In [134]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import plotly.express as px 

In [113]:
np.random.seed(42)

num_samples = 100
num_features = 5
x = np.random.rand(num_samples, num_features)
y = np.random.randint(0, 2, size = num_samples)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

print("Training set shape: ", x_train.shape, y_train.shape)
print("Testing set shape: ", x_test.shape, y_test.shape)

Training set shape:  (80, 5) (80,)
Testing set shape:  (20, 5) (20,)


In [114]:
np.random.seed(42)

x = np.random.rand(200, 4)
y = np.random.randint(0, 2, size = 200)

x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size = 0.2, random_state = 42)

model = LogisticRegression()

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

cm = confusion_matrix(y_test, y_pred)

tp = cm[1, 1]
fp = cm[0, 1]
fn = cm[1, 0]
tn = cm[0, 0]

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
specificity = tn / (tn + fp)
f1 = 2 * (precision * recall) / (precision + recall)

print("Confusion Matrix: ", cm)
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("Specificity: ", specificity)
print("F1 Score: ", f1)

Confusion Matrix:  [[20  3]
 [13  4]]
Accuracy:  0.6
Precision:  0.5714285714285714
Recall:  0.23529411764705882
Specificity:  0.8695652173913043
F1 Score:  0.3333333333333333


In [115]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [116]:
np.random.seed(42)

x = np.random.rand(200, 1)
y = 2 + 3 + x + np.random.randn(200, 1)   # True underlying relationship + random noise

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

model = LinearRegression()

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error: ", mse)
print(f"Root Mean Squared Error: {rmse: .2f}")
print("Mean Absolute Error: ", mae)
print("R - Squared: ", r2)

Mean Squared Error:  1.084623862545468
Root Mean Squared Error:  1.04
Mean Absolute Error:  0.8387402733729703
R - Squared:  -0.06813909168116017


### Exercise 1 - Build a linear regression with the dataset below and evaluate the model
Instruction:
1. Split the dataset into a training set (70% of the data) and a test set (30% of the data).
2. Implement linear regression
3. Train the linear regression model on the training set
4. Evaluate the trained model's performance on the test set by calculating the Mean Squared Error, Root Mean Squared Error, Mean Absolute Error, R-squared

In [117]:
data = datasets.fetch_california_housing()
df = pd.DataFrame(np.column_stack([data['data'], data['target']]), columns = data['feature_names'] + ['target'])
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [118]:
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:, 8].to_frame(), df['target'],
                                                    test_size = 0.3, random_state = 0)

In [119]:
model = LinearRegression()

In [120]:
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [121]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error: ", mse)
print(f"Root Mean Squared Error: {rmse: .2f}")
print("Mean Absolute Error: ", mae)
print("R - Squared: ", r2)

Mean Squared Error:  9.529851217218457e-30
Root Mean Squared Error:  0.00
Mean Absolute Error:  2.4079485893022765e-15
R - Squared:  1.0


### Exercise 2 - Build a logistic regression with the dataset below and evaluate the model
Instruction:
1. Split the dataset into a training set (70% of the data) and a test set (30% of the data).
2. Implement logistic regression
3. Train the logistic regression model on the training set
4. Evaluate the trained model's performance on the test set by calculating the accuracy, precision, recall, and F1 score.score.

In [147]:
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

california_housing = fetch_california_housing(as_frame=True)
x = california_housing.data
y = california_housing.target

# Bin the target variable into categories
bins = np.linspace(0, 5, 6) # Define bins for house price categories
y_binned = np.digitize(y, bins)

# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y_binned, test_size=0.3, random_state=42)

# Standardize features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(x_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(x_test_scaled)


cm = confusion_matrix(y_test, y_pred)

tp = cm[1, 1]
fp = cm[0, 1]
fn = cm[1, 0]
tn = cm[0, 0]

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
specificity = tn / (tn + fp)
f1 = 2 * (precision * recall) / (precision + recall)

print("Confusion Matrix: ", cm)
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("Specificity: ", specificity)
print("F1 Score: ", f1)

Confusion Matrix:  [[ 637  424    7    1    0    0]
 [ 222 1963  303    4    0    4]
 [  16  567  833   56    0    8]
 [   5  104  378  102    1   36]
 [   2   18  110   59    0   51]
 [   2   24   60   28    0  167]]
Accuracy:  0.8009858287122612
Precision:  0.8223711772098868
Recall:  0.8983981693363844
Specificity:  0.6003770028275212
F1 Score:  0.8587051618547681


In [149]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

