In [62]:
%pip install seaborn matplotlib scikit-learn scipy pandas numpy
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.datasets import fetch_california_housing
import warnings
warnings.filterwarnings('ignore')

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


# Q1: K-Fold Cross Validation for Multiple Linear Regression

In [63]:
california = fetch_california_housing()
df_house = pd.DataFrame(california.data, columns=california.feature_names)
df_house['price'] = california.target

In [64]:
X = df_house.drop('price', axis=1)
y = df_house['price']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [65]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores = []
beta_matrices = []

for train_idx, test_idx in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    X_train_with_intercept = np.column_stack([np.ones(X_train.shape[0]), X_train])
    X_test_with_intercept = np.column_stack([np.ones(X_test.shape[0]), X_test])
    
    beta = np.linalg.inv(X_train_with_intercept.T @ X_train_with_intercept) @ X_train_with_intercept.T @ y_train
    y_pred = X_test_with_intercept @ beta
    
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    beta_matrices.append(beta)
    
    print(f'R2 Score: {r2:.4f}')

R2 Score: 0.5758
R2 Score: 0.6137
R2 Score: 0.6086
R2 Score: 0.6213
R2 Score: 0.5875


In [66]:
best_idx = np.argmax(r2_scores)
best_beta = beta_matrices[best_idx]
print(f'Best R2 Score: {r2_scores[best_idx]:.4f}')

X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_train_final_with_intercept = np.column_stack([np.ones(X_train_final.shape[0]), X_train_final])
X_test_final_with_intercept = np.column_stack([np.ones(X_test_final.shape[0]), X_test_final])

y_pred_final = X_test_final_with_intercept @ best_beta
final_r2 = r2_score(y_test_final, y_pred_final)
print(f'Final Test R2 Score: {final_r2:.4f}')

Best R2 Score: 0.6213
Final Test R2 Score: 0.6022


# Q2: Validation Set for Multiple Linear Regression (Gradient Descent)

In [67]:
X_temp, X_test_q2, y_temp, y_test_q2 = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_train_q2, X_val_q2, y_train_q2, y_val_q2 = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

print(f'Train set size: {len(X_train_q2)} ({len(X_train_q2)/len(X_scaled)*100:.1f}%)')
print(f'Validation set size: {len(X_val_q2)} ({len(X_val_q2)/len(X_scaled)*100:.1f}%)')
print(f'Test set size: {len(X_test_q2)} ({len(X_test_q2)/len(X_scaled)*100:.1f}%)')

Train set size: 11558 (56.0%)
Validation set size: 2890 (14.0%)
Test set size: 6192 (30.0%)


In [68]:
def gradient_descent(X, y, learning_rate, iterations):
    X_with_intercept = np.column_stack([np.ones(X.shape[0]), X])
    theta = np.random.randn(X_with_intercept.shape[1])
    m = len(y)
    
    for i in range(iterations):
        predictions = X_with_intercept @ theta
        errors = predictions - y
        gradient = (1/m) * X_with_intercept.T @ errors
        theta = theta - learning_rate * gradient
    
    return theta

In [69]:
learning_rates = [0.001, 0.01, 0.1, 1]
coefficients = {}
val_r2_scores = {}
test_r2_scores = {}

for lr in learning_rates:
    theta = gradient_descent(X_train_q2, y_train_q2, lr, 1000)
    coefficients[lr] = theta
    
    X_val_with_intercept = np.column_stack([np.ones(X_val_q2.shape[0]), X_val_q2])
    X_test_with_intercept = np.column_stack([np.ones(X_test_q2.shape[0]), X_test_q2])
    
    val_pred = X_val_with_intercept @ theta
    test_pred = X_test_with_intercept @ theta
    
    val_r2 = r2_score(y_val_q2, val_pred)
    test_r2 = r2_score(y_test_q2, test_pred)
    
    val_r2_scores[lr] = val_r2
    test_r2_scores[lr] = test_r2
    
    print(f'Learning Rate: {lr}, Validation R2: {val_r2:.4f}, Test R2: {test_r2:.4f}')

Learning Rate: 0.001, Validation R2: -2.4416, Test R2: -1.8808
Learning Rate: 0.01, Validation R2: 0.5009, Test R2: 0.4901
Learning Rate: 0.1, Validation R2: 0.6113, Test R2: 0.5970
Learning Rate: 1, Validation R2: 0.6113, Test R2: 0.5970


In [70]:
best_lr = max(val_r2_scores, key=val_r2_scores.get)
best_coefficients = coefficients[best_lr]
print(f'Best Learning Rate: {best_lr}')
print(f'Best Validation R2: {val_r2_scores[best_lr]:.4f}')
print(f'Corresponding Test R2: {test_r2_scores[best_lr]:.4f}')

Best Learning Rate: 0.1
Best Validation R2: 0.6113
Corresponding Test R2: 0.5970


# Q3: Pre-processing and Multiple Linear Regression

In [71]:
column_names = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
               "num_doors", "body_style", "drive_wheels", "engine_location", "wheel_base",
               "length", "width", "height", "curb_weight", "engine_type", "num_cylinders",
               "engine_size", "fuel_system", "bore", "stroke", "compression_ratio",
               "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]

url_car = 'https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'
df_car = pd.read_csv(url_car, names=column_names, na_values='?')

In [72]:
df_car = df_car.dropna(subset=['price'])

numeric_cols = df_car.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if col != 'price':
        df_car[col] = df_car[col].fillna(df_car[col].median())

categorical_cols = df_car.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df_car[col] = df_car[col].fillna(df_car[col].mode()[0])

In [73]:
door_mapping = {'two': 2, 'four': 4}
df_car['num_doors'] = df_car['num_doors'].map(door_mapping)

cylinder_mapping = {'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'eight': 8, 'twelve': 12}
df_car['num_cylinders'] = df_car['num_cylinders'].map(cylinder_mapping)

In [74]:
df_car = pd.get_dummies(df_car, columns=['body_style', 'drive_wheels'], drop_first=True)

In [75]:
le_cols = ['make', 'aspiration', 'engine_location', 'fuel_type']
le = LabelEncoder()

for col in le_cols:
    df_car[col] = le.fit_transform(df_car[col])

In [76]:
df_car['fuel_system'] = df_car['fuel_system'].apply(lambda x: 1 if 'pfi' in str(x) else 0)
df_car['engine_type'] = df_car['engine_type'].apply(lambda x: 1 if 'ohc' in str(x) else 0)

In [77]:
X_car = df_car.drop('price', axis=1)
y_car = df_car['price']

scaler_car = StandardScaler()
X_car_scaled = scaler_car.fit_transform(X_car)

In [78]:
X_train_car, X_test_car, y_train_car, y_test_car = train_test_split(X_car_scaled, y_car, test_size=0.3, random_state=42)

lr_model = LinearRegression()
lr_model.fit(X_train_car, y_train_car)
y_pred_car = lr_model.predict(X_test_car)
r2_original = r2_score(y_test_car, y_pred_car)
print(f'Original R2 Score: {r2_original:.4f}')

Original R2 Score: 0.8734


In [79]:
pca = PCA(n_components=0.95)
X_car_pca = pca.fit_transform(X_car_scaled)
print(f'Original features: {X_car_scaled.shape[1]}')
print(f'PCA features: {X_car_pca.shape[1]}')

Original features: 29
PCA features: 16


In [80]:
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_car_pca, y_car, test_size=0.3, random_state=42)

lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train_pca)
y_pred_pca = lr_pca.predict(X_test_pca)
r2_pca = r2_score(y_test_pca, y_pred_pca)
print(f'PCA R2 Score: {r2_pca:.4f}')
print(f'Performance improvement: {r2_pca - r2_original:.4f}')

PCA R2 Score: 0.8617
Performance improvement: -0.0117


In [81]:
print('\nWhat i get to learn from this assignment and my solutions')
print(f'Q1 Best K-Fold R2: {max(r2_scores):.4f}')
print(f'Q1 Final Test R2: {final_r2:.4f}')
print(f'Q2 Best Learning Rate: {best_lr}')
print(f'Q2 Best Validation R2: {val_r2_scores[best_lr]:.4f}')
print(f'Q3 Original R2: {r2_original:.4f}')
print(f'Q3 PCA R2: {r2_pca:.4f}')
print(f'Q3 PCA Improvement: {"Yes" if r2_pca > r2_original else "No"}')


What i get to learn from this assignment and my solutions
Q1 Best K-Fold R2: 0.6213
Q1 Final Test R2: 0.6022
Q2 Best Learning Rate: 0.1
Q2 Best Validation R2: 0.6113
Q3 Original R2: 0.8734
Q3 PCA R2: 0.8617
Q3 PCA Improvement: No
