<a href="https://colab.research.google.com/github/jesse-venson/Machine-learning/blob/main/ML_Assign_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, train_test_split

QUESTION **1**

In [None]:
df = pd.read_csv('USA_Housing.csv')
X = df.drop('Price', axis = 1).values
y = df['Price'].values

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kf = KFold(n_splits = 5, shuffle = True, random_state=42)

betas, r2_scores, y_preds = [],[],[]


In [None]:
for train_idx , test_idx in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Least Square fit
    # adding intercept (bias term)
    X_train2 = np.hstack([np.ones((X_train.shape[0],1)),X_train])
    X_test2 = np.hstack([np.ones((X_test.shape[0],1)),X_test])

    beta = np.linalg.pinv(X_train2.T @ X_train2) @ X_train2.T @ y_train
    y_pred = X_test2 @ beta
    r2 = r2_score(y_test,y_pred)
    betas.append(beta)
    r2_scores.append(r2)
    y_preds.append(y_pred)

In [None]:
best_idx = np.argmax(r2_scores)
best_beta = betas[best_idx]


In [None]:
X_train_70, X_test_30,y_train_70, y_test_30 = train_test_split(X_scaled, y, test_size = 0.3, random_state =42)
X_train2_70 = np.hstack([np.ones((X_train_70.shape[0],1)),X_train_70])
X_test2_30 = np.hstack([np.ones((X_test_30.shape[0],1)),X_test_30])

In [None]:
beta_70 = np.linalg.pinv(X_train2_70.T @ X_train2_70) @ X_train2_70.T @ y_train_70

In [None]:
y_pred_30 = X_test2_30 @ beta_70
r2_30 = r2_score(y_test_30,y_pred_30)

Question **2**

In [None]:
X = df.drop('Price', axis = 1)
y = df['Price']

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42
)

In [None]:
# for the validation set : we split only the training subset
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

In [None]:
def gradient_descent(X, y, learning_rate, iterations):
    m, n = X.shape
    # Initialize weights (including bias)
    weights = np.zeros(n)

    for i in range(iterations):
        predictions = X.dot(weights)
        errors = predictions - y
        gradient = (1/m) * X.T.dot(errors)
        weights -= learning_rate * gradient

    return weights

In [None]:
def add_intercept(X):
    intercept = np.ones((X.shape[0], 1))
    return np.hstack((intercept, X))

X_train_i = add_intercept(X_train)
X_val_i = add_intercept(X_val)
X_test_i = add_intercept(X_test)

In [None]:

learning_rates = [0.001, 0.01, 0.1, 1]
iterations = 1000
best_lr = None
best_weights = None
best_val_r2 = -np.inf

results = []

for lr in learning_rates:
    weights = gradient_descent(X_train_i, y_train, lr, iterations)

    # Predict on validation and test sets
    val_pred = X_val_i.dot(weights)
    test_pred = X_test_i.dot(weights)

    print(np.isnan(X_train_i).sum(), np.isnan(X_val_i).sum(), np.isnan(X_test_i).sum())
    print(np.isnan(y_train).sum(), np.isnan(y_val).sum(), np.isnan(y_test).sum())

    val_r2 = r2_score(y_val, val_pred)
    test_r2 = r2_score(y_test, test_pred)

    results.append({
        'learning_rate': lr,
        'weights': weights,
        'val_r2': val_r2,
        'test_r2': test_r2
    })

    # Track best model by validation R2
    if val_r2 > best_val_r2:
        best_val_r2 = val_r2
        best_weights = weights
        best_lr = lr

print(f"Best learning rate: {best_lr}")
print(f"Best validation R2: {best_val_r2}")
print(f"Test R2 at best learning rate: {[r['test_r2'] for r in results if r['learning_rate'] == best_lr][0]}")
print(f"Best regression coefficients: {best_weights}")

Question **3**

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
columns = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration","num_doors", "body_style",
           "drive_wheels", "engine_location", "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke",
           "compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]

df = pd.read_csv(url, names=columns, na_values='?')

In [None]:
df.head()
df.isna().sum()

In [None]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols.remove('price')

# for columns with numerical values and imputing them with mean
for col in numeric_cols:
    df[col].fillna(df[col].astype(float).mean(), inplace=True)

# # for columns with categorical values and imputing them with most occurent value (mode)
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [None]:
words_to_num = {
    'two': 2,
    'three': 3,
    'four': 4,
    'five': 5,
    'six': 6,
    'eight': 8,
    'twelve': 12
}

df['num_doors'] = df['num_doors'].map(words_to_num)
df['num_cylinders'] = df['num_cylinders'].map(words_to_num)

In [None]:
df = pd.get_dummies(df, columns=['body_style', 'drive_wheels'], drop_first=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_cols = ['make', 'aspiration', 'engine_location', 'fuel_type']
le = LabelEncoder()

for col in label_cols:
    df[col] = le.fit_transform(df[col])

In [None]:
print(df.dtypes[df.dtypes == 'object'])

In [None]:
df['fuel_system'] = df['fuel_system'].apply(lambda x: 1 if 'pfi' in x else 0)
df['engine_type'] = df['engine_type'].apply(lambda x: 1 if 'ohc' in x else 0)

In [None]:
from sklearn.preprocessing import StandardScaler

X = df.drop('price', axis=1)
y = df['price'].astype(float)  # convert price to float

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
print(df.isna().sum())  # Should print 0

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_test)

print(f"R2 score on test set: {r2_score(y_test, y_pred):.4f}")
print(f"RMSE on test set: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")

In [None]:
from sklearn.decomposition import PCA

# Retain enough components to explain 95% variance (or choose number manually)
pca = PCA(0.95)  # 95% variance explained
X_pca = pca.fit_transform(X_scaled)

print(f"Original number of features: {X_scaled.shape[1]}")
print(f"Reduced number of features: {X_pca.shape[1]}")

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.3, random_state=42)

lr_model_pca = LinearRegression()
lr_model_pca.fit(X_train_pca, y_train_pca)

y_pred_pca = lr_model_pca.predict(X_test_pca)

print(f"R2 score on PCA test set: {r2_score(y_test_pca, y_pred_pca):.4f}")
print(f"RMSE on PCA test set: {np.sqrt(mean_squared_error(y_test_pca, y_pred_pca)):.4f}")