<a href="https://colab.research.google.com/github/HathawayQAQ/COMP551-Machine-Learning/blob/main/Assignment1/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Statement

In [1]:
import pandas as pd
import numpy as np

!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt



# Task1: Acquire, preprocess, and analyze the data
## Data Preperation
1. Dataset1: Infrared Thermography Temperature (regression): [link](https://archive.ics.uci.edu/dataset/925/infrared+thermography+temperature+dataset)
2. Dataset 2: CDC Diabetes Health Indicators (classification): [link](https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators)

## Data acquisition

In [2]:
infrared_thermography_temperature = fetch_ucirepo(id=925)
X = infrared_thermography_temperature.data.features
y = infrared_thermography_temperature.data.targets

## Preprocessing

In [3]:
# Handle missing values
nan_rows = X.isnull().any(axis=1)
X_clean = X[~nan_rows]
y_clean = y[~nan_rows]

# Handle categorical features
categorical_columns = ['Age', 'Gender', 'Ethnicity']
X_dummies = pd.get_dummies(X_clean, columns=categorical_columns, drop_first=True)

# Convert boolean columns to integer
bool_columns = X_dummies.select_dtypes(include=['bool']).columns
for col in bool_columns:
    X_dummies[col] = X_dummies[col].astype(int)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_dummies)

# Select the target variable
y_final = y_clean['aveOralM']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_final, test_size=0.2, random_state=42)

# Task2: Implement the Models

## 1. Linear Regression (Analytical Solution)

In [5]:
class LinearRegressionModel:
    def __init__(self):
        self.weights = None

    def fit(self, features, target):
        num_samples, num_features = features.shape
        features_with_bias = np.column_stack([np.ones(num_samples), features])  # Added bias term (ones)
        self.weights, loss = self.calculate_least_squares_loss(features_with_bias, target)
        return (loss, self.weights)

    def calculate_least_squares_loss(self, features, target):
        weights = np.linalg.inv(features.T @ features) @ features.T @ target
        loss = np.mean((target - features @ weights) ** 2)
        return weights, loss

    def predict(self, features):
        features_with_bias = np.column_stack([np.ones(features.shape[0]), features])  # Added bias term (ones)
        return features_with_bias @ self.weights


In [6]:
class LinearRegressionSGD:
  def __init__(self):
        self.weights = None
  def fit(self, features, target, max_iterations=1000, tolerance=1e-5, learning_rate=1e-2, verbose=False, batch_size=16):
        num_samples, num_features = features.shape
        features_with_bias = np.column_stack([np.ones(num_samples), features])
        target = target.astype(int)
        rng = np.random.default_rng()

        if self.weights is None:
            self.weights = 0.001 * rng.standard_normal(num_features + 1)  # Random initialization

        loss_history = []
        for iteration in range(max_iterations):
            batch_indices = rng.choice(np.arange(num_samples), batch_size, replace=False)
            features_batch = features_with_bias[batch_indices, :]
            target_batch = target[batch_indices]

            gradient, loss = self.linear_loss(features_batch, target_batch)
            loss_history.append(loss)

            self.weights -= learning_rate * gradient  # Update weights

            if verbose and iteration % 100 == 0:
                print(f"Iteration {iteration}: Loss {loss}")

        return loss_history

  def predict(self, features):
      features_with_bias = np.column_stack([np.ones(features.shape[0]), features])
      return features_with_bias @ self.weights

  def linear_loss(self, features, target):
      num_samples = features.shape[0]
      gradient = np.dot(features.T, np.dot(features, self.weights) - target) / num_samples
      loss = np.sum((target - features @ self.weights) ** 2) / num_samples
      return gradient, loss

# Task3: Run Experiments


## Experiment 1: Report performance of linear regression

In [7]:
model = LinearRegressionModel()
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

print("Experiment 1: Linear Regression Performance")
print(f"Training MSE: {train_mse:.4f}")
print(f"Test MSE: {test_mse:.4f}")
print(f"Training R-squared: {train_r2:.4f}")
print(f"Test R-squared: {test_r2:.4f}")
print(f"MAE (Train): {mae_train:.4f}")
print(f"MAE (Test): {mae_test:.4f}")

Experiment 1: Linear Regression Performance
Training MSE: 0.0617
Test MSE: 0.0667
Training R-squared: 0.7756
Test R-squared: 0.6646
MAE (Train): 0.1950
MAE (Test): 0.2032


## Experiment 2: Report weights of features

In [11]:
def report_top_features(model, feature_names, top_n=10):
    weights = model.weights[1:]  # Exclude bias term
    feature_importance = pd.DataFrame({'Feature': feature_names, 'Weight': weights})
    feature_importance['Abs_Weight'] = abs(feature_importance['Weight'])
    return feature_importance.sort_values('Abs_Weight', ascending=False).head(top_n)

feature_names = X_dummies.columns.tolist()
lr_model = LinearRegressionModel()
lr_model.fit(X_train, y_train)
print("\nTop 10 features (Linear Regression):")
print(report_top_features(lr_model, feature_names))
sgd_model = LinearRegressionSGD()
sgd_model.fit(X_train, y_train, verbose=True)
print("\nTop 10 features (SGD Linear Regression):")
print(report_top_features(sgd_model, feature_names))


Top 10 features (Linear Regression):
        Feature    Weight  Abs_Weight
19  canthi4Max1  0.357622    0.357622
18   canthiMax1 -0.352782    0.352782
27       T_Max1  0.301124    0.301124
12        T_LC1  0.300552    0.300552
11    T_RC_Max1  0.230246    0.230246
28        T_OR1  0.212044    0.212044
4     Max1R13_1 -0.209065    0.209065
8         T_RC1 -0.159100    0.159100
29    T_OR_Max1 -0.157372    0.157372
9     T_RC_Dry1  0.143177    0.143177


KeyError: '[666, 30, 783] not in index'

## Experiment 3: Sample growing subsets of training data

In [10]:
train_sizes = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
train_scores = []
test_scores = []

for size in train_sizes:
    x_train_subset, _, y_train_subset, _ = train_test_split(X_train, y_train, train_size=size, random_state=42)
    model = LinearRegression()
    model.fit(x_train_subset, y_train_subset)

    train_pred = model.predict(x_train_subset)
    test_pred = model.predict(X_test)

    train_scores.append(r2_score(y_train_subset, train_pred))
    test_scores.append(r2_score(y_test, test_pred))

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_scores, label='Train R-squared')
plt.plot(train_sizes, test_scores, label='Test R-squared')
plt.xlabel('Training Set Size')
plt.ylabel('R-squared Score')
plt.title('Learning Curve')
plt.legend()
plt.show()

NameError: name 'LinearRegression' is not defined

## Experiment 4: Try different mini-batch sizes

In [None]:
batch_sizes = [8, 16, 32, 64, 128]
batch_scores = []

def safe_r2_score(y_true, y_pred):
    try:
        return r2_score(y_true, y_pred)
    except ValueError:
        return float('-inf')

for batch_size in batch_sizes:
    model = SGDLinearRegression(batch_size=batch_size)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = safe_r2_score(y_test, y_pred)
    batch_scores.append(score)
    print(f"Batch size {batch_size}: Test R2= {score:.4f}")

plt.figure(figsize=(10, 6))
plt.plot(batch_sizes, batch_scores, marker='o')
plt.xlabel('Batch Size')
plt.ylabel('Test R2 Score')
plt.title('Effect of Batch Size on Model Performance')
plt.xscale('log')
plt.show()

print("\nExperiment 4: Mini-batch Size Performance")
for batch_size, score in zip(batch_sizes, batch_scores):
    print(f"Batch size {batch_size}: Test R2 = {score:.4f}")

## Experiment 5: Try different learning rates

In [None]:
learning_rates = [0.001, 0.01, 0.1]
lr_scores = []

for lr in learning_rates:
    model = SGDLinearRegression(learning_rate=lr)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = safe_r2_score(y_test, y_pred)
    lr_scores.append(score)
    print(f"Learning rate {lr}: Test R-squared = {score:.4f}")

print("\nExperiment 5: Learning Rate Performance")
for lr, score in zip(learning_rates, lr_scores):
    print(f"Learning rate {lr}: Test R-squared = {score:.4f}")

## Experiment 6: Compare analytical solution with mini-batch SGD

In [None]:
analytical_model = LinearRegression()
analytical_model.fit(X_train, y_train)
analytical_pred = analytical_model.predict(X_test)
analytical_score = r2_score(y_test, analytical_pred)

sgd_model = SGDLinearRegression()
sgd_model.fit(X_train, y_train)
sgd_pred = sgd_model.predict(X_test)
sgd_score = r2_score(y_test, sgd_pred)

print("\nExperiment 6: Analytical vs SGD Performance")
print(f"Analytical solution: Test R-squared = {analytical_score:.4f}")
print(f"Mini-batch SGD: Test R-squared = {sgd_score:.4f}")