In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from utils.data_acquisition import  fetch_housing_dataset, fetch_wine_dataset
from utils.metrics import mse
from utils.data_preparation import custom_train_test_split, pre_processings, remove_outliers
from models.models import LinearRegression, LogisticRegression
from models.optimizers import GradientDescent, StochasticGradientDescent

## Data Loading


In [2]:
SEED=0
np.random.seed(SEED)

housing_df = fetch_housing_dataset()
wine_df = fetch_wine_dataset()

# Experiment 1: Basic Performance Comparison

For both datasets, perform an 80/20 train/test split and report the performance metrics on both the training set and test set for each model. Please include metrics such as Mean Squared Error (MSE) for Linear Regression and accuracy, precision, recall, and F1-score for Logistic Regression

### Data Preparation

In [3]:
TEST_SIZE = 0.20
DEFAULT_BATCH_SIZE = 8

### Data Cleaning ###
# dropping CHAS column because of its heavy imbalance
housing_df_cleaned = housing_df.drop(columns='CHAS')

# remove outliers 
housing_df_cleaned = remove_outliers(housing_df_cleaned)
#wine_df_cleaned = remove_outliers(wine_df)
wine_df_cleaned = wine_df.copy()

### Data Splitting ###
# perform train test split
one_hot_wine_classes = pd.get_dummies(wine_df['class']).to_numpy(dtype=int)
X_wine_train, X_wine_test, y_wine_train, y_wine_test = custom_train_test_split(wine_df_cleaned.drop(['class'], axis=1).to_numpy(),
                                                                        one_hot_wine_classes,
                                                                        test_size=TEST_SIZE, stratify=wine_df_cleaned['class'], random_seed=SEED)

X_housing_train, X_housing_test, y_housing_train, y_housing_test = custom_train_test_split(housing_df_cleaned.drop(['MEDV'], axis=1).to_numpy(),
                                                                                    housing_df_cleaned.MEDV.to_numpy().reshape(-1,1), test_size=TEST_SIZE, random_seed=SEED)

### Data Normalization ###
# normalize data after train test split
X_wine_train, X_wine_test = pre_processings(X_wine_train, X_wine_test, scale=True)
X_housing_train, X_housing_test = pre_processings(X_housing_train, X_housing_test, scale=True)

# TODO: feature extraction

  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]
  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]
  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]
  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]
  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]
  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]
  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]
  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]
  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]
  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]
  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]


### Linear Regression

In [4]:
# Linear regression fit
lin_reg = LinearRegression()

# Analytic fit
lin_reg.fit(X_housing_train, y_housing_train, analytic_fit=True, verbose=False)
y_preds_with_analytic_fit = lin_reg.predict(X_housing_test)
min_ms_error = mse(y_housing_test, lin_reg.predict(X_housing_test))
w_analytic = np.copy(lin_reg.w)

# Mini-batch SGD fit
lin_reg.fit(X_housing_train, y_housing_train, optimizer_class=GradientDescent, batch_size=DEFAULT_BATCH_SIZE, verbose=False)
y_preds_with_grad = lin_reg.predict(X_housing_test)
ms_error = mse(y_housing_test, y_preds_with_grad)


print(f"L2 norm between best weights and SGD weights: {np.linalg.norm(w_analytic - lin_reg.w)}")
print(f"Mean squarred error from analytic fit: {min_ms_error}")
print(f"Mean squared error from SGD with mini-batch size {DEFAULT_BATCH_SIZE}: {ms_error}")

L2 norm between best weights and SGD weights: 10618.46391320338
Mean squarred error from analytic fit: 45.29202731361068
Mean squared error from SGD with mini-batch size 8: 81.27003293821912


### Logistic Regression

In [5]:
np.random.seed(0)
optimizer_kwargs = {'max_iters': 4e4,
                    'learning_rate': 0.05,
                    'verbose': False}

# Logistic regression fit
log_reg = LogisticRegression()

# Learn model parameters with gradient descent
log_reg.fit(X_wine_train, y_wine_train, optimizer_class=GradientDescent, **optimizer_kwargs)
y_wine_preds_gd = log_reg.predict(X_wine_test)
w_gd = np.copy(log_reg.w)

gd_accuracy = accuracy_score(y_wine_test, y_wine_preds_gd)
gd_recall = recall_score(y_wine_test, y_wine_preds_gd, average='weighted')
gd_precision = precision_score(y_wine_test, y_wine_preds_gd, average='weighted')
gd_f1_score = f1_score(y_wine_test, y_wine_preds_gd, average='weighted')

# Learn model parameters with stochastic gradient descent
log_reg.fit(X_wine_train, y_wine_train, optimizer_class=StochasticGradientDescent, batch_size=DEFAULT_BATCH_SIZE, **optimizer_kwargs)
y_wine_preds_sgd = log_reg.predict(X_wine_test)
w_sgd = np.copy(log_reg.w)

sgd_accuracy = accuracy_score(y_wine_test, y_wine_preds_sgd)
sgd_recall = recall_score(y_wine_test, y_wine_preds_sgd, average='weighted')
sgd_precision = precision_score(y_wine_test, y_wine_preds_sgd, average='weighted')
sgd_f1_score = f1_score(y_wine_test, y_wine_preds_sgd, average='weighted')

print("l2 norm between sgd weights and mini-batch weights: ", np.linalg.norm(w_gd - w_sgd))
print(f"Accuracy:\n \tGD: {gd_accuracy:.5g}, SGD with mini-batch of size {DEFAULT_BATCH_SIZE}: {sgd_accuracy:.5g}")
print(f"Recall:\n \tGD: {gd_recall:.5g}, SGD with mini-batch of size {DEFAULT_BATCH_SIZE}: {sgd_recall:.5g}")
print(f"Precision:\n \tGD: {gd_precision:.5g}, SGD with mini-batch of size {DEFAULT_BATCH_SIZE}: {sgd_precision:.5g}")
print(f"F1 score:\n \tGD: {gd_f1_score:.5g}, SGD with mini-batch of size {DEFAULT_BATCH_SIZE}: {sgd_f1_score:.5g}")

l2 norm between sgd weights and mini-batch weights:  0.2202785916063604
Accuracy:
 	GD: 0.77143, SGD with mini-batch of size 8: 0.77143
Recall:
 	GD: 0.77143, SGD with mini-batch of size 8: 0.77143
Precision:
 	GD: 0.77815, SGD with mini-batch of size 8: 0.77815
F1 score:
 	GD: 0.77389, SGD with mini-batch of size 8: 0.77389


# Experiment 2: Cross-Validation

For both data sets, use a 5-fold cross-validation technique and report the performance metrics on both the training set and test set for each model. Again, include appropriate performance metrics for each model. Check this link for more information.

Note: 5-fold cross-validation is a technique where the dataset is divided into five equal parts (folds), and a model is trained and evaluated five times, each time using a different fold as the validation set and the remain- ing four folds for training.


### Data Preparation

In [6]:
TEST_SIZE = 0.20
DEFAULT_BATCH_SIZE = 8

### Data Cleaning ###
# dropping CHAS column because of its heavy imbalance
housing_df_cleaned = housing_df.drop(columns='CHAS')

# remove outliers 
housing_df_cleaned = remove_outliers(housing_df_cleaned)
#wine_df_cleaned = remove_outliers(wine_df)
wine_df_cleaned = wine_df.copy()

### Data Splitting ###
# perform train test split
one_hot_wine_classes = pd.get_dummies(wine_df['class']).to_numpy(dtype=int)
X_wine_train, X_wine_test, y_wine_train, y_wine_test = custom_train_test_split(wine_df_cleaned.drop(['class'], axis=1).to_numpy(),
                                                                        one_hot_wine_classes,
                                                                        test_size=TEST_SIZE, stratify=wine_df_cleaned['class'], random_seed=SEED)

X_housing_train, X_housing_test, y_housing_train, y_housing_test = custom_train_test_split(housing_df_cleaned.drop(['MEDV'], axis=1).to_numpy(),
                                                                                    housing_df_cleaned.MEDV.to_numpy().reshape(-1,1), test_size=TEST_SIZE, random_seed=SEED)

### Data Normalization ###
# normalize data after train test split
X_wine_train, X_wine_test = pre_processings(X_wine_train, X_wine_test, scale=True)
X_housing_train, X_housing_test = pre_processings(X_housing_train, X_housing_test, scale=True)

# TODO: feature extraction

  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]
  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]
  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]
  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]
  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]
  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]
  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]
  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]
  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]
  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]
  cleaned_df = cleaned_df[np.abs(z_scores) <= z_max]


### Linear Regression

### Logistic Regression