In [47]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [48]:
DATA_FOLDER = '../data/'
DATA_ZIP = DATA_FOLDER + 'datasets.zip'

DATA_TRAIN_PATH = DATA_FOLDER + 'train.csv'
DATA_TRAIN_PATH_CLEAN = DATA_FOLDER + 'train_clean.csv'
DATA_TEST_PATH = DATA_FOLDER + 'test.csv' 

## Load the training data into feature matrix, class labels, and event ids:

In [49]:
from proj1_helpers import *
from split_data import *

y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
y_clean, tX_clean, ids_clean = load_csv_data(DATA_TRAIN_PATH_CLEAN)

split_ratio = 0.8
tX_train, tX_validation, y_train, y_validation = split_data(tX, y, split_ratio)
tX_train_clean, tX_validation_clean, y_train_clean, y_validation_clean = split_data(tX_clean, y_clean, split_ratio)

In [50]:
print("Raw set : ")
row_format = "{:<15}" * 3
print(row_format.format("", "Training", "Validation"))
print(row_format.format("Features", str(tX_train.shape), str(tX_validation.shape)))
print(row_format.format("Labels", str(y_train.shape), str(y_validation.shape)))

print("\nClean set : ")
row_format = "{:<15}" * 3
print(row_format.format("", "Training", "Validation"))
print(row_format.format("Features", str(tX_train_clean.shape), str(tX_validation_clean.shape)))
print(row_format.format("Labels", str(y_train_clean.shape), str(y_validation_clean.shape)))

Raw set : 
               Training       Validation     
Features       (200000, 30)   (50000, 30)    
Labels         (200000,)      (50000,)       

Clean set : 
               Training       Validation     
Features       (60122, 23)    (15031, 23)    
Labels         (60122,)       (15031,)       


## Do your thing crazy machine learning thing here :) ...

### Gradient Descent

In [51]:
from stochastic_gradient_descent import *
from gradient_descent import *
from costs import *


max_iters = 100
gamma = 0.01
initial_w = np.zeros(tX_train_clean.shape[1])

loss, weights_gd = gradient_descent(y_train_clean, tX_train_clean, initial_w, max_iters, gamma)
print(loss)
loss, weights_sgd = stochastic_gradient_descent(y_train_clean, tX_train_clean, initial_w, max_iters, gamma)
print(loss)

0.376365887110707
0.4341114083570188


In [52]:
# Not compared with the not cleaned set, because it needs normalizing to work
print("GD Training loss : {}".format(compute_loss(y_train_clean, tX_train_clean, weights_gd)))
print("GD Validation loss : {}".format(compute_loss(y_validation_clean, tX_validation_clean, weights_gd)))
print("SGD Clean training loss : {}".format(compute_loss(y_train_clean, tX_train_clean, weights_sgd)))
print("SGD Clean validation loss : {}".format(compute_loss(y_validation_clean, tX_validation_clean, weights_sgd)))

GD Training loss : 0.3762862825347247
GD Validation loss : 0.37743871877826035
SGD Clean training loss : 0.37743989171235026
SGD Clean validation loss : 0.3784985620998615


### Least Squares

In [53]:
from least_squares import *

weights_clean, t_loss_clean = least_squares(y_train_clean, tX_train_clean)
weights, t_loss = least_squares(y_train, tX_train)

In [54]:
print("Training loss : {}".format(compute_loss(y_train, tX_train, weights)))
print("Validation loss : {}".format(compute_loss(y_validation, tX_validation, weights)))
print("Clean training loss : {}".format(compute_loss(y_train_clean, tX_train_clean, weights_clean)))
print("Clean validation loss : {}".format(compute_loss(y_validation_clean, tX_validation_clean, weights_clean)))

Training loss : 0.3394268495585198
Validation loss : 0.3407678470839749
Clean training loss : 0.3680652063481529
Clean validation loss : 0.36665641257030934


### Ridge Regression

In [55]:
from ridge_regression import *
lambda_ = 1

weights, loss = ridge_regression(y_train, tX_train, lambda_)
weights_clean, loss_clean = ridge_regression(y_train_clean, tX_train_clean, lambda_)

In [56]:
print("Training loss : {}".format(compute_loss(y_train, tX_train, weights)))
print("Validation loss : {}".format(compute_loss(y_validation, tX_validation, weights)))
print("Clean training loss : {}".format(compute_loss(y_train_clean, tX_train_clean, weights_clean)))
print("Clean validation loss : {}".format(compute_loss(y_validation_clean, tX_validation_clean, weights_clean)))

Training loss : 0.339430792348136
Validation loss : 0.34075291469042784
Clean training loss : 0.36806562265821613
Clean validation loss : 0.3666566538617766


### Logistic Regression

In [84]:
from logistic_regression import *

max_iters = 50
gamma = 0.1
initial_w_train = np.ones(tX_train.shape[1])
initial_w_clean = np.ones(tX_clean.shape[1])

weights, loss = logistic_regression(y_train, tX_train, initial_w_train, max_iters, gamma) 
weights_clean, loss_clean = logistic_regression(y_clean, tX_clean, initial_w_clean, max_iters, gamma) 

Maximum weight :  3100.0  Minimum weight :  -979.0
Maximum weight :  5399.0  Minimum weight :  -2419.0
Maximum weight :  8201.0  Minimum weight :  -3621.0
Maximum weight :  10502.0  Minimum weight :  -4369.0
Maximum weight :  12601.0  Minimum weight :  -5562.0
Maximum weight :  15098.0  Minimum weight :  -6409.0
Maximum weight :  16998.0  Minimum weight :  -7226.0
Maximum weight :  19995.0  Minimum weight :  -8097.0
Maximum weight :  22694.0  Minimum weight :  -9260.0
Maximum weight :  24693.0  Minimum weight :  -10186.0
Maximum weight :  27096.0  Minimum weight :  -10943.0
Maximum weight :  29894.0  Minimum weight :  -11962.0
Maximum weight :  33293.0  Minimum weight :  -13090.0
Maximum weight :  35490.0  Minimum weight :  -13592.0
Maximum weight :  39186.0  Minimum weight :  -15166.0
Maximum weight :  42882.0  Minimum weight :  -16188.0
Maximum weight :  45277.0  Minimum weight :  -17252.0
Maximum weight :  47975.0  Minimum weight :  -18297.0
Maximum weight :  50271.0  Minimum weight

In [79]:
print("Training loss : {}".format(calculate_loss(y_train, tX_train, weights)))
print("Validation loss : {}".format(calculate_loss(y_validation, tX_validation, weights)))
print("Clean training loss : {}".format(calculate_loss(y_train_clean, tX_train_clean, weights_clean)))
print("Clean validation loss : {}".format(calculate_loss(y_validation_clean, tX_validation_clean, weights_clean)))

Training loss : inf
Validation loss : inf
Clean training loss : -4375318.130747814
Clean validation loss : -1068856.3995614946


### Regularized Logistic Regression

## Generate predictions and save ouput in csv format for submission:

In [59]:
#This enable to run all cells without running this one when unnecessary
if False :
    _, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
    OUTPUT_PATH = DATA_FOLDER + 'submission.csv' 
    y_pred = predict_labels(weights, tX_test)
    create_csv_submission(ids_test, y_pred, OUTPUT_PATH)
else :
    print("Change False to True to generate prediction")

Change False to True to generate prediction
