In [25]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
DATA_FOLDER = '../data/'
DATA_ZIP = DATA_FOLDER + 'datasets.zip'

DATA_TRAIN_PATH = DATA_FOLDER + 'train.csv'
DATA_TRAIN_PATH_CLEAN = DATA_FOLDER + 'train_clean.csv'
DATA_TEST_PATH = DATA_FOLDER + 'test.csv' 

## Load the training data into feature matrix, class labels, and event ids:

In [27]:
from proj1_helpers import *
from split_data import *
from data_processing import *

y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
y_clean, tX_clean, ids_clean = clean_training(y, tX, ids)


split_ratio = 0.8
tX_train, tX_validation, y_train, y_validation = split_data(tX, y, split_ratio)
tX_train_clean, tX_validation_clean, y_train_clean, y_validation_clean = split_data(tX_clean, y_clean, split_ratio)

TODO: only remove outliers when max - min > threshold like 10
    Doubt with DER_deltar_tau_lep and PRI_jet_all_pt


In [28]:
print("Raw set : ")
row_format = "{:<15}" * 3
print(row_format.format("", "Training", "Validation"))
print(row_format.format("Features", str(tX_train.shape), str(tX_validation.shape)))
print(row_format.format("Labels", str(y_train.shape), str(y_validation.shape)))

print("\nClean set : ")
row_format = "{:<15}" * 3
print(row_format.format("", "Training", "Validation"))
print(row_format.format("Features", str(tX_train_clean.shape), str(tX_validation_clean.shape)))
print(row_format.format("Labels", str(y_train_clean.shape), str(y_validation_clean.shape)))

Raw set : 
               Training       Validation     
Features       (200000, 30)   (50000, 30)    
Labels         (200000,)      (50000,)       

Clean set : 
               Training       Validation     
Features       (35536, 29)    (8884, 29)     
Labels         (35536,)       (8884,)        


## Testing every basic model on cleaned data

In [29]:
from pipeline import *

### (Stochastic) Gradient Descent

In [31]:
max_iters = 100
gamma = 0.01
initial_w = np.zeros(tX_train_clean.shape[1])

loss_gd, weights_gd = model_data(y_train_clean, tX_train_clean, 'gradient_descent', 
                              initial_w=initial_w, max_iters=max_iters, gamma=gamma)
loss_sgd, weights_sgd = model_data(y_train_clean, tX_train_clean, 'stochastic_gradient_descent',
                                                initial_w=initial_w, max_iters=max_iters, gamma=gamma)
# Not compared with the not cleaned set, because it needs normalizing to work
print("GD Training loss : {}".format(loss_gd))
print("GD Validation loss : {}".format(get_loss(y_validation_clean, tX_validation_clean, weights_gd)))
print("SGD Training loss : {}".format(loss_sgd))
print("SGD Validation loss : {}".format(get_loss(y_validation_clean, tX_validation_clean, weights_sgd)))

GD Training loss : 0.34483741738725987
GD Validation loss : 0.34342161403433336
SGD Training loss : 0.34177970796002916
SGD Validation loss : 0.3434787185463746


### Least Squares

In [32]:
weights, t_loss = model_data(y_train, tX_train, 'least_squares')
weights_clean, t_loss_clean = model_data(y_train_clean, tX_train_clean, 'least_squares')

print("Training loss : {}".format(t_loss))
print("Validation loss : {}".format(get_loss(y_validation, tX_validation, weights)))
print("Clean Training loss : {}".format(t_loss_clean))
print("Clean Validation loss : {}".format(get_loss(y_validation_clean, tX_validation_clean, weights_clean)))

Training loss : 0.3394268495585329
Validation loss : 0.3407678463202393
Clean Training loss : 0.3357623766686237
Clean Validation loss : 0.33477092497039646


### Ridge Regression

In [33]:
lambda_ = 1

weights, t_loss = model_data(y_train, tX_train, 'ridge_regression')
weights_clean, t_loss_clean = model_data(y_train_clean, tX_train_clean, 'ridge_regression')

print("Training loss : {}".format(t_loss))
print("Validation loss : {}".format(get_loss(y_validation, tX_validation, weights)))
print("Clean Training loss : {}".format(t_loss_clean))
print("Clean Validation loss : {}".format(get_loss(y_validation_clean, tX_validation_clean, weights_clean)))

Training loss : 0.3394307923454776
Validation loss : 0.34075291469501606
Clean Training loss : 0.339382825229168
Clean Validation loss : 0.3382086297912849


### (Penalized) Logistic Regression

In [43]:
max_iters = 500
gamma = 0.05
lambda_ = 0.1

weights_log, loss_log = model_data(y_train_clean, tX_train_clean, 'logistic_regression', 
                                   max_iters=max_iters, gamma=gamma) 
weights_p_log, loss_p_log = model_data(y_train_clean, tX_train_clean, 'regularized_logistic_regression', 
                                   max_iters=max_iters, gamma=gamma, lambda_=lambda_) 


print("Training loss : {}".format(loss_log))
print("Validation loss : {}".format(get_log_likelihood(y_validation_clean, tX_validation_clean, weights_log)))
print("Penalized Training loss : {}".format(loss_p_log))
print("Penalized Validation loss : {}".format(get_log_likelihood(y_validation_clean, tX_validation_clean, weights_p_log)))

Loss iteration 0 : 0.6931471805599454
Loss iteration 10 : 0.6121541577800205
Loss iteration 20 : 0.5714140668567727
Loss iteration 30 : 0.5680319905079382
Loss iteration 40 : 0.552866994997688
Loss iteration 50 : 0.5364454721121777
Loss iteration 60 : 0.5173986797126201
Loss iteration 70 : 0.5419424567815786
Loss iteration 80 : 0.5340840895766843
Loss iteration 90 : 0.5116140201286051
Loss iteration 100 : 0.5129195681826513
Loss iteration 110 : 0.521962869896383
Loss iteration 120 : 0.513112280664071
Loss iteration 130 : 0.51627839940326
Loss iteration 140 : 0.49278750671297383
Loss iteration 150 : 0.49984882809781456
Loss iteration 160 : 0.5027763479641189
Loss iteration 170 : 0.49566712253110184
Loss iteration 180 : 0.511815494613356
Loss iteration 190 : 0.4838054655245143
Loss iteration 200 : 0.4948972245236529
Loss iteration 210 : 0.5133422360548738
Loss iteration 220 : 0.49586842321791863
Loss iteration 230 : 0.502072854324075
Loss iteration 240 : 0.5235567903325151
Loss iteration

## Generate predictions and save ouput in csv format for submission:

In [112]:
#This enable to run all cells without running this one when unnecessary
if False :
    y_test, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
    y_test_clean, tX_test_clean, ids_test_clean = clean_test(y_test, tX_test, ids_test)
    OUTPUT_PATH = DATA_FOLDER + 'submission.csv' 
    if True: ## Only for logistic regression, will de something nice later
        y_pred = predict_labels_logistic(weights_clean, tX_test_clean_poly)
    else:
        y_pred = predict_labels(weights_clean, tX_test_clean)
    print(y_pred)
    create_csv_submission(ids_test_clean, y_pred, OUTPUT_PATH)
else :
    print("Change False to True to generate prediction")

[-1. -1.  1. ...  1.  1.  1.]
