In [1]:
# TO RUN IT: CHANGE MY PATH
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os
# change path if necessary
import sys
my_path = r'D:\Documents\etudes\epfl\MA1\cours\MachineLearning\Project1'
sys.path.insert(0,my_path + r'/code/SUBMISSION')

# imports
import numpy as np 
import matplotlib.pyplot as plt

from proj1_helpers import predict_labels 
from implementations_enhanced import *
from costs import *
from optimize_hyperparams import *
from cross_validation import *
from step_wise import *
from extend_features import *

# Build dataset

In [2]:
from proj1_helpers import load_csv_data 

# load raw data
y_raw, input_data_raw, ids = load_csv_data(my_path + r'/data/train.csv', sub_sample=False)

In [3]:
from outliers import handle_outliers

# handle outliers
X0, y = handle_outliers(input_data_raw, y_raw, -999, 'mean')

-999 are replaced by the mean value of the feature


In [4]:
# set y in {0,1} instead of {-1,1}
y[np.where(y==-1)]=0

# Build features

In [5]:
# get feature names 
names = list(np.genfromtxt(my_path + r'/data/train.csv', delimiter=",", dtype=str, max_rows = 1)[2:])
degree = 1

# extend features (adding log transformations, momentum features)
X0, features = extend_features(X0, names, degree, is_add_log=True)

---------------------------
Features have been set to the power(s): [1]
16 Features of the momentum have been added
4 logarithmic features have been added.


In [6]:
# standardize candidates to give to the stepwise
X0 ,_,_ = standardize(X0)

# stepwise input
all_candidates = X0

# Least Squares

In [7]:
# parameters
model = dict()
model['method'] = 'ls'
model['loss'] = 'rmse'

# R2 type
R2_method = 'McFadden' # or 'loss'

# estimate R2 error through cross validation (1 or 0)
cv = 0
model['k_fold'] = 5

# other
model['debug_mode'] = 1

In [10]:
# stepwise
best_R2adj, idx_features = stepwise(model, R2_method, all_candidates, features, y, cv)

--------------------------------------------------------------------------------------------
Feature chosen:  DER_sum_pt_power_1 (index : 9 ) |  R2adj =  0.0320794565365
--------------------------------------------------------------------------------------------
Feature chosen:  PRI_lep_pt_mom_comp2 (index : 35 ) |  R2adj =  0.0411265530781
--------------------------------------------------------------------------------------------
Feature chosen:  PRI_lep_eta_power_1 (index : 17 ) |  R2adj =  0.0476859241161
--------------------------------------------------------------------------------------------
Feature chosen:  PRI_jet_subleading_phi_power_1 (index : 28 ) |  R2adj =  0.0513540500361
--------------------------------------------------------------------------------------------


IndexError: list index out of range

In [9]:
# display selected features
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

step 1 : R2 adjusted = 0.0320794565365
step 2 : R2 adjusted = 0.0411265530781
step 3 : R2 adjusted = 0.0476859241161
step 4 : R2 adjusted = 0.0513540500361
step 5 : R2 adjusted = 0.0541217005304
step 6 : R2 adjusted = 0.0619509471832
step 7 : R2 adjusted = 0.0636698162768
step 8 : R2 adjusted = 0.0645875366324
step 9 : R2 adjusted = 0.065537927064
step 10 : R2 adjusted = 0.0663104102943
step 11 : R2 adjusted = 0.0670029136786
step 12 : R2 adjusted = 0.0676125537267
step 13 : R2 adjusted = 0.0685500923705
step 14 : R2 adjusted = 0.0701328057152
step 15 : R2 adjusted = 0.0708869018379
step 16 : R2 adjusted = 0.071204342374
step 17 : R2 adjusted = 0.0715130336785
step 18 : R2 adjusted = 0.0721154218984
step 19 : R2 adjusted = 0.0727543538223
step 20 : R2 adjusted = 0.0730407667631
step 21 : R2 adjusted = 0.0734366365681
step 22 : R2 adjusted = 0.0737501026612
step 23 : R2 adjusted = 0.0738255571317
step 24 : R2 adjusted = 0.0738659884399
step 25 : R2 adjusted = 0.0739037116954
step 26 : R

In [None]:
plt.plot(best_R2adj)
plt.title('Best R2 adjusted')
plt.xlabel('Num features')
plt.ylabel('R2_adj values')
plt.show()
plt.savefig("R2LS")

# Cross validation with feature selected

### Degree optimization + training/testing losses

In [None]:
# feature selection
X = X0[:,idx_features]

# Set parameters
model['initial_w'] = np.zeros(X.shape[1])
deg_min = 1
deg_max = 7
deg_steps = 1


w_tr_tot, loss_tr_tot, loss_te_tot, deg_opt, success_rate  = optimize_degree(y, X, deg_min, deg_max, deg_steps, model, debug_mode=1)


In [None]:
mean_tr = np.mean(loss_tr_tot)
mean_te = np.mean(loss_te_tot)
var_tr = np.var(loss_tr_tot)
var_te = np.var(loss_te_tot)

print("TRAINING:")
print('Mean = ', mean_tr, ' | Var = ', var_tr)
print('---------------------------------------------------')
print("TESTING:")
print('Mean = ', mean_te, ' | Var = ', var_te)
print('\n')
print("Success Rate : ", success_rate)

In [None]:
print("Optimum degree : ", deg_opt)

# Least Squares Gradient Descent

## Build features

In [None]:
X0, y = handle_outliers(input_data_raw, y_raw, -999, 'mean')
# set y in {0,1} instead of {-1,1}
y[np.where(y==-1)]=0

# get feature names 
names = list(np.genfromtxt(my_path + r'/data/train.csv', delimiter=",", dtype=str, max_rows = 1)[2:])
log = True
degree = 1
X0, features = extend_features(X0, names, degree, log)

In [None]:
all_candidates = X0
all_candidates, _,_ = standardize(X0)
all_candidates.shape

In [None]:
# parameters
model = dict()
model['method'] = 'lsgd'
model['loss'] = 'rmse'

# hyperparameters
model['max_iters'] = 5000
model['gamma'] = 1e-1
model['threshold'] = 1e-4


# other
model['debug_mode'] = 1

# R2 type
R2_method = 'McFadden' # 'loss' or 'McFadden' 

# estimate R2 error through cross validation (1 or 0)
cv = 0
model['k_fold'] = 10

In [None]:
# step-wise
best_R2adj, idx_features = stepwise(model, R2_method, all_candidates, features, y, cv)

In [None]:
duration = 1  # second
freq = 440  # Hz
os.system('play --no-show-progress --null --channels 1 synth %s sine %f' % (duration, freq))

In [None]:
# display selected features
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

In [None]:
best_R2adj

In [None]:
plt.plot(best_R2adj)
plt.title('Best R2 adjusted')
plt.xlabel('Num features')
plt.ylabel('R2_adj values')
plt.show()
plt.savefig("R2LS")

# Cross validation with feature selected

### Degree optimization

In [None]:
# ------------------------------- BUILD THE MODEL WITH THE SELECTED FEATURES 
X = X0[:,idx_features]

# Set parameters
model['initial_w'] = np.zeros(X.shape[1])
model['gamma'] = 1e-2
model['max_iters'] = 5000
model['debug_mode'] = 1
deg_min = 1
deg_max = 7
deg_steps = 1


w_tr_tot, loss_tr_tot, loss_te_tot, deg_opt, success_rate  = optimize_degree(y, X, deg_min, deg_max, deg_steps, model, debug_mode=1)
#w_tr_tot, loss_tr_tot, loss_te_tot, success_rate = cross_validation(y,X,model,debug_mode=1)

In [None]:
mean_tr = np.mean(loss_tr_tot)
mean_te = np.mean(loss_te_tot)
var_tr = np.var(loss_tr_tot)
var_te = np.var(loss_te_tot)

print("TRAINING:")
print('Mean = ', mean_tr, ' | Var = ', var_tr)
print('---------------------------------------------------')
print("TESTING:")
print('Mean = ', mean_te, ' | Var = ', var_te)
print('\n')
print("Success Rate : ", success_rate)

In [None]:
deg_opt

In [None]:
duration = 1  # second
freq = 440  # Hz
os.system('play --no-show-progress --null --channels 1 synth %s sine %f' % (duration, freq))

# Least Squares Stochastic Gradient Descent

## Build features 

In [None]:
X0, y = handle_outliers(input_data_raw, y_raw, -999, 'mean')
# set y in {0,1} instead of {-1,1}
y[np.where(y==-1)]=0

# get feature names 
names = list(np.genfromtxt(my_path + r'/data/train.csv', delimiter=",", dtype=str, max_rows = 1)[2:])
log = True
degree = 1
X0, features= extend_features(X0, names, degree,log)

In [None]:
all_candidates = X0
print(all_candidates.shape)
print(len(features))

In [None]:
# parameters
model = dict()
model['method'] = 'lssgd'
model['loss'] = 'rmse'

# hyperparameters
model['batch_size'] = 50
model['max_iters'] = 25
model['gamma'] = 1e-1
model['threshold'] = 1e-4

# other
model['debug_mode'] = 0

# R2 type
R2_method = 'McFadden' # 'loss', 'Tjur' or 'McFadden' 

# estimate R2 error through cross validation (1 or 0)
cv = 0
model['k_fold'] = 10

In [None]:
# step-wise
best_R2adj, idx_features = stepwise(model, R2_method, all_candidates, features, y, cv)

In [None]:
# display selected features
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

In [None]:
duration = 1  # second
freq = 440  # Hz
os.system('play --no-show-progress --null --channels 1 synth %s sine %f' % (duration, freq))

# Cross validation with feature selected

In [None]:
# ------------------------------- BUILD THE MODEL WITH THE SELECTED FEATURES 
X = X0[:,idx_features]

# Set parameters
model['initial_w'] = np.zeros(X.shape[1])
#model['debug_mode'] = 1
deg_min = 1
deg_max = 7
deg_steps = 1


w_tr_tot, loss_tr_tot, loss_te_tot, deg_opt, success_rate  = optimize_degree(y, X, deg_min, deg_max, deg_steps, model, debug_mode=1)


In [None]:
mean_tr = np.mean(loss_tr_tot)
mean_te = np.mean(loss_te_tot)
var_tr = np.var(loss_tr_tot)
var_te = np.var(loss_te_tot)

print("TRAINING:")
print('Mean = ', mean_tr, ' | Var = ', var_tr)
print('---------------------------------------------------')
print("TESTING:")
print('Mean = ', mean_te, ' | Var = ', var_te)
print('\n')
print("Success Rate : ", success_rate)

In [None]:
duration = 1  # second
freq = 440  # Hz
os.system('play --no-show-progress --null --channels 1 synth %s sine %f' % (duration, freq))