In [2]:
# TO RUN IT: CHANGE MY PATH
%matplotlib inline
%load_ext autoreload
%autoreload 2

# change path if necessary
import sys
my_path = r'D:\Documents\etudes\epfl\MA1\cours\MachineLearning\Project1'
sys.path.insert(0,my_path + r'\code\COMMON')

# imports
import numpy as np 
import matplotlib.pyplot as plt

from proj1_helpers import predict_labels 
from implementations import *
from labels import idx_2labels
from costs import *
from optimize_hyperparams import *
from cross_validation import *
from step_wise import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Build dataset

In [3]:
from proj1_helpers import load_csv_data 

# load raw data
y_raw, input_data_raw, ids = load_csv_data(my_path + r'\data\train.csv', sub_sample=False)

In [4]:
from outliers import handle_outliers

# handle outliers
X0, y = handle_outliers(input_data_raw, y_raw, -999, 'median')

-999 are replaced by the median value of the feature


In [5]:
from standard import standardize

# standardize
X0, mean_X0, std_X0 = standardize(X0)

# set y in {0,1} instead of {-1,1}
y[np.where(y==-1)]=0

# Least Squares

In [6]:
# parameters
model = dict()
model['method'] = 'ls'
model['loss'] = 'rmse'

# R2 type
R2_method = 'McFadden' # 'loss', 'Tjur' or 'McFadden'

# estimate R2 error through cross validation (1 or 0)
cv = 1
model['k_fold'] = 2

In [7]:
# step wise candidates
all_candidates = X0

# get feature names 
all_features = list(np.genfromtxt(my_path + r'/data/train.csv', delimiter=",", dtype=str, max_rows = 1)[2:])

# list of feature names and indices
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [8]:
# step-wise
best_R2adj, idx_features = stepwise(model, R2_method, all_candidates, features, y, cv)

-------------------------------------------------
Feature chosen:  DER_mass_transverse_met_lep (index : 1 )
-------------------------------------------------
Feature chosen:  PRI_tau_pt (index : 13 )
-------------------------------------------------
Feature chosen:  DER_deltaeta_jet_jet (index : 4 )
-------------------------------------------------
Feature chosen:  DER_met_phi_centrality (index : 11 )


KeyboardInterrupt: 

In [8]:
# display selected features
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

step 1 : R2 adjusted = 0.0320794565365
step 2 : R2 adjusted = 0.0411265530781
step 3 : R2 adjusted = 0.0481890133613
step 4 : R2 adjusted = 0.0514358016916
step 5 : R2 adjusted = 0.0535750884708
step 6 : R2 adjusted = 0.0557072759709
step 7 : R2 adjusted = 0.0580645296234
step 8 : R2 adjusted = 0.0601138475268
step 9 : R2 adjusted = 0.06122894488
step 10 : R2 adjusted = 0.0619636030891
step 11 : R2 adjusted = 0.0625479329125
step 12 : R2 adjusted = 0.063028005549
step 13 : R2 adjusted = 0.0634340068141
step 14 : R2 adjusted = 0.0636483769198
step 15 : R2 adjusted = 0.0637074735784
step 16 : R2 adjusted = 0.0637673586557
step 17 : R2 adjusted = 0.06377695
step 18 : R2 adjusted = 0.0637770496404
-------------------------------------------------------
Number of features chosen: 18
Indices of features chosen:  [1, 13, 4, 11, 7, 2, 16, 10, 19, 12, 23, 5, 8, 26, 21, 22, 0, 9]


# Least Squares Gradient Descent

In [9]:
# parameters
model = dict()
model['method'] = 'lsgd'
model['loss'] = 'rmse'

# hyperparameters
model['max_iters'] = 10000
model['gamma'] = 1e-2
model['threshold'] = 1e-2

# other
model['debug_mode'] = 0

# R2 type
R2_method = 'McFadden' # 'loss', 'Tjur' or 'McFadden' 

# estimate R2 error through cross validation (1 or 0)
cv = 0
model['k_fold'] = 5

In [10]:
# step wise candidates
all_candidates = X0

# get feature names 
all_features = list(np.genfromtxt(my_path + r'/data/train.csv', delimiter=",", dtype=str, max_rows = 1)[2:])

# list of feature names and indices
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [11]:
# step-wise
best_R2adj, idx_features = stepwise(model, R2_method, all_candidates, features, y, cv)

-------------------------------------------------
Feature chosen:  DER_mass_transverse_met_lep (index : 1 )
-------------------------------------------------
Feature chosen:  PRI_tau_pt (index : 13 )
-------------------------------------------------
Feature chosen:  DER_deltaeta_jet_jet (index : 4 )
-------------------------------------------------
Feature chosen:  DER_met_phi_centrality (index : 11 )
-------------------------------------------------
Feature chosen:  DER_deltar_tau_lep (index : 7 )
-------------------------------------------------
Feature chosen:  DER_mass_vis (index : 2 )
-------------------------------------------------
Feature chosen:  PRI_lep_pt (index : 16 )
-------------------------------------------------
Feature chosen:  PRI_met (index : 19 )
-------------------------------------------------
Feature chosen:  DER_lep_eta_centrality (index : 12 )
-------------------------------------------------
Feature chosen:  DER_pt_tot (index : 8 )
---------------------------

In [12]:
# display selected features
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

step 1 : R2 adjusted = 0.0302643899746
step 2 : R2 adjusted = 0.0407803259247
step 3 : R2 adjusted = 0.0481265396332
step 4 : R2 adjusted = 0.0510054618756
step 5 : R2 adjusted = 0.053328173018
step 6 : R2 adjusted = 0.0550977702977
step 7 : R2 adjusted = 0.0579690443817
step 8 : R2 adjusted = 0.0589765998121
step 9 : R2 adjusted = 0.0597583059702
step 10 : R2 adjusted = 0.0601371888532
step 11 : R2 adjusted = 0.0603386903934
step 12 : R2 adjusted = 0.0605544020324
step 13 : R2 adjusted = 0.060744306797
step 14 : R2 adjusted = 0.0611699064015
step 15 : R2 adjusted = 0.061404488233
step 16 : R2 adjusted = 0.0615374878159
step 17 : R2 adjusted = 0.0615492211022
step 18 : R2 adjusted = 0.0615592758769
-------------------------------------------------------
Number of features chosen: 18
Indices of features chosen:  [1, 13, 4, 11, 7, 2, 16, 19, 12, 8, 26, 21, 22, 9, 5, 0, 18, 25]


# Least Squares Stochastic Gradient Descent

In [13]:
# parameters
model = dict()
model['method'] = 'lssgd'
model['loss'] = 'rmse'

# hyperparameters
model['batch_size'] = 50
model['max_iters'] = 10000
model['gamma'] = 1e-2
model['threshold'] = 1e-2

# other
model['debug_mode'] = 0

# R2 type
R2_method = 'McFadden' # 'loss', 'Tjur' or 'McFadden' 

# estimate R2 error through cross validation (1 or 0)
cv = 0
model['k_fold'] = 5

In [14]:
# step wise candidates
all_candidates = X0

# get feature names 
all_features = list(np.genfromtxt(my_path + r'/data/train.csv', delimiter=",", dtype=str, max_rows = 1)[2:])

# list of feature names and indices
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [15]:
# step-wise
best_R2adj, idx_features = stepwise(model, R2_method, all_candidates, features, y, cv)

-------------------------------------------------
Feature chosen:  DER_mass_transverse_met_lep (index : 1 )
-------------------------------------------------
Feature chosen:  PRI_tau_pt (index : 13 )
-------------------------------------------------
Feature chosen:  DER_pt_tot (index : 8 )


In [16]:
# display selected features
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

step 1 : R2 adjusted = 0.0624206324159
step 2 : R2 adjusted = 0.0688784543044
step 3 : R2 adjusted = 0.0690815408877
-------------------------------------------------------
Number of features chosen: 3
Indices of features chosen:  [1, 13, 8]


# Logistic Regression (regularized or not)

In [17]:
# model parameters
model = dict()
model['method'] = 'lr' # 'lrr' if regularized with a penalization 'lambda_' otherwise 'lr' 
model['loss'] = 'loglikelihood'
model['method_minimization'] = 'gd' # 'gd' (gradient_descent) or 'newton'
model['threshold'] = 1

# model hyperparameters
model['max_iters'] = 10000
model['gamma'] = 1e-5
model['lambda_'] = 1e-5 # otpimize if cv = 1

# other
model['debug_mode'] = 0

# R2 type
R2_method = 'McFadden' # 'loss', 'Tjur' or 'McFadden' 

# estimate R2 error through cross validation (1 or 0)
cv = 0
model['k_fold'] = 5

In [18]:
# step wise candidates
all_candidates = X0

# get feature names 
all_features = list(np.genfromtxt(my_path + r'/data/train.csv', delimiter=",", dtype=str, max_rows = 1)[2:])

# list of feature names and indices
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [21]:
# step-wise
best_R2adj, idx_features = stepwise(model, R2_method, all_candidates, features, y, cv)

KeyboardInterrupt: 

In [None]:
# display selected features
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)