In [9]:
# TO RUN IT: CHANGE MY PATH
%matplotlib inline
%load_ext autoreload
%autoreload 2

# change path if necessary
import sys
my_path = r'/home/ilaria/Scrivania/Machine_Learning/Project_1/Project1_ML'
sys.path.insert(0,my_path + r'/code/COMMON')

# imports
import numpy as np 
import matplotlib.pyplot as plt

from proj1_helpers import predict_labels 
from implementations import *
from labels import idx_2labels
from costs import *
from optimize_hyperparams import *
from cross_validation import *
from step_wise import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Build dataset

In [10]:
from proj1_helpers import load_csv_data 

# load raw data
y_raw, input_data_raw, ids = load_csv_data(my_path + r'/data/train.csv', sub_sample=False)

In [11]:
from outliers import handle_outliers

# handle outliers
X0, y = handle_outliers(input_data_raw, y_raw, -999, 'median')

-999 are replaced by the median value of the feature


In [12]:
from standard import standardize

# standardize
X0, mean_X0, std_X0 = standardize(X0)

# set y in {0,1} instead of {-1,1}
y[np.where(y==-1)]=0

# Least Squares

In [13]:
# parameters
model = dict()
model['method'] = 'ls'
model['loss'] = 'rmse'

# R2 type
R2_method = 'McFadden' # or 'loss'

# estimate R2 error through cross validation (1 or 0)
cv = 0
model['k_fold'] = 2

In [14]:
# step wise candidates
all_candidates = X0

# adding logarithmic transformation of masses
all_candidates = np.concatenate((all_candidates, np.log(1+np.abs(X0[:,0])).reshape(len(X0),1)),axis =1)
all_candidates = np.concatenate((all_candidates, np.log(1+np.abs(X0[:,1])).reshape(len(X0),1)),axis =1)
all_candidates = np.concatenate((all_candidates, np.log(1+np.abs(X0[:,2])).reshape(len(X0),1)),axis =1)
all_candidates = np.concatenate((all_candidates, np.log(1+np.abs(X0[:,5])).reshape(len(X0),1)),axis =1)

# get feature names 
all_features = list(np.genfromtxt(my_path + r'/data/train.csv', delimiter=",", dtype=str, max_rows = 1)[2:])

# adding logarithm names
all_features.append('log(DER_mass_MMC)')
all_features.append('log(DER_mass_transverse_met_lep)')
all_features.append('log(DER_mass_vis)')
all_features.append('log(DER_mass_jet_jet)')

# list of feature names and indices
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [15]:
# step-wise
best_R2adj, idx_features = stepwise(model, R2_method, all_candidates, features, y, cv)

-------------------------------------------------
Feature chosen:  DER_mass_transverse_met_lep (index : 1 )
-------------------------------------------------
Feature chosen:  log(DER_mass_MMC) (index : 30 )
-------------------------------------------------
Feature chosen:  DER_mass_MMC (index : 0 )
-------------------------------------------------
Feature chosen:  log(DER_mass_jet_jet) (index : 33 )
-------------------------------------------------
Feature chosen:  log(DER_mass_vis) (index : 32 )
-------------------------------------------------
Feature chosen:  PRI_tau_pt (index : 13 )
-------------------------------------------------
Feature chosen:  DER_lep_eta_centrality (index : 12 )
-------------------------------------------------
Feature chosen:  DER_met_phi_centrality (index : 11 )
-------------------------------------------------
Feature chosen:  PRI_jet_all_pt (index : 29 )
-------------------------------------------------
Feature chosen:  PRI_lep_pt (index : 16 )
----------

In [16]:
# display selected features
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

step 1 : R2 adjusted = 0.0320794565365
step 2 : R2 adjusted = 0.047166942174
step 3 : R2 adjusted = 0.0612482205189
step 4 : R2 adjusted = 0.0668010566278
step 5 : R2 adjusted = 0.0717001634344
step 6 : R2 adjusted = 0.0757292538246
step 7 : R2 adjusted = 0.0781177637565
step 8 : R2 adjusted = 0.0800173636369
step 9 : R2 adjusted = 0.0807079498126
step 10 : R2 adjusted = 0.0812871433531
step 11 : R2 adjusted = 0.0822125400866
step 12 : R2 adjusted = 0.0827256747397
step 13 : R2 adjusted = 0.0833855822215
step 14 : R2 adjusted = 0.0834893390277
step 15 : R2 adjusted = 0.0835659325506
step 16 : R2 adjusted = 0.083659872517
step 17 : R2 adjusted = 0.0837093679881
step 18 : R2 adjusted = 0.0837332227061
step 19 : R2 adjusted = 0.0837459084975
step 20 : R2 adjusted = 0.0838071786813
step 21 : R2 adjusted = 0.0838343878265
step 22 : R2 adjusted = 0.0838448106646
step 23 : R2 adjusted = 0.0838500404853
step 24 : R2 adjusted = 0.0838532382908
---------------------------------------------------

# Least Squares Gradient Descent

In [None]:
# parameters
model = dict()
model['method'] = 'lsgd'
model['loss'] = 'rmse'

# hyperparameters
model['max_iters'] = 10000
model['gamma'] = 1e-2
model['threshold'] = 1e-2

# other
model['debug_mode'] = 0

# R2 type
R2_method = 'McFadden' # 'loss', 'Tjur' or 'McFadden' 

# estimate R2 error through cross validation (1 or 0)
cv = 0
model['k_fold'] = 5

In [None]:
# step wise candidates
all_candidates = X0

# get feature names 
all_features = list(np.genfromtxt(my_path + r'/data/train.csv', delimiter=",", dtype=str, max_rows = 1)[2:])

# list of feature names and indices
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [None]:
# step-wise
best_R2adj, idx_features = stepwise(model, R2_method, all_candidates, features, y, cv)

In [None]:
# display selected features
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

# Least Squares Stochastic Gradient Descent

In [None]:
# parameters
model = dict()
model['method'] = 'lssgd'
model['loss'] = 'rmse'

# hyperparameters
model['batch_size'] = 50
model['max_iters'] = 10000
model['gamma'] = 1e-2
model['threshold'] = 1e-2

# other
model['debug_mode'] = 0

# R2 type
R2_method = 'McFadden' # 'loss', 'Tjur' or 'McFadden' 

# estimate R2 error through cross validation (1 or 0)
cv = 0
model['k_fold'] = 5

In [None]:
# step wise candidates
all_candidates = X0

# get feature names 
all_features = list(np.genfromtxt(my_path + r'/data/train.csv', delimiter=",", dtype=str, max_rows = 1)[2:])

# list of feature names and indices
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [None]:
# step-wise
best_R2adj, idx_features = stepwise(model, R2_method, all_candidates, features, y, cv)

In [None]:
# display selected features
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)