In [1]:
# TO RUN IT: CHANGE MY PATH
%matplotlib inline
%load_ext autoreload
%autoreload 2

# change path if necessary
import sys
my_path = r'/home/ilaria/Scrivania/Machine_Learning/Project_1/Project1_ML'
sys.path.insert(0,my_path + r'/code/COMMON')

# imports
import numpy as np 
import matplotlib.pyplot as plt

from proj1_helpers import predict_labels 
from implementations import *
from labels import idx_2labels
from costs import *
from optimize_hyperparams import *
from cross_validation import *
from step_wise import *

# Build dataset

In [2]:
from proj1_helpers import load_csv_data 

# load raw data
y_raw, input_data_raw, ids = load_csv_data(my_path + r'/data/train.csv', sub_sample=False)

In [3]:
from outliers import handle_outliers

# handle outliers
X0, y = handle_outliers(input_data_raw, y_raw, -999, 'median')

-999 are replaced by the median value of the feature


In [4]:
from standard import standardize

# standardize
X0, mean_X0, std_X0 = standardize(X0)

# set y in {0,1} instead of {-1,1}
y[np.where(y==-1)]=0

# Least Squares

In [5]:
# parameters
model = dict()
model['method'] = 'ls'
model['loss'] = 'rmse'

# R2 type
R2_method = 'McFadden' # or 'loss'

# estimate R2 error through cross validation (1 or 0)
cv = 1
model['k_fold'] = 3

In [6]:
# step wise candidates
all_candidates = X0

# adding logarithmic transformation of masses

all_candidates = np.concatenate((all_candidates, np.log(1+np.abs(X0[:,0])).reshape(len(X0),1)),axis =1)
all_candidates = np.concatenate((all_candidates, np.log(1+np.abs(X0[:,1])).reshape(len(X0),1)),axis =1)
all_candidates = np.concatenate((all_candidates, np.log(1+np.abs(X0[:,2])).reshape(len(X0),1)),axis =1)
all_candidates = np.concatenate((all_candidates, np.log(1+np.abs(X0[:,5])).reshape(len(X0),1)),axis =1)

# adding the degree


# get feature names 
all_features = list(np.genfromtxt(my_path + r'/data/train.csv', delimiter=",", dtype=str, max_rows = 1)[2:])

# adding logarithm names

all_features.append('log(DER_mass_MMC)')
all_features.append('log(DER_mass_transverse_met_lep)')
all_features.append('log(DER_mass_vis)')
all_features.append('log(DER_mass_jet_jet)')

# list of feature names and indices
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [7]:
# step-wise
best_R2adj, idx_features = stepwise(model, R2_method, all_candidates, features, y, cv)

0.6564506258025032
0.6559826239304957
0.6590546362185449
0.6893667574670299
0.6891987567950272
0.6925227700910803
0.6566066264265057
0.6561266245064981
0.6592586370345481
0.6639266557066228
0.6639986559946239
0.666998667994672
0.6892107568430274
0.6882627530510123
0.6919467677870711
0.6839787359149436
0.6833427333709334
0.6860427441709767
0.6841347365389462
0.6827787311149245
0.6864507458029833
0.6566066264265057
0.6561266245064981
0.6592586370345481
0.6566066264265057
0.6561266245064981
0.6592586370345481
0.6504146016584066
0.6508346033384134
0.6532106128424514
0.6566066264265057
0.6561266245064981
0.6592586370345481
0.6571466285865143
0.6566426265705063
0.6593906375625502
0.679814719258877
0.6782907131628526
0.6817947271789088
0.6626066504266017
0.6625226500906004
0.6658706634826539
0.6566066264265057
0.6561266245064981
0.6592586370345481
0.6566066264265057
0.6561266245064981
0.6592586370345481
0.6566066264265057
0.6561266245064981
0.6592586370345481
0.6566066264265057
0.656126624506

0.784935139740559
0.7781311125244501
0.7783471133884535
0.7789831159324637
0.7833991335965343
0.7841791367165468
0.7845271381085525
0.7845871383485534
0.7854511418045672
0.7857991431965727
0.7804591218364874
0.7815871263485054
0.7817191268765075
0.7806511226044904
0.7814191256765027
0.7815751263005052
0.7795711182844731
0.7791991167964671
0.7802791211164845
0.7806991227964912
0.7815991263965056
0.7816111264445058
0.7805311221244885
0.7817791271165084
0.7817791271165084
0.7803391213564854
0.781623126492506
0.781623126492506
0.7806631226524906
0.7817671270685083
0.7817071268285073
0.779871119484478
0.7811191244764979
0.7812151248604995
0.7810351241404966
0.7822951291805167
0.7821391285565142
0.7806511226044904
0.7816711266845068
0.7816351265405062
0.7807711230844924
0.7816111264445058
0.781563126252505
0.7808431233724935
0.7817191268765075
0.7816711266845068
0.78187512750051
0.7824271297085188
0.78187512750051
0.7808431233724935
0.7817671270685083
0.7815991263965056
0.78062712250849
0.78

0.790815163260653
0.7922911691646767
0.7915951663806655
0.7909831639326558
0.7916911667646671
0.7915951663806655
0.790311161244645
0.7915831663326653
0.7914031656126624
0.7901671606686427
0.7916431665726663
0.7916671666686667
0.7903471613886456
0.7917151668606675
0.7921711686846747
0.7899991599966399
0.7925191700766803
0.7916431665726663
0.7902151608606435
0.7916551666206665
0.7916791667166668
0.7903351613406454
0.7918351673406694
0.7914151656606626
0.7906711626846508
0.7919191676766707
0.7915951663806655
0.7902271609086436
0.7916431665726663
0.7917391669566678
0.7911991647966592
0.7915351661406645
0.7918831675326702
0.7908871635486542
0.792315169260677
0.7915231660926644
0.7898071592286369
0.7912711650846603
0.7916671666686667
0.7902151608606435
0.7916551666206665
0.7916551666206665
0.7902991611966448
0.7916551666206665
0.7921591686366746
0.7899991599966399
0.7913671654686619
0.7916311665246661
0.7903591614366458
0.7917991671966688
0.7915711662846652
0.7902751611006444
0.7915831663326

0.7942711770847083
0.7934911739646958
0.7954231816927267
0.794187176748707
0.7934911739646958
0.7953271813087253
0.7942231768927076
0.7934191736766947
0.7955311821247285
0.7944031776127104
0.7938511754047016
0.7951471805887224
0.7942951771807087
0.7938391753567015
0.7950991803967216
0.7942831771327086
0.7938271753087013
0.7949311797247189
0.7943551774207097
0.7936351745406982
0.7954711818847275
0.7942951771807087
0.7935391741566966
0.7954111816447266
0.7944031776127104
0.7936831747326989
0.7950751803007212
0.7943551774207097
0.7934911739646958
0.7954711818847275
0.7942831771327086
0.7934431737726951
0.7954231816927267
0.7941511766047065
0.7934671738686955
0.7954951819807279
0.7934071736286945
0.7929151716606866
0.7953151812607251
-------------------------------------------------
Feature chosen:  PRI_jet_leading_pt (index : 23 )
0.7946071784287138
0.7937551750207
0.7951711806847227
0.7938511754047016
0.7937911751647007
0.7947151788607154
0.7942591770367081
0.7936351745406982
0.794883179

0.7948591794367178
0.7937671750687003
0.7935991743966976
0.7949431797727191
0.7938871755487021
0.7934791739166956
0.7947871791487165
0.7935991743966976
0.7935271741086964
0.7947631790527162
0.7937911751647007
0.7934431737726951
0.7948231792927172
0.7939111756447026
0.7935271741086964
0.7947871791487165
0.7936231744926979
0.7934791739166956
0.794751179004716
0.7936711746846987
0.7935391741566966
0.7947151788607154
0.793815175260701
0.7935031740126961
0.7946671786687147
0.7936231744926979
0.7934911739646958
0.7947991791967168


In [8]:
# display selected features
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

step 1 : R2 adjusted = 0.999997666501
step 2 : R2 adjusted = 0.999997745163
step 3 : R2 adjusted = 0.999997821002
step 4 : R2 adjusted = 0.999997851439
step 5 : R2 adjusted = 0.99999787885
step 6 : R2 adjusted = 0.99999790152
step 7 : R2 adjusted = 0.999997915057
step 8 : R2 adjusted = 0.999997925986
step 9 : R2 adjusted = 0.999997929942
step 10 : R2 adjusted = 0.999997932186
step 11 : R2 adjusted = 0.999997939145
step 12 : R2 adjusted = 0.999997941674
step 13 : R2 adjusted = 0.999997945507
step 14 : R2 adjusted = 0.99999794611
step 15 : R2 adjusted = 0.999997946571
step 16 : R2 adjusted = 0.999997947132
step 17 : R2 adjusted = 0.999997947434
step 18 : R2 adjusted = 0.99999794758
step 19 : R2 adjusted = 0.999997947672
step 20 : R2 adjusted = 0.99999794792
step 21 : R2 adjusted = 0.9999979482
step 22 : R2 adjusted = 0.999997948258
step 23 : R2 adjusted = 0.999997948297
step 24 : R2 adjusted = 0.999997948298
-------------------------------------------------------
Number of features chose

# Least Squares Gradient Descent

In [None]:
# parameters
model = dict()
model['method'] = 'lsgd'
model['loss'] = 'rmse'

# hyperparameters
model['max_iters'] = 10000
model['gamma'] = 1e-2
model['threshold'] = 1e-2

# other
model['debug_mode'] = 0

# R2 type
R2_method = 'McFadden' # 'loss', 'Tjur' or 'McFadden' 

# estimate R2 error through cross validation (1 or 0)
cv = 0
model['k_fold'] = 5

In [None]:
# step wise candidates
all_candidates = X0

# get feature names 
all_features = list(np.genfromtxt(my_path + r'/data/train.csv', delimiter=",", dtype=str, max_rows = 1)[2:])

# list of feature names and indices
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [None]:
# step-wise
best_R2adj, idx_features = stepwise(model, R2_method, all_candidates, features, y, cv)

In [None]:
# display selected features
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

# Least Squares Stochastic Gradient Descent

In [None]:
# parameters
model = dict()
model['method'] = 'lssgd'
model['loss'] = 'rmse'

# hyperparameters
model['batch_size'] = 50
model['max_iters'] = 10000
model['gamma'] = 1e-2
model['threshold'] = 1e-2

# other
model['debug_mode'] = 0

# R2 type
R2_method = 'McFadden' # 'loss', 'Tjur' or 'McFadden' 

# estimate R2 error through cross validation (1 or 0)
cv = 0
model['k_fold'] = 5

In [None]:
# step wise candidates
all_candidates = X0

# get feature names 
all_features = list(np.genfromtxt(my_path + r'/data/train.csv', delimiter=",", dtype=str, max_rows = 1)[2:])

# list of feature names and indices
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [None]:
# step-wise
best_R2adj, idx_features = stepwise(model, R2_method, all_candidates, features, y, cv)

In [None]:
# display selected features
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)