## Table of content: <a class="anchor" id="top"></a>
* [Missing values handler](#missing-value-handler)
    * [Drop columns containing missing values (M1)](#m1)
    * [Replace missing values by mean (M2)](#m2)
    * [Split by mass, replace missing values by mean (M3)](#m3)
    * [Split by jet_num, replace missing values by mean (M4)](#m4)
    * [Split by jet_num and mass, replace missing values by mean (M5)](#m5)
* [Remove outliers and feature augmentation](#jetnumRO)
    * [Split by jet_num and remove ouliters](#jetnumRO)    
    * [Split by jet_num and remove ouliters + polynomial](#jetnumROPE3)
    * [Split by jet_num, mass and remove ouliters](#jetnummassRO)    
    * [Split by jet_num, mass and remove ouliters + polynomial](#jetnummassROPE2)
<!--     * [Split by jet_num, mass and remove ouliters + polynomial](#jetnummassROPE3) -->
* [Degree search](#deg2)
    * [Degree 2](#deg2)    
    * [Degree 3](#deg3)    
    * [Degree 4](#deg4)    
    * [Degree 5](#deg5)    
* [Crossfeatures augmentation](#crossfeat)
* [Nonlinear augmentation](#nonlinear)


In [103]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [208]:
from crossval import GridSearchCV, CrossVal, PartitionCrossVal, MultiPartitionCrossVal, MassPartitionCrossVal
from implementations import logistic_regression, reg_logistic_regression, \
least_squares, least_squares_GD, least_squares_SGD,ridge_regression 
from proj1_helpers import load_csv_data, predict_labels, acc_score, f1_score
from preprocessing import NonLinearTransformer, Normalizer, Imputer, PolynomialFeature, Pipeline, remove_outliers

In [94]:
col2id={'DER_mass_MMC': 0,
 'DER_mass_transverse_met_lep': 1,
 'DER_mass_vis': 2,
 'DER_pt_h': 3,
 'DER_deltaeta_jet_jet': 4,
 'DER_mass_jet_jet': 5,
 'DER_prodeta_jet_jet': 6,
 'DER_deltar_tau_lep': 7,
 'DER_pt_tot': 8,
 'DER_sum_pt': 9,
 'DER_pt_ratio_lep_tau': 10,
 'DER_met_phi_centrality': 11,
 'DER_lep_eta_centrality': 12,
 'PRI_tau_pt': 13,
 'PRI_tau_eta': 14,
 'PRI_tau_phi': 15,
 'PRI_lep_pt': 16,
 'PRI_lep_eta': 17,
 'PRI_lep_phi': 18,
 'PRI_met': 19,
 'PRI_met_phi': 20,
 'PRI_met_sumet': 21,
 'PRI_jet_num': 22,
 'PRI_jet_leading_pt': 23,
 'PRI_jet_leading_eta': 24,
 'PRI_jet_leading_phi': 25,
 'PRI_jet_subleading_pt': 26,
 'PRI_jet_subleading_eta': 27,
 'PRI_jet_subleading_phi': 28,
 'PRI_jet_all_pt': 29}

In [39]:
DATA_TRAIN_PATH = 'data/train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
DATA_TEST_PATH = './data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

# Missing values handler <a class="anchor" id="missing-value-handler"></a>

## Drop columns containing missing values (M1) <a class="anchor" id="m1"></a>
[Back to top](#top)

In [112]:
pipeline = Pipeline(Imputer(dropnan=True), Normalizer())

In [113]:
for model in [least_squares, least_squares_GD, least_squares_SGD,logistic_regression]:
    crossval = CrossVal(model,predict_labels,[acc_score, f1_score],refit=False)
    _,_, scores_mean, scores_std, train_scores_mean = crossval.fit(y, tX, pipeline,gamma=1, early_stopping=True)
    print(scores_mean, scores_std, train_scores_mean)

[0.733416   0.55798222] [0.00264334 0.00421453] [0.73361   0.5583651]
[0.73362    0.55702553] [0.00261534 0.00419102] [0.733706   0.55699752]
[0.509272   0.44025098] [0.06297554 0.07137247] [0.511443   0.44245289]
[0.738572   0.57714405] [0.00246676 0.0040193 ] [0.738782   0.57743596]


In [114]:
lambda_grids = {'lambda_':np.logspace(-4, 0, 10).tolist()}
for model in [ridge_regression, reg_logistic_regression]:
    gridsearch = GridSearchCV(model,predict_labels,[acc_score,f1_score],lambda_grids, CrossVal, refit=False)
    _,_, scores, best_params, _ = gridsearch.fit(y, tX, pipeline,gamma=1, early_stopping=True)
    print(scores, best_params)
    print("===============")

{'lambda_': 0.0001} [0.733376   0.55796932] [0.733545   0.55825366]
{'lambda_': 0.0002782559402207126} [0.73338    0.55792048] [0.733539   0.55820275]
{'lambda_': 0.000774263682681127} [0.733352   0.55771931] [0.733545   0.55809685]
{'lambda_': 0.002154434690031882} [0.733256   0.55731897] [0.733478  0.5576165]
{'lambda_': 0.005994842503189409} [0.7332     0.55658681] [0.733284   0.55663925]
{'lambda_': 0.016681005372000592} [0.732556   0.55424095] [0.732704   0.55433443]
{'lambda_': 0.046415888336127774} [0.730004   0.54996802] [0.730025  0.5500043]
{'lambda_': 0.12915496650148828} [0.7245     0.54741535] [0.724527   0.54740954]
{'lambda_': 0.3593813663804626} [0.71522    0.55092996] [0.715052   0.55060178]
{'lambda_': 1.0} [0.705476  0.5577718] [0.705472   0.55768781]
([0.73338, 0.5579204818612903], [0.002663831826523588, 0.004154174894547088]) (('lambda_', 0.0002782559402207126),)
{'lambda_': 0.0001} [0.738676   0.57882732] [0.738903   0.57928302]
{'lambda_': 0.0002782559402207126} 

## Replace missing values by mean (M2)<a class="anchor" id="m2"></a>
[Back to top](#top)

In [115]:
pipeline = Pipeline(Imputer(replacenan='mean'), Normalizer())

In [116]:
for model in [least_squares, least_squares_GD, least_squares_SGD,logistic_regression]:
    crossval = CrossVal(model,predict_labels,[acc_score, f1_score],refit=False)
    _,_, scores_mean, scores_std, train_scores_mean = crossval.fit(y, tX, pipeline,gamma=1, early_stopping=True)
    print(scores_mean, scores_std, train_scores_mean)

[0.744224   0.56999449] [0.00209702 0.00374195] [0.7445    0.5705157]
[0.3781    0.3703198] [0.00210185 0.00298912] [0.378043   0.37031729]
[0.5114     0.43036795] [0.08503598 0.05039708] [0.509891 0.429814]
[0.750148   0.59361684] [0.00259235 0.00422535] [0.750302   0.59383493]


In [117]:
lambda_grids = {'lambda_':np.logspace(-4, 0, 10).tolist()}
for model in [ridge_regression, reg_logistic_regression]:
    gridsearch = GridSearchCV(model,predict_labels,[acc_score,f1_score],lambda_grids, CrossVal, refit=False)
    _,_, scores, best_params, _ = gridsearch.fit(y, tX, pipeline,gamma=1, early_stopping=True)
    print(scores, best_params)
    print("===============")

{'lambda_': 0.0001} [0.744376   0.57028432] [0.744492   0.57051084]
{'lambda_': 0.0002782559402207126} [0.744368   0.57026558] [0.74446    0.57043247]
{'lambda_': 0.000774263682681127} [0.744312  0.5700613] [0.744379  0.5701901]
{'lambda_': 0.002154434690031882} [0.7441     0.56938921] [0.744188   0.56960201]
{'lambda_': 0.005994842503189409} [0.743596   0.56787804] [0.743696  0.5680658]
{'lambda_': 0.016681005372000592} [0.742356   0.56431825] [0.742369   0.56443155]
{'lambda_': 0.046415888336127774} [0.73944    0.55849718] [0.739481  0.5585552]
{'lambda_': 0.12915496650148828} [0.73368    0.55205343] [0.734016   0.55268371]
{'lambda_': 0.3593813663804626} [0.72692    0.55262579] [0.727066   0.55288919]
{'lambda_': 1.0} [0.718992   0.55666795] [0.719056  0.5568439]
([0.744376, 0.5702843235303472], [0.0020825906943036194, 0.0037660942246723387]) (('lambda_', 0.0001),)
{'lambda_': 0.0001} [0.750228   0.59516439] [0.7505    0.5956501]
{'lambda_': 0.0002782559402207126} [0.75018    0.5948

## Split by mass, replace missing values by mean (M3)<a class="anchor" id="m3"></a>
[Back to top](#top)

In [205]:
pipeline = Pipeline(Imputer(replacenan='mean'), Normalizer())

In [209]:
for model in [least_squares,logistic_regression]:
    crossval = MassPartitionCrossVal(model,predict_labels,[acc_score, f1_score],refit=False)
    _,_, scores_mean, scores_std, train_scores_mean = crossval.fit(y, tX,pipeline, gamma=1, early_stopping=True)
    print(scores_mean, scores_std, train_scores_mean)

[0.750008   0.59514438] [0.00199369 0.00339488] [0.750324   0.59559433]
[0.753556   0.60852245] [0.00208681 0.00362732] [0.753726   0.60870044]


In [210]:
lambda_grids = {'lambda_':np.logspace(-4, 0, 10).tolist()}
for model in [ridge_regression, reg_logistic_regression]:
    gridsearch = GridSearchCV(model,predict_labels,[acc_score,f1_score],lambda_grids, MassPartitionCrossVal, refit=False)
    _,_, scores, best_params, _ = gridsearch.fit(y, tX, pipeline, gamma=1, early_stopping=True)
    print(scores, best_params)
    print("===============")

{'lambda_': 0.0001} [0.750136   0.59526166] [0.750296  0.5955293]
{'lambda_': 0.0002782559402207126} [0.750128   0.59518082] [0.750278   0.59545282]
{'lambda_': 0.000774263682681127} [0.750052   0.59491875] [0.750231   0.59526847]
{'lambda_': 0.002154434690031882} [0.749948   0.59441474] [0.750118   0.59471444]
{'lambda_': 0.005994842503189409} [0.749644   0.59308875] [0.749532   0.59291341]
{'lambda_': 0.016681005372000592} [0.748024   0.58901013] [0.748138   0.58910529]
{'lambda_': 0.046415888336127774} [0.744956   0.58283521] [0.744993   0.58297516]
{'lambda_': 0.12915496650148828} [0.740196   0.57647005] [0.740267   0.57651994]
{'lambda_': 0.3593813663804626} [0.73428    0.57182029] [0.734316   0.57186626]
{'lambda_': 1.0} [0.727284   0.56808277] [0.727202   0.56785994]
([0.750136, 0.5952616577038311], [0.0021391736722388825, 0.0036923080162243878]) (('lambda_', 0.0001),)
{'lambda_': 0.0001} [0.75408    0.61080592] [0.754114   0.61085689]
{'lambda_': 0.0002782559402207126} [0.75418

## Split by jet_num, replace missing values by mean (M4)<a class="anchor" id="m4"></a>
[Back to top](#top)

In [118]:
pipeline = Pipeline(Imputer(replacenan='mean'), Normalizer())

In [126]:
for model in [least_squares, least_squares_GD, least_squares_SGD,logistic_regression]:
    crossval = PartitionCrossVal(model,predict_labels,[acc_score, f1_score],refit=False)
    _,_, scores_mean, scores_std, train_scores_mean = crossval.fit(y, tX, pipeline,gamma=1, early_stopping=True)
    print(scores_mean, scores_std, train_scores_mean)

[0.75872    0.59655483] [0.00180891 0.00406981] [0.758954   0.59705581]
[0.758756  0.5954365] [0.00170882 0.00417422] [0.759126   0.59618379]
[0.503996   0.43833642] [0.02553224 0.0311066 ] [0.502664 0.436871]
[0.764596   0.62159128] [0.00141565 0.00269419] [0.764854   0.62211497]


In [127]:
lambda_grids = {'lambda_':np.logspace(-4, 0, 10).tolist()}
for model in [ridge_regression, reg_logistic_regression]:
    gridsearch = GridSearchCV(model,predict_labels,[acc_score,f1_score],lambda_grids, PartitionCrossVal, refit=False)
    _,_, scores, best_params, _ = gridsearch.fit(y, tX, pipeline,gamma=1, early_stopping=True)
    print(scores, best_params)
    print("===============")

{'lambda_': 0.0001} [0.758752   0.59660425] [0.758928   0.59695846]
{'lambda_': 0.0002782559402207126} [0.758744   0.59654744] [0.758902   0.59687322]
{'lambda_': 0.000774263682681127} [0.75874    0.59640938] [0.758917   0.59681811]
{'lambda_': 0.002154434690031882} [0.75854    0.59589022] [0.758847   0.59647721]
{'lambda_': 0.005994842503189409} [0.7583     0.59480363] [0.758691   0.59563256]
{'lambda_': 0.016681005372000592} [0.758124  0.5936173] [0.758454   0.59427333]
{'lambda_': 0.046415888336127774} [0.757748   0.59305666] [0.757968   0.59339727]
{'lambda_': 0.12915496650148828} [0.755672   0.59511464] [0.755794   0.59515877]
{'lambda_': 0.3593813663804626} [0.752708   0.60514969] [0.752711   0.60503437]
{'lambda_': 1.0} [0.747508   0.61543173] [0.747452   0.61518867]
([0.758752, 0.5966042536836839], [0.0017038591491082794, 0.0038388256009084065]) (('lambda_', 0.0001),)
{'lambda_': 0.0001} [0.76464    0.62295469] [0.765066   0.62360103]
{'lambda_': 0.0002782559402207126} [0.76467

## Split by jet_num and mass, replace missing values by mean (M5)<a class="anchor" id="m5"></a>
[Back to top](#top)

In [138]:
pipeline = Pipeline(Imputer(replacenan='mean'), Normalizer())

In [222]:
for model in [least_squares,logistic_regression]:
    crossval = MultiPartitionCrossVal(model,predict_labels,[acc_score, f1_score],refit=False)
    _,_, scores_mean, scores_std, train_scores_mean = crossval.fit(y, tX,pipeline, gamma=1, early_stopping=True)
    print(scores_mean, scores_std, train_scores_mean)

[0.76506    0.62092549] [0.00124849 0.00342657] [0.765245   0.62128437]
[0.768544   0.63469502] [0.00126519 0.00263315] [0.769164   0.63574293]


In [223]:
lambda_grids = {'lambda_':np.logspace(-4, 0, 10).tolist()}
for model in [ridge_regression, reg_logistic_regression]:
    gridsearch = GridSearchCV(model,predict_labels,[acc_score,f1_score],lambda_grids, MultiPartitionCrossVal, refit=False)
    _,_, scores, best_params, _ = gridsearch.fit(y, tX, pipeline, gamma=1, early_stopping=True)
    print(scores, best_params)
    print("===============")

{'lambda_': 0.0001} [0.76498    0.62082485] [0.765223   0.62123776]
{'lambda_': 0.0002782559402207126} [0.764936   0.62069289] [0.765219   0.62121043]
{'lambda_': 0.000774263682681127} [0.764952   0.62063066] [0.765223   0.62113739]
{'lambda_': 0.002154434690031882} [0.76486    0.62024957] [0.765107  0.6206906]
{'lambda_': 0.005994842503189409} [0.764544   0.61912517] [0.764741   0.61954442]
{'lambda_': 0.016681005372000592} [0.763576  0.6168024] [0.763998   0.61746875]
{'lambda_': 0.046415888336127774} [0.761904   0.61400009] [0.762277   0.61445047]
{'lambda_': 0.12915496650148828} [0.759068   0.61292073] [0.759542   0.61359157]
{'lambda_': 0.3593813663804626} [0.75552    0.61652005] [0.755557   0.61656353]
{'lambda_': 1.0} [0.750436   0.62009769] [0.75066    0.62043514]
([0.76498, 0.6208248503380357], [0.0012773096727105796, 0.0035513760993357312]) (('lambda_', 0.0001),)
{'lambda_': 0.0001} [0.768488  0.6354252] [0.769148   0.63655496]
{'lambda_': 0.0002782559402207126} [0.768476   0

# Remove outliers

## Split by jet_num, replace missing values by mean, remove_outliers (jetnum RO)<a class="anchor" id="jetnumRO"></a>
[Back to top](#top)

In [221]:
pipeline = Pipeline(Imputer(replacenan='mean'), Normalizer())

In [139]:
for model in [least_squares, least_squares_GD, least_squares_SGD,logistic_regression]:
    crossval = PartitionCrossVal(model,predict_labels,[acc_score, f1_score],refit=False)
    _,_, scores_mean, scores_std, train_scores_mean = crossval.fit(y, tX, pipeline,addition_on_train=remove_outliers,gamma=1, early_stopping=True)
    print(scores_mean, scores_std, train_scores_mean)

[0.764424   0.62930314] [0.00169037 0.00359936] [0.77215664 0.63146595]
[0.763728  0.6267087] [0.00158693 0.00357505] [0.77147121 0.62825639]
[0.4898     0.41357878] [0.02948141 0.02992582] [0.49048395 0.4219911 ]
[0.768372   0.64280944] [0.00125275 0.00292847] [0.77607964 0.64618231]


In [140]:
lambda_grids = {'lambda_':np.logspace(-4, 0, 10).tolist()}
for model in [ridge_regression, reg_logistic_regression]:
    gridsearch = GridSearchCV(model,predict_labels,[acc_score,f1_score],lambda_grids, PartitionCrossVal, refit=False)
    _,_, scores, best_params, _ = gridsearch.fit(y, tX, pipeline,addition_on_train=remove_outliers,gamma=1, early_stopping=True)
    print(scores, best_params)
    print("===============")

{'lambda_': 0.0001} [0.764404   0.62926345] [0.77214342 0.63139708]
{'lambda_': 0.0002782559402207126} [0.76434    0.62912996] [0.77214672 0.63135311]
{'lambda_': 0.000774263682681127} [0.764256   0.62892603] [0.77214892 0.63126319]
{'lambda_': 0.002154434690031882} [0.764136   0.62862582] [0.77209376 0.63088094]
{'lambda_': 0.005994842503189409} [0.763288   0.62679535] [0.7718166  0.62976067]
{'lambda_': 0.016681005372000592} [0.761968   0.62452233] [0.77100985 0.62789566]
{'lambda_': 0.046415888336127774} [0.757812   0.61939591] [0.76864871 0.62487746]
{'lambda_': 0.12915496650148828} [0.753024   0.61660352] [0.76607455 0.62447905]
{'lambda_': 0.3593813663804626} [0.7457     0.61361915] [0.76046783 0.62256536]
{'lambda_': 1.0} [0.737116   0.61038452] [0.7531635  0.61980249]
([0.764404, 0.6292634518419062], [0.0017022526251998977, 0.0037054251044410648]) (('lambda_', 0.0001),)
{'lambda_': 0.0001} [0.768516  0.6436737] [0.77605311 0.64732763]
{'lambda_': 0.0002782559402207126} [0.76839

## Split by jet_num, replace missing values by mean, polynominal, remove_outliers (jetnum RO + PE3)<a class="anchor" id="jetnumROPE3"></a>
[Back to top](#top)

In [141]:
pipeline = Pipeline(Imputer(replacenan='mean'), PolynomialFeature(degree=3), Normalizer())

In [143]:
for model in [least_squares, least_squares_GD, least_squares_SGD,logistic_regression]:
    crossval = PartitionCrossVal(model,predict_labels,[acc_score, f1_score],refit=False)
    _,_, scores_mean, scores_std, train_scores_mean = crossval.fit(y, tX,pipeline,addition_on_train=remove_outliers, gamma=1, early_stopping=True)
    print(scores_mean, scores_std, train_scores_mean)

[0.805308   0.70402386] [0.00165576 0.00354422] [0.81131298 0.70978454]
[0.789008  0.6820523] [0.00518431 0.00882208] [0.79453823 0.68069351]
[0.489496   0.40235515] [0.01739672 0.02478912] [0.48887908 0.40673618]
[0.814228   0.71668776] [0.00129    0.00259927] [0.81260782 0.7137965 ]


In [144]:
lambda_grids = {'lambda_':np.logspace(-4, 0, 10).tolist()}
for model in [ridge_regression, reg_logistic_regression]:
    gridsearch = GridSearchCV(model,predict_labels,[acc_score,f1_score],lambda_grids, PartitionCrossVal, refit=False)
    _,_, scores, best_params, _ = gridsearch.fit(y, tX, pipeline,addition_on_train=remove_outliers,gamma=1, early_stopping=True)
    print(scores, best_params)
    print("===============")

{'lambda_': 0.0001} [0.809248   0.70566301] [0.8089881  0.70586912]
{'lambda_': 0.0002782559402207126} [0.808664   0.70309957] [0.80766598 0.70310381]
{'lambda_': 0.000774263682681127} [0.807184   0.69940843] [0.80580396 0.69899333]
{'lambda_': 0.002154434690031882} [0.804488   0.69412675] [0.80275665 0.6922365 ]
{'lambda_': 0.005994842503189409} [0.79962    0.68592577] [0.79745807 0.68152491]
{'lambda_': 0.016681005372000592} [0.791072   0.67336629] [0.7901234  0.66744205]
{'lambda_': 0.046415888336127774} [0.779644   0.65834988] [0.78272023 0.6550044 ]
{'lambda_': 0.12915496650148828} [0.766632   0.64528304] [0.77557925 0.64699434]
{'lambda_': 0.3593813663804626} [0.751328   0.63456209] [0.76644752 0.64167097]
{'lambda_': 1.0} [0.735168  0.6240507] [0.75356371 0.6338536 ]
([0.809248, 0.7056630086591363], [0.0008203999024865722, 0.0023977348536597795]) (('lambda_', 0.0001),)
{'lambda_': 0.0001} [0.813832   0.71568632] [0.8122476  0.71296468]
{'lambda_': 0.0002782559402207126} [0.81330

## Split by jet_num and mass, replace missing values by mean, remove outliers,  (jetnum-mass RO)<a class="anchor" id="jetnummassRO"></a>
[Back to top](#top)

In [211]:
pipeline = Pipeline(Imputer(replacenan='mean'), Normalizer())

In [212]:
for model in [least_squares,logistic_regression]:
    crossval = MultiPartitionCrossVal(model,predict_labels,[acc_score, f1_score],refit=False)
    _,_, scores_mean, scores_std, train_scores_mean = crossval.fit(y, tX,pipeline,addition_on_train=remove_outliers, gamma=1, early_stopping=True)
    print(scores_mean, scores_std, train_scores_mean)

[0.768708   0.64238216] [0.00138648 0.00355112] [0.77503108 0.64470516]
[0.769792   0.64972449] [0.00101922 0.00191239] [0.77632131 0.65158801]


In [213]:
lambda_grids = {'lambda_':np.logspace(-4, 0, 10).tolist()}
for model in [ridge_regression, reg_logistic_regression]:
    gridsearch = GridSearchCV(model,predict_labels,[acc_score,f1_score],lambda_grids, MultiPartitionCrossVal, refit=False)
    _,_, scores, best_params, _ = gridsearch.fit(y, tX, pipeline,addition_on_train=remove_outliers, gamma=1, early_stopping=True)
    print(scores, best_params)
    print("===============")

{'lambda_': 0.0001} [0.768684   0.64231392] [0.7749701  0.64461421]
{'lambda_': 0.0002782559402207126} [0.768636   0.64224889] [0.77494234 0.64451997]
{'lambda_': 0.000774263682681127} [0.76844    0.64179065] [0.77486913 0.64427709]
{'lambda_': 0.002154434690031882} [0.768288   0.64137823] [0.77468049 0.64370049]
{'lambda_': 0.005994842503189409} [0.767648   0.64023608] [0.77434316 0.64267931]
{'lambda_': 0.016681005372000592} [0.76586    0.63701553] [0.77329257 0.64031468]
{'lambda_': 0.046415888336127774} [0.762128   0.63182072] [0.77108705 0.63659723]
{'lambda_': 0.12915496650148828} [0.757492   0.62711335] [0.76779414 0.63256494]
{'lambda_': 0.3593813663804626} [0.751216   0.62227331] [0.76344066 0.62864758]
{'lambda_': 1.0} [0.7429     0.61575067] [0.75686258 0.62255   ]
([0.768684, 0.6423139187288733], [0.0013822532329497443, 0.003564923450373585]) (('lambda_', 0.0001),)
{'lambda_': 0.0001} [0.770044   0.65071269] [0.77638673 0.65262096]
{'lambda_': 0.0002782559402207126} [0.7700

## Split by jet_num and mass, replace missing values by mean, polynominal, remove_outliers (jetnum-mass RO+PE2)<a class="anchor" id="jetnummassROPE2"></a>
[Back to top](#top)

In [245]:
pipeline = Pipeline(Imputer(replacenan='mean'), PolynomialFeature(degree=2), Normalizer())

In [246]:
for model in [logistic_regression]:
    crossval = MultiPartitionCrossVal(model,predict_labels,[acc_score, f1_score],refit=False)
    _,_, scores_mean, scores_std, train_scores_mean = crossval.fit(y, tX,pipeline,addition_on_train=remove_outliers, gamma=1, early_stopping=True)
    print(scores_mean, scores_std, train_scores_mean)

[0.816232   0.71990382] [0.00135343 0.00253639] [0.81486677 0.7186922 ]


In [247]:
lambda_grids = {'lambda_':np.logspace(-4, 0, 10).tolist()}
for model in [reg_logistic_regression]:
    gridsearch = GridSearchCV(model,predict_labels,[acc_score,f1_score],lambda_grids, MultiPartitionCrossVal, refit=False)
    _,_, scores, best_params, _ = gridsearch.fit(y, tX, pipeline,addition_on_train=remove_outliers, gamma=1, early_stopping=True)
    print(scores, best_params)
    print("===============")

{'lambda_': 0.0001} [0.815796   0.71901768] [0.81433227 0.7175902 ]
{'lambda_': 0.0002782559402207126} [0.81488    0.71735152] [0.81357107 0.7159376 ]
{'lambda_': 0.000774263682681127} [0.812352   0.71287548] [0.81073    0.71055767]
{'lambda_': 0.002154434690031882} [0.806148   0.70279406] [0.80459187 0.69914821]
{'lambda_': 0.005994842503189409} [0.795616   0.68785536] [0.79552121 0.68301445]
{'lambda_': 0.016681005372000592} [0.782704   0.67014912] [0.78694585 0.66827023]
{'lambda_': 0.046415888336127774} [0.771516   0.65686069] [0.78042703 0.6585401 ]
{'lambda_': 0.12915496650148828} [0.761404   0.64700258] [0.773943   0.65128392]
{'lambda_': 0.3593813663804626} [0.749296   0.63574665] [0.76492043 0.64166992]
{'lambda_': 1.0} [0.718136  0.5921672] [0.73403344 0.59372247]
([0.815796, 0.7190176763813689], [0.0013261915397106317, 0.002567150206303843]) (('lambda_', 0.0001),)


## Split by jet_num and mass, replace missing values by mean, polynominal, remove_outliers (jetnum-mass RO+PE3)<a class="anchor" id="jetnummassROPE3"></a>
[Back to top](#top)

In [156]:
pipeline = Pipeline(Imputer(replacenan='mean'), PolynomialFeature(degree=3), Normalizer())

In [160]:
for model in [least_squares, least_squares_GD, least_squares_SGD,logistic_regression]:
    crossval = MultiPartitionCrossVal(model,predict_labels,[acc_score, f1_score],refit=False)
    _,_, scores_mean, scores_std, train_scores_mean = crossval.fit(y, tX,pipeline,addition_on_train=remove_outliers, gamma=1, early_stopping=True)
    print(scores_mean, scores_std, train_scores_mean)

[0.807396   0.70958592] [0.00156892 0.00316496] [0.81362943 0.71811213]
[0.788928  0.6851926] [0.00796543 0.01305507] [0.79623247 0.68835827]
[0.495324   0.40950154] [0.0299664  0.04229156] [0.4950122 0.4168855]
[0.817216   0.72494503] [0.00107788 0.00254394] [0.81606788 0.7237359 ]


In [162]:
lambda_grids = {'lambda_':np.logspace(-4, 0, 10).tolist()}
for model in [ridge_regression, reg_logistic_regression]:
    gridsearch = GridSearchCV(model,predict_labels,[acc_score,f1_score],lambda_grids, MultiPartitionCrossVal, refit=False)
    _,_, scores, best_params, _ = gridsearch.fit(y, tX, pipeline,addition_on_train=remove_outliers, gamma=1, early_stopping=True)
    print(scores, best_params)
    print("===============")

{'lambda_': 0.0001} [0.81332    0.71492009] [0.8119489  0.71554668]
{'lambda_': 0.0002782559402207126} [0.813184   0.71385586] [0.81125514 0.71397586]
{'lambda_': 0.000774263682681127} [0.811764  0.7105651] [0.80999311 0.71116026]
{'lambda_': 0.002154434690031882} [0.809392   0.70599772] [0.80717084 0.70534567]
{'lambda_': 0.005994842503189409} [0.804788   0.69817336] [0.80220941 0.69566008]
{'lambda_': 0.016681005372000592} [0.796824   0.68686077] [0.79492902 0.68217072]
{'lambda_': 0.046415888336127774} [0.78496    0.67110478] [0.78645567 0.66788181]
{'lambda_': 0.12915496650148828} [0.771648  0.6563666] [0.77937688 0.65819679]
{'lambda_': 0.3593813663804626} [0.757732   0.64442959] [0.77146282 0.65101188]
{'lambda_': 1.0} [0.740708   0.62961718] [0.75896261 0.63930799]
([0.8133199999999998, 0.7149200941966323], [0.0011819306240215476, 0.0024035432720495853]) (('lambda_', 0.0001),)
{'lambda_': 0.0001} [0.816976   0.72423903] [0.8156877  0.72293546]
{'lambda_': 0.0002782559402207126} 

# Grid search degree

## Degree 2<a class="anchor" id="deg2"></a>
[Back to top](#top)

In [233]:
pipeline = Pipeline(Imputer(replacenan='mean'), PolynomialFeature(degree=2), Normalizer())

In [234]:
for model in [logistic_regression]:
    crossval = MultiPartitionCrossVal(model,predict_labels,[acc_score, f1_score],refit=False)
    _,_, scores_mean, scores_std, train_scores_mean = crossval.fit(y, tX,pipeline, gamma=1, early_stopping=True)
    print(scores_mean, scores_std, train_scores_mean)

[0.814008   0.71339693] [0.00117389 0.00266381] [0.814849   0.71476799]


In [235]:
lambda_grids = {'lambda_':np.logspace(-4, 0, 10).tolist()}
for model in [reg_logistic_regression]:
    gridsearch = GridSearchCV(model,predict_labels,[acc_score,f1_score],lambda_grids, MultiPartitionCrossVal, refit=False)
    _,_, scores, best_params, _ = gridsearch.fit(y, tX, pipeline, gamma=1, early_stopping=True)
    print(scores, best_params)
    print("===============")

{'lambda_': 0.0001} [0.813716   0.71291014] [0.814437   0.71401943]
{'lambda_': 0.0002782559402207126} [0.81276    0.71108953] [0.813508   0.71223328]
{'lambda_': 0.000774263682681127} [0.810128   0.70629762] [0.81105    0.70760872]
{'lambda_': 0.002154434690031882} [0.805216   0.69705609] [0.805743   0.69793082]
{'lambda_': 0.005994842503189409} [0.797388   0.68321683] [0.798038   0.68420306]
{'lambda_': 0.016681005372000592} [0.789164   0.66914296] [0.789592   0.66990038]
{'lambda_': 0.046415888336127774} [0.781276   0.65836647] [0.781852   0.65920094]
{'lambda_': 0.12915496650148828} [0.768108   0.64645248] [0.768283   0.64680931]
{'lambda_': 0.3593813663804626} [0.739368   0.61829387] [0.739629   0.61847744]
{'lambda_': 1.0} [0.623132   0.43794767] [0.623093   0.43781832]
([0.813716, 0.7129101374485363], [0.001177617934646034, 0.0026320535893520794]) (('lambda_', 0.0001),)


## Degree 3<a class="anchor" id="deg3"></a>
[Back to top](#top)

In [194]:
pipeline = Pipeline(Imputer(replacenan='mean'), PolynomialFeature(degree=3), Normalizer())

In [195]:
for model in [least_squares, least_squares_GD, least_squares_SGD,logistic_regression]:
    crossval = MultiPartitionCrossVal(model,predict_labels,[acc_score, f1_score],refit=False)
    _,_, scores_mean, scores_std, train_scores_mean = crossval.fit(y, tX,pipeline, gamma=1, early_stopping=True)
    print(scores_mean, scores_std, train_scores_mean)

[0.804412  0.6959638] [0.00155603 0.00334826] [0.805698   0.69789394]
[0.442856  0.4426387] [0.00167186 0.00224319] [0.442716   0.44244972]
[0.493076   0.40087998] [0.02183357 0.0376951 ] [0.491935   0.39936275]
[0.809404   0.70602775] [0.00121269 0.00283265] [0.810515   0.70765823]


In [196]:
lambda_grids = {'lambda_':np.logspace(-4, 0, 10).tolist()}
for model in [ridge_regression, reg_logistic_regression]:
    gridsearch = GridSearchCV(model,predict_labels,[acc_score,f1_score],lambda_grids, MultiPartitionCrossVal, refit=False)
    _,_, scores, best_params, _ = gridsearch.fit(y, tX, pipeline, gamma=1, early_stopping=True)
    print(scores, best_params)
    print("===============")

{'lambda_': 0.0001} [0.803428   0.69362701] [0.80461    0.69545739]
{'lambda_': 0.0002782559402207126} [0.802412   0.69142343] [0.803505   0.69324586]
{'lambda_': 0.000774263682681127} [0.800636   0.68793518] [0.80151    0.68936094]
{'lambda_': 0.002154434690031882} [0.797756 0.682586] [0.79871    0.68405312]
{'lambda_': 0.005994842503189409} [0.794248   0.67596725] [0.795201   0.67736812]
{'lambda_': 0.016681005372000592} [0.78962    0.66716196] [0.790196   0.66786904]
{'lambda_': 0.046415888336127774} [0.783948   0.65712791] [0.78474    0.65830904]
{'lambda_': 0.12915496650148828} [0.778552   0.65117323] [0.779016  0.6518554]
{'lambda_': 0.3593813663804626} [0.772044   0.65074805] [0.772548   0.65141496]
{'lambda_': 1.0} [0.7612     0.65026703] [0.7615     0.65067494]
([0.803428, 0.6936270053142235], [0.001566172404302897, 0.003611208928045982]) (('lambda_', 0.0001),)
{'lambda_': 0.0001} [0.813536   0.71401707] [0.814134   0.71486012]
{'lambda_': 0.0002782559402207126} [0.812092   0.

## Degree 4<a class="anchor" id="deg4"></a>
[Back to top](#top)

In [242]:
pipeline = Pipeline(Imputer(replacenan='mean'),PolynomialFeature(degree=4), Normalizer())

In [243]:
for model in [logistic_regression]:
    crossval = MultiPartitionCrossVal(model,predict_labels,[acc_score, f1_score],refit=False)
    _,_, scores_mean, scores_std, train_scores_mean = crossval.fit(y, tX,pipeline, gamma=1, early_stopping=True)
    print(scores_mean, scores_std, train_scores_mean)

[0.809928   0.70746394] [0.00134287 0.00282417] [0.811144  0.7093707]


In [244]:
lambda_grids = {'lambda_':np.logspace(-4, 0, 10).tolist()}
for model in [reg_logistic_regression]:
    gridsearch = GridSearchCV(model,predict_labels,[acc_score,f1_score],lambda_grids, MultiPartitionCrossVal, refit=False)
    _,_, scores, best_params, _ = gridsearch.fit(y, tX, pipeline, gamma=1, early_stopping=True)
    print(scores, best_params)
    print("===============")

{'lambda_': 0.0001} [0.813052   0.71398582] [0.813845   0.71515177]
{'lambda_': 0.0002782559402207126} [0.812476   0.71315009] [0.813074   0.71395606]
{'lambda_': 0.000774263682681127} [0.810116   0.70797974] [0.811096   0.70951455]
{'lambda_': 0.002154434690031882} [0.80456    0.69875729] [0.805847   0.70086143]
{'lambda_': 0.005994842503189409} [0.796508  0.6847419] [0.797401   0.68620281]
{'lambda_': 0.016681005372000592} [0.787084   0.66841462] [0.787492   0.66899264]
{'lambda_': 0.046415888336127774} [0.77478    0.65092243] [0.775189  0.6511619]
{'lambda_': 0.12915496650148828} [0.756816   0.63726899] [0.757161   0.63794537]
{'lambda_': 0.3593813663804626} [0.711576  0.5964134] [0.71107    0.59597678]
{'lambda_': 1.0} [0.60398    0.41774552] [0.604066   0.41759048]
([0.8130520000000001, 0.7139858194691941], [0.0012612121153874262, 0.00373780125897389]) (('lambda_', 0.0001),)


## Degree 5<a class="anchor" id="deg5"></a>
[Back to top](#top)

In [239]:
pipeline = Pipeline(Imputer(replacenan='mean'), PolynomialFeature(degree=5), Normalizer())

In [240]:
for model in [logistic_regression]:
    crossval = MultiPartitionCrossVal(model,predict_labels,[acc_score, f1_score],refit=False)
    _,_, scores_mean, scores_std, train_scores_mean = crossval.fit(y, tX,pipeline,gamma=1, early_stopping=True)
    print(scores_mean, scores_std, train_scores_mean)

[0.810688   0.70920358] [0.00123188 0.00262929] [0.812125   0.71137924]


In [241]:
lambda_grids = {'lambda_':np.logspace(-4, 0, 10).tolist()}
for model in [reg_logistic_regression]:
    gridsearch = GridSearchCV(model,predict_labels,[acc_score,f1_score],lambda_grids, MultiPartitionCrossVal, refit=False)
    _,_, scores, best_params, _ = gridsearch.fit(y, tX, pipeline, gamma=1, early_stopping=True)
    print(scores, best_params)
    print("===============")

{'lambda_': 0.0001} [0.813376   0.71465139] [0.814645   0.71650224]
{'lambda_': 0.0002782559402207126} [0.812256   0.71253257] [0.813495   0.71432638]
{'lambda_': 0.000774263682681127} [0.809368   0.70764561] [0.810864   0.71009459]
{'lambda_': 0.002154434690031882} [0.805232   0.69968277] [0.806103   0.70079968]
{'lambda_': 0.005994842503189409} [0.796764   0.68509711] [0.797646   0.68635812]
{'lambda_': 0.016681005372000592} [0.787164  0.6689665] [0.788083   0.67027612]
{'lambda_': 0.046415888336127774} [0.774196   0.65240601] [0.775012   0.65342794]
{'lambda_': 0.12915496650148828} [0.75556    0.63788034] [0.756044   0.63857053]
{'lambda_': 0.3593813663804626} [0.701744   0.58969104] [0.701707   0.58990584]
{'lambda_': 1.0} [0.607604   0.43040041] [0.607433   0.43003086]
([0.8133760000000001, 0.7146513930993174], [0.0012568786735401198, 0.003652745075022674]) (('lambda_', 0.0001),)


# Test cross features<a class="anchor" id="crossfeat"></a>
[Back to top](#top)

In [172]:
pipeline = Pipeline(PolynomialFeature(degree=3, cross_feat=True), Normalizer())

In [173]:
for model in [logistic_regression]:
    crossval = MultiPartitionCrossVal(model,predict_labels,[acc_score, f1_score],refit=False)
    _,_, scores_mean, scores_std, train_scores_mean = crossval.fit(y, tX,pipeline,addition_on_train=remove_outliers, gamma=1, early_stopping=True)
    print(scores_mean, scores_std, train_scores_mean)

[0.701196   0.61763487] [0.00852133 0.00777065] [0.79227946 0.67365436]


In [174]:
lambda_grids = {'lambda_':np.logspace(-4, 0, 10).tolist()}
for model in [reg_logistic_regression]:
    gridsearch = GridSearchCV(model,predict_labels,[acc_score,f1_score],lambda_grids, MultiPartitionCrossVal, refit=False)
    _,_, scores, best_params, _ = gridsearch.fit(y, tX, pipeline,addition_on_train=remove_outliers, gamma=1, early_stopping=True)
    print(scores, best_params)
    print("===============")

{'lambda_': 0.0001} [0.721116   0.61040352] [0.75472871 0.59387318]
{'lambda_': 0.0002782559402207126} [0.717204   0.61266196] [0.75177774 0.61013748]
{'lambda_': 0.000774263682681127} [0.701676   0.59665187] [0.73425524 0.59312896]
{'lambda_': 0.002154434690031882} [0.703296  0.5930756] [0.7441308  0.60819716]
{'lambda_': 0.005994842503189409} [0.689908   0.57718048] [0.73059638 0.5891272 ]
{'lambda_': 0.016681005372000592} [0.6554     0.54786897] [0.70582058 0.576512  ]
{'lambda_': 0.046415888336127774} [0.648988   0.53377204] [0.68117336 0.51951268]
{'lambda_': 0.12915496650148828} [0.595616   0.49874908] [0.64193844 0.51493984]
{'lambda_': 0.3593813663804626} [0.539016   0.45122599] [0.57650176 0.46925994]
{'lambda_': 1.0} [0.55412    0.43351646] [0.53857923 0.38042379]
([0.7211160000000001, 0.6104035202362443], [0.01020421403146758, 0.01700496303751363]) (('lambda_', 0.0001),)


# Test non-linear transformation<a class="anchor" id="nonlinear"></a>
[Back to top](#top)

In [188]:
pipeline = Pipeline(NonLinearTransformer([lambda x: np.sqrt(np.abs(x)), np.sin]),PolynomialFeature(degree=3), Normalizer())

In [189]:
for model in [logistic_regression]:
    crossval = MultiPartitionCrossVal(model,predict_labels,[acc_score, f1_score],refit=False)
    _,_, scores_mean, scores_std, train_scores_mean = crossval.fit(y, tX,pipeline,addition_on_train=remove_outliers, gamma=1, early_stopping=True)
    print(scores_mean, scores_std, train_scores_mean)

[0.812504   0.72334777] [0.00103347 0.00192255] [0.81489463 0.72716698]


In [190]:
lambda_grids = {'lambda_':np.logspace(-4, 0, 10).tolist()}
for model in [reg_logistic_regression]:
    gridsearch = GridSearchCV(model,predict_labels,[acc_score,f1_score],lambda_grids, MultiPartitionCrossVal, refit=False)
    _,_, scores, best_params, _ = gridsearch.fit(y, tX, pipeline,addition_on_train=remove_outliers, gamma=1, early_stopping=True)
    print(scores, best_params)
    print("===============")

{'lambda_': 0.0001} [0.80854    0.71467158] [0.80875082 0.717147  ]
{'lambda_': 0.0002782559402207126} [0.80666    0.70937121] [0.80666107 0.71154452]
{'lambda_': 0.000774263682681127} [0.801616   0.70095647] [0.80160967 0.70316896]
{'lambda_': 0.002154434690031882} [0.7911     0.68782986] [0.7902298 0.6933037]
{'lambda_': 0.005994842503189409} [0.77596    0.67165008] [0.77606838 0.6793397 ]
{'lambda_': 0.016681005372000592} [0.764348   0.64192728] [0.76677595 0.64262441]
{'lambda_': 0.046415888336127774} [0.744092   0.62025099] [0.75417765 0.62983176]
{'lambda_': 0.12915496650148828} [0.693576   0.57847097] [0.71434349 0.60368109]
{'lambda_': 0.3593813663804626} [0.62362    0.52920554] [0.63937697 0.55349507]
{'lambda_': 1.0} [0.569052   0.41150758] [0.57437016 0.39015329]
([0.80854, 0.7146715809459194], [0.0008798181630314519, 0.006476959784086239]) (('lambda_', 0.0001),)


In [191]:
pipeline = Pipeline(NonLinearTransformer([lambda x: np.sqrt(np.abs(x)), np.sin]),PolynomialFeature(degree=2,cross_feat=True), Normalizer())

In [192]:
for model in [logistic_regression]:
    crossval = MultiPartitionCrossVal(model,predict_labels,[acc_score, f1_score],refit=False)
    _,_, scores_mean, scores_std, train_scores_mean = crossval.fit(y, tX,pipeline,addition_on_train=remove_outliers, gamma=1, early_stopping=True)
    print(scores_mean, scores_std, train_scores_mean)

[0.760368   0.67337469] [0.00253786 0.00364473] [0.80486955 0.71214998]


In [193]:
lambda_grids = {'lambda_':np.logspace(-4, 0, 10).tolist()}
for model in [reg_logistic_regression]:
    gridsearch = GridSearchCV(model,predict_labels,[acc_score,f1_score],lambda_grids, MultiPartitionCrossVal, refit=False)
    _,_, scores, best_params, _ = gridsearch.fit(y, tX, pipeline,addition_on_train=remove_outliers, gamma=1, early_stopping=True)
    print(scores, best_params)
    print("===============")

{'lambda_': 0.0001} [0.743892   0.64487943] [0.75502702 0.64869473]
{'lambda_': 0.0002782559402207126} [0.744136   0.64085874] [0.75641389 0.64617246]
{'lambda_': 0.000774263682681127} [0.747976  0.6421104] [0.75813889 0.64377433]
{'lambda_': 0.002154434690031882} [0.72592   0.6145374] [0.73299394 0.60620009]


KeyboardInterrupt: 