In [2]:
# TO RUN IT: CHANGE MY PATH
%matplotlib inline
%load_ext autoreload
%autoreload 2

# change path if necessary
import sys
my_path = r'D:\Documents\etudes\epfl\MA1\cours\MachineLearning\Project1'
sys.path.insert(0,my_path + r'\code\COMMON')

# imports
import numpy as np 
import matplotlib.pyplot as plt

from implementations import *
from labels import idx_2labels
from costs import *
from optimize_hyperparams import *
from cross_validation import *
from step_wise import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Build dataset

In [3]:
from proj1_helpers import load_csv_data 

# load raw data
y_raw, input_data_raw, ids = load_csv_data(my_path + r'\data\train.csv', sub_sample=False)

In [4]:
from outliers import handle_outliers

# handle outliers
X0, y = handle_outliers(input_data_raw, y_raw, -999, 'median')

# set y in {0,1} instead of {-1,1}
y[np.where(y==-1)]=0

-999 are replaced by the median value of the feature


In [5]:
# get feature names 
all_features_raw = list(np.genfromtxt(my_path + r'/data/train.csv', delimiter=",", dtype=str, max_rows = 1)[2:])

In [20]:
# adding power transformations
degree = 1

# build polynomial basis function
phi = build_poly(X0, degree)

# create feature set
all_candidates = phi[:,1:]

# adding degree names
all_features = []
for i in range(len(all_features_raw)):
    for d in range(degree):
        all_features.append(all_features_raw[i] + '_power_' + str(d+1))

In [21]:
print(all_candidates.shape)
print(all_features)

(250000, 30)
['DER_mass_MMC_power_1', 'DER_mass_transverse_met_lep_power_1', 'DER_mass_vis_power_1', 'DER_pt_h_power_1', 'DER_deltaeta_jet_jet_power_1', 'DER_mass_jet_jet_power_1', 'DER_prodeta_jet_jet_power_1', 'DER_deltar_tau_lep_power_1', 'DER_pt_tot_power_1', 'DER_sum_pt_power_1', 'DER_pt_ratio_lep_tau_power_1', 'DER_met_phi_centrality_power_1', 'DER_lep_eta_centrality_power_1', 'PRI_tau_pt_power_1', 'PRI_tau_eta_power_1', 'PRI_tau_phi_power_1', 'PRI_lep_pt_power_1', 'PRI_lep_eta_power_1', 'PRI_lep_phi_power_1', 'PRI_met_power_1', 'PRI_met_phi_power_1', 'PRI_met_sumet_power_1', 'PRI_jet_num_power_1', 'PRI_jet_leading_pt_power_1', 'PRI_jet_leading_eta_power_1', 'PRI_jet_leading_phi_power_1', 'PRI_jet_subleading_pt_power_1', 'PRI_jet_subleading_eta_power_1', 'PRI_jet_subleading_phi_power_1', 'PRI_jet_all_pt_power_1']


In [22]:
# adding logarithmic transformation of masses
all_candidates = np.concatenate((all_candidates, np.log(1+np.abs(X0[:,0])).reshape(len(X0),1)),axis =1)
all_candidates = np.concatenate((all_candidates, np.log(1+np.abs(X0[:,1])).reshape(len(X0),1)),axis =1)
all_candidates = np.concatenate((all_candidates, np.log(1+np.abs(X0[:,2])).reshape(len(X0),1)),axis =1)
all_candidates = np.concatenate((all_candidates, np.log(1+np.abs(X0[:,5])).reshape(len(X0),1)),axis =1)

# adding logarithm names
all_features.append('log(DER_mass_MMC)')
all_features.append('log(DER_mass_transverse_met_lep)')
all_features.append('log(DER_mass_vis)')
all_features.append('log(DER_mass_jet_jet)')

In [23]:
# standardize
all_candidates,_ ,_ = standardize(all_candidates) 

In [25]:
# list of feature names and indices
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

print(all_candidates.shape)
print(all_features_raw)
print(len(features))
print(features)

(250000, 34)
['DER_mass_MMC', 'DER_mass_transverse_met_lep', 'DER_mass_vis', 'DER_pt_h', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_deltar_tau_lep', 'DER_pt_tot', 'DER_sum_pt', 'DER_pt_ratio_lep_tau', 'DER_met_phi_centrality', 'DER_lep_eta_centrality', 'PRI_tau_pt', 'PRI_tau_eta', 'PRI_tau_phi', 'PRI_lep_pt', 'PRI_lep_eta', 'PRI_lep_phi', 'PRI_met', 'PRI_met_phi', 'PRI_met_sumet', 'PRI_jet_num', 'PRI_jet_leading_pt', 'PRI_jet_leading_eta', 'PRI_jet_leading_phi', 'PRI_jet_subleading_pt', 'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi', 'PRI_jet_all_pt']
34
[(0, 'DER_mass_MMC_power_1'), (1, 'DER_mass_transverse_met_lep_power_1'), (2, 'DER_mass_vis_power_1'), (3, 'DER_pt_h_power_1'), (4, 'DER_deltaeta_jet_jet_power_1'), (5, 'DER_mass_jet_jet_power_1'), (6, 'DER_prodeta_jet_jet_power_1'), (7, 'DER_deltar_tau_lep_power_1'), (8, 'DER_pt_tot_power_1'), (9, 'DER_sum_pt_power_1'), (10, 'DER_pt_ratio_lep_tau_power_1'), (11, 'DER_met_phi_centrality_power_1'), (12, 

# Logistic Regression (no penalization)

In [None]:
# model parameters
model = dict()
model['method'] = 'lr' # 'lrr' if regularized with a penalization 'lambda_' otherwise 'lr' 
model['loss'] = 'loglikelihood'

# model hyperparameter
model['gamma'] = 1e-5

# other
model['method_minimization'] = 'gd' # 'gd' (gradient_descent) or 'newton' (newton descent leads to non invertible matrice "S" which is too big)
model['threshold'] = 1
model['max_iters'] = 10000
model['debug_mode'] = 1

# R2 type
R2_method = 'McFadden' # 'loss', 'Tjur' or 'McFadden' 

# estimate R2 error through cross validation (1 or 0)
cv = 1
model['k_fold'] = 10

In [None]:
# step-wise
best_R2adj, idx_features = stepwise(model, R2_method, all_candidates, features, y, cv)

In [None]:
# display selected features
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)