# Features selection with STEPWISE

In [1]:
import sys
my_path = r'/home/ilaria/Scrivania/Machine_Learning/Project_1/Project1_ML'
sys.path.insert(0,my_path + r'/code/COMMON')

import numpy as np 
import matplotlib.pyplot as plt
from proj1_helpers import load_csv_data, predict_labels 
from implementations import *
from outliers import handle_outliers
from labels import idx_2labels
from standard import standardize

In [2]:
yb, input_data, ids = load_csv_data(my_path + r'/data/train.csv', sub_sample=False)

In [3]:
input_data.shape

(250000, 30)

In [4]:
input_data, Y = handle_outliers(input_data,yb,-999,'mean') # substiution with mean because the standardization
                                                           #can be affected, otherwise we should delete the whole row
ind_back, ind_sig = idx_2labels(Y, [-1,1])

input_data, mean_X, std_X = standardize(input_data)    

-999 are replaced by the mean value of the feature


In [5]:
# Subdived the X features space in single features
all_features = np.genfromtxt(my_path + r'/data/train.csv', delimiter=",", dtype=str, max_rows = 1)[2:]
# converting array in list in order to simplify the adding of features
all_features = list(all_features)

features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))


## R^2  as stopping criteria : R2 with error , R2 of Tjur or R2 of McFadden

The only way to use the stepwise is using R2 of Tjur or McFadden because of the binary values of the indipendent variable, but the error was also used 

In [6]:
def results_r2_stepwise(list_r2_adj,indices_features):
    print("R2 asjusted values:")
    
    for i in range(len(list_r2_adj)):
        print(list_r2_adj[i])
    print("-------------------------------------------------------")
    print("Number of features chosen:", len(indices_features))
    print("\n")
    print("Indices of features chosen: ", indices_features)
    

## Least square model

### No cross validation

#### Use of the error before binarization (R2 with loss)

In [7]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = least_squares(Y,X)  # The loss cannot be used as a measure for the feature selection because it's a 
y = predict_labels(w0, X)

sse = loss
sst = np.sum((Y - Y.mean())**2)  #lack of information
R2 = np.abs((sst-sse)/sst)
R2adj_0 = R2 - (k/(n-k-1)*(1-R2))

#fix the R2adj_max

R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        ws , loss = least_squares(Y,X)
        k = len(ws) -1 # k is the number of regressor I use -> -1 because I don't consider the offset
        
        #y = predict_labels(ws,X)   #***** NO USE OF PREDICTION
        SSE = loss
        SST = np.sum((Y- Y.mean())**2)
        R2 = np.abs((SST-SSE)/SST)
        R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)
    
    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen
        
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)
        
        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])
        
        del(X)
        
    else:
        break
        

-------------------------------------------------
Feature chosen:  DER_mass_transverse_met_lep (index : 1 )
-------------------------------------------------
Feature chosen:  PRI_tau_pt (index : 13 )
-------------------------------------------------
Feature chosen:  DER_deltaeta_jet_jet (index : 4 )
-------------------------------------------------
Feature chosen:  DER_met_phi_centrality (index : 11 )
-------------------------------------------------
Feature chosen:  DER_deltar_tau_lep (index : 7 )
-------------------------------------------------
Feature chosen:  DER_mass_vis (index : 2 )
-------------------------------------------------
Feature chosen:  PRI_lep_pt (index : 16 )
-------------------------------------------------
Feature chosen:  DER_pt_ratio_lep_tau (index : 10 )
-------------------------------------------------
Feature chosen:  PRI_met (index : 19 )
-------------------------------------------------
Feature chosen:  DER_lep_eta_centrality (index : 12 )
----------------

In [8]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
0.999998246996
0.999998316254
0.999998366571
0.999998394757
0.99999840969
0.999998424917
0.99999844367
0.999998459265
0.999998468735
0.999998474896
0.99999847968
0.999998482105
0.999998484408
0.999998486981
0.99999848737
0.999998488309
0.999998488373
0.999998488384
0.99999848839
0.999998488392
0.999998488393
-------------------------------------------------------
Number of features chosen: 21


Indices of features chosen:  [1, 13, 4, 11, 7, 2, 16, 10, 19, 12, 23, 8, 5, 26, 22, 21, 0, 18, 6, 3, 28]


#### Use of the probability of the 2 events (R2 Tjur)

In [10]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))


In [11]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = least_squares(Y,X) 
y = predict_labels(w0, X)
ind_back, ind_sig = idx_2labels(y, [-1,1])

y_ = X.dot(w0)
R2 = 0
R2adj_0 = R2 - (k/(n-k-1)*(1-R2))

#fix the R2adj_max
R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        ws , loss = least_squares(Y,X)
        k = len(ws) -1 # k is the number of regressor I use -> -1 because I don't consider the offset
        
        y = predict_labels(ws,X)          
        ind_back, ind_sig = idx_2labels(y, [-1,1])
        
        if len(ind_sig) == 0 or len(ind_back) ==0:
            print('No signal detected')
            R2_adj.append(0)
            
        else: 
            
            y_ = X.dot(ws)

            R2 = np.abs((np.mean(y_[ind_sig]) - np.mean(y_[ind_back])))
            R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
            
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)

    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen

        #idx_features.append(np.where(all_candidates[:,ind_max] == input_data))
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)

        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])

        del(X)

    else:
        break

No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
-------------------------------------------------
Feature chosen:  DER_mass_jet_jet (index : 5 )
-------------------------------------------------
Feature chosen:  PRI_tau_pt (index : 13 )
-------------------------------------------------
Feature chosen:  DER_pt_h (index : 3 )
-------------------------------------------------
Feature chosen:  PRI_jet_subleading_pt (index : 26 )
-------------------------------------------------
Feature chosen:  DER_mass_vis (index : 2 )
-------------------------------------------------
Feature chosen:  PRI_met (index : 19 )
-------------------------------------------------
Feature chosen:  DER_deltar_tau_lep (index : 7 )
---------------

In [12]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
0.6926161766
0.728794138821
0.761245047368
0.779841035607
0.788068513396
0.792258513061
0.795905022014
0.796761064607
0.796904922471
0.797161806573
0.797239276567
0.797263761998
-------------------------------------------------------
Number of features chosen: 12


Indices of features chosen:  [5, 13, 3, 26, 2, 19, 7, 29, 8, 28, 27, 24]


#### Use of the likelihood (McFadden)

In [13]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))
   

In [14]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = least_squares(Y,X)  # The loss cannot be used as a measure for the feature selection because it's a 
y = predict_labels(w0, X)
loglike0 = np.sum(np.log(1+np.exp(X.dot(w0))) - y*(X.dot(w0)))

R2 = 0        # For the definition of McFadden 1-1 = 0
R2adj_0 = 0

#fix the R2adj_max

R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        ws , loss = least_squares(Y,X)
        k = len(ws) -1 # k is the number of regressor I use -> -1 because I don't consider the offset
        
        y = predict_labels(ws,X)   
        
        loglike = np.sum(np.log(1+np.exp(X.dot(ws))) - y*(X.dot(ws)))
        
        R2 = 1-(loglike/loglike0)
        R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)
    
    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen
        
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)
        
        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])
        
        del(X)
        
    else:
        break
        

-------------------------------------------------
Feature chosen:  PRI_tau_pt (index : 13 )
-------------------------------------------------
Feature chosen:  DER_mass_transverse_met_lep (index : 1 )
-------------------------------------------------
Feature chosen:  DER_deltaeta_jet_jet (index : 4 )
-------------------------------------------------
Feature chosen:  DER_met_phi_centrality (index : 11 )
-------------------------------------------------
Feature chosen:  DER_lep_eta_centrality (index : 12 )
-------------------------------------------------
Feature chosen:  PRI_met (index : 19 )
-------------------------------------------------
Feature chosen:  DER_pt_h (index : 3 )
-------------------------------------------------
Feature chosen:  PRI_lep_pt (index : 16 )
-------------------------------------------------
Feature chosen:  DER_mass_vis (index : 2 )
-------------------------------------------------
Feature chosen:  DER_deltar_tau_lep (index : 7 )
-----------------------------

In [15]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
0.144851442193
0.245175579312
0.352166652978
0.411696418459
0.424697884095
0.432275696692
0.443221001254
0.473450393588
0.477523535661
0.48312018156
0.490469990509
0.492634836048
0.494301061305
0.495636490642
0.496260491241
0.496303686644
0.496337978842
0.496353401017
0.496363839013
0.496372915688
0.496373777213
0.496375121667
0.496376035883
0.496376218564
-------------------------------------------------------
Number of features chosen: 24


Indices of features chosen:  [13, 1, 4, 11, 12, 19, 3, 16, 2, 7, 23, 8, 22, 21, 5, 18, 0, 20, 17, 28, 27, 24, 14, 15]


### Using cross validation

I am not sure that cross validation can be used we don't have to estimate any hyperparameters

#### R2 with likelihood (McFadden)

In [11]:
from split_data import split_data

sys.path.insert(0,my_path + r'/code/ilaria')
from i_cross_validation_methods import *


In [12]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [18]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = least_squares(Y,X)  # The loss cannot be used as a measure for the feature selection because it's a 
y = predict_labels(w0, X)
loglike0 = np.sum(np.log(1+np.exp(X.dot(w0))) - y*(X.dot(w0)))

R2 = 0        # For the definition of McFadden 1-1 = 0
R2adj_0 = 0

#fix the R2adj_max

R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        
        # CROSS-VALIDATION
        
        w_tr_tot, loss_tr_tot, loss_te_tot = cross_validation_ls(Y,X)
        ws = w_tr_tot[np.argmin(loss_te_tot)]
        
        k = len(ws) -1 # k is the number of regressor I use -> -1 because I don't consider the offset
        
        y = predict_labels(ws,X)   
        
        loglike = np.sum(np.log(1+np.exp(X.dot(ws))) - y*(X.dot(ws)))
        
        R2 = 1-(loglike/loglike0)
        R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)
    
    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen
        
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)
        
        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])
        
        del(X)
        
    else:
        break

-------------------------------------------------
Feature chosen:  PRI_tau_pt (index : 13 )
-------------------------------------------------
Feature chosen:  DER_mass_transverse_met_lep (index : 1 )
-------------------------------------------------
Feature chosen:  DER_deltaeta_jet_jet (index : 4 )
-------------------------------------------------
Feature chosen:  DER_met_phi_centrality (index : 11 )
-------------------------------------------------
Feature chosen:  DER_lep_eta_centrality (index : 12 )
-------------------------------------------------
Feature chosen:  PRI_met (index : 19 )
-------------------------------------------------
Feature chosen:  DER_pt_h (index : 3 )
-------------------------------------------------
Feature chosen:  PRI_lep_pt (index : 16 )
-------------------------------------------------
Feature chosen:  DER_mass_vis (index : 2 )
-------------------------------------------------
Feature chosen:  DER_deltar_tau_lep (index : 7 )
-----------------------------

In [19]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
0.140551720143
0.241171400994
0.34855071277
0.408699253641
0.421523878725
0.429019886186
0.440216159411
0.469680633536
0.473784812642
0.479594192023
0.486825612512
0.491425220535
0.493145754342
0.494560586668
0.494838404348
0.494894893433
0.494942204952
0.49498971502
0.495234470107
0.495273038038
0.49528255555
0.495288753521
0.495294129816
0.495297050008
0.495297102825
-------------------------------------------------------
Number of features chosen: 25


Indices of features chosen:  [13, 1, 4, 11, 12, 19, 3, 16, 2, 7, 23, 8, 22, 21, 6, 18, 0, 26, 5, 20, 28, 17, 24, 15, 27]


## Ridge regression

I used only lambda as hyperparameter because I am not building a polynomial model. But we can add different features transformation (square or log or power), so maybe we can test the polynomial after the best features are selected 

### No cross validation

In [29]:
# set lambda
lambda_ = 0.1

#### Using the loss from ridge regression

In [30]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))


In [31]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = ridge_regression(Y,X,0)  # start with lambda = 0  
y = predict_labels(w0, X)

sse = loss
sst = np.sum((Y - Y.mean())**2)  #lack of information
R2 = np.abs((sst-sse)/sst)
R2adj_0 = R2 - (k/(n-k-1)*(1-R2))

#fix the R2adj_max

R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        
        ws, loss = ridge_regression(Y,X,lambda_)
        k = len(ws) -1 # k is the number of regressor I use -> -1 because I don't consider the offset
        
        SSE = loss
        SST = np.sum((Y- Y.mean())**2)   # it has no sense
        R2 = np.abs((SST-SSE)/SST)
        R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)
    
    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen
        
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)
        
        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])
        
        del(X)
        
    else:
        break
        

-------------------------------------------------
Feature chosen:  DER_mass_transverse_met_lep (index : 1 )
-------------------------------------------------
Feature chosen:  DER_mass_jet_jet (index : 5 )
-------------------------------------------------
Feature chosen:  DER_pt_h (index : 3 )
-------------------------------------------------
Feature chosen:  PRI_jet_leading_pt (index : 23 )
-------------------------------------------------
Feature chosen:  DER_sum_pt (index : 9 )


In [32]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
0.999998071398
0.999998125118
0.999998156156
0.99999816116
0.999998162901
-------------------------------------------------------
Number of features chosen: 5


Indices of features chosen:  [1, 5, 3, 23, 9]


#### Use of the probability of the 2 events (R2 Tjur)

In [33]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [34]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = ridge_regression(Y,X,0)  
y = predict_labels(w0, X)
ind_back, ind_sig = idx_2labels(y, [-1,1])

y_ = X.dot(w0)
R2 = 0
R2adj_0 = R2 - (k/(n-k-1)*(1-R2))

#fix the R2adj_max
R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        ws, loss = ridge_regression(Y,X,lambda_)
        
        k = len(ws) -1 # k is the number of regressor I use -> -1 because I don't consider the offset
        
        y = predict_labels(ws,X)          
        ind_back, ind_sig = idx_2labels(y, [-1,1])
        
        if len(ind_sig) == 0 or len(ind_back) ==0:
            print('No signal detected')
            R2_adj.append(0)
            
        else: 
            
            y_ = X.dot(ws)

            R2 = np.abs((np.mean(y_[ind_sig]) - np.mean(y_[ind_back])))
            R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
            
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)

    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen

        #idx_features.append(np.where(all_candidates[:,ind_max] == input_data))
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)

        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])

        del(X)

    else:
        break

No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
-------------------------------------------------
Feature chosen:  DER_mass_jet_jet (index : 5 )
-------------------------------------------------
Feature chosen:  PRI_tau_pt (index : 13 )
-------------------------------------------------
Feature chosen:  DER_pt_h (index : 3 )
-------------------------------------------------
Feature chosen:  PRI_jet_subleading_pt (index : 26 )
-------------------------------------------------
Feature chosen:  DER_mass_vis (index : 2 )
-------------------------------------------------
Feature chosen:  PRI_met (index : 19 )
-------------------------------------------------
Feature chosen:  DER_deltar_tau_lep (index : 7 )
---------------

In [35]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
0.692616156508
0.728792953194
0.76124434214
0.779840055236
0.788067405044
0.792257399157
0.796829855419
0.79745084454
0.797887020756
0.798245461927
0.798280198671
-------------------------------------------------------
Number of features chosen: 11


Indices of features chosen:  [5, 13, 3, 26, 2, 19, 7, 29, 8, 28, 24]


#### Use of the likelihood (McFadden)

In [36]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [37]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = ridge_regression(Y,X,0)  # lambda set to 0 
y = predict_labels(w0, X)
loglike0 = np.sum(np.log(1+np.exp(X.dot(w0))) - y*(X.dot(w0)))

R2 = 0        # For the definition of McFadden 1-1 = 0
R2adj_0 = 0

#fix the R2adj_max

R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        #CROSS VALIDATION
        ws, loss = ridge_regression(Y,X,lambda_)
        
        y = predict_labels(ws,X)   
        
        loglike = np.sum(np.log(1+np.exp(X.dot(ws))) - y*(X.dot(ws)))
        
        R2 = 1-(loglike/loglike0)
        R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)
    
    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen
        
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)
        
        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])
        
        del(X)
        
    else:
        break
        

-------------------------------------------------
Feature chosen:  PRI_tau_pt (index : 13 )
-------------------------------------------------
Feature chosen:  DER_mass_transverse_met_lep (index : 1 )
-------------------------------------------------
Feature chosen:  DER_deltaeta_jet_jet (index : 4 )
-------------------------------------------------
Feature chosen:  DER_met_phi_centrality (index : 11 )
-------------------------------------------------
Feature chosen:  DER_lep_eta_centrality (index : 12 )
-------------------------------------------------
Feature chosen:  PRI_met (index : 19 )
-------------------------------------------------
Feature chosen:  DER_pt_h (index : 3 )
-------------------------------------------------
Feature chosen:  PRI_lep_pt (index : 16 )
-------------------------------------------------
Feature chosen:  DER_mass_vis (index : 2 )
-------------------------------------------------
Feature chosen:  DER_deltar_tau_lep (index : 7 )
-----------------------------

In [38]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
0.144853348482
0.245180002335
0.351918698681
0.411338325947
0.423331940403
0.43097057345
0.441893631956
0.472160646904
0.476281211645
0.481808467415
0.489134303934
0.491353088345
0.492962043085
0.494275999895
0.494839320244
0.494884261023
0.494922206234
0.494939710937
0.494952765391
0.494960315292
0.494962799368
0.494966276145
0.49496939301
0.494971976117
0.494972314289
-------------------------------------------------------
Number of features chosen: 25


Indices of features chosen:  [13, 1, 4, 11, 12, 19, 3, 16, 2, 7, 23, 8, 22, 21, 5, 18, 0, 20, 17, 28, 27, 24, 14, 15, 25]


### With cross validation (it's needed in order to choose the best hyperparameter lambda)

#### Using the loss of ridge regression as the error of R2

In [9]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))


In [13]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = ridge_regression(Y,X,0)  # start with lambda = 0  
y = predict_labels(w0, X)

sse = loss
sst = np.sum((Y - Y.mean())**2)  #lack of information
R2 = np.abs((sst-sse)/sst)
R2adj_0 = R2 - (k/(n-k-1)*(1-R2))

#fix the R2adj_max

R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        #CROSS VALIDATION
        w_tr_tot, loss_tr_tot, loss_te_tot = cross_validation_rr(Y,X)
        
        ws = w_tr_tot[np.argmin(loss_te_tot)]
        loss = np.min(loss_te_tot)
        k = len(ws) -1 # k is the number of regressor I use -> -1 because I don't consider the offset
        
        SSE = loss
        SST = np.sum((Y- Y.mean())**2)
        R2 = np.abs((SST-SSE)/SST)
        R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)
    
    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen
        
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)
        
        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])
        
        del(X)
        
    else:
        break
        

In [14]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
-------------------------------------------------------
Number of features chosen: 0


Indices of features chosen:  []


#### Use of the probability of the 2 events (R2 Tjur)

In [21]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [22]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = ridge_regression(Y,X,0)  
y = predict_labels(w0, X)
ind_back, ind_sig = idx_2labels(y, [-1,1])

y_ = X.dot(w0)
R2 = 0
R2adj_0 = R2 - (k/(n-k-1)*(1-R2))

#fix the R2adj_max
R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        #CROSS VALIDATION
        w_tr_tot, loss_tr_tot, loss_te_tot = cross_validation_rr(Y,X)
        ws = w_tr_tot[np.argmin(loss_te_tot)]
        
        k = len(ws) -1 # k is the number of regressor I use -> -1 because I don't consider the offset
        
        y = predict_labels(ws,X)          
        ind_back, ind_sig = idx_2labels(y, [-1,1])
        
        if len(ind_sig) == 0 or len(ind_back) ==0:
            print('No signal detected')
            R2_adj.append(0)
            
        else: 
            
            y_ = X.dot(ws)

            R2 = np.abs((np.mean(y_[ind_sig]) - np.mean(y_[ind_back])))
            R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
            
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)

    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen

        #idx_features.append(np.where(all_candidates[:,ind_max] == input_data))
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)

        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])

        del(X)

    else:
        break

No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
-------------------------------------------------
Feature chosen:  DER_mass_jet_jet (index : 5 )
-------------------------------------------------
Feature chosen:  PRI_tau_pt (index : 13 )
-------------------------------------------------
Feature chosen:  DER_pt_h (index : 3 )
-------------------------------------------------
Feature chosen:  PRI_jet_subleading_pt (index : 26 )
-------------------------------------------------
Feature chosen:  DER_mass_vis (index : 2 )
-------------------------------------------------
Feature chosen:  PRI_met (index : 19 )
-------------------------------------------------
Feature chosen:  DER_deltar_tau_lep (index : 7 )
---------------

In [23]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
0.692583907083
0.728802362346
0.761239966883
0.779850696222
0.788063711181
0.792264545618
0.795889920268
0.796756354309
0.7969376439
0.797170043648
0.797259734087
0.797271889943
-------------------------------------------------------
Number of features chosen: 12


Indices of features chosen:  [5, 13, 3, 26, 2, 19, 7, 29, 8, 28, 27, 24]


#### Use of the likelihood (McFadden)

In [24]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [26]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = ridge_regression(Y,X,0)  # lambda set to 0 
y = predict_labels(w0, X)
loglike0 = np.sum(np.log(1+np.exp(X.dot(w0))) - y*(X.dot(w0)))

R2 = 0        # For the definition of McFadden 1-1 = 0
R2adj_0 = 0

#fix the R2adj_max

R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        #CROSS VALIDATION
        w_tr_tot, loss_tr_tot, loss_te_tot = cross_validation_rr(Y,X)
        ws = w_tr_tot[np.argmin(loss_te_tot)]
        
        y = predict_labels(ws,X)   
        
        loglike = np.sum(np.log(1+np.exp(X.dot(ws))) - y*(X.dot(ws)))
        
        R2 = 1-(loglike/loglike0)
        R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)
    
    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen
        
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)
        
        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])
        
        del(X)
        
    else:
        break
        

-------------------------------------------------
Feature chosen:  PRI_tau_pt (index : 13 )
-------------------------------------------------
Feature chosen:  DER_mass_transverse_met_lep (index : 1 )
-------------------------------------------------
Feature chosen:  DER_deltaeta_jet_jet (index : 4 )
-------------------------------------------------
Feature chosen:  DER_met_phi_centrality (index : 11 )
-------------------------------------------------
Feature chosen:  DER_lep_eta_centrality (index : 12 )
-------------------------------------------------
Feature chosen:  PRI_met (index : 19 )
-------------------------------------------------
Feature chosen:  DER_pt_h (index : 3 )
-------------------------------------------------
Feature chosen:  PRI_lep_pt (index : 16 )
-------------------------------------------------
Feature chosen:  DER_mass_vis (index : 2 )
-------------------------------------------------
Feature chosen:  DER_deltar_tau_lep (index : 7 )
-----------------------------

In [27]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
0.144857765242
0.245184572292
0.352177585943
0.411708293799
0.424710904491
0.432292091945
0.443241438144
0.47347719231
0.477550694143
0.483149352784
0.4905044717
0.4926683267
0.494334962769
0.495673376908
0.496299873075
0.496344537618
0.496380190427
0.496397587194
0.496409721352
0.496420779729
0.496423639692
0.49642657291
0.496428791606
0.496430850605
0.496431354733
-------------------------------------------------------
Number of features chosen: 25


Indices of features chosen:  [13, 1, 4, 11, 12, 19, 3, 16, 2, 7, 23, 8, 22, 21, 5, 18, 0, 20, 17, 28, 27, 24, 15, 14, 25]


## Logistic regression

### No cross validation (hyperparameter fixed)

In [28]:
# Initialization
max_iters = 100
threshold = 1e-8
gamma = 0.1
method = 'gd'

#### Use of the likelihood (McFadden)

I used only this method because it's the most reasonable

In [None]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [None]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

initial_w = np.zeros((X.shape[1], 1))

w0, loss = logistic_regression(Y,X, initial_w, max_iters, gamma, method)
y = predict_labels(w0, X)
loglike0 = np.sum(np.log(1+np.exp(X.dot(w0))) - y*(X.dot(w0)))

R2 = 0        # For the definition of McFadden 1-1 = 0
R2adj_0 = 0

#fix the R2adj_max

R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        #CROSS VALIDATION
        
        initial_w = np.zeros((X.shape[1], 1))
        w_tr_tot, loss_tr_tot, loss_te_tot = cross_validation_lr()
        ws = w_tr_tot[np.argmin(loss_te_tot)]
        
        y = predict_labels(ws,X)   
        
        loglike = np.sum(np.log(1+np.exp(X.dot(ws))) - y*(X.dot(ws)))
        
        R2 = 1-(loglike/loglike0)
        R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)
    
    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen
        
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)
        
        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])
        
        del(X)
        
    else:
        break

In [None]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

### With cross correlation

#### Use of the likelihood (McFadden)