# Features selection with STEPWISE

In [3]:
import sys
my_path = r'C:\Users\utente\Documents\GitHub\Project1_ML'
sys.path.insert(0,my_path + r'\code\COMMON')

import numpy as np 
import matplotlib.pyplot as plt
from proj1_helpers import load_csv_data, predict_labels 
from implementations import *
from outliers import handle_outliers
from labels import idx_2labels
from standard import standardize
from costs import compute_loglikelihood_reg
from optimize_hyperparams import *
from cross_validation import *

## Load data and outliers management

In [4]:
yb, input_data, ids = load_csv_data(my_path + r'/data/train.csv', sub_sample=False)

In [5]:
input_data.shape

(250000, 30)

In [6]:
input_data, Y = handle_outliers(input_data,yb,-999,'mean') # substiution with mean because the standardization
                                                           #can be affected, otherwise we should delete the whole row
ind_back, ind_sig = idx_2labels(Y, [-1,1])
Y[ind_back] = 0

input_data, mean_X, std_X = standardize(input_data)    

-999 are replaced by the mean value of the feature


In [7]:
# Subdived the X features space in single features
all_features = np.genfromtxt(my_path + r'/data/train.csv', delimiter=",", dtype=str, max_rows = 1)[2:]
# converting array in list in order to simplify the adding of features
all_features = list(all_features)

features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))


## R^2  as stopping criteria : 1) R2 with error , 2) R2 of Tjur or 3) R2 of McFadden

The only way to use the stepwise is using R2 of Tjur or McFadden because of the binary values of the indipendent variable, but the error was also used 

In [8]:
def results_r2_stepwise(list_r2_adj,indices_features):
    print("R2 asjusted values:")
    
    for i in range(len(list_r2_adj)):
        print(list_r2_adj[i])
    print("-------------------------------------------------------")
    print("Number of features chosen:", len(indices_features))
    print("\n")
    print("Indices of features chosen: ", indices_features)
    

## A) Least square model

### No cross validation (NC)

#### 1) NC: Use of the error before binarization (R2 with loss)

In [9]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = least_squares(Y,X)  # The loss cannot be used as a measure for the feature selection because it's a 
y = predict_labels(w0, X)

sse = loss
sst = np.sum((Y - Y.mean())**2)  #lack of information
R2 = np.abs((sst-sse)/sst)
R2adj_0 = R2 - (k/(n-k-1)*(1-R2))

#fix the R2adj_max

R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        ws , loss = least_squares(Y,X)
        k = len(ws) -1 # k is the number of regressor I use -> -1 because I don't consider the offset
        
        #y = predict_labels(ws,X)   #***** NO USE OF PREDICTION
        SSE = loss
        SST = np.sum((Y- Y.mean())**2)
        R2 = np.abs((SST-SSE)/SST)
        R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)
    
    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen
        
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)
        
        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])
        
        del(X)
        
    else:
        break
        

-------------------------------------------------
Feature chosen:  DER_mass_transverse_met_lep (index : 1 )
-------------------------------------------------
Feature chosen:  PRI_tau_pt (index : 13 )
-------------------------------------------------
Feature chosen:  DER_deltaeta_jet_jet (index : 4 )
-------------------------------------------------
Feature chosen:  DER_met_phi_centrality (index : 11 )
-------------------------------------------------
Feature chosen:  DER_deltar_tau_lep (index : 7 )
-------------------------------------------------
Feature chosen:  DER_mass_vis (index : 2 )
-------------------------------------------------
Feature chosen:  PRI_lep_pt (index : 16 )
-------------------------------------------------
Feature chosen:  DER_pt_ratio_lep_tau (index : 10 )
-------------------------------------------------
Feature chosen:  PRI_met (index : 19 )
-------------------------------------------------
Feature chosen:  DER_lep_eta_centrality (index : 12 )
----------------

In [10]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
0.999998246996
0.999998316254
0.999998366571
0.999998394757
0.99999840969
0.999998424917
0.99999844367
0.999998459265
0.999998468735
0.999998474896
0.99999847968
0.999998482105
0.999998484408
0.999998486981
0.99999848737
0.999998488309
0.999998488373
0.999998488384
0.99999848839
0.999998488392
0.999998488393
-------------------------------------------------------
Number of features chosen: 21


Indices of features chosen:  [1, 13, 4, 11, 7, 2, 16, 10, 19, 12, 23, 8, 5, 26, 22, 21, 0, 18, 6, 3, 28]


#### 2) NC: Use of the probability of the 2 events (R2 Tjur)

In [11]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))


In [12]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = least_squares(Y,X) 
y = predict_labels(w0, X)
ind_back, ind_sig = idx_2labels(y, [0,1])

y_ = X.dot(w0)
R2 = 0
R2adj_0 = R2 - (k/(n-k-1)*(1-R2))

#fix the R2adj_max
R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        ws , loss = least_squares(Y,X)
        k = len(ws) -1 # k is the number of regressor I use -> -1 because I don't consider the offset
        
        y = predict_labels(ws,X)          
        ind_back, ind_sig = idx_2labels(y, [0,1])
        
        if len(ind_sig) == 0 or len(ind_back) ==0:
            print('No signal detected')
            R2_adj.append(0)
            
        else: 
            
            y_ = X.dot(ws)

            R2 = np.abs((np.mean(y_[ind_sig]) - np.mean(y_[ind_back])))
            R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
            
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)

    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen

        #idx_features.append(np.where(all_candidates[:,ind_max] == input_data))
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)

        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])

        del(X)

    else:
        break

No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
-------------------------------------------------
Feature chosen:  DER_mass_jet_jet (index : 5 )
-------------------------------------------------
Feature chosen:  PRI_tau_pt (index : 13 )
-------------------------------------------------
Feature chosen:  DER_pt_h (index : 3 )
-------------------------------------------------
Feature chosen:  PRI_jet_subleading_pt (index : 26 )
-------------------------------------------------
Feature chosen:  DER_mass_vis (index : 2 )
-------------------------------------------------
Feature chosen:  PRI_met (index : 19 )
-------------------------------------------------
Feature chosen:  DER_deltar_tau_lep (index : 7 )
---------------

In [13]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
0.346306088284
0.364393069363
0.380616523588
0.389912517644
0.394024256458
0.396117256195
0.397938510559
0.398364531728
0.398434460516
0.398560902406
0.398597637228
0.398607879751
-------------------------------------------------------
Number of features chosen: 12


Indices of features chosen:  [5, 13, 3, 26, 2, 19, 7, 29, 8, 28, 27, 24]


#### 3) NC: Use of the likelihood (McFadden)

In [14]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))
   

In [15]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = least_squares(Y,X)  # The loss cannot be used as a measure for the feature selection because it's a 
y = predict_labels(w0, X)
loglike0 = compute_loglikelihood_reg(y,X,w0)#np.sum(np.log(1+np.exp(X.dot(w0))) - y*(X.dot(w0)))

R2 = 0        # For the definition of McFadden 1-1 = 0
R2adj_0 = 0

#fix the R2adj_max

R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        ws , loss = least_squares(Y,X)
        k = len(ws) -1 # k is the number of regressor I use -> -1 because I don't consider the offset
        
        y = predict_labels(ws,X)   
        
        loglike = compute_loglikelihood_reg(y,X,ws) #np.sum(np.log(1+np.exp(X.dot(ws))) - y*(X.dot(ws)))
        
        R2 = 1-(loglike/loglike0)
        R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)
    
    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen
        
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)
        
        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])
        
        del(X)
        
    else:
        break
        

-------------------------------------------------
Feature chosen:  DER_mass_transverse_met_lep (index : 1 )
-------------------------------------------------
Feature chosen:  DER_met_phi_centrality (index : 11 )
-------------------------------------------------
Feature chosen:  DER_sum_pt (index : 9 )
-------------------------------------------------
Feature chosen:  DER_mass_vis (index : 2 )
-------------------------------------------------
Feature chosen:  PRI_lep_phi (index : 18 )
-------------------------------------------------
Feature chosen:  PRI_jet_subleading_phi (index : 28 )
-------------------------------------------------
Feature chosen:  PRI_jet_leading_eta (index : 24 )


In [16]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
0.127199992503
0.161949175423
0.162775033532
0.163218892521
0.163296584222
0.163326074239
0.163331726027
-------------------------------------------------------
Number of features chosen: 7


Indices of features chosen:  [1, 11, 9, 2, 18, 28, 24]


In [17]:
loglike/loglike0

0.8376553003945949

### Using cross validation (C)

I am not sure that cross validation can be used we don't have to estimate any hyperparameters

#### 3) C: R2 with likelihood (McFadden)

In [18]:
from split_data import split_data

In [19]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [20]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = least_squares(Y,X)  # The loss cannot be used as a measure for the feature selection because it's a 
y = predict_labels(w0, X)
loglike0 = np.sum(np.log(1+np.exp(X.dot(w0))) - y*(X.dot(w0)))

R2 = 0   # For the definition of McFadden 1-1 = 0
R2adj_0 = 0

#fix the R2adj_max

R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

# parameters for cross validation
arg_ls = dict()
arg_ls['method'] = 'ls'
arg_ls['loss'] = 'rmse'
arg_ls['k_fold'] = 10


for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        
        # CROSS-VALIDATION
        
        w_tr_tot, loss_tr_tot, loss_te_tot = cross_validation(Y, X, arg_ls)
        
        ws = w_tr_tot[np.argmin(loss_te_tot)]
        
        k = len(ws) -1 # k is the number of regressor I use -> -1 because I don't consider the offset
        
        y = predict_labels(ws,X)   
        
        loglike = np.sum(np.log(1+np.exp(X.dot(ws))) - y*(X.dot(ws)))
        
        R2 = 1-(loglike/loglike0)
        R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)
    
    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen
        
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)
        
        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])
        
        del(X)
        
    else:
        break

-------------------------------------------------
Feature chosen:  DER_mass_transverse_met_lep (index : 1 )
-------------------------------------------------
Feature chosen:  DER_met_phi_centrality (index : 11 )
-------------------------------------------------
Feature chosen:  DER_sum_pt (index : 9 )
-------------------------------------------------
Feature chosen:  PRI_jet_all_pt (index : 29 )
-------------------------------------------------
Feature chosen:  PRI_met (index : 19 )
-------------------------------------------------
Feature chosen:  DER_pt_h (index : 3 )
-------------------------------------------------
Feature chosen:  DER_lep_eta_centrality (index : 12 )
-------------------------------------------------
Feature chosen:  DER_mass_MMC (index : 0 )
-------------------------------------------------
Feature chosen:  DER_deltar_tau_lep (index : 7 )
-------------------------------------------------
Feature chosen:  DER_pt_ratio_lep_tau (index : 10 )
-------------------------

In [21]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
0.127029003781
0.162219139573
0.163016309637
0.16354554708
0.167617483869
0.172882697354
0.174062939737
0.174659663743
0.175157077078
0.176255853643
0.177589693328
0.17903001667
0.180289605671
0.180653688622
0.180673303474
0.18068436144
-------------------------------------------------------
Number of features chosen: 16


Indices of features chosen:  [1, 11, 9, 29, 19, 3, 12, 0, 7, 10, 2, 16, 23, 21, 15, 13]


# B) Ridge regression

I used only lambda as hyperparameter because I am not building a polynomial model. But we can add different features transformation (square or log or power), so maybe we can test the polynomial after the best features are selected 

## No cross validation (NC)

In [25]:
# set lambda
lambda_ = 0.05

### 1) (NC) Using the loss from ridge regression

In [26]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))


In [27]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = ridge_regression(Y,X,0)  # start with lambda = 0  
y = predict_labels(w0, X)

sse = loss
sst = np.sum((Y - Y.mean())**2)  #lack of information
R2 = np.abs((sst-sse)/sst)
R2adj_0 = R2 - (k/(n-k-1)*(1-R2))

#fix the R2adj_max

R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        
        ws, loss = ridge_regression(Y,X,lambda_)
        k = len(ws) -1 # k is the number of regressor I use -> -1 because I don't consider the offset
        
        SSE = loss
        SST = np.sum((Y- Y.mean())**2)   # it has no sense
        R2 = np.abs((SST-SSE)/SST)
        R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)
    
    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen
        
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)
        
        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])
        
        del(X)
        
    else:
        break
        

-------------------------------------------------
Feature chosen:  DER_mass_transverse_met_lep (index : 1 )
-------------------------------------------------
Feature chosen:  PRI_tau_pt (index : 13 )
-------------------------------------------------
Feature chosen:  DER_deltaeta_jet_jet (index : 4 )
-------------------------------------------------
Feature chosen:  DER_met_phi_centrality (index : 11 )
-------------------------------------------------
Feature chosen:  DER_deltar_tau_lep (index : 7 )
-------------------------------------------------
Feature chosen:  DER_lep_eta_centrality (index : 12 )
-------------------------------------------------
Feature chosen:  PRI_met (index : 19 )
-------------------------------------------------
Feature chosen:  DER_mass_vis (index : 2 )
-------------------------------------------------
Feature chosen:  DER_mass_jet_jet (index : 5 )
-------------------------------------------------
Feature chosen:  PRI_jet_subleading_pt (index : 26 )
----------

In [28]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
0.999998118035
0.999998183913
0.999998230611
0.999998262345
0.999998274162
0.999998283279
0.999998287416
0.999998291185
0.999998293376
0.999998296691
0.999998297975
0.999998299392
0.999998299686
0.999998299704
0.999998299707
-------------------------------------------------------
Number of features chosen: 15


Indices of features chosen:  [1, 13, 4, 11, 7, 12, 19, 2, 5, 26, 8, 3, 22, 18, 15]


### 2) (NC) Use of the probability of the 2 events (R2 Tjur)

In [47]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [29]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = ridge_regression(Y,X,0)  
y = predict_labels(w0, X)
ind_back, ind_sig = idx_2labels(y, [0,1])

y_ = X.dot(w0)
R2 = 0
R2adj_0 = R2 - (k/(n-k-1)*(1-R2))

#fix the R2adj_max
R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        ws, loss = ridge_regression(Y,X,lambda_)
        
        k = len(ws) -1 # k is the number of regressor I use -> -1 because I don't consider the offset
        
        y = predict_labels(ws,X)          
        ind_back, ind_sig = idx_2labels(y, [-1,1])
        
        if len(ind_sig) == 0 or len(ind_back) ==0:
            print('No signal detected')
            R2_adj.append(0)
            
        else: 
            
            y_ = X.dot(ws)

            R2 = np.abs((np.mean(y_[ind_sig]) - np.mean(y_[ind_back])))
            R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
            
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)

    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen

        #idx_features.append(np.where(all_candidates[:,ind_max] == input_data))
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)

        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])

        del(X)

    else:
        break

No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected


In [30]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
-------------------------------------------------------
Number of features chosen: 0


Indices of features chosen:  []


### 3) (NC) Use of the likelihood (McFadden)

In [22]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [23]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = ridge_regression(Y,X,0)  # lambda set to 0 
y = predict_labels(w0, X)
loglike0 = compute_loglikelihood_reg(y,X,w0) #np.sum(np.log(1+np.exp(X.dot(w0))) - y*(X.dot(w0)))

R2 = 0        # For the definition of McFadden 1-1 = 0
R2adj_0 = 0

#fix the R2adj_max

R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        
        ws, loss = ridge_regression(Y,X,lambda_)
        
        y = predict_labels(ws,X)   
        
        loglike = compute_loglikelihood_reg(y,X,ws) #np.sum(np.log(1+np.exp(X.dot(ws))) - y*(X.dot(ws)))
        
        R2 = 1-(loglike/loglike0)
        R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)
    
    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen
        
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)
        
        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])
        
        del(X)
        
    else:
        break
        

-------------------------------------------------
Feature chosen:  PRI_tau_pt (index : 13 )
-------------------------------------------------
Feature chosen:  DER_mass_transverse_met_lep (index : 1 )
-------------------------------------------------
Feature chosen:  DER_deltaeta_jet_jet (index : 4 )
-------------------------------------------------
Feature chosen:  DER_met_phi_centrality (index : 11 )
-------------------------------------------------
Feature chosen:  DER_lep_eta_centrality (index : 12 )
-------------------------------------------------
Feature chosen:  PRI_met (index : 19 )
-------------------------------------------------
Feature chosen:  DER_pt_h (index : 3 )
-------------------------------------------------
Feature chosen:  PRI_lep_pt (index : 16 )
-------------------------------------------------
Feature chosen:  DER_mass_vis (index : 2 )
-------------------------------------------------
Feature chosen:  DER_deltar_tau_lep (index : 7 )
-----------------------------

In [24]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
0.144854047951
0.245180817163
0.352173598228
0.41170504767
0.424708632929
0.432288557592
0.443235724243
0.473466109895
0.477541148029
0.483139657023
0.490491175076
0.492657971875
0.494326119857
0.495663494828
0.496289473241
0.496334681561
0.496370986565
0.496388422301
0.49640087404
0.496411964442
0.496414840522
0.496418199262
0.496421128045
0.496423325284
0.496423537179
-------------------------------------------------------
Number of features chosen: 25


Indices of features chosen:  [13, 1, 4, 11, 12, 19, 3, 16, 2, 7, 23, 8, 22, 21, 5, 18, 0, 20, 17, 28, 27, 24, 14, 15, 25]


## With cross validation (C) (it's needed in order to choose the best hyperparameter lambda)

### 1) (C) Using the loss of ridge regression as the error of R2

In [9]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))


In [13]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = ridge_regression(Y,X,0)  # start with lambda = 0  
y = predict_labels(w0, X)

sse = loss
sst = np.sum((Y - Y.mean())**2)  #lack of information
R2 = np.abs((sst-sse)/sst)
R2adj_0 = R2 - (k/(n-k-1)*(1-R2))

#fix the R2adj_max

R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        #CROSS VALIDATION
        w_tr_tot, loss_tr_tot, loss_te_tot = cross_validation_rr(Y,X)
        
        ws = w_tr_tot[np.argmin(loss_te_tot)]
        loss = np.min(loss_te_tot)
        k = len(ws) -1 # k is the number of regressor I use -> -1 because I don't consider the offset
        
        SSE = loss
        SST = np.sum((Y- Y.mean())**2)
        R2 = np.abs((SST-SSE)/SST)
        R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)
    
    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen
        
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)
        
        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])
        
        del(X)
        
    else:
        break
        

In [14]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
-------------------------------------------------------
Number of features chosen: 0


Indices of features chosen:  []


### 2) (C) Use of the probability of the 2 events (R2 Tjur)

In [21]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [22]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = ridge_regression(Y,X,0)  
y = predict_labels(w0, X)
ind_back, ind_sig = idx_2labels(y, [0,1])

y_ = X.dot(w0)
R2 = 0
R2adj_0 = R2 - (k/(n-k-1)*(1-R2))

#fix the R2adj_max
R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        #CROSS VALIDATION
        w_tr_tot, loss_tr_tot, loss_te_tot = cross_validation_rr(Y,X)
        ws = w_tr_tot[np.argmin(loss_te_tot)]
        
        k = len(ws) -1 # k is the number of regressor I use -> -1 because I don't consider the offset
        
        y = predict_labels(ws,X)          
        ind_back, ind_sig = idx_2labels(y, [-1,1])
        
        if len(ind_sig) == 0 or len(ind_back) ==0:
            print('No signal detected')
            R2_adj.append(0)
            
        else: 
            
            y_ = X.dot(ws)

            R2 = np.abs((np.mean(y_[ind_sig]) - np.mean(y_[ind_back])))
            R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
            
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)

    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen

        #idx_features.append(np.where(all_candidates[:,ind_max] == input_data))
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)

        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])

        del(X)

    else:
        break

No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
No signal detected
-------------------------------------------------
Feature chosen:  DER_mass_jet_jet (index : 5 )
-------------------------------------------------
Feature chosen:  PRI_tau_pt (index : 13 )
-------------------------------------------------
Feature chosen:  DER_pt_h (index : 3 )
-------------------------------------------------
Feature chosen:  PRI_jet_subleading_pt (index : 26 )
-------------------------------------------------
Feature chosen:  DER_mass_vis (index : 2 )
-------------------------------------------------
Feature chosen:  PRI_met (index : 19 )
-------------------------------------------------
Feature chosen:  DER_deltar_tau_lep (index : 7 )
---------------

In [23]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
0.692583907083
0.728802362346
0.761239966883
0.779850696222
0.788063711181
0.792264545618
0.795889920268
0.796756354309
0.7969376439
0.797170043648
0.797259734087
0.797271889943
-------------------------------------------------------
Number of features chosen: 12


Indices of features chosen:  [5, 13, 3, 26, 2, 19, 7, 29, 8, 28, 27, 24]


### 3) (C) Use of the likelihood (McFadden)

In [24]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [26]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = ridge_regression(Y,X,0)  # lambda set to 0 
y = predict_labels(w0, X)
loglike0 = np.sum(np.log(1+np.exp(X.dot(w0))) - y*(X.dot(w0)))

R2 = 0        # For the definition of McFadden 1-1 = 0
R2adj_0 = 0

#fix the R2adj_max

R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        #CROSS VALIDATION
        w_tr_tot, loss_tr_tot, loss_te_tot = cross_validation_rr(Y,X)
        ws = w_tr_tot[np.argmin(loss_te_tot)]
        
        y = predict_labels(ws,X)   
        
        loglike = np.sum(np.log(1+np.exp(X.dot(ws))) - y*(X.dot(ws)))
        
        R2 = 1-(loglike/loglike0)
        R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)
    
    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen
        
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)
        
        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])
        
        del(X)
        
    else:
        break
        

-------------------------------------------------
Feature chosen:  PRI_tau_pt (index : 13 )
-------------------------------------------------
Feature chosen:  DER_mass_transverse_met_lep (index : 1 )
-------------------------------------------------
Feature chosen:  DER_deltaeta_jet_jet (index : 4 )
-------------------------------------------------
Feature chosen:  DER_met_phi_centrality (index : 11 )
-------------------------------------------------
Feature chosen:  DER_lep_eta_centrality (index : 12 )
-------------------------------------------------
Feature chosen:  PRI_met (index : 19 )
-------------------------------------------------
Feature chosen:  DER_pt_h (index : 3 )
-------------------------------------------------
Feature chosen:  PRI_lep_pt (index : 16 )
-------------------------------------------------
Feature chosen:  DER_mass_vis (index : 2 )
-------------------------------------------------
Feature chosen:  DER_deltar_tau_lep (index : 7 )
-----------------------------

In [27]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
0.144857765242
0.245184572292
0.352177585943
0.411708293799
0.424710904491
0.432292091945
0.443241438144
0.47347719231
0.477550694143
0.483149352784
0.4905044717
0.4926683267
0.494334962769
0.495673376908
0.496299873075
0.496344537618
0.496380190427
0.496397587194
0.496409721352
0.496420779729
0.496423639692
0.49642657291
0.496428791606
0.496430850605
0.496431354733
-------------------------------------------------------
Number of features chosen: 25


Indices of features chosen:  [13, 1, 4, 11, 12, 19, 3, 16, 2, 7, 23, 8, 22, 21, 5, 18, 0, 20, 17, 28, 27, 24, 15, 14, 25]


## Logistic regression

### No cross validation (hyperparameter fixed)

In [67]:
# Initialization
max_iters = 100
threshold = 1e-8
gamma = 0
method = 'gd'

#### Use of the likelihood (McFadden)

I used only this method because it's the most reasonable

In [26]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [27]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

initial_w = np.zeros(X.shape[1])

w0, loss = logistic_regression(Y,X, initial_w, max_iters, gamma, method)
y = predict_labels(w0[-1], X)
loglike0 = np.sum(np.log(1+np.exp(X.dot(w0))) - y*(X.dot(w0)))

R2 = 0        # For the definition of McFadden 1-1 = 0
R2adj_0 = 0

#fix the R2adj_max

R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        
        initial_w = np.zeros(X.shape[1])
        ws, loss = logistic_regression(Y,X, initial_w, max_iters, gamma, method)
        y = predict_labels(ws[-1], X)
        loglike = compute_loglikelihood_reg(y, X, ws)
        
        R2 = 1-(loglike/loglike0)
        R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)
    
    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen
        
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)
        
        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])
        
        del(X)
        
    else:
        break

Logistic Regression (99/99): loss logLikelihood=-62866720559.99994
Logistic Regression (99/99): loss logLikelihood=-62916782566.017204


  loglikelihood = np.sum(np.log(1+np.exp(tx.dot(w))) - y*(tx.dot(w))) + lambda_*w.T.dot(w)
  return np.exp(z)/(np.exp(z)+1)
  return np.exp(z)/(np.exp(z)+1)


Logistic Regression (99/99): loss logLikelihood=nan


  y_pred[np.where(y_pred <= 0)] = -1
  y_pred[np.where(y_pred > 0)] = 1


Logistic Regression (99/99): loss logLikelihood=-62977964651.30821
Logistic Regression (99/99): loss logLikelihood=nan
Logistic Regression (99/99): loss logLikelihood=nan


KeyboardInterrupt: 

In [37]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
-------------------------------------------------------
Number of features chosen: 0


Indices of features chosen:  []


### With cross correlation

#### Use of the likelihood (McFadden)

In [11]:
# Realloc FEATURES
features = []
for i in range(len(all_features)):
    features.append((i,all_features[i]))

In [None]:
# Start of STEP-WISE algorithm 
all_candidates = input_data

n = all_candidates.shape[0] #needed for the R^2 adjusted
num = all_candidates.shape[1]
H = np.ones((n,1)) #offset

#Initialization only with offsets (lack of info)
X = H
k = 0 #needed for the R^2 adjusted

w0, loss = logistic_regression(Y,X, initial_w, max_iters, gamma, method)
y = predict_labels(w0, X)
loglike0 = np.sum(np.log(1+np.exp(X.dot(w0))) - y*(X.dot(w0)))

R2 = 0        # For the definition of McFadden 1-1 = 0
R2adj_0 = 0

#fix the R2adj_max

R2adj_max = R2adj_0
ind_max = 0  # this index will show us which is the best feature chosen
del(X)
idx_features = []
best_R2adj = []

for j in range(num):
    R2_adj = []
    for i in range(all_candidates.shape[1]):
        
        X = np.concatenate((H,all_candidates[:,i].reshape(n,1)), axis=1)
        #CROSS VALIDATION
        w_tr_tot, loss_tr_tot, loss_te_tot = cross_validation_lr(Y,X)
        ws = w_tr_tot[np.argmin(loss_te_tot)]
        
        y = predict_labels(ws,X)   
        
        loglike = np.sum(np.log(1+np.exp(X.dot(ws))) - y*(X.dot(ws)))
        
        R2 = 1-(loglike/loglike0)
        R2_adj.append(R2 - (k/(n-k-1)*(1-R2)))
        
    R2adj_chosen = np.max(R2_adj)
    best_R2adj.append(R2adj_chosen)
    idx_chosen = np.argmax(R2_adj)
    
    if R2adj_chosen > R2adj_max:
        R2adj_max = R2adj_chosen
        ind_max = idx_chosen
        
        H = np.concatenate((H, all_candidates[:,ind_max].reshape(n,1)), axis = 1)
        
        all_candidates = np.delete(all_candidates,ind_max,1)
        print('-------------------------------------------------')
        print('Feature chosen: ', features[ind_max][1], '(index :', features[ind_max][0], ')')
        idx_features.append(features[ind_max][0])
        del(features[ind_max])
        
        del(X)
        
    else:
        break
        

In [38]:
results_r2_stepwise(best_R2adj[:len(best_R2adj)-1], idx_features)

R2 asjusted values:
-------------------------------------------------------
Number of features chosen: 0


Indices of features chosen:  []
