Importation of modules and functions
===

In [1]:
# Modules
import csv
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import sys
import datetime
import random
import warnings

# Functions
sys.path.insert(0, './implementations/')
from implementations import *
from preprocessing import *
from pca import *
from plot import *
from helpers import *

# Autoreload
%load_ext autoreload
%autoreload 2

# Set random seed
np.random.seed(1)

Training data loading
===

In [2]:
(labels_raw, data_raw, ids_raw) = load_csv_data("data/train.csv")

Splitting data and jets
===

In [3]:
# Splitting data
X_train, y_train, X_test, y_test = split_data(data_raw, labels_raw)

# Get feature jet_num
jets = X_train[:,22]
jets_t = X_test[:,22]

# Get index of samples with appropriate jet
idx_jet0 = np.argwhere(jets == 0)[:,0]
idx_jet1 = np.argwhere(jets == 1)[:,0]
idx_jet2 = np.argwhere(jets >= 2)[:,0]
# idx_jet3 = np.argwhere(jets == 3)[:,0]

idx_jet0_t = np.argwhere(jets_t == 0)[:,0]
idx_jet1_t = np.argwhere(jets_t == 1)[:,0]
idx_jet2_t = np.argwhere(jets_t >= 2)[:,0]
# idx_jet3_t = np.argwhere(jets_t == 3)[:,0]


In [10]:
X_train = np.delete(X_train, 22, axis=1)
X_test = np.delete(X_test, 22, axis=1)

# Split data relatitve to jets
data_tr_j0 = X_train[idx_jet0,:]
data_tr_j1 = X_train[idx_jet1,:]
data_tr_j2 = X_train[idx_jet2,:]
# data_tr_j3 = X_train[idx_jet3,:]

data_ts_j0 = X_test[idx_jet0_t,:]
data_ts_j1 = X_test[idx_jet1_t,:]
data_ts_j2 = X_test[idx_jet2_t,:]
# data_ts_j3 = X_test[idx_jet3_t,:]

# Split labels relative to jets
lab_j0 = y_train[idx_jet0]
lab_j1 = y_train[idx_jet1]
lab_j2 = y_train[idx_jet2]
# lab_j3 = y_train[idx_jet3]

lab_j0_t = y_test[idx_jet0_t]
lab_j1_t = y_test[idx_jet1_t]
lab_j2_t = y_test[idx_jet2_t]
# lab_j3_t = y_test[idx_jet3_t]

Data filtering and normalization
===

In [13]:
# Filtering missing values and outliers
data_j0, data_j0_t = process_data(data_tr_j0, data_ts_j0)
data_j1, data_j1_t = process_data(data_tr_j1, data_ts_j1)
data_j2, data_j2_t = process_data(data_tr_j2, data_ts_j2)
# data_j3, data_j3_t = process_data(data_tr_j3, data_ts_j3)

The original dimensions of the training data set was 79923 samples and 29 columns
 After feature and sample filtering, there are 79923 samples and 19 columns
The original dimensions of the training data set was 61985 samples and 29 columns
 After feature and sample filtering, there are 61985 samples and 22 columns
The original dimensions of the training data set was 58092 samples and 29 columns
 After feature and sample filtering, there are 58092 samples and 29 columns


In [17]:
# Transforming data using polynomials, log and interaction terms
y_j0, tx_j0, y_j0_t, tx_j0_t = transform_data(data_j0, data_j0_t, lab_j0, lab_j0_t)
y_j1, tx_j1, y_j1_t, tx_j1_t = transform_data(data_j1, data_j1_t, lab_j1, lab_j1_t)
y_j2, tx_j2, y_j2_t, tx_j2_t = transform_data(data_j2, data_j2_t, lab_j2, lab_j2_t)
# y_j3, tx_j3, y_j3_t, tx_j3_t = transform_data(data_j3, data_j3_t, lab_j3, lab_j3_t)

we have reduce the number of feature with PCA to 380
we have reduce the number of feature with PCA to 545
we have reduce the number of feature with PCA to 957


Logistic regression using Newton's method
===

In [18]:
best_gamma = 0.07
best_lambda = 0

initial_w = np.zeros(tx_j0.shape[1])
losses, losses_t, acc, acc_t, w_0 = logistic_hessian(y_j0, tx_j0, y_j0_t, tx_j0_t, initial_w, best_gamma, best_lambda, 100) # fit model, retrieve parameters

initial_w = np.zeros(tx_j1.shape[1])
losses, losses_t, acc, acc_t, w_1 = logistic_hessian(y_j1, tx_j1, y_j1_t, tx_j1_t, initial_w, best_gamma, best_lambda, 100) # fit model, retrieve parameters

initial_w = np.zeros(tx_j2.shape[1])
losses, losses_t, acc, acc_t, w_2 = logistic_hessian(y_j2, tx_j2, y_j2_t, tx_j2_t, initial_w, best_gamma, best_lambda, 100) # fit model, retrieve parameters

# initial_w = np.zeros(tx_j3.shape[1])
# losses, losses_t, acc, acc_t, w_3 = logistic_hessian(y_j3, tx_j3, y_j3_t, tx_j3_t, initial_w, best_gamma, best_lambda, 100) # fit model, retrieve parameters


25/100	 train acc : 0.851306882874767 	 | test acc : 0.8461230615307653
50/100	 train acc : 0.8513444190032907 	 | test acc : 0.8463231615807904
75/100	 train acc : 0.8512943708319257 	 | test acc : 0.8462731365682842
25/100	 train acc : 0.8196660482374768 	 | test acc : 0.8167620026994022
50/100	 train acc : 0.820375897394531 	 | test acc : 0.8164406452856867
75/100	 train acc : 0.8203436315237558 	 | test acc : 0.8163763738029436
25/100	 train acc : 0.8515974660882738 	 | test acc : 0.8411182617119922
50/100	 train acc : 0.8524409557254011 	 | test acc : 0.8412566604387239


KeyboardInterrupt: 

Grid search for best paramaters
===

In [None]:
X_train, y_train, X_test, y_test = split_data(data_raw, labels_raw)

# Filtering missing values and outliers
X_train, X_test, y_train = process_data(X_train, X_test, y_train, ids_raw, sample_filtering = False, feature_filtering = False, replace = 'median',remove_outlier = True)

# Build interaction terms
data_tr_int = build_interact_terms(X_train)
data_ts_int = build_interact_terms(X_test)

# Build polynomial of degree 3
data_tr_poly = build_poly(X_train, 4)
data_ts_poly = build_poly(X_test, 4)

# Build log 
data_tr_log = np.log(abs(X_train)+1)
data_ts_log = np.log(abs(X_test)+1)


# Combine polynomial and int term
data_train = np.c_[data_tr_poly, data_tr_int, data_tr_log]
data_test = np.c_[data_ts_poly, data_ts_int, data_ts_log]

# Perform PCA
eigVal, eigVec, sumEigVal = PCA(data_train, threshold = 0.98)
data = data_train.dot(eigVec)
data_t = data_test.dot(eigVec)
print("we have reduce the number of feature with PCA to {0}".format(eigVec.shape[1]))

y, tx = build_model_data(data, y_train)
y_t, tx_t = build_model_data(data_t,y_test)

In [None]:
num_iter = 150
lr = np.arange(0.05,0.1,0.005)
lamb = np.logspace(-3,3,7)

initial_w = np.ones(tx.shape[1])

best_gamma = 0
best_lambda = 0
best_acc = 0
grid_acc = np.zeros([len(lr),len(lamb)])

for g in range(len(lr)):
    for l in range(len(lamb)):
        grid_acc[g,l] = Grid_Search_logistic(y, tx, y_t, tx_t, initial_w, gamma=lr[g], lam=lamb[l], max_iters = num_iter, momentum = 0)
        print("gamma: {0} \t| lambda: {1} \t| test acc: {2}".format(lr[g],lamb[l], grid_acc[g,l]))
        if (grid_acc[g,l] > best_acc):
            best_acc = grid_acc[g,l]
            best_gamma = lr[g]
            best_lambda = lamb[l]

file = "grid_search_logistic_test_acc.npy"
np.save(file, grid_acc)


In [None]:
best_gamma
best_lambda

In [None]:
# grid_acc = np.load(file)
import seaborn as sns; sns.set()
ax = sns.heatmap(grid_acc,xticklabels=lamb, yticklabels=lr,vmin= 0.6, vmax=0.782)
ax.set_xlabel('lambda')
ax.set_ylabel('gamma')


In [None]:
X_train, y_train, X_test, y_test = split_data(data_raw, labels_raw)

# Filtering missing values and outliers
X_train, X_test, y_train = process_data(X_train, X_test, y_train, ids_raw, sample_filtering = False, feature_filtering = False, replace = 'mean',remove_outlier = True)

# Build interaction terms
data_tr_int = build_interact_terms(X_train)
data_ts_int = build_interact_terms(X_test)

# Build polynomial of degree 3
data_tr_poly = build_poly(X_train, 4)
data_ts_poly = build_poly(X_test, 4)

# Build log 
data_tr_log = np.log(abs(X_train)+1)
data_ts_log = np.log(abs(X_test)+1)


# Combine polynomial and int term
data_train = np.c_[data_tr_poly, data_tr_int, data_tr_log]
data_test = np.c_[data_ts_poly, data_ts_int, data_ts_log]

# Perform PCA
eigVal, eigVec, sumEigVal = PCA(data_train, threshold = 0.98)
data = data_train.dot(eigVec)
data_t = data_test.dot(eigVec)
print("we have reduce the number of feature with PCA to {0}".format(eigVec.shape[1]))

y, tx = build_model_data(data, y_train)
y_t, tx_t = build_model_data(data_t,y_test)

In [None]:
print(np.logspace(-3,3,7))

In [None]:
num_iter = 100
lr = np.arange(0.05,0.06, 0.07)
lamb = np.logspace(-3,3,7)

initial_w = np.ones(tx.shape[1])

best_gamma = 0
best_lambda = 0
best_acc = 0
grid_acc = np.zeros([len(lr),len(lamb)])

for g in range(len(lr)):
    for l in range(len(lamb)):
        grid_acc[g,l] = Grid_Search_logistic(y, tx, y_t, tx_t, initial_w, gamma=lr[g], lam=lamb[l], max_iters = num_iter, momentum = 0)
        print("gamma: {0} \t| lambda: {1} \t| test acc: {2}".format(lr[g],lamb[l], grid_acc[g,l]))
        if (grid_acc[g,l] > best_acc):
            best_acc = grid_acc[g,l]
            best_gamma = lr[g]
            best_lambda = lamb[l]

file = "grid_search_logistic_test_acc_mean.npy"
np.save(file, grid_acc)


In [None]:
# grid_acc = np.load(file)
import seaborn as sns; sns.set()
ax = sns.heatmap(grid_acc,xticklabels=lamb, yticklabels=lr,vmin= 0.6, vmax=0.782)
ax.set_xlabel('lambda')
ax.set_ylabel('gamma')


Kaggle 
====

Testing data loading
===

In [3]:
(labels_t, data_raw_t, ids_t) = load_csv_data("data/test.csv")

Get jet indexes
===

In [4]:
# Get feature jet_num
jets = data_raw[:,22]
jets_t = data_raw_t[:,22]

# Get index of samples with appropriate jet
idx_jet0 = np.argwhere(jets == 0)[:,0]
idx_jet1 = np.argwhere(jets == 1)[:,0]
idx_jet2 = np.argwhere(jets >= 2)[:,0]
# idx_jet3 = np.argwhere(jets == 3)[:,0]

idx_jet0_t = np.argwhere(jets_t == 0)[:,0]
idx_jet1_t = np.argwhere(jets_t == 1)[:,0]
idx_jet2_t = np.argwhere(jets_t >= 2)[:,0]
# idx_jet3_t = np.argwhere(jets_t == 3)[:,0]

Separate data relative to jets
===

In [5]:
data_raw = np.delete(data_raw, 22, axis=1)
data_raw_t = np.delete(data_raw_t, 22, axis=1)
# Split data relatitve to jets
data_tr_j0 = data_raw[idx_jet0,:]
data_tr_j1 = data_raw[idx_jet1,:]
data_tr_j2 = data_raw[idx_jet2,:]
# data_tr_j3 = data_raw[idx_jet3,:]

data_ts_j0 = data_raw_t[idx_jet0_t,:]
data_ts_j1 = data_raw_t[idx_jet1_t,:]
data_ts_j2 = data_raw_t[idx_jet2_t,:]
# data_ts_j3 = data_raw_t[idx_jet3_t,:]

# Split labels relative to jets
lab_j0 = labels_raw[idx_jet0]
lab_j1 = labels_raw[idx_jet1]
lab_j2 = labels_raw[idx_jet2]
# lab_j3 = labels_raw[idx_jet3]

lab_j0_t = labels_t[idx_jet0_t]
lab_j1_t = labels_t[idx_jet1_t]
lab_j2_t = labels_t[idx_jet2_t]
# lab_j3_t = labels_t[idx_jet3_t]

Data filtering and transformation
===

In [6]:
# Filtering missing values and outliers
data_j0, data_j0_t = process_data(data_tr_j0, data_ts_j0)
data_j1, data_j1_t = process_data(data_tr_j1, data_ts_j1)
data_j2, data_j2_t = process_data(data_tr_j2, data_ts_j2)
# data_j3, data_j3_t = process_data(data_tr_j3, data_ts_j3)

The original dimensions of the training data set was 99913 samples and 29 columns
 After feature and sample filtering, there are 99913 samples and 19 columns
The original dimensions of the training data set was 77544 samples and 29 columns
 After feature and sample filtering, there are 77544 samples and 22 columns
The original dimensions of the training data set was 72543 samples and 29 columns
 After feature and sample filtering, there are 72543 samples and 29 columns


In [13]:
# Transforming data using polynomials, log and interaction terms
# y_j0, tx_j0, y_j0_t, tx_j0_t = transform_data(data_j0, data_j0_t, lab_j0, lab_j0_t, long = True, pca_t = 0.9999)
# y_j1, tx_j1, y_j1_t, tx_j1_t = transform_data(data_j1, data_j1_t, lab_j1, lab_j1_t, long = True, pca_t = 0.9999)
y_j2, tx_j2, y_j2_t, tx_j2_t = transform_data(data_j2, data_j2_t, lab_j2, lab_j2_t, long = True, pca_t = 0.999)
# y_j3, tx_j3, y_j3_t, tx_j3_t = transform_data(data_j3, data_j3_t, lab_j3, lab_j3_t)

KeyboardInterrupt: 

Logistic regression using Newton's method
===

In [None]:
best_gamma = 0.07
best_lambda = 0

initial_w = np.zeros(tx_j0.shape[1])
losses, losses_t, acc, acc_t, w_0 = logistic_hessian(y_j0, tx_j0, y_j0_t, tx_j0_t, initial_w, best_gamma, best_lambda, 100) # fit model, retrieve parameters

initial_w = np.zeros(tx_j1.shape[1])
losses, losses_t, acc, acc_t, w_1 = logistic_hessian(y_j1, tx_j1, y_j1_t, tx_j1_t, initial_w, best_gamma, best_lambda, 100) # fit model, retrieve parameters

initial_w = np.zeros(tx_j2.shape[1])
losses, losses_t, acc, acc_t, w_2 = logistic_hessian(y_j2, tx_j2, y_j2_t, tx_j2_t, initial_w, best_gamma, best_lambda, 100) # fit model, retrieve parameters

# initial_w = np.zeros(tx_j3.shape[1])
# losses, losses_t, acc, acc_t, w_3 = logistic_hessian(y_j3, tx_j3, y_j3_t, tx_j3_t, initial_w, best_gamma, best_lambda, 100) # fit model, retrieve parameters


Kaggle submission
===

In [10]:
pred_t = np.zeros(ids_t.shape)

pred_t[idx_jet0_t] = predict_labels_logistic(w_0, tx_j0_t, 0.5)
pred_t[idx_jet1_t] = predict_labels_logistic(w_1, tx_j1_t, 0.5)
pred_t[idx_jet2_t] = predict_labels_logistic(w_2, tx_j2_t, 0.5)
# pred_t[idx_jet3_t] = predict_labels_logistic(w_3, tx_j3_t, 0.5)

name = "jets_better.csv"
create_csv_submission(ids_t, pred_t, name)
