Importation of modules and functions
===

In [1]:
# Modules
import csv
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import sys
import datetime
import random
import warnings

# Functions
sys.path.insert(0, './implementations/')
from implementations import *
from preprocessing import *
from pca import *
from plot import *
from helpers import *

# Autoreload
%load_ext autoreload
%autoreload 2

# Set random seed
np.random.seed(1)

Training data loading
===

In [None]:
(labels_raw, data_raw, ids_raw) = load_csv_data("data/train.csv")

Testing data loading
===

In [3]:
(labels_t, data_raw_t, ids_t) = load_csv_data("data/test.csv")

Get jet indexes
===

In [4]:
# Get feature jet_num
jets = data_raw[:,22]
jets_t = data_raw_t[:,22]

# Get index of samples with appropriate jet
idx_jet0 = np.argwhere(jets == 0)[:,0]
idx_jet1 = np.argwhere(jets == 1)[:,0]
idx_jet2 = np.argwhere(jets >= 2)[:,0]

idx_jet0_t = np.argwhere(jets_t == 0)[:,0]
idx_jet1_t = np.argwhere(jets_t == 1)[:,0]
idx_jet2_t = np.argwhere(jets_t >= 2)[:,0]

Separate data relative to jets
===

In [5]:
data_raw = np.delete(data_raw, 22, axis=1)
data_raw_t = np.delete(data_raw_t, 22, axis=1)

# Split data relatitve to jets
data_tr_j0 = data_raw[idx_jet0,:]
data_tr_j1 = data_raw[idx_jet1,:]
data_tr_j2 = data_raw[idx_jet2,:]

data_ts_j0 = data_raw_t[idx_jet0_t,:]
data_ts_j1 = data_raw_t[idx_jet1_t,:]
data_ts_j2 = data_raw_t[idx_jet2_t,:]

# Split labels relative to jets
lab_j0 = labels_raw[idx_jet0]
lab_j1 = labels_raw[idx_jet1]
lab_j2 = labels_raw[idx_jet2]

lab_j0_t = labels_t[idx_jet0_t]
lab_j1_t = labels_t[idx_jet1_t]
lab_j2_t = labels_t[idx_jet2_t]

Data filtering and transformation
===

In [12]:
# Filtering missing values and outliers
data_j0, data_j0_t = process_data(data_tr_j0, data_ts_j0)
data_j1, data_j1_t = process_data(data_tr_j1, data_ts_j1)
data_j2, data_j2_t = process_data(data_tr_j2, data_ts_j2)

The original dimensions of the training data set was 99913 samples and 29 columns
 After feature and sample filtering, there are 99913 samples and 19 columns
The original dimensions of the training data set was 77544 samples and 29 columns
 After feature and sample filtering, there are 77544 samples and 22 columns
The original dimensions of the training data set was 72543 samples and 29 columns
 After feature and sample filtering, there are 72543 samples and 29 columns


In [13]:
# Transforming data using polynomials, log and interaction terms
y_j0, tx_j0, y_j0_t, tx_j0_t = transform_data(data_j0, data_j0_t, lab_j0, lab_j0_t, 6)
y_j1, tx_j1, y_j1_t, tx_j1_t = transform_data(data_j1, data_j1_t, lab_j1, lab_j1_t, log = False)
y_j2, tx_j2, y_j2_t, tx_j2_t = transform_data(data_j2, data_j2_t, lab_j2, lab_j2_t)

we have reduce the number of feature with PCA to 464
we have reduce the number of feature with PCA to 542
we have reduce the number of feature with PCA to 956


Logistic regression using Newton's method
===

In [14]:
initial_w = np.zeros(tx_j0.shape[1])
losses, losses_t, acc, acc_t, w_0 = logistic_hessian(y_j0, tx_j0, y_j0_t, tx_j0_t, initial_w, 0.075, 1, 100) # fit model, retrieve parameters

initial_w = np.zeros(tx_j1.shape[1])
losses, losses_t, acc, acc_t, w_1 = logistic_hessian(y_j1, tx_j1, y_j1_t, tx_j1_t, initial_w, 0.075, 100, 100) # fit model, retrieve parameters

initial_w = np.zeros(tx_j2.shape[1])
losses, losses_t, acc, acc_t, w_2 = logistic_hessian(y_j2, tx_j2, y_j2_t, tx_j2_t, initial_w, 0.07, 0.001, 100) # fit model, retrieve parameters

25/100	 train acc : 0.8496792209222023 	 | test acc : 0.20791970385741543
50/100	 train acc : 0.850690100387337 	 | test acc : 0.21410106481196528
75/100	 train acc : 0.8509303093691511 	 | test acc : 0.21614539827132923
100/100	 train acc : 0.8508602484161221 	 | test acc : 0.21643556173007764
25/100	 train acc : 0.8161817806664603 	 | test acc : 0.32772131540225163
50/100	 train acc : 0.8185159393376663 	 | test acc : 0.33149117704091524
75/100	 train acc : 0.8187738574228824 	 | test acc : 0.3323808872007209
100/100	 train acc : 0.8188641287527081 	 | test acc : 0.33254628203812064
25/100	 train acc : 0.8483106571274968 	 | test acc : 0.4489609651720845
50/100	 train acc : 0.8504611058268888 	 | test acc : 0.4458964470932411
75/100	 train acc : 0.850598955102491 	 | test acc : 0.44502000701152067
100/100	 train acc : 0.8508608687261349 	 | test acc : 0.44478427485160965


Kaggle submission
===

In [15]:
pred_t = np.zeros(ids_t.shape)

pred_t[idx_jet0_t] = predict_labels_logistic(w_0, tx_j0_t, 0.5)
pred_t[idx_jet1_t] = predict_labels_logistic(w_1, tx_j1_t, 0.5)
pred_t[idx_jet2_t] = predict_labels_logistic(w_2, tx_j2_t, 0.5)

name = "based_on_train.csv"
create_csv_submission(ids_t, pred_t, name)