Importation of modules and functions
===

In [1]:
# Modules
import csv
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import sys
import datetime
import random
import warnings

# Functions
sys.path.insert(0, './implementations/')
from implementations import *
from preprocessing import *
from pca import *
from plot import *
from helpers import *

# Autoreload
%load_ext autoreload
%autoreload 2

# Set random seed
np.random.seed(1)

Training data loading
===

In [2]:
(labels_raw, data_raw, ids_raw) = load_csv_data("data/train.csv")

Testing data loading
===

In [3]:
(labels_t, data_raw_t, ids_t) = load_csv_data("data/test.csv")

Get jet indexes
===

In [4]:
# Get feature jet_num
jets = data_raw[:,22]
jets_t = data_raw_t[:,22]

# Get index of samples with appropriate jet
idx_jet0 = np.argwhere(jets == 0)[:,0]
idx_jet1 = np.argwhere(jets == 1)[:,0]
idx_jet2 = np.argwhere(jets >= 2)[:,0]

idx_jet0_t = np.argwhere(jets_t == 0)[:,0]
idx_jet1_t = np.argwhere(jets_t == 1)[:,0]
idx_jet2_t = np.argwhere(jets_t >= 2)[:,0]

Separate data relative to jets
===

In [5]:
data_raw = np.delete(data_raw, 22, axis=1)
data_raw_t = np.delete(data_raw_t, 22, axis=1)

# Split data relatitve to jets
data_tr_j0 = data_raw[idx_jet0,:]
data_tr_j1 = data_raw[idx_jet1,:]
data_tr_j2 = data_raw[idx_jet2,:]

data_ts_j0 = data_raw_t[idx_jet0_t,:]
data_ts_j1 = data_raw_t[idx_jet1_t,:]
data_ts_j2 = data_raw_t[idx_jet2_t,:]

# Split labels relative to jets
lab_j0 = labels_raw[idx_jet0]
lab_j1 = labels_raw[idx_jet1]
lab_j2 = labels_raw[idx_jet2]

lab_j0_t = labels_t[idx_jet0_t]
lab_j1_t = labels_t[idx_jet1_t]
lab_j2_t = labels_t[idx_jet2_t]

Data filtering and transformation
===

In [6]:
# Filtering missing values and outliers
data_j0, data_j0_t = process_data(data_tr_j0, data_ts_j0)
data_j1, data_j1_t = process_data(data_tr_j1, data_ts_j1)
data_j2, data_j2_t = process_data(data_tr_j2, data_ts_j2)

Filtering features
Finding and replacing outliers by column mean
Replacing NaN points with feature mean value
Standerizing the data
Filtering features
Finding and replacing outliers by column mean
Replacing NaN points with feature mean value
Standerizing the data
Filtering features
Finding and replacing outliers by column mean
Replacing NaN points with feature mean value
Standerizing the data


In [7]:
# Transforming data using polynomials, log and interaction terms
y_j0, tx_j0, y_j0_t, tx_j0_t = transform_data(data_j0, data_j0_t, lab_j0, lab_j0_t, 6)
y_j1, tx_j1, y_j1_t, tx_j1_t = transform_data(data_j1, data_j1_t, lab_j1, lab_j1_t, log = False)
y_j2, tx_j2, y_j2_t, tx_j2_t = transform_data(data_j2, data_j2_t, lab_j2, lab_j2_t)

Building polynomial of degree 6
Building the interactive terms
Taking the log value of the data
Building the interactive terms or order three
Performing PCA and keeping feature explaining 1 of the variance
Reducing the number of PCA to 464
Adding a columns of ones to the dataset
Building polynomial of degree 4
Building the interactive terms
Building the interactive terms or order three
Performing PCA and keeping feature explaining 1 of the variance
Reducing the number of PCA to 542
Adding a columns of ones to the dataset
Building polynomial of degree 4
Building the interactive terms
Taking the log value of the data
Building the interactive terms or order three
Performing PCA and keeping feature explaining 1 of the variance
Reducing the number of PCA to 956
Adding a columns of ones to the dataset


Logistic regression using Newton's method
===

In [8]:
initial_w = np.zeros(tx_j0.shape[1])
losses, losses_t, acc, acc_t, w_0 = logistic_hessian(y_j0, tx_j0, y_j0_t, tx_j0_t, initial_w, 0.075, 1, 100) # fit model, retrieve parameters

initial_w = np.zeros(tx_j1.shape[1])
losses, losses_t, acc, acc_t, w_1 = logistic_hessian(y_j1, tx_j1, y_j1_t, tx_j1_t, initial_w, 0.075, 100, 100) # fit model, retrieve parameters

initial_w = np.zeros(tx_j2.shape[1])
losses, losses_t, acc, acc_t, w_2 = logistic_hessian(y_j2, tx_j2, y_j2_t, tx_j2_t, initial_w, 0.07, 0.001, 100) # fit model, retrieve parameters

25/100	 train acc : 0.8509803529070291 	 | test acc : 0.21779845070298692
50/100	 train acc : 0.8509603354918779 	 | test acc : 0.21563541401049865
75/100	 train acc : 0.850910291954 	 | test acc : 0.21601350579007994
100/100	 train acc : 0.8508302222933952 	 | test acc : 0.21638280473757793
25/100	 train acc : 0.8193154854018364 	 | test acc : 0.33300824692878894
50/100	 train acc : 0.8188125451356649 	 | test acc : 0.33203869098541106
75/100	 train acc : 0.8188125451356649 	 | test acc : 0.3322725250658728
100/100	 train acc : 0.8188383369441865 	 | test acc : 0.33246643625454836
25/100	 train acc : 0.8513019864080614 	 | test acc : 0.44363583612383795
50/100	 train acc : 0.8508470837985747 	 | test acc : 0.4444397432332781
75/100	 train acc : 0.8508332988710144 	 | test acc : 0.4447117418793293
100/100	 train acc : 0.8508332988710144 	 | test acc : 0.44476614160853956


Kaggle submission
===

In [9]:
pred_t = np.zeros(ids_t.shape)

pred_t[idx_jet0_t] = predict_labels_logistic(w_0, tx_j0_t, 0.5)
pred_t[idx_jet1_t] = predict_labels_logistic(w_1, tx_j1_t, 0.5)
pred_t[idx_jet2_t] = predict_labels_logistic(w_2, tx_j2_t, 0.5)

name = "new.csv"
create_csv_submission(ids_t, pred_t, name)