## Import necessary libraries

In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
import math

In [2]:
from costs import *
from models import *
from helpers import * 
from evaluation import *
from split_data import *

## Preprocessing
** Load the training data into feature matrix, class labels, and record ids**

We write our own `load_csv_data` function to import csv data, which gives us prediction column, feature matrix and each record ID.

In [3]:
from proj1_helpers import *
DATA_TRAIN_PATH = 'data/train.csv' # TODO: download train data and supply path here 
y, tx, ids = load_csv_data(DATA_TRAIN_PATH, sub_sample=False)

** Split into 6 distinct datasets**
According to our exploration, we can distinct 3 different dataset based on number of jets each experiments contains. Then each of them can be split again into 2 different datasets based on whether they have a measurable mass or not.

In [4]:
jet0, jet1, jet23, y0, y1, y23 = split_on_jets(y, tx)
jet0_nomass, jet0, y0_nomass, y0 = split_on_mass(y0, jet0)
jet1_nomass, jet1, y1_nomass, y1 = split_on_mass(y1, jet1)
jet23_nomass, jet23, y23_nomass, y23 = split_on_mass(y23, jet23)

### Only keep the features without NaN in each subset

In [5]:
jet0_nomass, selected_jet0_nomass = select_features_without_nan(jet0_nomass)
jet0, selected_jet0 = select_features_without_nan(jet0)
jet1_nomass, selected_jet1_nomass = select_features_without_nan(jet1_nomass)
jet1, selected_jet1 = select_features_without_nan(jet1)
jet23_nomass, selected_jet23_nomass = select_features_without_nan(jet23_nomass)
jet23, selected_jet23 = select_features_without_nan(jet23)


## to do: remove more features ! some are flat, some are even unique !!

** Standardization of values**
We use [feature scaling](https://en.wikipedia.org/wiki/Feature_scaling) method to standardize our feature matrix, i.e. to rescale tx down to [0, 1], so as to avoid complicated computation caused by large numbers.

In [6]:
tx = standardize(tx)
sdt_jet0_nomass, min0_nomass, range0_nomass = standardize(jet0_nomass)
sdt_jet0, min0, range0 = standardize(jet0)
sdt_jet1_nomass, min1_nomass, range1_nomass = standardize(jet1_nomass)
sdt_jet1, min1, range1 = standardize(jet1)
sdt_jet23_nomass, min23_nomass, range23_nomass = standardize(jet23_nomass)
sdt_jet23, min23, range23 = standardize(jet23)

## Model Selection

Let's begin with a simple linear regression with least_square using **normal equations**. Here we don't consider using least squares with gradient descent or stochastic gradient descent for the fact that **optimal w could be derived thoeritically**. We therefore don't bother to estimate the w.

We run cross validation 4 times on our train_data to see LS performance

### Logistic Regression

Choose intial parameters

In [7]:
n_iters = 10000
gamma = 0.000003

Train with logistic regression

In [8]:
print("w0_nomass")
loss0_nomass, w0_nomass = logistic_regression(y0_nomass, jet0_nomass, gamma, n_iters)
print("w0")
loss0, w0 = logistic_regression(y0, jet0, gamma, n_iters)

print("w1_nomass")
loss1_nomass, w1_nomass = logistic_regression(y1_nomass, jet1_nomass, gamma, n_iters)
print("w1")
loss1_nomass, w1 = logistic_regression(y1, jet1, gamma, n_iters)

print("w23_nomass")
loss23_nomass, w23_nomass = logistic_regression(y23_nomass, jet23_nomass, gamma, n_iters)
print("w23")
loss23, w23 = logistic_regression(y23, jet23, gamma, n_iters)


w0_nomass
Current iteration=0, the loss=18107.08379776745, gradient=14.979627418817616
Current iteration=100, the loss=18923.411273579743, gradient=1.077527815046155
Current iteration=200, the loss=18922.119312667215, gradient=0.918928181652207
Current iteration=300, the loss=18920.14834788276, gradient=0.7955475238607953
Current iteration=400, the loss=18917.775532430078, gradient=0.6967303614411418
Current iteration=500, the loss=18915.202810020794, gradient=0.6154599464281553
Current iteration=600, the loss=18912.573013530553, gradient=0.5471194813382345
Current iteration=700, the loss=18909.98309712258, gradient=0.48864242862344115
Current iteration=800, the loss=18907.495785800133, gradient=0.4379508011387841
Current iteration=900, the loss=18905.148996923162, gradient=0.3935946157851658
Current iteration=1000, the loss=18902.963106326482, gradient=0.35452515368357557
Current iteration=1100, the loss=18900.946384251096, gradient=0.3199540474469154
Current iteration=1200, the loss=

## Prediction

In [9]:
test_x = np.genfromtxt('data/test.csv', delimiter=',', skip_header=1)

In [27]:
ids = test_x[:, 0]
testset = test_x[:, 2:] # remove id and prediction columns

y = []

for x_t in testset:
    x = np.array([x_t])
    if isJet0_nomass(x):
        pred = x[:,selected_jet0_nomass]
        pred, _ , _ = standardize(pred, min0_nomass, range0_nomass)
        y.append(log_reg_predict(pred, w0_nomass))
    elif isJet0(x):
        pred = x[:, selected_jet0]
        pred, _ , _  = standardize(pred, min0, range0)
        y.append(log_reg_predict(pred, w0))
    elif isJet1_nomass(x):
        pred = x[:, selected_jet1_nomass]
        pred, _ , _  = standardize(pred,min1_nomass, range1_nomass)
        y.append(log_reg_predict(pred, w1_nomass))
    elif isJet1(x):
        pred = x[:, selected_jet1]
        pred, _ , _  = standardize(pred, min1, range1)
        y.append(log_reg_predict(pred, w1))
    elif isJet23_nomass(x):
        pred= x[:, selected_jet23_nomass]
        pred, _ , _  = standardize(pred, min23_nomass, range23_nomass)
        y.append(log_reg_predict(pred, w23_nomass))
    else:
        pred= x[:, selected_jet23]
        pred, _ , _  = standardize(pred, min23, range23)
        y.append(log_reg_predict(pred, w23))


In [29]:
create_csv_submission(ids, y, 'data/split_data.csv')

In [None]:
ids = test_x[:, 0]
testset = test_x[:, 2:] # remove id and prediction columns

jet0_nm_ids = np.where((testset[:,22] == 0) & (testset[:,0] == -999.))[0]
jet0_ids = np.where((testset[:,22] == 0) & (testset[:,0] != -999.))[0]
jet1_nm_ids = np.where((testset[:,22] == 1) & (testset[:,0] == -999.))[0]
jet1_ids = np.where((testset[:,22] == 1) & (testset[:,0] != -999.))[0]
jet23_nm_ids = np.where(((testset[:,22] == 2) | (testset[:,22] == 3)) \
                        & (testset[:,0] == -999.))[0]
jet23_ids = np.where(((testset[:,22] == 2) | (testset[:,22] == 3)) \
                        & (testset[:,0] != -999.))[0]

jet0_test_nm = testset[jet0_nm_ids][:, selected_jet0_nomass]
jet0_test = testset[jet0_ids][:, selected_jet0]
jet1_test_nm = testset[jet1_nm_ids][:, selected_jet1_nomass]
jet1_test = testset[jet1_ids][:, selected_jet1]
jet23_test_nm = testset[jet23_nm_ids][:, selected_jet23_nomass]
jet23_test = testset[jet23_ids][:, selected_jet23]

higgs_jet0_nm = np.where(log_reg_predict(jet0_test_nm, w0_nomass) == 1)[0]
higgs_jet0 = np.where(log_reg_predict(jet0_test, w0) == 1)[0]
higgs_jet1_nm = np.where(log_reg_predict(jet1_test_nm, w1_nomass) == 1)[0]
higgs_jet1 = np.where(log_reg_predict(jet1_test, w1) == 1)[0]
higgs_jet23_nm = np.where(log_reg_predict(jet23_test_nm, w23_nomass) == 1)[0]
higgs_jet23 = np.where(log_reg_predict(jet23_test, w23) == 1)[0]


### retrieve the id's of higgs boson

In [None]:
import itertools

higgs = itertools.chain(ids[jet0_nm_ids][higgs_jet0_nm], ids[jet0_ids][higgs_jet0] \
               , ids[jet1_nm_ids][higgs_jet1_nm], ids[jet1_ids][higgs_jet1]\
               , ids[jet23_nm_ids][higgs_jet23_nm], ids[jet23_ids][higgs_jet23])

higgs_arr = list([int(i) for i in higgs])

In [None]:
res = []
for i in range(350000,918238):
    if i in higgs_arr:
        res.append([i, 1])
    else:
        res.append([i, -1])
        
res

In [None]:
prediction = apply_right_model(testset, ids, w0_nomass, w0, w1_nomass, ...
                               w1, w23_nomass, w23, selected_jet0_nomass, ...
                              selected_jet0, selected_jet1_nomass, selected_jet1, ...
                              selected_jet23_nomass, selected_jet23)
prediction

In [None]:
print("higgs: ", np.count_nonzero(prediction == 1))
print("non-higgs: ", np.count_nonzero(prediction == -1))

In [None]:
test_x = np.genfromtxt('data/test.csv', delimiter=',', skip_header=1)
test_x = standardize(test_x[:, 2:])  # remove id and prediction columns
# could've used load_csv_data
create_csv_submission([i for i in range(350000,918238)], log_reg_predict(test_x, w), 'res.csv')

In [None]:
test_jet0

## Feature Engineering
TODO

## Prediction

**Generate predictions and save ouput in csv format for submission**