## Import necessary libraries

In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
import math

In [2]:
from costs import *
from models import *
from helpers import * 
from evaluation import *
from split_data import *

## Preprocessing
** Load the training data into feature matrix, class labels, and record ids**

We write our own `load_csv_data` function to import csv data, which gives us prediction column, feature matrix and each record ID.

In [3]:
from proj1_helpers import *
DATA_TRAIN_PATH = 'data/train.csv' # TODO: download train data and supply path here 
y, tx, ids = load_csv_data(DATA_TRAIN_PATH, sub_sample=False)

** Split into 6 distinct datasets**
According to our exploration, we can distinct 3 different dataset based on number of jets each experiments contains. Then each of them can be split again into 2 different datasets based on whether they have a measurable mass or not.

In [4]:
jet0, jet1, jet23, y0, y1, y23 = split_on_jets(y, tx)
jet0_nomass, jet0, y0_nomass, y0 = split_on_mass(y0, jet0)
jet1_nomass, jet1, y1_nomass, y1 = split_on_mass(y1, jet1)
jet23_nomass, jet23, y23_nomass, y23 = split_on_mass(y23, jet23)

### Only keep the features without NaN in each subset

In [5]:
jet0_nomass, selected_jet0_nomass = select_features_without_nan(jet0_nomass)
jet0, selected_jet0 = select_features_without_nan(jet0)
jet1_nomass, selected_jet1_nomass = select_features_without_nan(jet1_nomass)
jet1, selected_jet1 = select_features_without_nan(jet1)
jet23_nomass, selected_jet23_nomass = select_features_without_nan(jet23_nomass)
jet23, selected_jet23 = select_features_without_nan(jet23)


## to do: remove more features ! some are flat, some are even unique !!

** Standardization of values**
We use [feature scaling](https://en.wikipedia.org/wiki/Feature_scaling) method to standardize our feature matrix, i.e. to rescale tx down to [0, 1], so as to avoid complicated computation caused by large numbers.

In [6]:
tx = standardize(tx)
sdt_jet0_nomass, min0_nomass, range0_nomass = standardize(jet0_nomass)
sdt_jet0, min0, range0 = standardize(jet0)
sdt_jet1_nomass, min1_nomass, range1_nomass = standardize(jet1_nomass)
sdt_jet1, min1, range1 = standardize(jet1)
sdt_jet23_nomass, min23_nomass, range23_nomass = standardize(jet23_nomass)
sdt_jet23, min23, range23 = standardize(jet23)

## Model Selection

Let's begin with a simple linear regression with least_square using **normal equations**. Here we don't consider using least squares with gradient descent or stochastic gradient descent for the fact that **optimal w could be derived thoeritically**. We therefore don't bother to estimate the w.

We run cross validation 4 times on our train_data to see LS performance

### Logistic Regression

Choose intial parameters

In [7]:
n_iters = 10000
gamma = 0.000003
lambda_ = 0.001

Train with logistic regression

In [9]:
print("w0_nomass")
loss0_nomass, w0_nomass = ridge_regression(y0_nomass, jet0_nomass, lambda_)
print("w0")
loss0, w0 = ridge_regression(y0, jet0, lambda_)

print("w1_nomass")
loss1_nomass, w1_nomass = ridge_regression(y1_nomass, jet1_nomass, lambda_)
print("w1")
loss1_nomass, w1 = ridge_regression(y1, jet1, lambda_)

print("w23_nomass")
loss23_nomass, w23_nomass = ridge_regression(y23_nomass, jet23_nomass, lambda_)
print("w23")
loss23, w23 = ridge_regression(y23, jet23, lambda_)


w0_nomass
w0
w1_nomass
w1
w23_nomass
w23


## Prediction

In [10]:
test_x = np.genfromtxt('data/test.csv', delimiter=',', skip_header=1)

In [11]:
ids = test_x[:, 0]
testset = test_x[:, 2:] # remove id and prediction columns

y = []

for x_t in testset:
    x = np.array([x_t])
    if isJet0_nomass(x):
        pred = x[:,selected_jet0_nomass]
        pred, _ , _ = standardize(pred, min0_nomass, range0_nomass)
        y.append(predict(pred, w0_nomass))
    elif isJet0(x):
        pred = x[:, selected_jet0]
        pred, _ , _  = standardize(pred, min0, range0)
        y.append(predict(pred, w0))
    elif isJet1_nomass(x):
        pred = x[:, selected_jet1_nomass]
        pred, _ , _  = standardize(pred,min1_nomass, range1_nomass)
        y.append(predict(pred, w1_nomass))
    elif isJet1(x):
        pred = x[:, selected_jet1]
        pred, _ , _  = standardize(pred, min1, range1)
        y.append(predict(pred, w1))
    elif isJet23_nomass(x):
        pred= x[:, selected_jet23_nomass]
        pred, _ , _  = standardize(pred, min23_nomass, range23_nomass)
        y.append(predict(pred, w23_nomass))
    else:
        pred= x[:, selected_jet23]
        pred, _ , _  = standardize(pred, min23, range23)
        y.append(predict(pred, w23))


In [12]:
create_csv_submission(ids, y, 'data/split_data_ridge.csv')

## Feature Engineering
TODO

## Prediction

**Generate predictions and save ouput in csv format for submission**