In [6]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from bokeh.plotting import figure, output_notebook,show, ColumnDataSource
from bokeh.layouts import widgetbox
from bokeh.models.widgets import DataTable, NumberFormatter, TableColumn
output_notebook()

from k_fold import *
from proj1_helpers import *
from implementations import *
from helpers import *

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
y, x, ids = load_csv_data('data/train.csv')
labels = np.array(np.genfromtxt('data/train.csv', delimiter=",", names=True).dtype.names[2:])

# Customized model

Our model divides the training dataset in six groups based on the observation made during the exploratory data analysis. 

Our model makes use of Ridge Regression and we compare normal and polynomial basis-enhanced Ridge Regression.

Function to divide the training dataset in six groups.

In [18]:
def x_y_for_jet(x, y, n, mass=True):
    jet_num = x[:, 22] == n
    if n == 2:
        jet_num = jet_num | (x[:, 22] == 3)
    if mass:
        jet_num = jet_num & (x[:, 0] != -999)
    else:
        jet_num = jet_num & (x[:, 0] == -999)
    x_jet = x[jet_num]
    y_jet = y[jet_num]
    jet_mean = np.mean(x_jet, axis=0)
    x_jet = x_jet[:, (jet_mean != -999) & (jet_mean != 0) & (jet_mean != n)]
    return x_jet, y_jet

# tuning the model: group0

In [None]:
lambdas = np.logspace(-10,0,30)
x_jet, y_jet = x_y_for_jet(x,y, 0, False)
rmse_tr = []
rmse_te = []
for lambda_ in lambdas:
    rmse_tr_, rmse_te_ = cross_validation(y_jet, x_jet, 10, ridge_regression, accuracy, {'lambda_': lambda_}, seed=5)
    rmse_tr.append(rmse_tr_)
    rmse_te.append(rmse_te_)
cross_validation_visualization(lambdas, rmse_tr, rmse_te)

version with polynomially enhanced input.

In [None]:
lambdas = np.logspace(-10,0,10)
x_jet, y_jet = x_y_for_jet(x,y, 0, False)
rmse_tr = []
rmse_te = []
degree = 5
for lambda_ in lambdas:
    rmse_tr_, rmse_te_ = cross_validation(y_jet, x_jet, 10, ridge_regression_with_poly, accuracy_with_poly, {'lambda_': lambda_, 'degree': degree}, {'degree': degree},  seed=5)
    rmse_tr.append(rmse_tr_)
    rmse_te.append(rmse_te_)
cross_validation_visualization(lambdas, rmse_tr, rmse_te)

In [None]:
print(np.min(rmse_te))
print(lambdas[np.argmin(rmse_te)])

# tuning the model: group1

In [None]:
lambdas = np.logspace(-10,0,30)
x_jet, y_jet = x_y_for_jet(x,y, 0, True)
rmse_tr = []
rmse_te = []
for lambda_ in lambdas:
    rmse_tr_, rmse_te_ = cross_validation(y_jet, x_jet, 10, ridge_regression, accuracy, {'lambda_': lambda_}, seed=5)
    rmse_tr.append(rmse_tr_)
    rmse_te.append(rmse_te_)
cross_validation_visualization(lambdas, rmse_tr, rmse_te)

version with polynomially enhanced input.

In [None]:
lambdas = np.logspace(-10,0,5)
x_jet, y_jet = x_y_for_jet(x,y, 0, True)
rmse_tr = []
rmse_te = []
degree = 4
for lambda_ in lambdas:
    rmse_tr_, rmse_te_ = cross_validation(y_jet, x_jet, 10, ridge_regression_with_poly, accuracy_with_poly, {'lambda_': lambda_, 'degree': degree}, {'degree': degree},  seed=5)
    rmse_tr.append(rmse_tr_)
    rmse_te.append(rmse_te_)
cross_validation_visualization(lambdas, rmse_tr, rmse_te)

In [None]:
print(np.min(rmse_te))
print(lambdas[np.argmin(rmse_te)])

# tuning the model: group2

In [None]:
lambdas = np.logspace(-10,0,30)
x_jet, y_jet = x_y_for_jet(x,y, 1, True)
rmse_tr = []
rmse_te = []
for lambda_ in lambdas:
    rmse_tr_, rmse_te_ = cross_validation(y_jet, x_jet, 10, ridge_regression, accuracy, {'lambda_': lambda_}, seed=5)
    rmse_tr.append(rmse_tr_)
    rmse_te.append(rmse_te_)
cross_validation_visualization(lambdas, rmse_tr, rmse_te)

version with polynomially enhanced input.

In [None]:
lambdas = np.logspace(-10,0,10)
x_jet, y_jet = x_y_for_jet(x,y, 1, True)
rmse_tr = []
rmse_te = []
degree = 6
for lambda_ in lambdas:
    rmse_tr_, rmse_te_ = cross_validation(y_jet, x_jet, 10, ridge_regression_with_poly, accuracy_with_poly, {'lambda_': lambda_, 'degree': degree}, {'degree': degree},  seed=5)
    rmse_tr.append(rmse_tr_)
    rmse_te.append(rmse_te_)
cross_validation_visualization(lambdas, rmse_tr, rmse_te)

In [None]:
print(np.min(rmse_te))
print(lambdas[np.argmin(rmse_te)])

# tuning the model: group3

In [None]:
lambdas = np.logspace(-10,0,30)
x_jet, y_jet = x_y_for_jet(x,y, 1, False)
rmse_tr = []
rmse_te = []
for lambda_ in lambdas:
    rmse_tr_, rmse_te_ = cross_validation(y_jet, x_jet, 10, ridge_regression, accuracy, {'lambda_': lambda_}, seed=5)
    rmse_tr.append(rmse_tr_)
    rmse_te.append(rmse_te_)
cross_validation_visualization(lambdas, rmse_tr, rmse_te)

version with polynomially enhanced input.

In [None]:
lambdas = np.logspace(-10,0,10)
x_jet, y_jet = x_y_for_jet(x,y, 1, False)
rmse_tr = []
rmse_te = []
degree = 5
for lambda_ in lambdas:
    rmse_tr_, rmse_te_ = cross_validation(y_jet, x_jet, 10, ridge_regression_with_poly, accuracy_with_poly, {'lambda_': lambda_, 'degree': degree}, {'degree': degree},  seed=5)
    rmse_tr.append(rmse_tr_)
    rmse_te.append(rmse_te_)
cross_validation_visualization(lambdas, rmse_tr, rmse_te)

In [None]:
print(np.min(rmse_te))
print(lambdas[np.argmin(rmse_te)])

# tuning the model: group4

In [None]:
lambdas = np.logspace(-10,0,30)
x_jet, y_jet = x_y_for_jet(x,y, 2, True)
rmse_tr = []
rmse_te = []
for lambda_ in lambdas:
    rmse_tr_, rmse_te_ = cross_validation(y_jet, x_jet, 10, ridge_regression, accuracy, {'lambda_': lambda_}, seed=5)
    rmse_tr.append(rmse_tr_)
    rmse_te.append(rmse_te_)
cross_validation_visualization(lambdas, rmse_tr, rmse_te)

version with polynomially enhanced input.

In [None]:
lambdas = np.logspace(-10,0,10)
x_jet, y_jet = x_y_for_jet(x,y, 2, True)
rmse_tr = []
rmse_te = []
degree = 7
for lambda_ in lambdas:
    rmse_tr_, rmse_te_ = cross_validation(y_jet, x_jet, 10, ridge_regression_with_poly, accuracy_with_poly, {'lambda_': lambda_, 'degree': degree}, {'degree': degree},  seed=5)
    rmse_tr.append(rmse_tr_)
    rmse_te.append(rmse_te_)
cross_validation_visualization(lambdas, rmse_tr, rmse_te)

In [None]:
print(np.min(rmse_te))
print(lambdas[np.argmin(rmse_te)])

# tuning the model: group5

In [None]:
lambdas = np.logspace(-10,0,30)
x_jet, y_jet = x_y_for_jet(x,y, 2, False)
rmse_tr = []
rmse_te = []
for lambda_ in lambdas:
    rmse_tr_, rmse_te_ = cross_validation(y_jet, x_jet, 10, ridge_regression, accuracy, {'lambda_': lambda_}, seed=5)
    rmse_tr.append(rmse_tr_)
    rmse_te.append(rmse_te_)
cross_validation_visualization(lambdas, rmse_tr, rmse_te)

version with polynomially enhanced input.

In [None]:
lambdas = np.logspace(-10,0,10)
x_jet, y_jet = x_y_for_jet(x,y, 2, False)
rmse_tr = []
rmse_te = []
degree = 3
for lambda_ in lambdas:
    rmse_tr_, rmse_te_ = cross_validation(y_jet, x_jet, 10, ridge_regression_with_poly, accuracy_with_poly, {'lambda_': lambda_, 'degree': degree}, {'degree': degree},  seed=5)
    rmse_tr.append(rmse_tr_)
    rmse_te.append(rmse_te_)
cross_validation_visualization(lambdas, rmse_tr, rmse_te)

In [None]:
print(np.min(rmse_te))
print(lambdas[np.argmin(rmse_te)])

# tuning the whole model

In [None]:
lambdas = np.logspace(-5,0,10)
degree = 6
for n in range(3):
    for mass in [True, False]:
        x_jet, y_jet = x_y_for_jet(x, y, n, mass)
        plt.figure(n*2+int(mass))
        rmse_tr = []
        rmse_te = []
        for lambda_ in lambdas:
            rmse_tr_, rmse_te_ = cross_validation(y_jet, x_jet, 10, ridge_regression, compute_mse, {'lambda_': lambda_}, seed=5)
            rmse_tr.append(rmse_tr_)
            rmse_te.append(rmse_te_)
        cross_validation_visualization(lambdas, rmse_tr, rmse_te)
        plt.plot()
        i += 1

# applying feature transformation

This is the function we used to standardize exponential features.

In [1]:
def log_normalize(x):
    if x > 0:
        return np.log(x)
    return x
normalize = [idx for idx, label in enumerate(labels) if label in ['DER_mass_vis', 'PRI_tau_pt', 'PRI_lep_pt',\
                                                                  'PRI_met', 'PRI_jet_subleading_pt', 'DER_mass_MMC',\
                                                                  'DER_mass_vis', 'DER_pt_tot', 'DER_sum_pt',\
                                                                  'DER_pt_ratio_lep_tau', 'PRI_met_sumet',\
                                                                  'PRI_jet_leading_pt']]

NameError: name 'labels' is not defined