In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [126]:
from proj1_helpers import *
from implementations import *

In [3]:
def compute_accuracy(y, predictions):
    N = y.size
    accuracy = 1 - (np.count_nonzero(predictions-y)/N)
    print("Accuracy: {}".format(accuracy))

In [4]:
def save_results(weights, clean_features, parameters):
    np.save('all/weights.npy', weights)
    np.save('all/clean_features.npy', clean_features)
    np.save('all/parameters.npy', parameters)

In [19]:
test = np.array([[1,2,3,4],[1,5,6,7]])
build_poly(test[:,1:], 3)

array([[  1.,   2.,   3.,   4.,   4.,   9.,  16.,   8.,  27.,  64.],
       [  1.,   5.,   6.,   7.,  25.,  36.,  49., 125., 216., 343.]])

# Part 1: Without feature engineering

### Import Dataset

In [5]:
train_data = 'all/train.csv'
labels, input_data, ids, features = load_csv_data(train_data)

In [6]:
features

array(['DER_mass_MMC', 'DER_mass_transverse_met_lep', 'DER_mass_vis',
       'DER_pt_h', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet',
       'DER_prodeta_jet_jet', 'DER_deltar_tau_lep', 'DER_pt_tot',
       'DER_sum_pt', 'DER_pt_ratio_lep_tau', 'DER_met_phi_centrality',
       'DER_lep_eta_centrality', 'PRI_tau_pt', 'PRI_tau_eta',
       'PRI_tau_phi', 'PRI_lep_pt', 'PRI_lep_eta', 'PRI_lep_phi',
       'PRI_met', 'PRI_met_phi', 'PRI_met_sumet', 'PRI_jet_num',
       'PRI_jet_leading_pt', 'PRI_jet_leading_eta', 'PRI_jet_leading_phi',
       'PRI_jet_subleading_pt', 'PRI_jet_subleading_eta',
       'PRI_jet_subleading_phi', 'PRI_jet_all_pt'], dtype='<U27')

In [27]:
training_ratio = 0.8

In [28]:
x_tr, x_te, y_tr, y_te = split_data(input_data, labels, training_ratio)

In [29]:
tx_tr, mean_tr, std_tr = extend_and_standardize(x_tr)

## Gradient descent

In [166]:
initial_w = np.zeros(x_tr.shape[1])
max_iters = 100
gamma = 0.00001

In [167]:
losses_GD, ws_GD = least_squares_GD(y_tr, x_tr, initial_w, max_iters, gamma)

  ret = umr_sum(arr, axis, dtype, out, keepdims)
  return 1/2*np.mean(e**2)


In [53]:
predictions_GD = predict_labels(w_GD,x_te)
compute_accuracy(y_te,predictions_GD)

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


## Stochastic gradient descent

## Least squares

### No standardization

In [12]:
loss, w_LS = least_squares(y_tr,x_tr)
predictions = predict_labels(w_LS,x_te)

In [14]:
compute_accuracy(y_te,predictions)

Accuracy: 0.74402


### With standardization

In [30]:
tx_te, _, _ = extend_and_standardize(x_te, mean_tr, std_tr)

In [31]:
loss, w_LS = least_squares(y_tr,tx_tr)
predictions = predict_labels(w_LS,tx_te)

In [32]:
compute_accuracy(y_te,predictions)

Accuracy: 0.74468


## Ridge regression

### Use cross-validation to find good hyperparameter

In [92]:
seed = 1
k_fold = 4
k_indices = build_k_indices(y_tr, k_fold, seed)
lambda_ = 0.001

In [93]:
lambdas, tr_losses, te_losses = find_optimal_lambda(y_tr,x_tr)

Iteration 0
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
Iteration 26
Iteration 27
Iteration 28
Iteration 29


In [70]:
optimal_lambda = find_optimal_lambda(y_tr,x_tr)

Iteration 0
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
Iteration 26
Iteration 27
Iteration 28
Iteration 29


In [72]:
w_rr = ridge_regression(y_tr,x_tr,optimal_lambda)

In [88]:
optimal_lambda

0.004520353656360241

In [73]:
predictions = predict_labels(w_rr,x_te)

In [75]:
compute_accuracy(y_te,predictions)

Accuracy: 0.7453000000000001


## Logistic regression

In [160]:
initial_w = np.zeros(x_tr.shape[1])
max_iters = 1000
gamma = 0.01
y_tr_log = np.ones(y_tr.size)
y_tr_log[y_tr == -1] = 0

In [161]:
loss, w_logistic = logistic_regression(y_tr_log, x_tr, initial_w, max_iters, gamma)

  loss = (-y * np.log(pred) - (1 - y) * np.log(1 - pred)).mean()
  loss = (-y * np.log(pred) - (1 - y) * np.log(1 - pred)).mean()


Current iteration=0, loss=nan
Current iteration=100, loss=nan
Current iteration=200, loss=nan
Current iteration=300, loss=nan
Current iteration=400, loss=nan
Current iteration=500, loss=nan
Current iteration=600, loss=nan
Current iteration=700, loss=nan
Current iteration=800, loss=nan
Current iteration=900, loss=nan


In [163]:
predictions = predict_labels(w_logistic, x_te)
compute_accuracy(y_te,predictions)

Accuracy: 0.6876599999999999


## Regularized logistic regression

In [156]:
loss, w_reg_logistic = reg_logistic_regression(y_tr_log, x_tr, optimal_lambda, initial_w, max_iters, gamma)

  loss = (-y * np.log(pred) - (1 - y) * np.log(1 - pred)).mean()
  loss = (-y * np.log(pred) - (1 - y) * np.log(1 - pred)).mean()


Current iteration=0, loss=nan
Current iteration=100, loss=nan
Current iteration=200, loss=nan
Current iteration=300, loss=nan
Current iteration=400, loss=nan
Current iteration=500, loss=nan
Current iteration=600, loss=nan
Current iteration=700, loss=nan
Current iteration=800, loss=nan
Current iteration=900, loss=nan


In [158]:
predictions = predict_labels(w_reg_logistic, x_te)
compute_accuracy(y_te,predictions)

Accuracy: 0.7079599999999999


# Part 2: EDA and feature engineering

### Import Dataset

In [84]:
train_data = 'all/train.csv'
labels, input_data, ids, features = load_csv_data(train_data)

In [85]:
training_ratio = 0.8

In [127]:
x_tr, x_te, y_tr, y_te = split_data(input_data, labels, training_ratio)

In [128]:
X, y = x_tr, y_tr  # input_data, labels

i, = np.where(features == 'PRI_jet_num')
pri_jet_num_idx = np.squeeze(i)
cond_null = X[:, pri_jet_num_idx] == 0
cond_one = X[:, pri_jet_num_idx] == 1
cond_plural = X[:, pri_jet_num_idx] >= 2
conditions = [cond_null, cond_one, cond_plural]

dsets = [X[cond] for cond in conditions]
ybs = [y[cond] for cond in conditions]

For now, just remove any column with undefined -999 values. Also, before standardization, remove features with 0 variance. 
Second part: test how replacing -999 in DER_mass_MMC by defined mean affects the score. 

In [129]:
clean_dsets = []
clean_features = []

for dset in dsets:
    
    # Impute undefined DER_mass_MMC
    """
    DER_mass_MMC = dset[:,0]
    undefined_indices = (DER_mass_MMC == -999)
    filter_undefined = DER_mass_MMC[~undefined_indices]
    defined_mean = np.mean(filter_undefined)
    print(defined_mean)
    defined_median = np.median(filter_undefined)
    print(defined_median)
    DER_mass_MMC[undefined_indices] = defined_median
    """
    
    # Remove constant features and features with undefined samples
    no_undefined = np.all(dset != -999, axis = 0)
    no_constant = np.any(dset != dset[0], axis = 0)
    cleaned = no_undefined * no_constant
    clean_dset = dset[:,cleaned]
    clean_dsets.append(clean_dset)
    clean_features.append(cleaned)

Standardize and extend data, save mean and standard deviation of each dataset.

In [130]:
parameters = []
standardized_dsets = []

for clean_dset in clean_dsets:
    extended_dset = build_poly(clean_dset, 3)
    standardized_dset, mean_x, std_x = extend_and_standardize(extended_dset[:,1:])
    # standardized_dset, mean_x, std_x = extend_and_standardize(clean_dset)
    # Added for testing purposes, handles outliers
    """
    standardized_dset[standardized_dset > 3] = 3
    standardized_dset[standardized_dset < -3]  = -3
    """
    # Polynomial expansion
    # standardized_dset = build_poly(standardized_dset[:,1:], 3)
    standardized_dsets.append(standardized_dset)
    parameters.append((mean_x,std_x))

Only extend datasets

## Gradient descent

In [168]:
max_iters = 1000
gamma = 0.1

In [169]:
ws_GD = []
for jet_num, standardized_dset in enumerate(standardized_dsets):
    initial_w = np.zeros(standardized_dset.shape[1])
    losses_GD, w_GD = least_squares_GD(ybs[jet_num], standardized_dset, initial_w, max_iters, gamma)
    ws_GD.append(w_GD)

In [170]:
predictions = model_predictions(x_te, ws_GD, pri_jet_num_idx, clean_features, parameters)

In [171]:
compute_accuracy(y_te,predictions)

Accuracy: 0.75998


Second score, after replacing -999 in DER_mass_MMC by the defined mean.

In [44]:
compute_accuracy(y_te,predictions)

Accuracy: 0.72386


Conclusion: bit worse

Third score, after replacing -999 in DER_mass_MMC by the defined median.

In [70]:
compute_accuracy(y_te,predictions)

Accuracy: 0.72848


## Stochastic Gradient descent

In [130]:
max_iters = 1000
gamma = 0.1
batch_size = 100

In [132]:
ws_SGD = []
for jet_num, standardized_dset in enumerate(standardized_dsets):
    initial_w = np.zeros(standardized_dset.shape[1])
    loss_SGD, w_SGD = least_squares_SGD(ybs[jet_num], standardized_dset, initial_w, batch_size, max_iters, gamma)
    ws_SGD.append(w_SGD)

In [133]:
predictions = model_predictions(x_te, ws_SGD, pri_jet_num_idx, clean_features, parameters)

In [134]:
compute_accuracy(y_te,predictions)

Accuracy: 0.75512


## Least squares

In [131]:
ws_LS = []
for jet_num, standardized_dset in enumerate(standardized_dsets):
    loss, w = least_squares(ybs[jet_num],standardized_dset)
    ws_LS.append(w)

In [132]:
predictions = model_predictions(x_te, ws_LS, pri_jet_num_idx, clean_features, parameters)

In [133]:
compute_accuracy(y_te,predictions)

Accuracy: 0.79222


Second score, after replacing -999 in DER_mass_MMC by the defined mean.

In [69]:
compute_accuracy(y_te,predictions)

Accuracy: 0.75986


Third score, after replacing -999 in DER_mass_MMC by the defined median.

In [50]:
compute_accuracy(y_te,predictions)

Accuracy: 0.7598


Handle outliers

In [105]:
compute_accuracy(y_te,predictions)

Accuracy: 0.76336


In [90]:
ws_LS = []
for jet_num, extended_dset in enumerate(extended_dsets):
    loss, w = least_squares(ybs[jet_num],extended_dset)
    ws_LS.append(w)

In [91]:
predictions = model_predictions(x_te, ws_LS, pri_jet_num_idx, clean_features, parameters)
compute_accuracy(y_te, predictions)

Accuracy: 0.76336


## Ridge regression

In [61]:
lambdas = []
for jet_num, standardized_dset in enumerate(standardized_dsets):
    optimal_lambda = ridge_optimal_lambda(ybs[jet_num], standardized_dset)
    lambdas.append(optimal_lambda)

Iteration 0
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
Iteration 26
Iteration 27
Iteration 28
Iteration 29
Iteration 0
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
Iteration 26
Iteration 27
Iteration 28
Iteration 29
Iteration 0
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Ite

In [62]:
lambdas

[1.0, 1.0, 1.0]

In [63]:
ws_RR = []
for jet_num, standardized_dset in enumerate(standardized_dsets):
    w_RR = ridge_regression(ybs[jet_num],standardized_dset,lambdas[jet_num])
    ws_RR.append(w_RR)

In [64]:
predictions = model_predictions(x_te, ws_RR, pri_jet_num_idx, clean_features, parameters)

In [65]:
compute_accuracy(y_te,predictions)

Accuracy: 0.6776


In [58]:
save_results(ws_RR, clean_features, parameters)

Second score, after replacing -999 in DER_mass_MMC by the defined mean.

In [140]:
compute_accuracy(y_te,predictions)

Accuracy: 0.75988


Third score, after replacing -999 in DER_mass_MMC by the defined median.

In [152]:
compute_accuracy(y_te,predictions)

Accuracy: 0.75992


## Logistic regression

In [134]:
ws_LR = []
for jet_num, standardized_dset in enumerate(standardized_dsets):
    initial_w = np.zeros(standardized_dset.shape[1])
    max_iters = 2000
    gamma = 0.7 # 0.01
    y_logistic = np.ones(ybs[jet_num].size)
    y_logistic[ybs[jet_num] == -1] = 0
    loss, w_LR = logistic_regression(y_logistic, standardized_dset, initial_w, max_iters, gamma)
    ws_LR.append(w_LR)

Current iteration=0, loss=0.5836174146430618


  loss = (-y * np.log(pred) - (1 - y) * np.log(1 - pred)).mean()


Current iteration=100, loss=nan
Current iteration=200, loss=nan
Current iteration=300, loss=nan
Current iteration=400, loss=nan


  loss = (-y * np.log(pred) - (1 - y) * np.log(1 - pred)).mean()


Current iteration=500, loss=nan
Current iteration=600, loss=nan
Current iteration=700, loss=nan
Current iteration=800, loss=nan
Current iteration=900, loss=nan
Current iteration=1000, loss=nan
Current iteration=1100, loss=nan
Current iteration=1200, loss=nan
Current iteration=1300, loss=nan
Current iteration=1400, loss=nan
Current iteration=1500, loss=nan
Current iteration=1600, loss=nan
Current iteration=1700, loss=nan
Current iteration=1800, loss=nan
Current iteration=1900, loss=nan
Current iteration=0, loss=0.6214271156124798
Current iteration=100, loss=0.4975096661324537
Current iteration=200, loss=0.4886285318645617
Current iteration=300, loss=0.48404018991788084
Current iteration=400, loss=0.48125344293360034
Current iteration=500, loss=0.47939747174224556
Current iteration=600, loss=0.47808464272880336
Current iteration=700, loss=0.4771075602050127
Current iteration=800, loss=0.4763478781915445
Current iteration=900, loss=0.4757347272316412
Current iteration=1000, loss=0.4752239

In [135]:
predictions = model_predictions(x_te, ws_LR, pri_jet_num_idx, clean_features, parameters)

In [136]:
compute_accuracy(y_te,predictions)

Accuracy: 0.80266


In [175]:
save_results(ws_RR, clean_features, parameters)

Second score, after replacing -999 in DER_mass_MMC by the defined mean.

In [235]:
compute_accuracy(y_te,predictions)

Accuracy: 0.76258


Third score, after replacing -999 in DER_mass_MMC by the defined median.

In [225]:
compute_accuracy(y_te,predictions)

Accuracy: 0.76196


Handle outliers

In [114]:
compute_accuracy(y_te,predictions)

Accuracy: 0.76416


## Regularized logistic regression

In [39]:
lambdas = []
for jet_num, standardized_dset in enumerate(standardized_dsets):
    initial_w = np.zeros(standardized_dset.shape[1])
    max_iters = 2000
    gamma = 0.7 # 0.01
    y_logistic = np.ones(ybs[jet_num].size)
    y_logistic[ybs[jet_num] == -1] = 0
    optimal_lambda = logistic_optimal_lambda(y_logistic, standardized_dset, initial_w, max_iters, gamma)
    lambdas.append(optimal_lambda)

Iteration 0


  loss = (-y * np.log(pred) - (1 - y) * np.log(1 - pred)).mean()
  loss = (-y * np.log(pred) - (1 - y) * np.log(1 - pred)).mean()


Current iteration=0, loss=nan
Current iteration=100, loss=nan
Current iteration=200, loss=nan
Current iteration=300, loss=nan
Current iteration=400, loss=nan
Current iteration=500, loss=nan
Current iteration=600, loss=nan
Current iteration=700, loss=nan
Current iteration=800, loss=nan


KeyboardInterrupt: 

In [151]:
lambdas

[0.0001, 0.0001, 0.00013738237958832623]

Lambdas when replacing -999 by mean

In [263]:
lambdas

[0.0001, 0.0001, 0.0001]

In [152]:
ws_RLR = []
for jet_num, standardized_dset in enumerate(standardized_dsets):
    initial_w = np.zeros(standardized_dset.shape[1])
    max_iters = 2000
    gamma = 0.7 # 0.01
    y_logistic = np.ones(ybs[jet_num].size)
    y_logistic[ybs[jet_num] == -1] = 0
    loss, w_RLR = reg_logistic_regression(y_logistic, standardized_dset, lambdas[jet_num], initial_w, max_iters, gamma)
    ws_RLR.append(w_RLR)

Current iteration=0, loss=0.5981457873646676
Current iteration=100, loss=0.40570366954781983
Current iteration=200, loss=0.39811158731651086
Current iteration=300, loss=0.3951233762054761
Current iteration=400, loss=0.3937249484158812
Current iteration=500, loss=0.39302899842308897
Current iteration=600, loss=0.3926703056127306
Current iteration=700, loss=0.39248120630136046
Current iteration=800, loss=0.3923800033816537
Current iteration=900, loss=0.39232526283559516
Current iteration=1000, loss=0.3922954253301242
Current iteration=1100, loss=0.3922790701887242
Current iteration=1200, loss=0.39227006822843535
Current iteration=1300, loss=0.3922650984133083
Current iteration=1400, loss=0.3922623484964606
Current iteration=1500, loss=0.3922608243656276
Current iteration=0, loss=0.6370622906477821
Current iteration=100, loss=0.5440962786181257
Current iteration=200, loss=0.5421053931684284
Current iteration=300, loss=0.5416118282423613
Current iteration=400, loss=0.54146090795682
Current

In [265]:
predictions = model_predictions(x_te, ws_RLR, pri_jet_num_idx, clean_features, parameters)

In [153]:
compute_accuracy(y_te,predictions)

Accuracy: 0.7599400000000001


Second score, after replacing -999 in DER_mass_MMC by the defined mean.

In [255]:
compute_accuracy(y_te,predictions)

Accuracy: 0.76004


Third score, after replacing -999 in DER_mass_MMC by the defined median.

In [267]:
compute_accuracy(y_te,predictions)

Accuracy: 0.7593799999999999
