In [300]:
import numpy as np
import random
from implementations import *
from proj1_helpers import create_csv_submission, load_csv_data, predict_labels_kaggle

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [301]:
LOWER_BOUND = -1
UPPER_BOUND = 1

# Import data

In [302]:
DATA_FOLDER = "competition-data/"
DATA_TEST = "test.csv"
DATA_TRAIN = "train.csv"
y_train, x_train, ids_train = load_csv_data(DATA_FOLDER + DATA_TRAIN, LOWER_BOUND, UPPER_BOUND)
tx_train = x_train.T

In [303]:
# Code to plot correlations between features
'''
import matplotlib.pyplot as plt
import numpy as np

for i in range(29):
    for j in range(i+1, 30):
        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)
        x = tx_train[i]
        y = tx_train[j]
        plt.xlabel('Feature %d' % i)
        plt.ylabel('Feature %d' % j)
        ax.scatter(x, y)
        plt.show()
'''


"\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfor i in range(29):\n    for j in range(i+1, 30):\n        fig = plt.figure()\n        ax = fig.add_subplot(1, 1, 1)\n        x = tx_train[i]\n        y = tx_train[j]\n        plt.xlabel('Feature %d' % i)\n        plt.ylabel('Feature %d' % j)\n        ax.scatter(x, y)\n        plt.show()\n"

# Cleaning data

In [304]:
# Separate into 4 sets depending on the feature 22: 'PRI_jet_num' (categorical feature)
masks_jet_train = get_jet_masks(tx_train.T)

In [293]:
# Balance the data set to have the same number of 1 that -1 (downsample)
'''
tx_train, y_train = balance(tx_train.T, y_train, LOWER_BOUND, UPPER_BOUND)
tx_train.shape
'''

'\ntx_train, y_train = balance(tx_train.T, y_train, LOWER_BOUND, UPPER_BOUND)\ntx_train.shape\n'

In [305]:
# Replace remaining NaN values with the median of the column + standardie the values of all features
tx_train = replace_nan_by_median(tx_train)
mean_train, std_train, tx_train = standardize(tx_train)

In [306]:
### TEST SET ###

# Do the same we did to the training set, but to the test set
y_test, x_test, ids_test = load_csv_data(DATA_FOLDER + DATA_TEST, LOWER_BOUND, UPPER_BOUND)
masks_jet_test = get_jet_masks(x_test)
tx_test = x_test.T
tx_test = replace_nan_by_median(tx_test)
tx_test = standardize_predef(tx_test, mean_train, std_train)

# Cross Validation

In [307]:
# Cross validate the different models one by one (change manually)
model_nb = 0

current_tx_train = tx_train.T[masks_jet_train[model_nb]].T
current_y_train = y_train[masks_jet_train[model_nb]]

# Those should be done if we choose not to do them before separating the data set
'''
# Remove columns full of NaN
current_tx_train = current_tx_train[~np.all(np.isnan(current_tx_train), axis=1)]

# Remove columns without standard deviation at all
current_tx_train = current_tx_train[np.nanstd(current_tx_train, axis=1) != 0]

# Balance the data set
current_tx_train, current_y_train = balance(current_tx_train.T, current_y_train, LOWER_BOUND, UPPER_BOUND)

# Replace remaining NaN by median
current_tx_train = replace_nan_by_median(current_tx_train)

# Standardize features
mean_train, std_train, current_tx_train = standardize(current_tx_train)
'''

print(current_tx_train.shape)
print(current_y_train.shape)

(30, 99913)
(99913, 1)


In [308]:
from implementations import build_k_indices
#from tqdm import tqdm_notebook
from matplotlib.pyplot import figure, show
from matplotlib.ticker import MaxNLocator
import matplotlib.patches as mpatches

def cross_validation_demo():
    model = "ridge_regression"
    seed = 3
    k_fold = 10
    degrees = np.arange(5, 14, 1)
    lambdas = np.logspace(-5, -2, 10)
    gammas = [0]#np.arange(0.05, 0.6, 0.05)
    #initial_w = init_w(tx_train)
    max_iters = [50]#np.logspace(2, 3, 4)
    batch_sizes = [64]
    k_indices = build_k_indices(current_y_train, k_fold, seed)
    results = []
    for degree in degrees:
        tx_train_poly = build_poly_tx(current_tx_train, degree)
        initial_w = init_w(tx_train_poly)
        for max_iter in max_iters:
            for batch_size in batch_sizes:
                for gamma in gammas:
                    for lambda_ in lambdas:
                        accs = []
                        ws = []
                        for k in range(k_fold):
                            w_tr, acc = cross_validation(current_y_train, tx_train_poly.T, initial_w,
                                                         int(max_iter), k_indices, k, gamma, lambda_, LOWER_BOUND, UPPER_BOUND, model, batch_size)
                            ws.append(w_tr)
                            accs.append(acc)
                        w_mean = np.mean(ws, axis=0)
                        acc_mean = np.mean(accs)
                        results.append((degree, max_iter, batch_size, gamma, lambda_, acc_mean, w_mean))

                        print("Finished: " + str((degree, max_iter, batch_size, gamma, lambda_, acc_mean)))

                        '''
                        fig = plt.subplots(1, 1, figsize=(10,5))
                        plt.plot(range(1,k_fold+1), accs, marker=".", color='b', label='accuracy')
                        plt.axhline(y=acc_mean, color='r', label='mean')
                        ax = plt.gca()
                        ax.xaxis.set_major_locator(MaxNLocator(integer=True))
                        plt.xlabel("k")
                        plt.ylabel("accuracy")
                        plt.title("k-fold accuracy for lambda=%.5f, gamma=%.2f, degree=%d" % (lambda_, gamma, degree))
                        plt.legend(loc=2)
                        plt.grid(True)
                        plt.show()
                        '''
    
    return results

results = cross_validation_demo()

Finished: (5, 50, 64, 0, 1.0000000000000001e-05, 0.83649284355920328)
Finished: (5, 50, 64, 0, 2.1544346900318823e-05, 0.8365428885997398)
Finished: (5, 50, 64, 0, 4.6415888336127818e-05, 0.83653287959163247)
Finished: (5, 50, 64, 0, 0.0001, 0.83629266339705732)
Finished: (5, 50, 64, 0, 0.00021544346900318823, 0.83629266339705732)
Finished: (5, 50, 64, 0, 0.00046415888336127773, 0.83615253728355532)
Finished: (5, 50, 64, 0, 0.001, 0.83615253728355532)
Finished: (5, 50, 64, 0, 0.0021544346900318821, 0.83578220398358527)
Finished: (5, 50, 64, 0, 0.0046415888336127772, 0.83521169052146915)
Finished: (5, 50, 64, 0, 0.01, 0.83472124912421175)
Finished: (6, 50, 64, 0, 1.0000000000000001e-05, 0.83623260934841359)
Finished: (6, 50, 64, 0, 2.1544346900318823e-05, 0.83766389750775683)
Finished: (6, 50, 64, 0, 4.6415888336127818e-05, 0.80448403563206894)
Finished: (6, 50, 64, 0, 0.0001, 0.83708337503753383)
Finished: (6, 50, 64, 0, 0.00021544346900318823, 0.8031528375537984)
Finished: (6, 50, 64,

In [310]:
results.sort(key=lambda x: -x[5])
[x[0] for x in results[:4]]

[9, 9, 8, 10]

# Separating into multiple models

With balance before everything (+ nan_to_median + standardize):

* For 0: Best is Ridge (9, 0, 0, 0, 0.00046415888336127773, 0.81200248756218907)
* For 1: Best is Ridge (11, 0, 0, 0, 0.001291549665014884, 0.79104143337066068)
* For 2: Best is Ridge (11, 50, 64, 0, 4.6415888336127818e-05, 0.84399274987053341)
* For 3: Best is Ridge (11, 50, 64, 0, 1.0000000000000001e-05, 0.80668918918918919)

With balance before everything (+ nan_to_median + standardize BUT AFTER SEPARATING DATA):

* For 0: Best is Ridge (8, 50, 64, 0, 0.001, 0.81080816006276968)
* For 1: Best is Ridge (13, 50, 64, 0, 0.001, 0.79271021291952359)
* For 2: Best is Ridge (12, 50, 64, 0, 0.0021544346900318821, 0.83351592615134906)
* For 3: Best is Ridge (10, 50, 64, 0, 0.001, 0.80564635958395248)

Without balance (+ nan_to_median + standardize):

* For 0: Best is Ridge (9, 50, 64, 0, 4.6415888336127818e-05, 0.84248823941547391)
* For 1: Best is Ridge (12, 50, 64, 0, 0.001, 0.80530049006964144)
* For 2: Best is Ridge (12, 50, 64, 0, 0.0021544346900318821, 0.83388921977367492)
* For 3: Best is Ridge (11, 50, 64, 0, 2.1544346900318823e-05, 0.82996389891696754)

Without balance (+ nan_to_median + standardize BUT AFTER SEPARATING DATA):

* For 0: Best is Ridge (8, 50, 64, 0, 2.1544346900318823e-05, 0.84242818536683006)
* For 1: Best is Ridge (12, 50, 64, 0, 0.0021544346900318821, 0.80531338663915408)
* For 2: Best is Ridge (12, 50, 64, 0, 0.00046415888336127773, 0.83325392098471307)
* For 3: Best is Ridge (12, 50, 64, 0, 0.0001, 0.8336642599277978)

In [317]:
# Code to use all models on the test_set and have the final prediction in y_pred

degrees = [9, 12, 12, 11]
lambdas = [4.6415888336127818e-05, 0.001, 0.0021544346900318821, 2.1544346900318823e-05]

# Final prediction in here
y_pred = np.zeros(x_test.shape[0])

for i in range(len(masks_jet_train)):
    current_tx_train = tx_train.T[masks_jet_train[i]].T
    current_tx_test = tx_test.T[masks_jet_test[i]].T
    current_y_train = y_train[masks_jet_train[i]]
    
    # Preprocess here if not preprocessed before separating
    '''
    # Remove columns full of NaN
    current_tx_train = current_tx_train[~np.all(np.isnan(current_tx_train), axis=1)]
    current_tx_test = current_tx_test[~np.all(np.isnan(current_tx_test), axis=1)]

    # Remove columns without standard deviation at all
    current_tx_train = current_tx_train[np.nanstd(current_tx_train, axis=1) != 0]
    current_tx_test = current_tx_test[np.nanstd(current_tx_test, axis=1) != 0]
    
    # Balance the data set
    current_tx_train, current_y_train = balance(current_tx_train.T, current_y_train, LOWER_BOUND, UPPER_BOUND)

    # Replace remaining NaN by median
    current_tx_train = replace_nan_by_median(current_tx_train)
    current_tx_test = replace_nan_by_median(current_tx_test)

    # Standardize features
    mean_train, std_train, current_tx_train = standardize(current_tx_train)
    current_tx_test = standardize_predef(current_tx_test, mean_train, std_train)
    
    '''
    
    # Build poly
    current_tx_poly_train = build_poly_tx(current_tx_train, degrees[i])
    current_tx_poly_test = build_poly_tx(current_tx_test, degrees[i])
    
    # Compute best method
    current_w, current_loss = ridge_regression(current_y_train, current_tx_poly_train, lambdas[i])
    
    acc = accuracy(current_y_train, current_tx_poly_train.T, current_w, LOWER_BOUND, UPPER_BOUND)
    print("Accuracy:", acc)
    
    # Predict
    y_test_pred = predict_labels_kaggle(current_w, current_tx_poly_test.T, LOWER_BOUND, UPPER_BOUND)
    y_pred[masks_jet_test[i]] = y_test_pred.flatten()

print("Number of %d:" % UPPER_BOUND, np.count_nonzero(y_pred == UPPER_BOUND))
print("Number of %d:" % LOWER_BOUND, np.count_nonzero(y_pred == LOWER_BOUND))

Accuracy: 0.842863291063
Accuracy: 0.806909625503
Accuracy: 0.836658131364
Accuracy: 0.835814834867
Number of 1: 177264
Number of -1: 390974


# Submission

In [31]:
create_csv_submission(ids_test, y_pred, "test20.csv")

Test n : algorithm / features / y / w
- - - - - - - - - - - - - - - - - - - 
Test 1 : least_squares / all features standardized / y = -1,1 / random init_w

Test 2 : least_squares / corr > 0.1 features standardized / y = -1,1 / random init_w

Test 3 : least_squares_GD(10000,0.5) / all features standardized / y = -1,1 / random init_w

Test 4 : least_squares_GD(10000,0.5) / all features standardized / y = -1,1 / random init_w / poly, degree=1

Test 5 : least_squares / all features standardized / y = -1,1 / random init_w / median + categorical

Test 6 : logistic_regressoin  /all features standardized / y = 0,1 / random init_w / median + categorical + balanced

Test 7 : least_squares / all features standardized / y = 0,1 / random init_w

Test 8 : Test 1

Test 9 : Test 1

Test 10 : Test 1 / standardized test_set with mean and std from train_set

Test 11 : Test 1 / standardized test_set with mean and std from train_set / balance

Test 12 : Ridge regression / non-balanced / standardized test_set with mean and std from train_set / y=-1,1 / mean of 4 best lambdas for degree 11

Test 13 : Ridge regression / balanced / standardized test_set with mean and std from train_set / y=-1,1 / mean of 4 best lambdas for degree 11

Test 14 : Ridge regression / balanced before doing anything / standardized test_set with mean and std from train_set / y=-1,1 / mean of 4 best lambdas for degree 11

Test 15 : Ridge regression / Removed all rows containing at least a NaN / balanced before doing anything / standardized test_set with mean and std from train_set / y=-1,1 / mean of 4 best lambdas for degree 11 / Replaced NaN values in test_set by median in test_set

Test 16 : balanced before doing anything / standardized test_set with mean and std from train_set / y=-1,1 / Ensembling with: ("least_squares_GD", 1, 150, 0, 0.01, 0), ("least_squares_GD", 1, 50, 0, 0.25, 0), ("least_squares_SGD", 1, 30, 256, 0.2, 0), ("least_squares_SGD", 1, 60, 64, 0.1, 0), ("ridge_regression", 7, 0, 0, 0, 0.001), ("ridge_regression", 9, 0, 0, 0, 0.001), ("ridge_regression", 11, 0, 0, 0, 0.001)

Test 17 : balanced before doing anything / standardized test_set with mean and std from train_set / y=-1,1 / Separating into 4 models based on feature 22, using Ridge everytime with degrees = [9, 11, 11, 11] and lambdas = [0.00046415888336127773, 0.001291549665014884, 4.6415888336127818e-05, 1.0000000000000001e-05]

Test 18 : non-balanced / standardized test_set with mean and std from train_set / y=-1,1 / Separating into 4 models based on feature 22, using Ridge everytime with degrees = [9, 12, 12, 11] and lambdas = [4.6415888336127818e-05, 0.001, 0.0021544346900318821, 2.1544346900318823e-05]

Test 19 : non-balanced / standardized test_set with mean and std from train_set BUT AFTER SEPARATING / y=-1,1 / Separating into 4 models based on feature 22, using Ridge everytime with degrees = [8, 12, 12, 11] and lambdas = [2.1544346900318823e-05, 0.0021544346900318821, 0.00046415888336127773, 0.0001]

Test 20 : balanced / standardized test_set with mean and std from train_set BUT AFTER SEPARATING / y=-1,1 / Separating into 4 models based on feature 22, using Ridge everytime with degrees = [8, 13, 12, 10] and lambdas = [0.001, 0.001, 0.0021544346900318821, 0.001]

## Further work

- balance output (batch numpy)
- median and category
- features engineering : features d'intéraction
- logistic regression 