In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

from proj1_helpers import *
from essential_functions import *


Load data
===

In [2]:
from proj1_helpers import *

labels, raw_data, indices = load_csv_data('train.csv', sub_sample=False)
nan_data = meaningless_to_nan(raw_data)

labels_te, raw_data_te, indices_te = load_csv_data('test.csv', sub_sample=False)
nan_data_te = meaningless_to_nan(raw_data_te)


Estimate whole features
===

In [3]:
# Estimation of columns for train set

estimated_data = column_estimation(nan_data)
estimated_data_te = column_estimation(nan_data_te)

Columns containing NaN [ 0  4  5  6 12 23 24 25 26 27 28]
Columns containing NaN [ 0  4  5  6 12 23 24 25 26 27 28]


In [4]:
std_data, mean_train, std_train = standardize_train(estimated_data)
std_data_te = standardize_test(estimated_data_te, mean_train, std_train)

Find best parameters
===

In [5]:
#Parameters
degrees = [11]
lambdas = np.logspace(-8, -10, 3)
k_fold = 10
seed = 42

k_idx = build_k_indices(labels, k_fold, seed)
loss_te = np.ones((len(degrees),len(lambdas)))
scores = np.ones((len(degrees),len(lambdas)))

for degree_idx, degree in enumerate(degrees):
    for lambda_idx, lambda_ in enumerate(lambdas):
        _ ,loss_te[degree_idx, lambda_idx], scores[degree_idx, lambda_idx]= cross_validation_(labels, std_data, k_idx, k_fold, lambda_, degree)
        print('Degree:', degrees[degree_idx], 'Lambda:', lambdas[lambda_idx])
        print('Score:', scores[degree_idx, lambda_idx])
        print('Loss:', loss_te[degree_idx, lambda_idx])

Degree: 11 Lambda: 1e-08
Score: 0.81392
Loss: 0.09202800000000001
Degree: 11 Lambda: 1e-09
Score: 0.81396
Loss: 0.092016
Degree: 11 Lambda: 1e-10
Score: 0.81392
Loss: 0.09203800000000001


In [6]:
ratio = scores/loss_te
best_HP_idx = np.unravel_index(np.argmax(ratio), np.shape(ratio))
best_degree = degrees[best_HP_idx[0]]
best_lambda = lambdas[best_HP_idx[1]]

best_score = scores[best_HP_idx[0], best_HP_idx[1]]
best_loss = loss_te[best_HP_idx[0], best_HP_idx[1]]

print('Best degree:', best_degree, 'Best lambda:', best_lambda, 'Best score:', best_score, 'Best loss', best_loss)

(1, 3)
Best degree: 11 Best lambda: 1e-09 Best score: 0.81396 Best loss 0.092016


Prediction
===

In [7]:
# Train model to get weights
poly_std_data = build_poly(std_data, best_degree)
weights, loss = ridge_regression(labels, poly_std_data, best_lambda)

#Predict on test
poly_std_data_te = build_poly(std_data_te, best_degree)
y_pred_te = predict_labels(weights, poly_std_data_te)

create_csv_submission(indices_te, y_pred_te, 'csv_de_l_angoisse.csv')