In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

from proj1_helpers import *
from Hugues import *


Load data
===

In [2]:
from proj1_helpers import *

labels, raw_data, indices = load_csv_data('train.csv', sub_sample=False)
nan_data = meaningless_to_nan(raw_data)

labels_te, raw_data_te, indices_te = load_csv_data('test.csv', sub_sample=False)
nan_data_te = meaningless_to_nan(raw_data_te)


Estimate whole features
===

In [3]:
# Estimation of columns for train set
lambdas = np.logspace(-4,0,5)
k_fold = 4

estimated_data = column_estimation(nan_data, lambdas, k_fold)
estimated_data_te = column_estimation(nan_data_te, lambdas, k_fold)

Columns containing NaN [ 0  4  5  6 12 23 24 25 26 27 28]
Columns containing NaN [ 0  4  5  6 12 23 24 25 26 27 28]


Find best parameters
===

In [5]:
#Parameters
degrees = range(15)
lambdas = np.logspace(-1, -10, 10)
k_fold = 10
seed = 42

k_idx = build_k_indices(labels, k_fold, seed)
loss_te = np.ones((len(degrees),len(lambdas)))
scores = np.ones((len(degrees),len(lambdas)))

for degree_idx, degree in enumerate(degrees):
    for lambda_idx, lambda_ in enumerate(lambdas):
        _ ,loss_te[degree_idx, lambda_idx], scores[degree_idx, lambda_idx]= cross_validation_(labels, estimated_data, k_idx, k_fold, lambda_, degree)
        print('Degree:', degrees[degree_idx], 'Lambda:', lambdas[lambda_idx])
        print('Score:', scores[degree_idx, lambda_idx])
        print('Loss:', loss_te[degree_idx, lambda_idx])

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Degree: 0 Lambda: 0.1
Score: 0.65752
Loss: 0.5
Degree: 0 Lambda: 0.01
Score: 0.65752
Loss: 0.5
Degree: 0 Lambda: 0.001
Score: 0.65752
Loss: 0.5
Degree: 0 Lambda: 0.0001
Score: 0.65752
Loss: 0.5
Degree: 0 Lambda: 1e-05
Score: 0.65752
Loss: 0.5
Degree: 0 Lambda: 1e-06
Score: 0.65752
Loss: 0.5
Degree: 0 Lambda: 1e-07
Score: 0.65752
Loss: 0.5
Degree: 0 Lambda: 1e-08
Score: 0.65752
Loss: 0.5
Degree: 0 Lambda: 1e-09
Score: 0.65752
Loss: 0.5
Degree: 0 Lambda: 1e-10
Score: 0.65752
Loss: 0.5
Degree: 1 Lambda: 0.1
Score: 0.7318
Loss: 0.5547739999999999
Degree: 1 Lambda: 0.01
Score: 0.74076
Loss: 0.5447379999999999
Degree: 1 Lambda: 0.001
Score: 0.74668
Loss: 0.538098
Degree: 1 Lambda: 0.0001
Score: 0.74728
Loss: 0.53725
Degree: 1 Lambda: 1e-05
Score: 0.74756
Loss: 0.537084
Degree: 1 Lambda: 1e-06
Score: 0.74764
Loss: 0.537036
Degree: 1 Lambda: 1e-07
Score: 0.74764
Loss: 0.5370820000000001
Degree: 1 Lambda: 1e-08
Score: 0.7476
Loss: 0.53712
Degree: 1 Lambda: 1e-09
Score: 0.74756
Loss: 0.537106000

In [6]:
ratio = scores/loss_te
print(ratio.shape)
best_HP_idx = np.unravel_index(np.argmax(ratio), np.shape(ratio))
best_degree = degrees[best_HP_idx[0]]
best_lambda = lambdas[best_HP_idx[1]]
best_score = scores[best_HP_idx[0], best_HP_idx[1]]
best_loss = loss_te[best_HP_idx[0], best_HP_idx[1]]

print('Best degree:', best_degree, 'Best lambda:', best_lambda, 'Best score:', best_score, 'Best loss', best_loss)

(15, 10)
Best degree: 10 Best lambda: 1e-05 Best score: 0.80968 Best loss 0.49618400000000007


Prediction
===

In [8]:
# Train model to get weights
poly_estimated_data = build_poly(estimated_data, best_degree)
weights, loss = ridge_regression(labels, poly_estimated_data, best_lambda)

#Predict on test
poly_estimated_data_te = build_poly(estimated_data_te, best_degree)
y_pred_te = predict_labels(weights, poly_estimated_data_te)

create_csv_submission(indices_te, y_pred_te, 'csv_de_l_angoisse.csv')