In [18]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

from used_functions import *
from functions import *


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Load data
===

In [3]:
from proj1_helpers import *

labels, raw_data, indices = load_csv_data('train.csv', sub_sample=False)
nan_data = meaningless_to_nan(raw_data)

labels_te, raw_data_te, indices_te = load_csv_data('test.csv', sub_sample=False)
nan_data_te = meaningless_to_nan(raw_data_te)


In [5]:
#Train
n_samples, n_features = np.shape(nan_data)
nan_columns = nan_find_columns(n_features, n_samples, nan_data)

#Test
n_samples_te, n_features_te = np.shape(raw_data_te)
nan_columns_te = nan_find_columns(n_features_te, n_samples_te, nan_data_te)

[ 0  4  5  6 12 23 24 25 26 27 28]
[ 0  4  5  6 12 23 24 25 26 27 28]


Estimate whole features
===

In [6]:
# Estimation of columns for train set
new_submatrix = np.delete(nan_data, nan_columns, axis = 1)
k_fold = 4
seed = 2
lambdas = np.logspace(-5,-1,5)
loss_te = np.ones(len(lambdas))
loss_tr = np.ones(len(lambdas))

for chosen_feature in nan_columns:
    samples = []
    for sample in range(n_samples):
        if np.isnan(nan_data[sample,chosen_feature]):
            samples.append(sample)
    nan_lines = samples
    new_submatrix0 = np.delete(new_submatrix,nan_lines, axis = 0)
    labels_0 = np.delete(nan_data[:,chosen_feature],nan_lines, axis = 0)
    k_indices = build_k_indices(labels_0, k_fold, seed)
    for idx, lambda_ in enumerate(lambdas):
        loss_tr[idx],loss_te[idx] = cross_validation(labels_0, new_submatrix0, k_indices, k_fold, lambda_, degree=1)
    best_lambda = lambdas[np.argmin(loss_te)]
    print((best_lambda))
    weights, loss = ridge_regression(labels_0, new_submatrix0, best_lambda)
    x_pred = np.dot(new_submatrix[nan_lines,:], weights)
    nan_data[nan_lines,chosen_feature] = x_pred
    #print(x_pred.shape)
    
estimated_data = nan_data

1e-05
0.0001
0.0001
0.0001
0.0001
1e-05
0.001
0.01
0.001
0.01
0.01


In [7]:
# Estimation of columns for test set
new_submatrix_te = np.delete(nan_data_te,nan_columns, axis = 1)
loss_te_ = np.ones(len(lambdas))
loss_tr_ = np.ones(len(lambdas))
for chosen_feature in nan_columns:
    samples_te = []
    for sample in range(n_samples_te):
        if np.isnan(nan_data_te[sample,chosen_feature]):
            samples_te.append(sample)
    nan_lines_te = samples_te
    new_submatrix0_te = np.delete(new_submatrix_te,nan_lines_te, axis = 0)
    labels_0_te = np.delete(nan_data_te[:,chosen_feature],nan_lines_te, axis = 0)
    k_indices_te = build_k_indices(labels_0_te, k_fold, seed)
    for idx, lambda_ in enumerate(lambdas):
        loss_tr_[idx],loss_te_[idx] = cross_validation(labels_0_te, new_submatrix0_te, k_indices_te, k_fold, lambda_, degree=1)
    best_lambda_te = lambdas[np.argmin(loss_te)]
    print((best_lambda_te))
    weights_te, loss_te = ridge_regression(labels_0, new_submatrix0, best_lambda)
    x_pred_te = np.dot(new_submatrix_te[nan_lines_te,:], weights_te)
    nan_data_te[nan_lines_te,chosen_feature] = x_pred_te
    #print(x_pred.shape)
estimated_data_te = nan_data_te

0.01
1e-05
1e-05
1e-05
1e-05
1e-05
1e-05
1e-05
1e-05
1e-05
1e-05


Find best parameters
===

In [37]:
#Parameters
degrees = [9,10,11]
lambdas = np.logspace(-9, -5, 10)
k_fold = 4
seed = 27

k_idx = build_k_indices(labels, k_fold, seed)
#loss_tr = np.ones((len(degrees),len(lambdas)))
loss_te = np.ones((len(degrees),len(lambdas)))
scores = np.ones((len(degrees),len(lambdas)))
class_scores = np.ones((len(degrees),len(lambdas)))
#labels = np.expand_dims(labels, axis=1) 

for degree_idx, degree in enumerate(degrees):
    for lambda_idx, lambda_ in enumerate(lambdas):
        _ ,loss_te[degree_idx, lambda_idx], scores[degree_idx, lambda_idx], class_scores[degree_idx, lambda_idx] = cross_validation_(labels, estimated_data, k_idx, k_fold, lambda_, degree)
        print('Degree:', degrees[degree_idx], 'Lambda:', lambdas[lambda_idx])
        print('Score:', scores[degree_idx, lambda_idx])
        print('Class score:', class_scores[degree_idx, lambda_idx])
        
#Error        
best_HP_idx = np.unravel_index(np.argmax(scores), np.shape(scores))
best_degree = degrees[best_HP_idx[0]]
best_lambda = lambdas[best_HP_idx[1]]
best_score = scores[best_HP_idx[0], best_HP_idx[1]]
#Class error
best_HP_idx_class = np.unravel_index(np.argmax(class_scores), np.shape(class_scores))
best_degree_class = degrees[best_HP_idx_class[0]]
best_lambda_class = lambdas[best_HP_idx_class[1]]
best_class_score = class_scores[best_HP_idx_class[0], best_HP_idx_class[1]]


print('Best degree:', best_degree, 'Best lambda:', best_lambda, 'Best score:', best_score, 'Best class score:', best_class_score)


KeyboardInterrupt: 

Prediction
===

In [36]:
# Train model to get weights
poly_estimated_data = build_poly(estimated_data, best_degree_class)
weights, loss = ridge_regression(labels, poly_estimated_data, best_lambda_class)

#Predict on test
poly_estimated_data_te = build_poly(estimated_data_te, best_degree_class)
y_pred_te = predict_labels(weights, poly_estimated_data_te)

print(np.shape(y_pred_te), np.shape(indices_te))
create_csv_submission(indices_te, y_pred_te, 'submission_features_augmentation_ridgereg_2.csv')

(568238,) (568238,)
