In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

# Import the data

In [2]:
from utilities.proj1_helpers import *

In [3]:
train_data = load_csv_data("data/train.csv")
y_train, x_train, ids_train = train_data[0], train_data[1], train_data[2]
N, D = x_train.shape
print(y_train.shape, x_train.shape, ids_train.shape)

(250000,) (250000, 30) (250000,)


In [4]:
test_data = load_csv_data("data/test.csv")
y_test, x_test, ids_test = test_data[0], test_data[1], test_data[2]
print(y_test.shape, x_test.shape, ids_test.shape)

(568238,) (568238, 30) (568238,)


In [5]:
from utilities.implementations import *
from utilities.cross_validation import cross_validation, build_k_indices, split_data
#from utilities.pca import compute_pca
from utilities.preprocessing import standard_scaler

# Pre-process it

## Take care of missing values
Too many samples have at least one missing values (around 73%), we have to replace them with the mean (which will be 0 after normalization).

In [6]:
# Count the number of samples with missing values
count = 0
for i in range(len(x_train)):
    if -999.0 in x_train[i,:]:
        count += 1
        
print(count, "/", len(x_train), "=", count/len(x_train)*100, "% of samples with missing values")

181886 / 250000 = 72.7544 % of samples with missing values


In [7]:
# Normalize and center the data considering ONLY the correct values (not taking in account the values set at -999.0)

x_train_mean = np.zeros(D)
x_train_std  = np.zeros(D)
# Create a boolean mask with False at missing values
x_train_mask = (x_train != -999.0)

# Loop on the features, compute the mean/std without the missing values
for i in range(D):
    feature_values = x_train[x_train_mask[:, i], i]
    x_train_mean[i] = feature_values.mean()
    x_train_std[i]  = feature_values.std()
    
# Normalize and center the data
x = (x_train - x_train_mean) / x_train_std
# Set to 0 (the mean) the missing values
x[np.invert(x_train_mask)] = 0.

# Kernel

# Learn a model

## Split the data
Split the data into train and validation set. We first learn the model on the train set, and then test it on the validation set.

The ration gives the percent of the data going to train (ratio = 0.8 means 80% for training and 20% for validating).

In [55]:
seed = 1
ratio = 0.66

# Choose here if you want the original data (x), or after PCA (x_pc)
x_tr, y_tr, x_te, y_te = split_data(x, y_train, ratio, seed)
print(x_tr.shape, x_te.shape, "\n", x_tr.shape[0] / (x_tr.shape[0]+x_te.shape[0])*100, "% of training")

(165000, 30) (85000, 30) 
 66.0 % of training


## Weight and loss
Compute here the weight and the resulting loss of the chosen model.

In [56]:
## Logistic regression

# y = {-1; +1} => y = {0; +1}
y_tr_log = (y_tr + 1) / 2
# Add a "biais" to the input
tx_tr = np.c_[np.ones(x_tr.shape[0]), x_tr]

initial_w = np.zeros(tx_tr.shape[1])
max_iters = 500
gamma     = 1E-5

w, loss_tr = logistic_regression(y_tr_log, tx_tr, initial_w, max_iters, gamma)
print("Train logistic loss =", loss_tr)

# Compute the accuracy
y_tr_pred = predict_labels(w, tx_tr)
accuracy_tr = 1. - np.sum(np.absolute(y_tr - y_tr_pred)) / 2. / y_tr.shape[0]
print("Train Accuracy =", accuracy_tr)

Stochastic Gradient Descent(0/499): loss=114369.28479239097, w0=-0.26081000000000004, w1=0.008210225040278137
Stochastic Gradient Descent(100/499): loss=82538.99428367298, w0=-0.8819560951008333, w1=-0.01860962796328153
Stochastic Gradient Descent(200/499): loss=82390.58330193603, w0=-0.8959900131139303, w1=0.0561188624653027
Stochastic Gradient Descent(300/499): loss=82371.59638280456, w0=-0.8997458265699114, w1=0.08019818470151518
Stochastic Gradient Descent(400/499): loss=82367.77577816414, w0=-0.9007023034194223, w1=0.08775054966160964
Train logistic loss = 82366.7641886
Train Accuracy = 0.750139393939


In [39]:
## Least-squares

# Add a "biais" to the input
tx_tr = np.c_[np.ones(x_tr.shape[0]), x_tr]

w, loss_tr = least_squares(y_tr, tx_tr)
print("Train MSE loss =", loss_tr)

# Compute the accuracy
y_tr_pred = predict_labels(w, tx_tr)
accuracy_tr = 1. - np.sum(np.absolute(y_tr - y_tr_pred)) / 2. / y_tr.shape[0]
print("Train Accuracy =", accuracy_tr)

Train MSE loss = 0.360422351629
Train Accuracy = 0.721884848485


## Test the model
Test the learned model on the validation set.

In [54]:
# Add a "biais" to the input
tx_te = np.c_[np.ones(x_te.shape[0]), x_te]

# y = {-1; +1} => y = {0; +1}
y_te_log = (y_te + 1) / 2

loss_te = compute_logistic_loss(y_te_log, tx_te, w)
print("Test logistic loss =", loss_te)

# Compute the accuracy
y_te_pred = predict_labels(w, tx_te)
accuracy_te = 1. - np.sum(np.absolute(y_te - y_te_pred)) / 2. / y_te.shape[0]
print("Test Accuracy =", accuracy_te)

Test logistic loss = 42503.7955583
Test Accuracy = 0.750505882353


# Predict the Kaggle labels
Try to predicts the labels of the test set, then create a submission to be posted on Kaggle.

## Send the test set into the same space as the train set
In order to use the same weight, we need to send the testing data into the same space as the during the training (i.e. normalization, centering, PCA, etc.).

In [35]:
# Create a boolean mask with False at missing values
x_test_mask = (x_test != -999.0)

# Normalize and center the data
x_kaggle = (x_test - x_train_mean) / x_train_std
# Set to 0 (the mean) the missing values
x_kaggle[np.invert(x_test_mask)] = 0.

# Send the data to PCA space
x_kaggle_pc = x_kaggle.dot(eigenvectors)

# Add a "bias" to the input (choose if you want the original, or PCAed data here)
tx_kaggle = np.c_[np.ones(x_kaggle.shape[0]), x_kaggle]

## Predict the labels, and create a submission

In [36]:
# Predict the labels
y_pred = predict_labels(w, tx_kaggle)

# Create a sumbission file to be uploaded to the Kaggle competition
create_csv_submission(ids_test, y_pred, "nico_log_subm.csv")