In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# imports
import numpy as np 
import matplotlib.pyplot as plt
from implementations import *
from costs import *
from optimize_hyperparams import *
from cross_validation import *
from step_wise import *

# Build training dataset

In [None]:
from proj1_helpers import load_csv_data 

# load raw data
y_raw, input_data_raw, ids = load_csv_data('train.csv', sub_sample=False)

### load raw data:

In [None]:
from outliers import handle_outliers

# handle outliers: all the -999 values are put to the mean value taken from the rest of the sample
X_raw, y = handle_outliers(input_data_raw, y_raw, -999, 'mean')

# set y in {0,1} instead of {-1,1}
y[np.where(y==-1)]=0

### Features generation:

In [None]:
# get feature names 
all_features_raw = list(np.genfromtxt('train.csv', delimiter=",", dtype=str, max_rows = 1)[2:])

In [None]:
from extend_features import extend_features


# extend feature set adding log transformations and momentum features
all_candidates, features = extend_features(X_raw, all_features_raw, degree = 1, is_add_log = True)
print(all_candidates.shape)

# Training (time demanding - see bellow to only load the trained weights) 

### Selection of features from stepwise results:

In [None]:
# feature selection (best feature indices from the step wise with logistic regression)
indx = [1, 13, 4, 46, 0, 11, 44, 43, 7, 2, 16, 48, 10, 6, 49, 22, 45, 12, 19, 23, 32, 24, 17, 14, 39, 42, 30, 31, 47, 38, 20]

# thresholding to lower the number of feature
indx = indx[:17]

# training set
X = all_candidates[:, indx]

### Raising features up to the tuned degree:

In [None]:
# optimal degree obtained from degree optimization that uses cross validation with different degrees in [1,10]  
degree_opt = 5

# build polynomial basis function
phi = build_poly(X, degree_opt)

### Standardize:

In [None]:
# standardization
phi_tmp,_,_ =  standardize(phi[:,1:]) 
phi[:,1:] = phi_tmp

### Compute the model (otherwise load the weights from the next line):

In [None]:
# model parameters (tuned manually to insure convergence)
gamma = 1e-5
threshold = 1e-3
max_iters = 10000
initial_w = np.zeros(phi.shape[1])

# logistic regression
w_tot, loss_tot = logistic_regression(y, phi, initial_w, max_iters, gamma)

print(loss_tot)

# Load the optimal trained weights 

In [None]:
w_opt = np.load('w_opt_lr.npy')

# Build testing dataset

### loading test dataset

In [None]:
from proj1_helpers import load_csv_data 

# load testing set
y_raw_te, input_data_raw_te, ids_te = load_csv_data('test.csv', sub_sample=False)

# handle outliers: all the -999 values are put to the mean value taken from the rest of the sample
input_data_raw_te, _ = handle_outliers(input_data_raw_te, y_raw_te, -999, 'mean')

### Features generation:

In [None]:
# get feature names 
all_features_raw = list(np.genfromtxt('test.csv', delimiter=",", dtype=str, max_rows = 1)[2:])

In [None]:
from extend_features import extend_features

# feature degree
degree = 1

# extend feature set
X_te, _ = extend_features(input_data_raw_te, all_features_raw, degree, is_add_log = True)

In [None]:
# feature selection (best feature indices from the step wise with logistic regression)
indx = [1, 13, 4, 46, 0, 11, 44, 43, 7, 2, 16, 48, 10, 6, 49, 22, 45, 12, 19, 23, 32, 24, 17, 14, 39, 42, 30, 31, 47, 38, 20]

# thresholding to lower the number of feature
indx = indx[:17]

# feature selection (same as for the training set: see above)
print(X_te.shape)
X_te = X_te[:, indx]
print(X_te.shape)

In [None]:
# build polynomial basis function
degree_opt = 5
phi = build_poly(X_te, degree_opt)
        
# standardization
phi_tmp,_,_ =  standardize(phi[:,1:]) 
phi[:,1:] = phi_tmp

# Predict labels in testing dataset

In [None]:
from proj1_helpers import predict_labels_log

# predict labels
y_pred = predict_labels_log(w_opt,phi)

# Create a submission csv file

In [None]:
# replace 0 in labels per -1
y_pred[np.where(y_pred==0)] = -1

# create the csv file
create_csv_submission(ids_te, y_pred, "submission_30_10")