In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# change path if necessary
import sys
my_path = r'D:\Documents\etudes\epfl\MA1\cours\MachineLearning\Project1' # to be adapated
sys.path.insert(0,my_path + r'\code\COMMON')

# imports
import numpy as np 
import matplotlib.pyplot as plt

from implementations import *
from labels import idx_2labels
from costs import *
from optimize_hyperparams import *
from cross_validation import *
from step_wise import *

# Build training dataset

In [2]:
from proj1_helpers import load_csv_data 

# load raw data
y_raw, input_data_raw, ids = load_csv_data(my_path + r'\data\train.csv', sub_sample=False)

In [3]:
from outliers import handle_outliers

# handle outliers
X_raw, y = handle_outliers(input_data_raw, y_raw, -999, 'mean')

# set y in {0,1} instead of {-1,1}
y[np.where(y==-1)]=0

-999 are replaced by the mean value of the feature


In [4]:
# get feature names 
all_features_raw = list(np.genfromtxt(my_path + r'/data/train.csv', delimiter=",", dtype=str, max_rows = 1)[2:])

In [5]:
from extend_features import extend_features

# feature degree
degree = 1

# extend feature set
all_candidates, features = extend_features(X_raw, all_features_raw, degree, is_add_log = True)
print(all_candidates.shape)

---------------------------
Features have been set to the power(s): [1]
16 Features of the momentum have been added
4 logarithmic features have been added.
(250000, 50)


# Training

In [6]:
# feature selection
indx = [1, 13, 4, 46, 0, 11, 44, 43, 7, 2, 16, 48, 10, 6, 49, 22, 45, 12, 19, 23, 32, 24, 17, 14, 39, 42, 30, 31, 47, 38, 20]
indx = indx[:17]

# training set
X = all_candidates[:, indx]

# build polynomial basis function
degree_opt = 5
phi = build_poly(X, degree_opt)
        
# standardization
phi_tmp,_,_ =  standardize(phi[:,1:]) 
phi[:,1:] = phi_tmp

In [9]:
# model parameters
gamma = 1e-6
method = 'gd'
threshold = 500
max_iters = 10000
initial_w = np.zeros(phi.shape[1])

# logistic regression
w_tot, loss_tot = logistic_regression(y, phi, initial_w, max_iters, gamma, method, threshold, debug_mode=0)

In [28]:
from tempfile import TemporaryFile

# save w_opt
outfile = TemporaryFile()
np.save('w_opt_lr_17_features_degree_5', w_tot[-1])

# Build testing dataset

In [11]:
from proj1_helpers import load_csv_data 

# load 
path_data_test = r'C:\Users\Tom\Desktop'
y_raw_te, input_data_raw_te, ids_te = load_csv_data(path_data_test + r'\test.csv', sub_sample=False)

In [12]:
# get feature names 
all_features_raw = list(np.genfromtxt(my_path + r'/data/train.csv', delimiter=",", dtype=str, max_rows = 1)[2:])

In [13]:
# handle outliers
input_data_raw_te, _ = handle_outliers(input_data_raw_te, y_raw_te, -999, 'mean')

-999 are replaced by the mean value of the feature


In [14]:
from extend_features import extend_features

# feature degree
degree = 1

# extend feature set
X_te, _ = extend_features(input_data_raw_te, all_features_raw, degree, is_add_log = True)

---------------------------
Features have been set to the power(s): [1]
16 Features of the momentum have been added
4 logarithmic features have been added.


In [15]:
# feature selection
indx = [1, 13, 4, 46, 0, 11, 44, 43, 7, 2, 16, 48, 10, 6, 49, 22, 45, 12, 19, 23, 32, 24, 17, 14, 39, 42, 30, 31, 47, 38, 20]
indx = indx[:17]

# training set
print(X_te.shape)
X_te = X_te[:, indx]
print(X_te.shape)

(568238, 50)
(568238, 17)


In [16]:
# build polynomial basis function
degree_opt = 5
phi = build_poly(X_te, degree_opt)
        
# standardization
phi_tmp,_,_ =  standardize(phi[:,1:]) 
phi[:,1:] = phi_tmp

In [29]:
# load optimal weights
w_opt = np.load('w_opt_lr_17_features_degree_5.npy')
print(w_opt.shape)

(86,)
85


# Predict labels in testing dataset

In [30]:
from proj1_helpers import predict_labels_log

# predict labels
y_pred = predict_labels_log(w_opt,phi)

# Create a submission csv file

In [31]:
# replace 0 in labels per -1
y_pred[np.where(y_pred==0)] = -1

# create the csv file
create_csv_submission(ids_te, y_pred, "sub29_10_3")