### Experiment pipeline

In [41]:
import sys
sys.path.append('../')
import numpy as np
# add ../ to path
from helpers import (
    load_csv_data
)  
from feature_processing import (
    drop_columns,
    drop_correlated_columns,
    drop_single_value_columns,
    median_imputation,
    mean_imputation,
    standardize,
    build_poly,
    build_k_indices
)
from cross_validation import (
    predict_mse,
    predict_logistic,
    accuracy,
    f1_score,
    print_results,
    cross_validation
)
from implementations import (
    mean_squared_error_gd,
    mean_squared_error_sgd,
    least_squares,
    ridge_regression,
    logistic_regression,
    reg_logistic_regression
)
from implementations_utils import (
    compute_loss_mse, 
    compute_loss_logistic)

In [2]:
print("Loading data...")
x_train, x_test, y_train, train_ids, test_ids = load_csv_data("../../data/")
print("Data loaded...")

Loading data...
Data loaded...


In [3]:
print("x_train.shape =", x_train.shape)
print("x_test.shape =", x_test.shape)
print("y_train.shape =", y_train.shape)
print("train_ids.shape =", train_ids.shape)
print("test_ids.shape =", test_ids.shape)

x_train.shape = (328135, 321)
x_test.shape = (109379, 321)
y_train.shape = (328135,)
train_ids.shape = (328135,)
test_ids.shape = (109379,)


In [4]:
# replace -1 with 0 in y_train
y_train[np.where(y_train == -1)] = 0

### Feature engineering

In [5]:
# threshold for drop_columns function
DROP_NAN_THRESHOLD = 0.2
# threshold for drop_correlated_columns function
DROP_CORR_THRESHOLD = 0.85
# flag for drop_single_value_columns function
DROP_SINGLE = True
# threshold for categorical/numerical (categorical are imputated with median, numerical with mean)
CAT_NUM_THRESHOLD = 30
# flag for build_poly function
BUILD_POLY = False
# degree for build_poly function
DEGREE = 2
# flag for standardize function
STANDARDIZE = True

In [6]:
# copy data
x_train_temp = x_train.copy()
x_test_temp = x_test.copy()

In [7]:
# drop_columns on x_train
x_train_temp, cols_to_keep_1 = drop_columns(x_train, DROP_NAN_THRESHOLD)
print(f"Dropping columns with DROP_NAN_THRESHOLD = {DROP_NAN_THRESHOLD}...")
# drop_correlated_columns on x_train
x_train_temp, cols_to_keep_2 = drop_correlated_columns(x_train_temp, DROP_CORR_THRESHOLD)
print(f"Dropping columns with DROP_CORR_THRESHOLD = {DROP_CORR_THRESHOLD}...")
# drop_single_value_columns on x_train
if DROP_SINGLE:
    x_train_temp, cols_to_keep_3 = drop_single_value_columns(x_train_temp)
    print("Dropping single valued columns...")
# apply the same column selection to x_test
x_test_temp = x_test[:, cols_to_keep_1]
x_test_temp = x_test_temp[:, cols_to_keep_2]
x_test_temp = x_test_temp[:, cols_to_keep_3]

In [8]:
# categorical and numerical features
categorical_features = []
numerical_features = []
# find categorical and numerical features
for i, feature in enumerate(x_train_temp.T):
    if (
        np.unique(feature).shape[0] < CAT_NUM_THRESHOLD
    ):
        categorical_features.append(i)
    else:
        numerical_features.append(i)
# fill in missing values on the train and test
x_train_temp[:, categorical_features] = median_imputation(
    x_train_temp[:, categorical_features]
)
x_test_temp[:, categorical_features] = median_imputation(
    x_test_temp[:, categorical_features]
)
x_train_temp[:, numerical_features] = mean_imputation(
    x_train_temp[:, numerical_features]
)
x_test_temp[:, numerical_features] = mean_imputation(x_test_temp[:, numerical_features])

In [9]:
# build polynomial features
if BUILD_POLY:
    print(f"Building polynomial with degree = {DEGREE}...")
    x_train_temp = build_poly(x_train_temp, DEGREE)
    x_test_temp = build_poly(x_test_temp, DEGREE)

In [10]:
# standardize
if STANDARDIZE:
    print("Standardizing...")
    x_train_temp = standardize(x_train_temp)
    x_test_temp = standardize(x_test_temp)

In [61]:
print("New x_train and x_test shapes")
print("x_train_temp.shape =", x_train_temp.shape)
print("x_test_temp.shape =", x_test_temp.shape)

x_train_temp.shape = (328135, 110)
x_test_temp.shape = (109379, 110)


### Models

In [11]:
# number of folds
NUM_FOLDS = 5
# build k_indices for cross validation
k_indices = build_k_indices(y_train, NUM_FOLDS, 42)

#### Mean squared error with GD

In [54]:
# Model parameters
MAX_ITERS = 100
GAMMA = 0.5
# Model
_mean_squared_error_gd = lambda y, tx, initial_w: mean_squared_error_gd(y, tx, initial_w, MAX_ITERS, GAMMA)

In [55]:
# Cross validation
eval_results_train, eval_results_test, losses_train, losses_test, w = cross_validation(y_train, 
                                                                                       x_train_temp, 
                                                                                       k_indices, 
                                                                                       model_fn=_mean_squared_error_gd, 
                                                                                       loss_fn=compute_loss_mse, 
                                                                                       pred_fn=predict_mse, 
                                                                                       eval_fns=dict(accuracy=accuracy, f1_score=f1_score))

Fold 1/5
Train loss: 9075749012592432689553414834817919664387578749276068991304647701161299673088.00000
Test loss: 9014456055067429320046017433763391753940360974223209709111805492850405146624.00000
Train accuracy: 0.46100
Train f1_score: 0.05258
Test accuracy: 0.46223
Test f1_score: 0.05160
------------------------------
Fold 2/5
Train loss: 10289978312293596914735020162589354104594649009051997685784312174501781569536.00000
Test loss: 10196345109396401368639023891561027377687163891438966408956748763857476911104.00000
Train accuracy: 0.46065
Train f1_score: 0.05224
Test accuracy: 0.46482
Test f1_score: 0.05372
------------------------------
Fold 3/5
Train loss: 1333029380650509738974151269237701118430882501818633651775734862992161898496.00000
Test loss: 1370668706698061270079232218156243344328197806987529686429901626070219620352.00000
Train accuracy: 0.45867
Train f1_score: 0.05176
Test accuracy: 0.45655
Test f1_score: 0.05139
------------------------------
Fold 4/5
Train loss: 76509052

In [57]:
print("Train:")
print_results(eval_results_train)
print("-"*20)
print("Test:")
print_results(eval_results_test)

Train:
accuracy : 0.460053 ± 0.000961
f1_score : 0.052117 ± 0.000287
--------------------
Test:
accuracy : 0.459863 ± 0.003185
f1_score : 0.051953 ± 0.000994


### Mean squared error with SGD

In [36]:
# Model parameters
MAX_ITERS = 100
GAMMA = 0.5
# Model
_mean_squared_error_sgd = lambda y, tx, initial_w: mean_squared_error_sgd(y, tx, initial_w, MAX_ITERS, GAMMA)

In [37]:
# Cross validation
eval_results_train, eval_results_test, losses_train, losses_test, w = cross_validation(y_train, 
                                                                                       x_train_temp, 
                                                                                       k_indices, 
                                                                                       model_fn=_mean_squared_error_sgd, 
                                                                                       loss_fn=compute_loss_mse, 
                                                                                       pred_fn=predict_mse, 
                                                                                       eval_fns=dict(accuracy=accuracy, f1_score=f1_score))

Fold 1/5
Train loss: 800524659423373871670270209025170209493176981214258735540328163903775848931590264685364243767356275323957627846656.00000
Test loss: 803580112432148322408097315627089307741227767058033588985410759340861889009562224202479958732521661171709461397504.00000
Train accuracy: 0.55595
Train f1_score: 0.21409
Test accuracy: 0.55701
Test f1_score: 0.21325
------------------------------
Fold 2/5
Train loss: 318846563870809459654359175039577103927818825211577372625530677825451102661762875392.00000
Test loss: 311940575846786832564010045358227196234971512037904863674548888790049698922048258048.00000
Train accuracy: 0.58829
Train f1_score: 0.22188
Test accuracy: 0.58583
Test f1_score: 0.21882
------------------------------
Fold 3/5
Train loss: 1460283621492348270917962639255513263335059130007938803117868446062249676065538006627385344.00000
Test loss: 1456537800402020210666206741206510879227013683233800811171649614545914445049982762419748864.00000
Train accuracy: 0.48210
Train f1_s

In [17]:
print("Train:")
print_results(eval_results_train)
print("-"*20)
print("Test:")
print_results(eval_results_test)

Train:
accuracy : 0.467852 ± 0.059187
f1_score : 0.140160 ± 0.060838
--------------------
Test:
accuracy : 0.469188 ± 0.057951
f1_score : 0.141326 ± 0.062081


### Least squares

In [38]:
_least_squares = lambda y, tx, initial_w: least_squares(y, tx)

In [39]:
# Cross validation
eval_results_train, eval_results_test, losses_train, losses_test, w = cross_validation(y_train, 
                                                                                       x_train_temp, 
                                                                                       k_indices, 
                                                                                       model_fn=_least_squares, 
                                                                                       loss_fn=compute_loss_mse, 
                                                                                       pred_fn=predict_mse, 
                                                                                       eval_fns=dict(accuracy=accuracy, f1_score=f1_score))

Fold 1/5
Train loss: 0.03854
Test loss: 0.03819
Train accuracy: 0.91139
Train f1_score: 0.00009
Test accuracy: 0.91292
Test f1_score: 0.00000
------------------------------
Fold 2/5
Train loss: 0.03861
Test loss: 0.03792
Train accuracy: 0.91131
Train f1_score: 0.00000
Test accuracy: 0.91324
Test f1_score: 0.00000
------------------------------
Fold 3/5
Train loss: 0.03843
Test loss: 0.03865
Train accuracy: 0.91183
Train f1_score: 0.00009
Test accuracy: 0.91118
Test f1_score: 0.00000
------------------------------
Fold 4/5
Train loss: 0.03842
Test loss: 0.03868
Train accuracy: 0.91185
Train f1_score: 0.00009
Test accuracy: 0.91104
Test f1_score: 0.00000
------------------------------
Fold 5/5
Train loss: 0.03832
Test loss: 0.03909
Train accuracy: 0.91210
Train f1_score: 0.00009
Test accuracy: 0.91008
Test f1_score: 0.00000
------------------------------


In [40]:
print("Train:")
print_results(eval_results_train)
print("-"*20)
print("Test:")
print_results(eval_results_test)

Train:
accuracy : 0.911694 ± 0.000299
f1_score : 0.000069 ± 0.000035
--------------------
Test:
accuracy : 0.911692 ± 0.001197
f1_score : 0.000000 ± 0.000000


### Ridge regression

In [21]:
LAMBDA_ = 0.1
_ridge_regression = lambda y, tx, initial_w: ridge_regression(y, tx, LAMBDA_)

In [22]:
# Cross validation
eval_results_train, eval_results_test, losses_train, losses_test, w = cross_validation(y_train, 
                                                                                       x_train_temp, 
                                                                                       k_indices, 
                                                                                       model_fn=_ridge_regression, 
                                                                                       loss_fn=compute_loss_mse, 
                                                                                       pred_fn=predict_mse, 
                                                                                       eval_fns=dict(accuracy=accuracy, f1_score=f1_score))

Fold 1/5
Train loss: 0.03860
Test loss: 0.03820
Train accuracy: 0.91139
Train f1_score: 0.00000
Test accuracy: 0.91292
Test f1_score: 0.00000
------------------------------
Fold 2/5
Train loss: 0.03867
Test loss: 0.03794
Train accuracy: 0.91131
Train f1_score: 0.00000
Test accuracy: 0.91324
Test f1_score: 0.00000
------------------------------
Fold 3/5
Train loss: 0.03848
Test loss: 0.03871
Train accuracy: 0.91182
Train f1_score: 0.00000
Test accuracy: 0.91118
Test f1_score: 0.00000
------------------------------
Fold 4/5
Train loss: 0.03847
Test loss: 0.03875
Train accuracy: 0.91185
Train f1_score: 0.00000
Test accuracy: 0.91104
Test f1_score: 0.00000
------------------------------
Fold 5/5
Train loss: 0.03837
Test loss: 0.03915
Train accuracy: 0.91209
Train f1_score: 0.00000
Test accuracy: 0.91008
Test f1_score: 0.00000
------------------------------


In [23]:
print("Train:")
print_results(eval_results_train)
print("-"*20)
print("Test:")
print_results(eval_results_test)

Train:
accuracy : 0.911692 ± 0.000299
f1_score : 0.000000 ± 0.000000
--------------------
Test:
accuracy : 0.911692 ± 0.001197
f1_score : 0.000000 ± 0.000000


### Logistic regression

In [58]:
# Model parameters
MAX_ITERS = 100
GAMMA = 0.5
# Model
_logistic_regression = lambda y, tx, initial_w: logistic_regression(y, tx, initial_w, MAX_ITERS, GAMMA)

In [59]:
# Cross validation
eval_results_train, eval_results_test, losses_train, losses_test, w = cross_validation(y_train, 
                                                                                       x_train_temp, 
                                                                                       k_indices, 
                                                                                       model_fn=_logistic_regression, 
                                                                                       loss_fn=compute_loss_logistic, 
                                                                                       pred_fn=predict_logistic, 
                                                                                       eval_fns=dict(accuracy=accuracy, f1_score=f1_score))

Fold 1/5
Train loss: 0.66972
Test loss: 0.67079
Train accuracy: 0.62406
Train f1_score: 0.29867
Test accuracy: 0.62368
Test f1_score: 0.29435
------------------------------
Fold 2/5
Train loss: 0.66984
Test loss: 0.67033
Train accuracy: 0.62283
Train f1_score: 0.29838
Test accuracy: 0.62261
Test f1_score: 0.29288
------------------------------
Fold 3/5
Train loss: 0.66984
Test loss: 0.67038
Train accuracy: 0.62364
Train f1_score: 0.29705
Test accuracy: 0.62141
Test f1_score: 0.29889
------------------------------
Fold 4/5
Train loss: 0.67023
Test loss: 0.66876
Train accuracy: 0.62330
Train f1_score: 0.29704
Test accuracy: 0.62618
Test f1_score: 0.30028
------------------------------
Fold 5/5
Train loss: 0.66964
Test loss: 0.67113
Train accuracy: 0.62422
Train f1_score: 0.29680
Test accuracy: 0.62186
Test f1_score: 0.30052
------------------------------


In [60]:
print("Train:")
print_results(eval_results_train)
print("-"*20)
print("Test:")
print_results(eval_results_test)

Train:
accuracy : 0.623610 ± 0.000506
f1_score : 0.297589 ± 0.000773
--------------------
Test:
accuracy : 0.623146 ± 0.001698
f1_score : 0.297383 ± 0.003162


### Regularized logistic regression

In [27]:
# Model parameters
MAX_ITERS = 100
GAMMA = 0.1
LAMBDA_ = 0.1
# Model
_reg_logistic_regression = lambda y, tx, initial_w: reg_logistic_regression(y, tx, initial_w, MAX_ITERS, GAMMA, LAMBDA_)

In [28]:
# Cross validation
eval_results_train, eval_results_test, losses_train, losses_test, w = cross_validation(y_train, 
                                                                                       x_train_temp, 
                                                                                       k_indices, 
                                                                                       model_fn=_reg_logistic_regression, 
                                                                                       loss_fn=compute_loss_logistic, 
                                                                                       pred_fn=predict_logistic, 
                                                                                       eval_fns=dict(accuracy=accuracy, f1_score=f1_score))

Fold 1/5
Train loss: 0.67132
Test loss: 0.67180
Train accuracy: 0.62037
Train f1_score: 0.29688
Test accuracy: 0.62048
Test f1_score: 0.29239
------------------------------
Fold 2/5
Train loss: 0.67141
Test loss: 0.67160
Train accuracy: 0.61943
Train f1_score: 0.29674
Test accuracy: 0.61878
Test f1_score: 0.29123
------------------------------
Fold 3/5
Train loss: 0.67138
Test loss: 0.67177
Train accuracy: 0.62026
Train f1_score: 0.29515
Test accuracy: 0.61819
Test f1_score: 0.29700
------------------------------
Fold 4/5
Train loss: 0.67172
Test loss: 0.67066
Train accuracy: 0.61926
Train f1_score: 0.29507
Test accuracy: 0.62188
Test f1_score: 0.29867
------------------------------
Fold 5/5
Train loss: 0.67122
Test loss: 0.67215
Train accuracy: 0.62004
Train f1_score: 0.29502
Test accuracy: 0.61816
Test f1_score: 0.29840
------------------------------
