### Experiment pipeline

In [5]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
import sys

sys.path.append("../")
# add ../ to path

In [7]:
import numpy as np
from helpers import load_csv_data
from feature_processing import (
    drop_columns,
    drop_correlated_columns,
    drop_single_value_columns,
    median_imputation,
    mean_imputation,
    standardize,
    build_poly,
    build_k_indices,
    build_log,
    build_ratios,
)
from cross_validation import (
    predict_mse,
    predict_logistic,
    accuracy,
    f1_score,
    print_results,
    cross_validation,
)
from implementations import (
    mean_squared_error_gd,
    mean_squared_error_sgd,
    least_squares,
    ridge_regression,
    logistic_regression,
    reg_logistic_regression,
)
from implementations_utils import compute_loss_mse, compute_loss_logistic


In [8]:
print("Loading data...")
x_train, x_test, y_train, train_ids, test_ids = load_csv_data("../../data/")
print("Data loaded...")

Loading data...
Data loaded...


In [9]:
print("x_train.shape =", x_train.shape)
print("x_test.shape =", x_test.shape)
print("y_train.shape =", y_train.shape)
print("train_ids.shape =", train_ids.shape)
print("test_ids.shape =", test_ids.shape)

x_train.shape = (328135, 321)
x_test.shape = (109379, 321)
y_train.shape = (328135,)
train_ids.shape = (328135,)
test_ids.shape = (109379,)


In [10]:
# replace -1 with 0 in y_train
y_train[np.where(y_train == -1)] = 0

### Feature engineering

In [11]:
### Column dropping and imputation ###
# threshold for drop_columns function
DROP_NAN_THRESHOLD = 1  # 1 = keep everything, 0 = drop everything that contains nan
# threshold for categorical/numerical (categorical are imputated with median, numerical with mean)
CAT_NUM_THRESHOLD = 30
# flag for drop_single_value_columns function
# Should always be True, otherwise it is messing up with the correlation coefficient.
DROP_SINGLE = True
# threshold for drop_correlated_columns function
DROP_CORR_THRESHOLD = 0.9

### Feature processing ###
# flag for build_poly function
BUILD_POLY = False
# degree for build_poly function
DEGREE = 2
# flag for build_log function
BUILD_LOG = False
# flag for build_x
BUILD_RATIOS = False
# flag for standardize function
STANDARDIZE = True

In [12]:
# copy data
x_train_temp = x_train.copy()
x_test_temp = x_test.copy()

In [13]:
# drop_columns on x_train and x_test
x_train_temp, cols_to_keep_1 = drop_columns(x_train, DROP_NAN_THRESHOLD)
x_test_temp = x_test[:, cols_to_keep_1]
print(f"Dropping columns with DROP_NAN_THRESHOLD = {DROP_NAN_THRESHOLD}...")

Dropping columns with DROP_NAN_THRESHOLD = 1...


In [14]:
# categorical and numerical features
categorical_features = []
numerical_features = []
# find categorical and numerical features
for i, feature in enumerate(x_train_temp.T):
    if np.unique(feature).shape[0] < CAT_NUM_THRESHOLD:
        categorical_features.append(i)
    else:
        numerical_features.append(i)
# fill in missing values on the train and test
x_train_temp[:, categorical_features] = median_imputation(
    x_train_temp[:, categorical_features]
)
x_test_temp[:, categorical_features] = median_imputation(
    x_test_temp[:, categorical_features]
)
x_train_temp[:, numerical_features] = mean_imputation(
    x_train_temp[:, numerical_features]
)
x_test_temp[:, numerical_features] = mean_imputation(
    x_test_temp[:, numerical_features])


In [15]:
print("x_train_temp.shape =", x_train_temp.shape)
print("x_test_temp.shape =", x_test_temp.shape)

x_train_temp.shape = (328135, 321)
x_test_temp.shape = (109379, 321)


In [16]:
# drop_single_value_columns on x_train and x_test
if DROP_SINGLE:
    x_train_temp, cols_to_keep_3 = drop_single_value_columns(x_train_temp)
    x_test_temp = x_test_temp[:, cols_to_keep_3]
    print("Dropping single valued columns...")
# drop_correlated_columns on x_train and x_test
x_train_temp, cols_to_keep_2 = drop_correlated_columns(
    x_train_temp, DROP_CORR_THRESHOLD
)
x_test_temp = x_test_temp[:, cols_to_keep_2]
print(f"Dropping columns with DROP_CORR_THRESHOLD = {DROP_CORR_THRESHOLD}...")


Dropping single valued columns...
Dropping columns with DROP_CORR_THRESHOLD = 0.9...


In [17]:
print("x_train_temp.shape =", x_train_temp.shape)
print("x_test_temp.shape =", x_test_temp.shape)

x_train_temp.shape = (328135, 279)
x_test_temp.shape = (109379, 279)


In [18]:
# CHECK IF THERE ARE TWO EQUAL COLUMNS
# matrix = x_train_temp
# # Get the number of columns in the matrix
# num_columns = matrix.shape[1]
# # Initialize a flag to check for equal columns
# has_equal_columns = False

# # Loop through each pair of columns
# for i in range(num_columns):
#     for j in range(i + 1, num_columns):
#         if np.array_equal(matrix[:, i], matrix[:, j]):
#             has_equal_columns = True
#             break

# if has_equal_columns:
#     print("The matrix has two equal columns.")
# else:
#     print("The matrix does not have two equal columns.")

In [21]:
if BUILD_RATIOS:
    print("Building ratios...")
    # build ratios between columns
    x_train_temp = build_ratios(x_train_temp)
    x_test_temp = build_ratios(x_test_temp)


Building ratios...


In [48]:
if BUILD_LOG:
    print("Building log...")
    # build_log on x_train and x_test
    x_train_temp = build_log(x_train_temp)
    x_test_temp = build_log(x_test_temp)

Building log...


In [49]:
x_train_temp.shape

(328135, 558)

In [20]:
# build polynomial features
if BUILD_POLY:
    print(f"Building polynomial with degree = {DEGREE}...")
    x_train_temp = build_poly(x_train_temp, DEGREE)
    x_test_temp = build_poly(x_test_temp, DEGREE)

In [21]:
# standardize
if STANDARDIZE:
    print("Standardizing...")
    x_train_temp = standardize(x_train_temp)
    x_test_temp = standardize(x_test_temp)

Standardizing...


In [22]:
print("New x_train and x_test shapes")
print("x_train_temp.shape =", x_train_temp.shape)
print("x_test_temp.shape =", x_test_temp.shape)

New x_train and x_test shapes
x_train_temp.shape = (328135, 279)
x_test_temp.shape = (109379, 279)


### Models

In [23]:
# number of folds
NUM_FOLDS = 5
# build k_indices for cross validation
k_indices = build_k_indices(y_train, NUM_FOLDS, 42)

#### Mean squared error with GD

In [24]:
# Model parameters
MAX_ITERS = 100
GAMMA = 0.5
# Model


def _mean_squared_error_gd(y, tx, initial_w):
    return mean_squared_error_gd(y, tx, initial_w, MAX_ITERS, GAMMA)


In [25]:
# Cross validation
eval_results_train, eval_results_test, losses_train, losses_test, w = cross_validation(
    y_train,
    x_train_temp,
    k_indices,
    model_fn=_mean_squared_error_gd,
    loss_fn=compute_loss_mse,
    pred_fn=predict_mse,
    eval_fns=dict(accuracy=accuracy, f1_score=f1_score),
)


Fold 1/5
Train loss: 651794955608505568499454194535392247626259397682191373903288171631601609110389044154762601462899933184.00000
Test loss: 642335906831634450402607695751100210247787565533766105669829961746277498816226952775138766881044824064.00000
Train accuracy: 0.47004
Train f1_score: 0.05968
Test accuracy: 0.47241
Test f1_score: 0.05790
------------------------------
Fold 2/5
Train loss: 461123120233055180863264590874661119753652831487652529512794908038836671479508046539474013383724892160.00000
Test loss: 457505792770218050447818156297955373788713121068156704274467827418264776136015244639550229356814008320.00000
Train accuracy: 0.46956
Train f1_score: 0.05887
Test accuracy: 0.47421
Test f1_score: 0.06142
------------------------------
Fold 3/5
Train loss: 69281503798390583237474791008462924453966334216173687989312178010027281870787241502970590722053373952.00000
Test loss: 71336845659704819147472183344618916371681566921483207376711651018978672624103529805506559731455492096.00000
Tr

In [26]:
print("Train:")
print_results(eval_results_train)
print("-" * 20)
print("Test:")
print_results(eval_results_test)


Train:
accuracy : 0.468921 ± 0.001159
f1_score : 0.059065 ± 0.000408
--------------------
Test:
accuracy : 0.469115 ± 0.003562
f1_score : 0.059113 ± 0.001238


### Mean squared error with SGD

In [27]:
# Model parameters
MAX_ITERS = 100
GAMMA = 0.5
# Model


def _mean_squared_error_sgd(y, tx, initial_w):
    return mean_squared_error_sgd(y, tx, initial_w, MAX_ITERS, GAMMA)


In [28]:
# Cross validation
eval_results_train, eval_results_test, losses_train, losses_test, w = cross_validation(
    y_train,
    x_train_temp,
    k_indices,
    model_fn=_mean_squared_error_sgd,
    loss_fn=compute_loss_mse,
    pred_fn=predict_mse,
    eval_fns=dict(accuracy=accuracy, f1_score=f1_score),
)


Fold 1/5
Train loss: 298392628334554811106612086784714317913029029821275735968480291543142540352509479079571681052075875190603339962010797880670350393743507456.00000
Test loss: 302107941856087520772057709559050688911895288790913960911823884850021798577119849577038845324416924124781840805494727183060674014764597248.00000
Train accuracy: 0.57662
Train f1_score: 0.18236
Test accuracy: 0.57402
Test f1_score: 0.17699
------------------------------
Fold 2/5
Train loss: 1600415535684864923278207750390391608289506490693086688985438181682513381218446972356679132300443648.00000
Test loss: 1594087747759648393512609787932585748481883162677021456658391892124971084164241640787964762920058880.00000
Train accuracy: 0.39296
Train f1_score: 0.05541
Test accuracy: 0.39659
Test f1_score: 0.05498
------------------------------
Fold 3/5
Train loss: 2153389575252531809476713058590078961381510636305428116959410349093698619769732560421102017908278003483445647588809141443618141829805053978290946048.00000
Test l

In [29]:
print("Train:")
print_results(eval_results_train)
print("-" * 20)
print("Test:")
print_results(eval_results_test)


Train:
accuracy : 0.508958 ± 0.063296
f1_score : 0.140837 ± 0.047567
--------------------
Test:
accuracy : 0.508781 ± 0.061187
f1_score : 0.139217 ± 0.047377


### Least squares

In [30]:
def _least_squares(y, tx, initial_w):
    return least_squares(y, tx)

In [31]:
# Cross validation
eval_results_train, eval_results_test, losses_train, losses_test, w = cross_validation(
    y_train,
    x_train_temp,
    k_indices,
    model_fn=_least_squares,
    loss_fn=compute_loss_mse,
    pred_fn=predict_mse,
    eval_fns=dict(accuracy=accuracy, f1_score=f1_score),
)


Fold 1/5
Train loss: 0.03825
Test loss: 0.03796
Train accuracy: 0.91156
Train f1_score: 0.00429
Test accuracy: 0.91301
Test f1_score: 0.00279
------------------------------
Fold 2/5
Train loss: 0.03831
Test loss: 0.03769
Train accuracy: 0.91146
Train f1_score: 0.00386
Test accuracy: 0.91334
Test f1_score: 0.00316
------------------------------
Fold 3/5
Train loss: 0.03814
Test loss: 0.03837
Train accuracy: 0.91192
Train f1_score: 0.00259
Test accuracy: 0.91129
Test f1_score: 0.00240
------------------------------
Fold 4/5
Train loss: 0.03812
Test loss: 0.03849
Train accuracy: 0.91202
Train f1_score: 0.00423
Test accuracy: 0.91121
Test f1_score: 0.00376
------------------------------
Fold 5/5
Train loss: 0.03802
Test loss: 0.03888
Train accuracy: 0.91225
Train f1_score: 0.00389
Test accuracy: 0.91022
Test f1_score: 0.00406
------------------------------


In [32]:
print("Train:")
print_results(eval_results_train)
print("-" * 20)
print("Test:")
print_results(eval_results_test)


Train:
accuracy : 0.911843 ± 0.000294
f1_score : 0.003770 ± 0.000616
--------------------
Test:
accuracy : 0.911814 ± 0.001179
f1_score : 0.003233 ± 0.000609


### Ridge regression

In [33]:
LAMBDA_ = 0.1


def _ridge_regression(y, tx, initial_w):
    return ridge_regression(y, tx, LAMBDA_)

In [34]:
# Cross validation
eval_results_train, eval_results_test, losses_train, losses_test, w = cross_validation(
    y_train,
    x_train_temp,
    k_indices,
    model_fn=_ridge_regression,
    loss_fn=compute_loss_mse,
    pred_fn=predict_mse,
    eval_fns=dict(accuracy=accuracy, f1_score=f1_score),
)


Fold 1/5
Train loss: 0.03834
Test loss: 0.03801
Train accuracy: 0.91144
Train f1_score: 0.00129
Test accuracy: 0.91293
Test f1_score: 0.00070
------------------------------
Fold 2/5
Train loss: 0.03841
Test loss: 0.03773
Train accuracy: 0.91135
Train f1_score: 0.00112
Test accuracy: 0.91328
Test f1_score: 0.00105
------------------------------
Fold 3/5
Train loss: 0.03824
Test loss: 0.03845
Train accuracy: 0.91186
Train f1_score: 0.00095
Test accuracy: 0.91121
Test f1_score: 0.00069
------------------------------
Fold 4/5
Train loss: 0.03821
Test loss: 0.03857
Train accuracy: 0.91189
Train f1_score: 0.00086
Test accuracy: 0.91112
Test f1_score: 0.00171
------------------------------
Fold 5/5
Train loss: 0.03811
Test loss: 0.03895
Train accuracy: 0.91216
Train f1_score: 0.00156
Test accuracy: 0.91017
Test f1_score: 0.00203
------------------------------


In [35]:
print("Train:")
print_results(eval_results_train)
print("-" * 20)
print("Test:")
print_results(eval_results_test)


Train:
accuracy : 0.911740 ± 0.000303
f1_score : 0.001155 ± 0.000249
--------------------
Test:
accuracy : 0.911744 ± 0.001176
f1_score : 0.001236 ± 0.000545


### Logistic regression

In [36]:
# Model parameters
MAX_ITERS = 100
GAMMA = 0.5
# Model


def _logistic_regression(y, tx, initial_w):
    return logistic_regression(y, tx, initial_w, MAX_ITERS, GAMMA)


In [37]:
# Cross validation
eval_results_train, eval_results_test, losses_train, losses_test, w = cross_validation(
    y_train,
    x_train_temp,
    k_indices,
    model_fn=_logistic_regression,
    loss_fn=compute_loss_logistic,
    pred_fn=predict_logistic,
    eval_fns=dict(accuracy=accuracy, f1_score=f1_score),
)


Fold 1/5
Train loss: 0.66826
Test loss: 0.67004
Train accuracy: 0.63552
Train f1_score: 0.30401
Test accuracy: 0.63413
Test f1_score: 0.29815
------------------------------
Fold 2/5
Train loss: 0.66845
Test loss: 0.66926
Train accuracy: 0.63366
Train f1_score: 0.30324
Test accuracy: 0.63367
Test f1_score: 0.29818
------------------------------
Fold 3/5
Train loss: 0.66837
Test loss: 0.66965
Train accuracy: 0.63484
Train f1_score: 0.30214
Test accuracy: 0.63207
Test f1_score: 0.30347
------------------------------
Fold 4/5
Train loss: 0.66866
Test loss: 0.66845
Train accuracy: 0.63530
Train f1_score: 0.30267
Test accuracy: 0.63555
Test f1_score: 0.30443
------------------------------
Fold 5/5
Train loss: 0.66818
Test loss: 0.67042
Train accuracy: 0.63515
Train f1_score: 0.30178
Test accuracy: 0.63172
Test f1_score: 0.30455
------------------------------


In [38]:
print("Train:")
print_results(eval_results_train)
print("-" * 20)
print("Test:")
print_results(eval_results_test)


Train:
accuracy : 0.634893 ± 0.000653
f1_score : 0.302769 ± 0.000793
--------------------
Test:
accuracy : 0.633428 ± 0.001399
f1_score : 0.301754 ± 0.002956


### Regularized logistic regression

In [39]:
# Model parameters
MAX_ITERS = 100
GAMMA = 0.5
LAMBDA_ = 0.1
# Model


def _reg_logistic_regression(y, tx, initial_w):
    return reg_logistic_regression(y, tx, initial_w, MAX_ITERS, GAMMA, LAMBDA_)


In [40]:
# Cross validation
eval_results_train, eval_results_test, losses_train, losses_test, w = cross_validation(
    y_train,
    x_train_temp,
    k_indices,
    model_fn=_reg_logistic_regression,
    loss_fn=compute_loss_logistic,
    pred_fn=predict_logistic,
    eval_fns=dict(accuracy=accuracy, f1_score=f1_score),
)


Fold 1/5
Train loss: 0.67025
Test loss: 0.67115
Train accuracy: 0.62914
Train f1_score: 0.30135
Test accuracy: 0.62901
Test f1_score: 0.29692
------------------------------
Fold 2/5
Train loss: 0.67041
Test loss: 0.67072
Train accuracy: 0.62756
Train f1_score: 0.30086
Test accuracy: 0.62778
Test f1_score: 0.29586
------------------------------
Fold 3/5
Train loss: 0.67032
Test loss: 0.67110
Train accuracy: 0.62904
Train f1_score: 0.29990
Test accuracy: 0.62593
Test f1_score: 0.30042
------------------------------
Fold 4/5
Train loss: 0.67062
Test loss: 0.67005
Train accuracy: 0.62811
Train f1_score: 0.29965
Test accuracy: 0.63020
Test f1_score: 0.30231
------------------------------
Fold 5/5
Train loss: 0.67016
Test loss: 0.67147
Train accuracy: 0.62884
Train f1_score: 0.29942
Test accuracy: 0.62639
Test f1_score: 0.30294
------------------------------


In [41]:
print("Train:")
print_results(eval_results_train)
print("-" * 20)
print("Test:")
print_results(eval_results_test)


Train:
accuracy : 0.628539 ± 0.000608
f1_score : 0.300235 ± 0.000741
--------------------
Test:
accuracy : 0.627860 ± 0.001592
f1_score : 0.299691 ± 0.002840
