### Experiment pipeline

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../')
import numpy as np
# add ../ to path
from helpers import (
    load_csv_data
)  
from feature_processing import (
    drop_columns,
    drop_correlated_columns,
    drop_single_value_columns,
    median_imputation,
    mean_imputation,
    standardize,
    build_poly,
    build_k_indices
)
from cross_validation import (
    predict_mse,
    predict_logistic,
    accuracy,
    f1_score,
    print_results,
    cross_validation
)
from implementations import (
    mean_squared_error_gd,
    mean_squared_error_sgd,
    least_squares,
    ridge_regression,
    logistic_regression,
    reg_logistic_regression
)
from implementations_utils import (
    compute_loss_mse, 
    compute_loss_logistic)

In [3]:
print("Loading data...")
x_train, x_test, y_train, train_ids, test_ids = load_csv_data("../../data/")
print("Data loaded...")

Loading data...
Data loaded...


In [4]:
print("x_train.shape =", x_train.shape)
print("x_test.shape =", x_test.shape)
print("y_train.shape =", y_train.shape)
print("train_ids.shape =", train_ids.shape)
print("test_ids.shape =", test_ids.shape)

x_train.shape = (328135, 321)
x_test.shape = (109379, 321)
y_train.shape = (328135,)
train_ids.shape = (328135,)
test_ids.shape = (109379,)


In [5]:
# replace -1 with 0 in y_train
y_train[np.where(y_train == -1)] = 0

### Feature engineering

In [6]:
### Column dropping and imputation ###
# threshold for drop_columns function
DROP_NAN_THRESHOLD = 1 # 1 = keep everything, 0 = drop everything that contains nan
# threshold for categorical/numerical (categorical are imputated with median, numerical with mean)
CAT_NUM_THRESHOLD = 30 
# flag for drop_single_value_columns function
DROP_SINGLE = True # Should always be True, otherwise it is messing up with the correlation coefficient. 
# threshold for drop_correlated_columns function
DROP_CORR_THRESHOLD = 0.9

### Feature processing ###
# flag for build_poly function
BUILD_POLY = False
# degree for build_poly function
DEGREE = 2
# flag for build_log function
BUILD_LOG = False #TODO: TO BE IMPLEMENTED
# flag for build_x
BUILD_X = False #TODO: TO BE IMPLEMENTED
# flag for standardize function
STANDARDIZE = True

In [7]:
# copy data
x_train_temp = x_train.copy()
x_test_temp = x_test.copy()

In [8]:
# drop_columns on x_train and x_test
x_train_temp, cols_to_keep_1 = drop_columns(x_train, DROP_NAN_THRESHOLD)
x_test_temp = x_test[:, cols_to_keep_1]
print(f"Dropping columns with DROP_NAN_THRESHOLD = {DROP_NAN_THRESHOLD}...")

Dropping columns with DROP_NAN_THRESHOLD = 1...


In [9]:
# categorical and numerical features
categorical_features = []
numerical_features = []
# find categorical and numerical features
for i, feature in enumerate(x_train_temp.T):
    if (
        np.unique(feature).shape[0] < CAT_NUM_THRESHOLD
    ):
        categorical_features.append(i)
    else:
        numerical_features.append(i)
# fill in missing values on the train and test
x_train_temp[:, categorical_features] = median_imputation(
    x_train_temp[:, categorical_features]
)
x_test_temp[:, categorical_features] = median_imputation(
    x_test_temp[:, categorical_features]
)
x_train_temp[:, numerical_features] = mean_imputation(
    x_train_temp[:, numerical_features]
)
x_test_temp[:, numerical_features] = mean_imputation(x_test_temp[:, numerical_features])

In [10]:
print("x_train_temp.shape =", x_train_temp.shape)
print("x_test_temp.shape =", x_test_temp.shape)

x_train_temp.shape = (328135, 321)
x_test_temp.shape = (109379, 321)


In [11]:
# drop_single_value_columns on x_train and x_test 
if DROP_SINGLE:
    x_train_temp, cols_to_keep_3 = drop_single_value_columns(x_train_temp)
    x_test_temp = x_test_temp[:, cols_to_keep_3]
    print("Dropping single valued columns...")
# drop_correlated_columns on x_train and x_test
x_train_temp, cols_to_keep_2 = drop_correlated_columns(x_train_temp, DROP_CORR_THRESHOLD)
x_test_temp = x_test_temp[:, cols_to_keep_2]
print(f"Dropping columns with DROP_CORR_THRESHOLD = {DROP_CORR_THRESHOLD}...")

Dropping single valued columns...
Dropping columns with DROP_CORR_THRESHOLD = 0.9...


In [12]:
print("x_train_temp.shape =", x_train_temp.shape)
print("x_test_temp.shape =", x_test_temp.shape)

x_train_temp.shape = (328135, 279)
x_test_temp.shape = (109379, 279)


In [13]:
# CHECK IF THERE ARE TWO EQUAL COLUMNS
# matrix = x_train_temp
# # Get the number of columns in the matrix
# num_columns = matrix.shape[1]
# # Initialize a flag to check for equal columns
# has_equal_columns = False

# # Loop through each pair of columns
# for i in range(num_columns):
#     for j in range(i + 1, num_columns):
#         if np.array_equal(matrix[:, i], matrix[:, j]):
#             has_equal_columns = True
#             break

# if has_equal_columns:
#     print("The matrix has two equal columns.")
# else:
#     print("The matrix does not have two equal columns.")

In [14]:
# build polynomial features
if BUILD_POLY:
    print(f"Building polynomial with degree = {DEGREE}...")
    x_train_temp = build_poly(x_train_temp, DEGREE)
    x_test_temp = build_poly(x_test_temp, DEGREE)

In [15]:
# standardize
if STANDARDIZE:
    print("Standardizing...")
    x_train_temp = standardize(x_train_temp)
    x_test_temp = standardize(x_test_temp)

Standardizing...


In [16]:
print("New x_train and x_test shapes")
print("x_train_temp.shape =", x_train_temp.shape)
print("x_test_temp.shape =", x_test_temp.shape)

New x_train and x_test shapes
x_train_temp.shape = (328135, 279)
x_test_temp.shape = (109379, 279)


### Models

In [17]:
# number of folds
NUM_FOLDS = 5
# build k_indices for cross validation
k_indices = build_k_indices(y_train, NUM_FOLDS, 42)

#### Mean squared error with GD

In [18]:
# Model parameters
MAX_ITERS = 100
GAMMA = 0.5
# Model
_mean_squared_error_gd = lambda y, tx, initial_w: mean_squared_error_gd(y, tx, initial_w, MAX_ITERS, GAMMA)

In [19]:
# Cross validation
eval_results_train, eval_results_test, losses_train, losses_test, w = cross_validation(y_train, 
                                                                                       x_train_temp, 
                                                                                       k_indices, 
                                                                                       model_fn=_mean_squared_error_gd, 
                                                                                       loss_fn=compute_loss_mse, 
                                                                                       pred_fn=predict_mse, 
                                                                                       eval_fns=dict(accuracy=accuracy, f1_score=f1_score))

Fold 1/5
Train loss: 651794955608432959306938365685637561818029961839536707305271758864855301369762642868198771590072107008.00000
Test loss: 642335906831563084518182891367950912895178496743211690212190542348824792235542784387317468341655830528.00000
Train accuracy: 0.47004
Train f1_score: 0.05968
Test accuracy: 0.47241
Test f1_score: 0.05790
------------------------------
Fold 2/5
Train loss: 461123120233004018735318934073849384804874727293727195086281630892404983747885162756218849012716929024.00000
Test loss: 457505792770167509973918011730446333067745200400281495418143046956479888984363477305666330652525461504.00000
Train accuracy: 0.46956
Train f1_score: 0.05887
Test accuracy: 0.47421
Test f1_score: 0.06142
------------------------------
Fold 3/5
Train loss: 69281503798379129261686228109860783306563702748700124358589126595409981184819420923305020812748128256.00000
Test loss: 71336845659693007720607450911867726043288079926530821543130214010689461604652317267452512063786582016.00000
Tr

In [20]:
print("Train:")
print_results(eval_results_train)
print("-"*20)
print("Test:")
print_results(eval_results_test)

Train:
accuracy : 0.468921 ± 0.001159
f1_score : 0.059065 ± 0.000408
--------------------
Test:
accuracy : 0.469115 ± 0.003562
f1_score : 0.059113 ± 0.001238


### Mean squared error with SGD

In [21]:
# Model parameters
MAX_ITERS = 100
GAMMA = 0.5
# Model
_mean_squared_error_sgd = lambda y, tx, initial_w: mean_squared_error_sgd(y, tx, initial_w, MAX_ITERS, GAMMA)

In [22]:
# Cross validation
eval_results_train, eval_results_test, losses_train, losses_test, w = cross_validation(y_train, 
                                                                                       x_train_temp, 
                                                                                       k_indices, 
                                                                                       model_fn=_mean_squared_error_sgd, 
                                                                                       loss_fn=compute_loss_mse, 
                                                                                       pred_fn=predict_mse, 
                                                                                       eval_fns=dict(accuracy=accuracy, f1_score=f1_score))

Fold 1/5
Train loss: 298392628334543573155142652558532135352792472713598653088869898762414701322989814181120663759196846974025864802168388830065924996652335104.00000
Test loss: 302107941856076158872594127161256202867538475538666910206335436266565343675951070727501971223160553685227408199036409209103993925793742848.00000
Train accuracy: 0.57662
Train f1_score: 0.18236
Test accuracy: 0.57402
Test f1_score: 0.17699
------------------------------
Fold 2/5
Train loss: 1600415535684866623113488447903328662818674961272130001091422352304594476554305493897928686857879552.00000
Test loss: 1594087747759650093347890485445522803011051633256064768764376062747052179500100162329214317477494784.00000
Train accuracy: 0.39296
Train f1_score: 0.05541
Test accuracy: 0.39659
Test f1_score: 0.05498
------------------------------
Fold 3/5
Train loss: 2153389575252662413309984925453224543003240512120194786891408851557655450039897831543184894097625678437870776979495840959219267783652485977021087744.00000
Test l

In [23]:
print("Train:")
print_results(eval_results_train)
print("-"*20)
print("Test:")
print_results(eval_results_test)

Train:
accuracy : 0.508958 ± 0.063296
f1_score : 0.140837 ± 0.047567
--------------------
Test:
accuracy : 0.508781 ± 0.061187
f1_score : 0.139217 ± 0.047377


### Least squares

In [24]:
_least_squares = lambda y, tx, initial_w: least_squares(y, tx)

In [26]:
# Cross validation
eval_results_train, eval_results_test, losses_train, losses_test, w = cross_validation(y_train, 
                                                                                       x_train_temp, 
                                                                                       k_indices, 
                                                                                       model_fn=_least_squares, 
                                                                                       loss_fn=compute_loss_mse, 
                                                                                       pred_fn=predict_mse, 
                                                                                       eval_fns=dict(accuracy=accuracy, f1_score=f1_score))

Fold 1/5
Train loss: 0.03825
Test loss: 0.03796
Train accuracy: 0.91156
Train f1_score: 0.00429
Test accuracy: 0.91301
Test f1_score: 0.00279
------------------------------
Fold 2/5
Train loss: 0.03831
Test loss: 0.03769
Train accuracy: 0.91146
Train f1_score: 0.00386
Test accuracy: 0.91334
Test f1_score: 0.00316
------------------------------
Fold 3/5
Train loss: 0.03814
Test loss: 0.03837
Train accuracy: 0.91192
Train f1_score: 0.00259
Test accuracy: 0.91129
Test f1_score: 0.00240
------------------------------
Fold 4/5
Train loss: 0.03812
Test loss: 0.03849
Train accuracy: 0.91202
Train f1_score: 0.00423
Test accuracy: 0.91121
Test f1_score: 0.00376
------------------------------
Fold 5/5
Train loss: 0.03802
Test loss: 0.03888
Train accuracy: 0.91225
Train f1_score: 0.00389
Test accuracy: 0.91022
Test f1_score: 0.00406
------------------------------


In [27]:
print("Train:")
print_results(eval_results_train)
print("-"*20)
print("Test:")
print_results(eval_results_test)

Train:
accuracy : 0.911843 ± 0.000294
f1_score : 0.003770 ± 0.000616
--------------------
Test:
accuracy : 0.911814 ± 0.001179
f1_score : 0.003233 ± 0.000609


### Ridge regression

In [28]:
LAMBDA_ = 0.1
_ridge_regression = lambda y, tx, initial_w: ridge_regression(y, tx, LAMBDA_)

In [29]:
# Cross validation
eval_results_train, eval_results_test, losses_train, losses_test, w = cross_validation(y_train, 
                                                                                       x_train_temp, 
                                                                                       k_indices, 
                                                                                       model_fn=_ridge_regression, 
                                                                                       loss_fn=compute_loss_mse, 
                                                                                       pred_fn=predict_mse, 
                                                                                       eval_fns=dict(accuracy=accuracy, f1_score=f1_score))

Fold 1/5
Train loss: 0.03834
Test loss: 0.03801
Train accuracy: 0.91144
Train f1_score: 0.00129
Test accuracy: 0.91293
Test f1_score: 0.00070
------------------------------
Fold 2/5
Train loss: 0.03841
Test loss: 0.03773
Train accuracy: 0.91135
Train f1_score: 0.00112
Test accuracy: 0.91328
Test f1_score: 0.00105
------------------------------
Fold 3/5
Train loss: 0.03824
Test loss: 0.03845
Train accuracy: 0.91186
Train f1_score: 0.00095
Test accuracy: 0.91121
Test f1_score: 0.00069
------------------------------
Fold 4/5
Train loss: 0.03821
Test loss: 0.03857
Train accuracy: 0.91189
Train f1_score: 0.00086
Test accuracy: 0.91112
Test f1_score: 0.00171
------------------------------
Fold 5/5
Train loss: 0.03811
Test loss: 0.03895
Train accuracy: 0.91216
Train f1_score: 0.00156
Test accuracy: 0.91017
Test f1_score: 0.00203
------------------------------


In [30]:
print("Train:")
print_results(eval_results_train)
print("-"*20)
print("Test:")
print_results(eval_results_test)

Train:
accuracy : 0.911740 ± 0.000303
f1_score : 0.001155 ± 0.000249
--------------------
Test:
accuracy : 0.911744 ± 0.001176
f1_score : 0.001236 ± 0.000545


### Logistic regression

In [31]:
# Model parameters
MAX_ITERS = 100
GAMMA = 0.5
# Model
_logistic_regression = lambda y, tx, initial_w: logistic_regression(y, tx, initial_w, MAX_ITERS, GAMMA)

In [32]:
# Cross validation
eval_results_train, eval_results_test, losses_train, losses_test, w = cross_validation(y_train, 
                                                                                       x_train_temp, 
                                                                                       k_indices, 
                                                                                       model_fn=_logistic_regression, 
                                                                                       loss_fn=compute_loss_logistic, 
                                                                                       pred_fn=predict_logistic, 
                                                                                       eval_fns=dict(accuracy=accuracy, f1_score=f1_score))

Fold 1/5
Train loss: 0.66826
Test loss: 0.67004
Train accuracy: 0.63552
Train f1_score: 0.30401
Test accuracy: 0.63413
Test f1_score: 0.29815
------------------------------
Fold 2/5
Train loss: 0.66845
Test loss: 0.66926
Train accuracy: 0.63366
Train f1_score: 0.30324
Test accuracy: 0.63367
Test f1_score: 0.29818
------------------------------
Fold 3/5
Train loss: 0.66837
Test loss: 0.66965
Train accuracy: 0.63484
Train f1_score: 0.30214
Test accuracy: 0.63207
Test f1_score: 0.30347
------------------------------
Fold 4/5
Train loss: 0.66866
Test loss: 0.66845
Train accuracy: 0.63530
Train f1_score: 0.30267
Test accuracy: 0.63555
Test f1_score: 0.30443
------------------------------
Fold 5/5
Train loss: 0.66818
Test loss: 0.67042
Train accuracy: 0.63515
Train f1_score: 0.30178
Test accuracy: 0.63172
Test f1_score: 0.30455
------------------------------


In [33]:
print("Train:")
print_results(eval_results_train)
print("-"*20)
print("Test:")
print_results(eval_results_test)

Train:
accuracy : 0.634893 ± 0.000653
f1_score : 0.302769 ± 0.000793
--------------------
Test:
accuracy : 0.633428 ± 0.001399
f1_score : 0.301754 ± 0.002956


### Regularized logistic regression

In [34]:
# Model parameters
MAX_ITERS = 100
GAMMA = 0.1
LAMBDA_ = 0.1
# Model
_reg_logistic_regression = lambda y, tx, initial_w: reg_logistic_regression(y, tx, initial_w, MAX_ITERS, GAMMA, LAMBDA_)

In [35]:
# Cross validation
eval_results_train, eval_results_test, losses_train, losses_test, w = cross_validation(y_train, 
                                                                                       x_train_temp, 
                                                                                       k_indices, 
                                                                                       model_fn=_reg_logistic_regression, 
                                                                                       loss_fn=compute_loss_logistic, 
                                                                                       pred_fn=predict_logistic, 
                                                                                       eval_fns=dict(accuracy=accuracy, f1_score=f1_score))

Fold 1/5
Train loss: 0.67029
Test loss: 0.67118
Train accuracy: 0.62906
Train f1_score: 0.30135
Test accuracy: 0.62893
Test f1_score: 0.29675
------------------------------
Fold 2/5
Train loss: 0.67045
Test loss: 0.67075
Train accuracy: 0.62752
Train f1_score: 0.30080
Test accuracy: 0.62784
Test f1_score: 0.29594
------------------------------
Fold 3/5
Train loss: 0.67037
Test loss: 0.67114
Train accuracy: 0.62898
Train f1_score: 0.29984
Test accuracy: 0.62579
Test f1_score: 0.30046
------------------------------
Fold 4/5
Train loss: 0.67066
Test loss: 0.67009
Train accuracy: 0.62806
Train f1_score: 0.29959
Test accuracy: 0.63015
Test f1_score: 0.30233
------------------------------
Fold 5/5
Train loss: 0.67020
Test loss: 0.67150
Train accuracy: 0.62879
Train f1_score: 0.29943
Test accuracy: 0.62628
Test f1_score: 0.30292
------------------------------


In [37]:
print("Train:")
print_results(eval_results_train)
print("-"*20)
print("Test:")
print_results(eval_results_test)

Train:
accuracy : 0.628483 ± 0.000596
f1_score : 0.300203 ± 0.000744
--------------------
Test:
accuracy : 0.627800 ± 0.001621
f1_score : 0.299680 ± 0.002854
