### Experiment pipeline

In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import sys

sys.path.append("../")
# add ../ to path

In [3]:
import numpy as np
from helpers import load_csv_data
from feature_processing import (
    drop_columns,
    drop_correlated_columns,
    drop_single_value_columns,
    median_imputation,
    mean_imputation,
    standardize,
    build_poly,
    build_k_indices,
    build_log,
    build_ratios,
)
from cross_validation import (
    predict_mse,
    predict_logistic,
    accuracy,
    f1_score,
    print_results,
    cross_validation,
)
from implementations import (
    mean_squared_error_gd,
    mean_squared_error_sgd,
    least_squares,
    ridge_regression,
    logistic_regression,
    reg_logistic_regression,
)
from implementations_utils import compute_loss_mse, compute_loss_logistic
import csv
from pipeline import execute_pipeline

In [4]:
print("Loading data...")
x_train, x_test, y_train, train_ids, test_ids = load_csv_data("../../data/")
print("Data loaded...")

Loading data...
Data loaded...


In [5]:
print("x_train.shape =", x_train.shape)
print("x_test.shape =", x_test.shape)
print("y_train.shape =", y_train.shape)
print("train_ids.shape =", train_ids.shape)
print("test_ids.shape =", test_ids.shape)

x_train.shape = (328135, 321)
x_test.shape = (109379, 321)
y_train.shape = (328135,)
train_ids.shape = (328135,)
test_ids.shape = (109379,)


In [6]:
# replace -1 with 0 in y_train
y_train[np.where(y_train == -1)] = 0

In [7]:
results = []
# Example
# new_result = {
#     "Drop Threshold": 0.5,
#     "Drop corr thresh.": 0.6,
#     "Imputation": "mean",
#     "Standardization": "z-score",
#     "Build Poly": True,
#     "Build Log": False,
#     "Build Ratios": True,
#     "Model": "Logistic",
#     "Initial W": 0.5,
#     "Max Iters": 100,
#     "Gamma": 0.01,
#     "Lambda": 0.1,
#     "CV F1": 0.9,
#     "CV Accuracy": 95
# }
# results.append(new_result)

### Experiments using different drop NaN threshold

In [8]:
#### BASELINE ####

### Column dropping and imputation ###
# threshold for drop_columns function
DROP_NAN_THRESHOLD = 0.9  # 1 = keep everything, 0 = drop everything that contains nan
# threshold for categorical/numerical (categorical are imputated with median, numerical with mean)
CAT_NUM_THRESHOLD = 90
# flag for drop_single_value_columns function
# Should always be True, otherwise it is messing up with the correlation coefficient.
DROP_SINGLE = True
# threshold for drop_correlated_columns function
DROP_CORR_THRESHOLD = 0.8

### Feature processing ###
# flag for build_poly function
BUILD_POLY = False
# degree for build_poly function
DEGREE = 2
# flag for build_log function
BUILD_LOG = True
# flag for build_x
BUILD_RATIOS = True
# flag for standardize function
STANDARDIZE = True

NUM_FOLDS = 5

GAMMA = 0.5
MAX_ITERS = 300
LAMBDA = 0.1

In [9]:
results = []

In [10]:
# Test different drop thresholds, 0 to 1 with 0.1 step
thresholds = np.arange(0, 1.1, 0.1)

for threshold in thresholds:
    DROP_NAN_THRESHOLD = threshold
    results = execute_pipeline(
        x_train,
        x_test,
        y_train,
        DROP_NAN_THRESHOLD,
        DROP_CORR_THRESHOLD,
        CAT_NUM_THRESHOLD,
        DROP_SINGLE,
        BUILD_RATIOS,
        BUILD_LOG,
        BUILD_POLY,
        DEGREE,
        STANDARDIZE,
        NUM_FOLDS,
        GAMMA,
        MAX_ITERS,
        LAMBDA,
        results,
    )

Dropping columns with DROP_NAN_THRESHOLD = 0.0...
Dropping single valued columns...
Standardizing...
Fold 1/5
Train loss: 0.67452
Test loss: 0.67480
Train accuracy: 0.61024
Train f1_score: 0.28764
Test accuracy: 0.61060
Test f1_score: 0.28343
------------------------------
Fold 2/5
Train loss: 0.67456
Test loss: 0.67457
Train accuracy: 0.60903
Train f1_score: 0.28751
Test accuracy: 0.60900
Test f1_score: 0.28268
------------------------------
Fold 3/5
Train loss: 0.67455
Test loss: 0.67467
Train accuracy: 0.60862
Train f1_score: 0.28531
Test accuracy: 0.60663
Test f1_score: 0.28803
------------------------------
Fold 4/5
Train loss: 0.67468
Test loss: 0.67413
Train accuracy: 0.60989
Train f1_score: 0.28649
Test accuracy: 0.61136
Test f1_score: 0.28854
------------------------------
Fold 5/5
Train loss: 0.67434
Test loss: 0.67551
Train accuracy: 0.60960
Train f1_score: 0.28579
Test accuracy: 0.60722
Test f1_score: 0.28838
------------------------------
Train:
accuracy : 0.609476 ± 0.000

In [11]:
print("Saving to csv...")

with open("../results/results_drop_nan.csv", "w") as csvfile:
    writer = csv.DictWriter(
        csvfile,
        fieldnames=[
            "Drop Threshold",
            "Drop corr thresh.",
            "Imputation",
            "Standardization",
            "Build Poly",
            "Degree",
            "Build Log",
            "Build Ratios",
            "Model",
            "Initial W",
            "Max Iters",
            "Gamma",
            "Lambda",
            "CV F1 std",
            "CV Accuracy std",
            "CV F1",
            "CV Accuracy",
        ],
    )

    writer.writeheader()
    for result in results:
        writer.writerow(result)

Saving to csv...


In [12]:
results_drop_nan = []
with open("../results/results_drop_nan.csv", "r") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        results_drop_nan.append(row)

In [15]:
# determine the best results for Logistic Regression drop nan threshold and for Reg Logistic Regression drop nan threshold
best_result_logistic = max(
    results_drop_nan,
    key=lambda x: float(x["CV F1"]) if x["Model"] == "Logistic_Regression" else 0.0,
)
best_result_reg_logistic = max(
    results_drop_nan,
    key=lambda x: float(x["CV F1"]) if x["Model"] == "Reg_Logistic_Regression" else 0.0,
)

print("Best result for Logistic Regression: ")
print(best_result_logistic["CV F1"] + "+/-" + best_result_logistic["CV F1 std"])
print(
    best_result_logistic["CV Accuracy"]
    + "+/-"
    + best_result_logistic["CV Accuracy std"]
)
print("With Drop Threshold: ")
print(best_result_logistic["Drop Threshold"])


print("Best result for Reg Logistic Regression:")
print(best_result_reg_logistic["CV F1"] + "+/-" + best_result_reg_logistic["CV F1 std"])
print(
    best_result_reg_logistic["CV Accuracy"]
    + "+/-"
    + best_result_reg_logistic["CV Accuracy std"]
)
print("With Drop Threshold: ")
print(best_result_reg_logistic["Drop Threshold"])

Best result for Logistic Regression: 
0.3017536017318743+/-0.0029558990866196816
0.6334283145656514+/-0.001399176317764829
With Drop Threshold: 
1.0
Best result for Reg Logistic Regression:
0.299691008421669+/-0.002840396345990438
0.6278604842519085+/-0.0015924191815075736
With Drop Threshold: 
1.0


### Experiments using different CAT_NUM_THRESHOLD = 200, choice made from graph from EDA

In [16]:
results = []

In [17]:
#### BASELINE ####

### Column dropping and imputation ###
# threshold for drop_columns function
DROP_NAN_THRESHOLD = 1  # 1 = keep everything, 0 = drop everything that contains nan
# threshold for categorical/numerical (categorical are imputated with median, numerical with mean)
CAT_NUM_THRESHOLD = 30
# flag for drop_single_value_columns function
# Should always be True, otherwise it is messing up with the correlation coefficient.
DROP_SINGLE = True
# threshold for drop_correlated_columns function
DROP_CORR_THRESHOLD = 0.9

### Feature processing ###
# flag for build_poly function
BUILD_POLY = False
# degree for build_poly function
DEGREE = 2
# flag for build_log function
BUILD_LOG = False
# flag for build_x
BUILD_RATIOS = False
# flag for standardize function
STANDARDIZE = True

NUM_FOLDS = 5

GAMMA = 0.5
MAX_ITERS = 100
LAMBDA = 0.1

In [18]:
results = []

cat_thresholds = [30, 50, 100, 200, 300]

for cat_threshold in cat_thresholds:
    CAT_NUM_THRESHOLD = cat_threshold
    results = execute_pipeline(
        x_train,
        x_test,
        y_train,
        DROP_NAN_THRESHOLD,
        DROP_CORR_THRESHOLD,
        CAT_NUM_THRESHOLD,
        DROP_SINGLE,
        BUILD_RATIOS,
        BUILD_LOG,
        BUILD_POLY,
        DEGREE,
        STANDARDIZE,
        NUM_FOLDS,
        GAMMA,
        MAX_ITERS,
        LAMBDA,
        results,
    )

with open("../results/results_cat_threshold.csv", "w") as csvfile:
    writer = csv.DictWriter(
        csvfile,
        fieldnames=[
            "Drop Threshold",
            "Drop corr thresh.",
            "Imputation",
            "Standardization",
            "Build Poly",
            "Degree",
            "Build Log",
            "Build Ratios",
            "Model",
            "Initial W",
            "Max Iters",
            "Gamma",
            "Lambda",
            "CV F1 std",
            "CV Accuracy std",
            "CV F1",
            "CV Accuracy",
        ],
    )

    writer.writeheader()
    for result in results:
        writer.writerow(result)

Dropping columns with DROP_NAN_THRESHOLD = 1...
Dropping single valued columns...
Standardizing...
Fold 1/5
Train loss: 0.66826
Test loss: 0.67004
Train accuracy: 0.63552
Train f1_score: 0.30401
Test accuracy: 0.63413
Test f1_score: 0.29815
------------------------------
Fold 2/5
Train loss: 0.66845
Test loss: 0.66926
Train accuracy: 0.63366
Train f1_score: 0.30324
Test accuracy: 0.63367
Test f1_score: 0.29818
------------------------------
Fold 3/5
Train loss: 0.66837
Test loss: 0.66965
Train accuracy: 0.63484
Train f1_score: 0.30214
Test accuracy: 0.63207
Test f1_score: 0.30347
------------------------------
Fold 4/5
Train loss: 0.66866
Test loss: 0.66845
Train accuracy: 0.63530
Train f1_score: 0.30267
Test accuracy: 0.63555
Test f1_score: 0.30443
------------------------------
Fold 5/5
Train loss: 0.66818
Test loss: 0.67042
Train accuracy: 0.63515
Train f1_score: 0.30178
Test accuracy: 0.63172
Test f1_score: 0.30455
------------------------------
Train:
accuracy : 0.634893 ± 0.00065

In [19]:
results_cat_threshold = []
with open("../results/results_cat_threshold.csv", "r") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        results_cat_threshold.append(row)

In [20]:
# determine the best results for Logistic Regression drop nan threshold and for Reg Logistic Regression drop nan threshold
best_result_logistic = max(
    results_cat_threshold,
    key=lambda x: float(x["CV F1"]) if x["Model"] == "Logistic_Regression" else 0.0,
)
best_result_reg_logistic = max(
    results_cat_threshold,
    key=lambda x: float(x["CV F1"]) if x["Model"] == "Reg_Logistic_Regression" else 0.0,
)

print("Best result for Logistic Regression: ")
print(best_result_logistic["CV F1"] + "+/-" + best_result_logistic["CV F1 std"])
print(
    best_result_logistic["CV Accuracy"]
    + "+/-"
    + best_result_logistic["CV Accuracy std"]
)
print("With CAT Threshold: ")
print(best_result_logistic["Imputation"])


print("Best result for Reg Logistic Regression:")
print(best_result_reg_logistic["CV F1"] + "+/-" + best_result_reg_logistic["CV F1 std"])
print(
    best_result_reg_logistic["CV Accuracy"]
    + "+/-"
    + best_result_reg_logistic["CV Accuracy std"]
)
print("With CAT Threshold: ")
print(best_result_reg_logistic["Imputation"])

Best result for Logistic Regression: 
0.30186279074049305+/-0.0031723432697801472
0.6334770749843814+/-0.0014811541535213306
With CAT Threshold: 
200
Best result for Reg Logistic Regression:
0.2998218405540042+/-0.0029338138186970606
0.6282048547091897+/-0.0018794220153454323
With CAT Threshold: 
200


### Experiment using different drop correlation values

In [21]:
results = []

In [22]:
#### BASELINE ####

### Column dropping and imputation ###
# threshold for drop_columns function
DROP_NAN_THRESHOLD = 1  # 1 = keep everything, 0 = drop everything that contains nan
# threshold for categorical/numerical (categorical are imputated with median, numerical with mean)
CAT_NUM_THRESHOLD = 30
# flag for drop_single_value_columns function
# Should always be True, otherwise it is messing up with the correlation coefficient.
DROP_SINGLE = True
# threshold for drop_correlated_columns function
DROP_CORR_THRESHOLD = 0.9

### Feature processing ###
# flag for build_poly function
BUILD_POLY = False
# degree for build_poly function
DEGREE = 2
# flag for build_log function
BUILD_LOG = False
# flag for build_x
BUILD_RATIOS = False
# flag for standardize function
STANDARDIZE = True

NUM_FOLDS = 5

GAMMA = 0.5
MAX_ITERS = 100
LAMBDA = 0.1

In [23]:
results = []

DROP_NAN_THRESHOLD = 1
CAT_NUM_THRESHOLD = 30

corr_thresholds = [0.85, 0.9, 0.95, 0.99]

for corr_threshold in corr_thresholds:
    DROP_CORR_THRESHOLD = corr_threshold
    results = execute_pipeline(
        x_train,
        x_test,
        y_train,
        DROP_NAN_THRESHOLD,
        DROP_CORR_THRESHOLD,
        CAT_NUM_THRESHOLD,
        DROP_SINGLE,
        BUILD_RATIOS,
        BUILD_LOG,
        BUILD_POLY,
        DEGREE,
        STANDARDIZE,
        NUM_FOLDS,
        GAMMA,
        MAX_ITERS,
        LAMBDA,
        results,
    )

with open("../results/results_corr_threshold.csv", "w") as csvfile:
    writer = csv.DictWriter(
        csvfile,
        fieldnames=[
            "Drop Threshold",
            "Drop corr thresh.",
            "Imputation",
            "Standardization",
            "Build Poly",
            "Degree",
            "Build Log",
            "Build Ratios",
            "Model",
            "Initial W",
            "Max Iters",
            "Gamma",
            "Lambda",
            "CV F1 std",
            "CV Accuracy std",
            "CV F1",
            "CV Accuracy",
        ],
    )

    writer.writeheader()
    for result in results:
        writer.writerow(result)

Dropping columns with DROP_NAN_THRESHOLD = 1...
Dropping single valued columns...
Standardizing...
Fold 1/5
Train loss: 0.66828
Test loss: 0.67003
Train accuracy: 0.63549
Train f1_score: 0.30404
Test accuracy: 0.63385
Test f1_score: 0.29812
------------------------------
Fold 2/5
Train loss: 0.66848
Test loss: 0.66923
Train accuracy: 0.63343
Train f1_score: 0.30323
Test accuracy: 0.63321
Test f1_score: 0.29808
------------------------------
Fold 3/5
Train loss: 0.66840
Test loss: 0.66962
Train accuracy: 0.63441
Train f1_score: 0.30191
Test accuracy: 0.63209
Test f1_score: 0.30368
------------------------------
Fold 4/5
Train loss: 0.66868
Test loss: 0.66847
Train accuracy: 0.63519
Train f1_score: 0.30269
Test accuracy: 0.63532
Test f1_score: 0.30425
------------------------------
Fold 5/5
Train loss: 0.66819
Test loss: 0.67045
Train accuracy: 0.63496
Train f1_score: 0.30181
Test accuracy: 0.63137
Test f1_score: 0.30423
------------------------------
Train:
accuracy : 0.634695 ± 0.00072

In [24]:
results_corr_threshold = []
with open("../results/results_corr_threshold.csv", "r") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        results_corr_threshold.append(row)

In [26]:
# determine the best results for Logistic Regression drop nan threshold and for Reg Logistic Regression drop nan threshold
best_result_logistic = max(
    results_corr_threshold,
    key=lambda x: float(x["CV F1"]) if x["Model"] == "Logistic_Regression" else 0.0,
)
best_result_reg_logistic = max(
    results_corr_threshold,
    key=lambda x: float(x["CV F1"]) if x["Model"] == "Reg_Logistic_Regression" else 0.0,
)

print("Best result for Logistic Regression: ")
print(best_result_logistic["CV F1"] + "+/-" + best_result_logistic["CV F1 std"])
print(
    best_result_logistic["CV Accuracy"]
    + "+/-"
    + best_result_logistic["CV Accuracy std"]
)
print("With corr Threshold: ")
print(best_result_logistic["Drop corr thresh."])


print("Best result for Reg Logistic Regression:")
print(best_result_reg_logistic["CV F1"] + "+/-" + best_result_reg_logistic["CV F1 std"])
print(
    best_result_reg_logistic["CV Accuracy"]
    + "+/-"
    + best_result_reg_logistic["CV Accuracy std"]
)
print("With corr Threshold: ")
print(best_result_reg_logistic["Drop corr thresh."])

Best result for Logistic Regression: 
0.3017536017318743+/-0.0029558990866196816
0.6334283145656514+/-0.001399176317764829
With corr Threshold: 
0.9
Best result for Reg Logistic Regression:
0.2997655939733219+/-0.002876811700852166
0.6278635317780792+/-0.001484817801258279
With corr Threshold: 
0.85


keep the corr threshold 0.9

### Experiment using BUILD POLY

In [27]:
results = []

In [28]:
#### BASELINE ####

### Column dropping and imputation ###
# threshold for drop_columns function
DROP_NAN_THRESHOLD = 1  # 1 = keep everything, 0 = drop everything that contains nan
# threshold for categorical/numerical (categorical are imputated with median, numerical with mean)
CAT_NUM_THRESHOLD = 30
# flag for drop_single_value_columns function
# Should always be True, otherwise it is messing up with the correlation coefficient.
DROP_SINGLE = True
# threshold for drop_correlated_columns function
DROP_CORR_THRESHOLD = 0.9

### Feature processing ###
# flag for build_poly function
BUILD_POLY = False
# degree for build_poly function
DEGREE = 2
# flag for build_log function
BUILD_LOG = False
# flag for build_x
BUILD_RATIOS = False
# flag for standardize function
STANDARDIZE = True

NUM_FOLDS = 5

GAMMA = 0.5
MAX_ITERS = 100
LAMBDA = 0.1

In [29]:
results = []

DROP_NAN_THRESHOLD = 1
CAT_NUM_THRESHOLD = 200
BUILD_POLY = True

DEGREE = 2  # when we use degree 3 the kernel crashes

results = execute_pipeline(
    x_train,
    x_test,
    y_train,
    DROP_NAN_THRESHOLD,
    DROP_CORR_THRESHOLD,
    CAT_NUM_THRESHOLD,
    DROP_SINGLE,
    BUILD_RATIOS,
    BUILD_LOG,
    BUILD_POLY,
    DEGREE,
    STANDARDIZE,
    NUM_FOLDS,
    GAMMA,
    MAX_ITERS,
    LAMBDA,
    results,
)

with open("../results/results_poly.csv", "w") as csvfile:
    writer = csv.DictWriter(
        csvfile,
        fieldnames=[
            "Drop Threshold",
            "Drop corr thresh.",
            "Imputation",
            "Standardization",
            "Build Poly",
            "Degree",
            "Build Log",
            "Build Ratios",
            "Model",
            "Initial W",
            "Max Iters",
            "Gamma",
            "Lambda",
            "CV F1 std",
            "CV Accuracy std",
            "CV F1",
            "CV Accuracy",
        ],
    )

    writer.writeheader()
    for result in results:
        writer.writerow(result)

Dropping columns with DROP_NAN_THRESHOLD = 1...
Building polynomial with degree = 2...
Dropping single valued columns...
Standardizing...
Fold 1/5
Train loss: 0.66710
Test loss: 0.66911
Train accuracy: 0.64386
Train f1_score: 0.30857
Test accuracy: 0.64210
Test f1_score: 0.30232
------------------------------
Fold 2/5
Train loss: 0.66715
Test loss: 0.66880
Train accuracy: 0.64264
Train f1_score: 0.30804
Test accuracy: 0.64195
Test f1_score: 0.30232
------------------------------
Fold 3/5
Train loss: 0.66714
Test loss: 0.66919
Train accuracy: 0.64349
Train f1_score: 0.30690
Test accuracy: 0.64099
Test f1_score: 0.30847
------------------------------
Fold 4/5
Train loss: 0.66750
Test loss: 0.66749
Train accuracy: 0.64270
Train f1_score: 0.30686
Test accuracy: 0.64280
Test f1_score: 0.30784
------------------------------
Fold 5/5
Train loss: 0.66704
Test loss: 0.66981
Train accuracy: 0.64296
Train f1_score: 0.30593
Test accuracy: 0.63928
Test f1_score: 0.30872
----------------------------

In [30]:
results_poly = []
with open("../results/results_poly.csv", "r") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        results_poly.append(row)

In [31]:
print("Results for build poly with degree 2, Logistic Regression: ")
print(results_poly[0]["CV F1"] + "+/-" + results_poly[0]["CV F1 std"])
print(results_poly[0]["CV Accuracy"] + "+/-" + results_poly[0]["CV Accuracy std"])

print("Results for build poly with degree 2, Reg Logistic Regression: ")
print(results_poly[1]["CV F1"] + "+/-" + results_poly[1]["CV F1 std"])
print(results_poly[1]["CV Accuracy"] + "+/-" + results_poly[1]["CV Accuracy std"])

Results for build poly with degree 2, Logistic Regression: 
0.3059342053019788+/-0.002965275068657712
0.6414219757112164+/-0.0012173560626491924
Results for build poly with degree 2, Reg Logistic Regression: 
0.3000286653491342+/-0.0027655394418345802
0.6282292349185548+/-0.0018229047245199273


### Experiment using BUILD LOG

In [52]:
results = []

In [53]:
#### BASELINE ####

### Column dropping and imputation ###
# threshold for drop_columns function
DROP_NAN_THRESHOLD = 1  # 1 = keep everything, 0 = drop everything that contains nan
# threshold for categorical/numerical (categorical are imputated with median, numerical with mean)
CAT_NUM_THRESHOLD = 30
# flag for drop_single_value_columns function
# Should always be True, otherwise it is messing up with the correlation coefficient.
DROP_SINGLE = True
# threshold for drop_correlated_columns function
DROP_CORR_THRESHOLD = 0.9

### Feature processing ###
# flag for build_poly function
BUILD_POLY = False
# degree for build_poly function
DEGREE = 2
# flag for build_log function
BUILD_LOG = False
# flag for build_x
BUILD_RATIOS = False
# flag for standardize function
STANDARDIZE = True

NUM_FOLDS = 5

GAMMA = 0.5
MAX_ITERS = 100
LAMBDA = 0.1

In [54]:
results = []

BUILD_LOG = True
CAT_NUM_THRESHOLD = 200
DROP_CORR_THRESHOLD = 0.9
DROP_NAN_THRESHOLD = 1

results = execute_pipeline(
    x_train,
    x_test,
    y_train,
    DROP_NAN_THRESHOLD,
    DROP_CORR_THRESHOLD,
    CAT_NUM_THRESHOLD,
    DROP_SINGLE,
    BUILD_RATIOS,
    BUILD_LOG,
    BUILD_POLY,
    DEGREE,
    STANDARDIZE,
    NUM_FOLDS,
    GAMMA,
    MAX_ITERS,
    LAMBDA,
    results,
)

with open("../results/results_log.csv", "w") as csvfile:
    writer = csv.DictWriter(
        csvfile,
        fieldnames=[
            "Drop Threshold",
            "Drop corr thresh.",
            "Imputation",
            "Standardization",
            "Build Poly",
            "Degree",
            "Build Log",
            "Build Ratios",
            "Model",
            "Initial W",
            "Max Iters",
            "Gamma",
            "Lambda",
            "CV F1 std",
            "CV Accuracy std",
            "CV F1",
            "CV Accuracy",
        ],
    )

    writer.writeheader()
    for result in results:
        writer.writerow(result)

Dropping columns with DROP_NAN_THRESHOLD = 1...
Building log...
Dropping single valued columns...
Standardizing...
Fold 1/5
Train loss: 0.66824
Test loss: 0.67002
Train accuracy: 0.63585
Train f1_score: 0.30419
Test accuracy: 0.63410
Test f1_score: 0.29813
------------------------------
Fold 2/5
Train loss: 0.66843
Test loss: 0.66928
Train accuracy: 0.63384
Train f1_score: 0.30349
Test accuracy: 0.63321
Test f1_score: 0.29779
------------------------------
Fold 3/5
Train loss: 0.66835
Test loss: 0.66968
Train accuracy: 0.63496
Train f1_score: 0.30205
Test accuracy: 0.63227
Test f1_score: 0.30382
------------------------------
Fold 4/5
Train loss: 0.66864
Test loss: 0.66845
Train accuracy: 0.63529
Train f1_score: 0.30263
Test accuracy: 0.63582
Test f1_score: 0.30463
------------------------------
Fold 5/5
Train loss: 0.66815
Test loss: 0.67046
Train accuracy: 0.63506
Train f1_score: 0.30179
Test accuracy: 0.63154
Test f1_score: 0.30476
------------------------------
Train:
accuracy : 0.

In [55]:
results_log = []
with open("../results/results_log.csv", "r") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        results_log.append(row)

In [56]:
print("Results for build log, Logistic Regression: ")
print(results_log[0]["CV F1"] + "+/-" + results_log[0]["CV F1 std"])
print(results_log[0]["CV Accuracy"] + "+/-" + results_log[0]["CV Accuracy std"])
print("Results for build log, Reg Logistic Regression: ")
print(results_log[1]["CV F1"] + "+/-" + results_log[1]["CV F1 std"])
print(results_log[1]["CV Accuracy"] + "+/-" + results_log[1]["CV Accuracy std"])

Results for build log, Logistic Regression: 
0.30182729233722394+/-0.0031742020883959066
0.6333886967254332+/-0.0014914083696359283
Results for build log, Reg Logistic Regression: 
0.29963755266459335+/-0.0031854647106852167
0.6277507733097658+/-0.001660233409478766


### Experiment using BUILD RATIOS

In [46]:
results = []

In [48]:
#### BASELINE ####

### Column dropping and imputation ###
# threshold for drop_columns function
DROP_NAN_THRESHOLD = 1  # 1 = keep everything, 0 = drop everything that contains nan
# threshold for categorical/numerical (categorical are imputated with median, numerical with mean)
CAT_NUM_THRESHOLD = 30
# flag for drop_single_value_columns function
# Should always be True, otherwise it is messing up with the correlation coefficient.
DROP_SINGLE = True
# threshold for drop_correlated_columns function
DROP_CORR_THRESHOLD = 0.9

### Feature processing ###
# flag for build_poly function
BUILD_POLY = False
# degree for build_poly function
DEGREE = 2
# flag for build_log function
BUILD_LOG = False
# flag for build_x
BUILD_RATIOS = False
# flag for standardize function
STANDARDIZE = True

NUM_FOLDS = 5

GAMMA = 0.5
MAX_ITERS = 100
LAMBDA = 0.1

In [49]:
results = []

BUILD_RATIOS = True
CAT_NUM_THRESHOLD = 200
DROP_NAN_THRESHOLD = 1
DROP_CORR_THRESHOLD = 0.9
results = execute_pipeline(
    x_train,
    x_test,
    y_train,
    DROP_NAN_THRESHOLD,
    DROP_CORR_THRESHOLD,
    CAT_NUM_THRESHOLD,
    DROP_SINGLE,
    BUILD_RATIOS,
    BUILD_LOG,
    BUILD_POLY,
    DEGREE,
    STANDARDIZE,
    NUM_FOLDS,
    GAMMA,
    MAX_ITERS,
    LAMBDA,
    results,
)

with open("../results/results_ratios.csv", "w") as csvfile:
    writer = csv.DictWriter(
        csvfile,
        fieldnames=[
            "Drop Threshold",
            "Drop corr thresh.",
            "Imputation",
            "Standardization",
            "Build Poly",
            "Degree",
            "Build Log",
            "Build Ratios",
            "Model",
            "Initial W",
            "Max Iters",
            "Gamma",
            "Lambda",
            "CV F1 std",
            "CV Accuracy std",
            "CV F1",
            "CV Accuracy",
        ],
    )

    writer.writeheader()
    for result in results:
        writer.writerow(result)

Dropping columns with DROP_NAN_THRESHOLD = 1...
Building ratios...
Dropping single valued columns...
Standardizing...
Fold 1/5
Train loss: 0.66824
Test loss: 0.67008
Train accuracy: 0.63566
Train f1_score: 0.30393
Test accuracy: 0.63413
Test f1_score: 0.29811
------------------------------
Fold 2/5
Train loss: 0.66843
Test loss: 0.66934
Train accuracy: 0.63372
Train f1_score: 0.30332
Test accuracy: 0.63325
Test f1_score: 0.29777
------------------------------
Fold 3/5
Train loss: 0.66837
Test loss: 0.66966
Train accuracy: 0.63484
Train f1_score: 0.30214
Test accuracy: 0.63219
Test f1_score: 0.30358
------------------------------
Fold 4/5
Train loss: 0.66865
Test loss: 0.66846
Train accuracy: 0.63533
Train f1_score: 0.30263
Test accuracy: 0.63634
Test f1_score: 0.30505
------------------------------
Fold 5/5
Train loss: 0.66817
Test loss: 0.67043
Train accuracy: 0.63515
Train f1_score: 0.30182
Test accuracy: 0.63155
Test f1_score: 0.30449
------------------------------
Train:
accuracy :

In [50]:
results_ratios = []
with open("../results/results_ratios.csv", "r") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        results_ratios.append(row)

In [51]:
print("Results for build ratios, Logistic Regression: ")
print(results_ratios[0]["CV F1"] + "+/-" + results_ratios[0]["CV F1 std"])
print(results_ratios[0]["CV Accuracy"] + "+/-" + results_ratios[0]["CV Accuracy std"])
print("Results for build ratios, Reg Logistic Regression: ")
print(results_ratios[1]["CV F1"] + "+/-" + results_ratios[1]["CV F1 std"])
print(results_ratios[1]["CV Accuracy"] + "+/-" + results_ratios[1]["CV Accuracy std"])

Results for build ratios, Logistic Regression: 
0.30179929093616437+/-0.0031890864971147763
0.6334923126152345+/-0.0016739603814959311
Results for build ratios, Reg Logistic Regression: 
0.2997544266217278+/-0.0027273148251637743
0.6281256190287533+/-0.001813751784724359


### Ratio + Poly

In [7]:
results = []

In [11]:
#### BASELINE ####

### Column dropping and imputation ###
# threshold for drop_columns function
DROP_NAN_THRESHOLD = 1  # 1 = keep everything, 0 = drop everything that contains nan
# threshold for categorical/numerical (categorical are imputated with median, numerical with mean)
CAT_NUM_THRESHOLD = 30
# flag for drop_single_value_columns function
# Should always be True, otherwise it is messing up with the correlation coefficient.
DROP_SINGLE = True
# threshold for drop_correlated_columns function
DROP_CORR_THRESHOLD = 0.9

### Feature processing ###
# flag for build_poly function
BUILD_POLY = False
# degree for build_poly function
DEGREE = 2
# flag for build_log function
BUILD_LOG = False
# flag for build_x
BUILD_RATIOS = False
# flag for standardize function
STANDARDIZE = True

NUM_FOLDS = 5

GAMMA = 0.5
MAX_ITERS = 100
LAMBDA = 0.1

In [12]:
results = []

BUILD_RATIOS = True
BUILD_LOG = False
BUILD_POLY = True
CAT_NUM_THRESHOLD = 200
DROP_NAN_THRESHOLD = 1
DROP_CORR_THRESHOLD = 0.9
results = execute_pipeline(
    x_train,
    x_test,
    y_train,
    DROP_NAN_THRESHOLD,
    DROP_CORR_THRESHOLD,
    CAT_NUM_THRESHOLD,
    DROP_SINGLE,
    BUILD_RATIOS,
    BUILD_LOG,
    BUILD_POLY,
    DEGREE,
    STANDARDIZE,
    NUM_FOLDS,
    GAMMA,
    MAX_ITERS,
    LAMBDA,
    results,
)

with open("../results/results_ratio_poly.csv", "w") as csvfile:
    writer = csv.DictWriter(
        csvfile,
        fieldnames=[
            "Drop Threshold",
            "Drop corr thresh.",
            "Imputation",
            "Standardization",
            "Build Poly",
            "Degree",
            "Build Log",
            "Build Ratios",
            "Model",
            "Initial W",
            "Max Iters",
            "Gamma",
            "Lambda",
            "CV F1 std",
            "CV Accuracy std",
            "CV F1",
            "CV Accuracy",
        ],
    )

    writer.writeheader()
    for result in results:
        writer.writerow(result)

Dropping columns with DROP_NAN_THRESHOLD = 1...
Building ratios...
Building polynomial with degree = 2...
Dropping single valued columns...
Standardizing...
Fold 1/5
Train loss: 0.66708
Test loss: 0.66919
Train accuracy: 0.64393
Train f1_score: 0.30851
Test accuracy: 0.64198
Test f1_score: 0.30238
------------------------------
Fold 2/5
Train loss: 0.66712
Test loss: 0.66889
Train accuracy: 0.64263
Train f1_score: 0.30809
Test accuracy: 0.64141
Test f1_score: 0.30167
------------------------------
Fold 3/5
Train loss: 0.66712
Test loss: 0.66929
Train accuracy: 0.64343
Train f1_score: 0.30688
Test accuracy: 0.64112
Test f1_score: 0.30860
------------------------------
Fold 4/5
Train loss: 0.66749
Test loss: 0.66753
Train accuracy: 0.64262
Train f1_score: 0.30659
Test accuracy: 0.64256
Test f1_score: 0.30790
------------------------------
Fold 5/5
Train loss: 0.66703
Test loss: 0.66984
Train accuracy: 0.64303
Train f1_score: 0.30601
Test accuracy: 0.63940
Test f1_score: 0.30863
---------

In [13]:
results_ratio_poly = []
with open("../results/results_ratio_poly.csv", "r") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        results_ratio_poly.append(row)

In [14]:
print("Results for build ratios, Logistic Regression: ")
print(results_ratio_poly[0]["CV F1"] + "+/-" + results_ratio_poly[0]["CV F1 std"])
print(
    results_ratio_poly[0]["CV Accuracy"]
    + "+/-"
    + results_ratio_poly[0]["CV Accuracy std"]
)
print("Results for build ratios, Reg Logistic Regression: ")
print(results_ratio_poly[1]["CV F1"] + "+/-" + results_ratio_poly[1]["CV F1 std"])
print(
    results_ratio_poly[1]["CV Accuracy"]
    + "+/-"
    + results_ratio_poly[1]["CV Accuracy std"]
)

Results for build ratios, Logistic Regression: 
0.30583396906534166+/-0.0031304020342082855
0.6412939796120499+/-0.0010660854638806165
Results for build ratios, Reg Logistic Regression: 
0.3001154604383235+/-0.0025379873578594316
0.6283237082298444+/-0.0019650046062295026


Not an improvement, we will use only poly fron now on

### Experiments using different gammas

In [10]:
results = []

In [11]:
#### BASELINE ####

### Column dropping and imputation ###
# threshold for drop_columns function
DROP_NAN_THRESHOLD = 0.8  # 1 = keep everything, 0 = drop everything that contains nan
# threshold for categorical/numerical (categorical are imputated with median, numerical with mean)
CAT_NUM_THRESHOLD = 50
# flag for drop_single_value_columns function
# Should always be True, otherwise it is messing up with the correlation coefficient.
DROP_SINGLE = True
# threshold for drop_correlated_columns function
DROP_CORR_THRESHOLD = 0.8

### Feature processing ###
# flag for build_poly function
BUILD_POLY = False
# degree for build_poly function
DEGREE = 2
# flag for build_log function
BUILD_LOG = True
# flag for build_x
BUILD_RATIOS = True
# flag for standardize function
STANDARDIZE = True

NUM_FOLDS = 5

GAMMA = 0.5
MAX_ITERS = 300
LAMBDA = 0.1

In [12]:
results = []

NUM_FOLDS = 5

GAMMA = 0.5
MAX_ITERS = 100
LAMBDA = 0.1

gammas = [0.3, 0.4, 0.5, 0.6, 0.7]

for gamma in gammas:
    GAMMA = gamma
    results = execute_pipeline(
        x_train,
        x_test,
        y_train,
        DROP_NAN_THRESHOLD,
        DROP_CORR_THRESHOLD,
        CAT_NUM_THRESHOLD,
        DROP_SINGLE,
        BUILD_RATIOS,
        BUILD_LOG,
        BUILD_POLY,
        DEGREE,
        STANDARDIZE,
        NUM_FOLDS,
        GAMMA,
        MAX_ITERS,
        LAMBDA,
        results,
    )

with open("../results/results_gammas.csv", "w") as csvfile:
    writer = csv.DictWriter(
        csvfile,
        fieldnames=[
            "Drop Threshold",
            "Drop corr thresh.",
            "Imputation",
            "Standardization",
            "Build Poly",
            "Degree",
            "Build Log",
            "Build Ratios",
            "Model",
            "Initial W",
            "Max Iters",
            "Gamma",
            "Lambda",
            "CV F1 std",
            "CV Accuracy std",
            "CV F1",
            "CV Accuracy",
        ],
    )

    writer.writeheader()
    for result in results:
        writer.writerow(result)

Dropping columns with DROP_NAN_THRESHOLD = 0.8...
Building ratios...
Building log...
Dropping single valued columns...
Standardizing...
Fold 1/5
Train loss: 0.66929
Test loss: 0.67051
Train accuracy: 0.63034
Train f1_score: 0.30123
Test accuracy: 0.62959
Test f1_score: 0.29623
------------------------------
Fold 2/5
Train loss: 0.66942
Test loss: 0.67002
Train accuracy: 0.62918
Train f1_score: 0.30135
Test accuracy: 0.62877
Test f1_score: 0.29568
------------------------------
Fold 3/5
Train loss: 0.66939
Test loss: 0.67023
Train accuracy: 0.63021
Train f1_score: 0.29990
Test accuracy: 0.62761
Test f1_score: 0.30132
------------------------------
Fold 4/5
Train loss: 0.66978
Test loss: 0.66862
Train accuracy: 0.62987
Train f1_score: 0.29998
Test accuracy: 0.63218
Test f1_score: 0.30329
------------------------------
Fold 5/5
Train loss: 0.66921
Test loss: 0.67087
Train accuracy: 0.63076
Train f1_score: 0.29996
Test accuracy: 0.62810
Test f1_score: 0.30315
------------------------------

In [13]:
results_gammas = []
with open("../results/results_gammas.csv", "r") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        results_gammas.append(row)

In [14]:
# determine the best results for Logistic Regression drop nan threshold and for Reg Logistic Regression drop nan threshold
best_result_logistic = max(
    results_gammas,
    key=lambda x: float(x["CV F1"]) if x["Model"] == "Logistic_Regression" else 0.0,
)
best_result_reg_logistic = max(
    results_gammas,
    key=lambda x: float(x["CV F1"]) if x["Model"] == "Reg_Logistic_Regression" else 0.0,
)

print("Best result for Logistic Regression: ")
print(best_result_logistic["CV F1"] + "+/-" + best_result_logistic["CV F1 std"])
print(
    best_result_logistic["CV Accuracy"]
    + "+/-"
    + best_result_logistic["CV Accuracy std"]
)
print("With Gamma: ")
print(best_result_logistic["Gamma"])


print("Best result for Reg Logistic Regression:")
print(best_result_reg_logistic["CV F1"] + "+/-" + best_result_reg_logistic["CV F1 std"])
print(
    best_result_reg_logistic["CV Accuracy"]
    + "+/-"
    + best_result_reg_logistic["CV Accuracy std"]
)
print("With Gamma: ")
print(best_result_reg_logistic["Gamma"])

Best result for Logistic Regression: 
0.2999355004950664+/-0.0033272054457835097
0.6292471086595456+/-0.0016098785955818435
With Gamma: 
0.3
Best result for Reg Logistic Regression:
0.2970012121823249+/-0.003178373686757062
0.6218903804836424+/-0.0016646970488154891
With Gamma: 
0.3


Gamma does not change

### Different MAX ITERS

In [15]:
results = []

In [16]:
#### BASELINE ####

### Column dropping and imputation ###
# threshold for drop_columns function
DROP_NAN_THRESHOLD = 0.8  # 1 = keep everything, 0 = drop everything that contains nan
# threshold for categorical/numerical (categorical are imputated with median, numerical with mean)
CAT_NUM_THRESHOLD = 50
# flag for drop_single_value_columns function
# Should always be True, otherwise it is messing up with the correlation coefficient.
DROP_SINGLE = True
# threshold for drop_correlated_columns function
DROP_CORR_THRESHOLD = 0.8

### Feature processing ###
# flag for build_poly function
BUILD_POLY = False
# degree for build_poly function
DEGREE = 2
# flag for build_log function
BUILD_LOG = True
# flag for build_x
BUILD_RATIOS = True
# flag for standardize function
STANDARDIZE = True

NUM_FOLDS = 5

GAMMA = 0.5
MAX_ITERS = 300
LAMBDA = 0.1

In [17]:
results = []

BUILD_RATIOS = False
BUILD_LOG = False
BUILD_POLY = True
CAT_NUM_THRESHOLD = 200
DROP_NAN_THRESHOLD = 1
DROP_CORR_THRESHOLD = 0.9

iters = [10, 50, 100, 200, 500, 1000]

for iter in iters:
    MAX_ITERS = iter
    results = execute_pipeline(
        x_train,
        x_test,
        y_train,
        DROP_NAN_THRESHOLD,
        DROP_CORR_THRESHOLD,
        CAT_NUM_THRESHOLD,
        DROP_SINGLE,
        BUILD_RATIOS,
        BUILD_LOG,
        BUILD_POLY,
        DEGREE,
        STANDARDIZE,
        NUM_FOLDS,
        GAMMA,
        MAX_ITERS,
        LAMBDA,
        results,
    )

with open("../results/results_max_iters.csv", "w") as csvfile:
    writer = csv.DictWriter(
        csvfile,
        fieldnames=[
            "Drop Threshold",
            "Drop corr thresh.",
            "Imputation",
            "Standardization",
            "Build Poly",
            "Degree",
            "Build Log",
            "Build Ratios",
            "Model",
            "Initial W",
            "Max Iters",
            "Gamma",
            "Lambda",
            "CV F1 std",
            "CV Accuracy std",
            "CV F1",
            "CV Accuracy",
        ],
    )

    writer.writeheader()
    for result in results:
        writer.writerow(result)

Dropping columns with DROP_NAN_THRESHOLD = 1...
Building polynomial with degree = 2...
Dropping single valued columns...
Standardizing...
Fold 1/5
Train loss: 0.66903
Test loss: 0.67029
Train accuracy: 0.63177
Train f1_score: 0.30253
Test accuracy: 0.63152
Test f1_score: 0.29789
------------------------------
Fold 2/5
Train loss: 0.66918
Test loss: 0.66970
Train accuracy: 0.63027
Train f1_score: 0.30215
Test accuracy: 0.63052
Test f1_score: 0.29708
------------------------------
Fold 3/5
Train loss: 0.66912
Test loss: 0.67021
Train accuracy: 0.63123
Train f1_score: 0.30093
Test accuracy: 0.62823
Test f1_score: 0.30164
------------------------------
Fold 4/5
Train loss: 0.66945
Test loss: 0.66898
Train accuracy: 0.63102
Train f1_score: 0.30093
Test accuracy: 0.63180
Test f1_score: 0.30303
------------------------------
Fold 5/5
Train loss: 0.66895
Test loss: 0.67080
Train accuracy: 0.63167
Train f1_score: 0.30090
Test accuracy: 0.62866
Test f1_score: 0.30411
----------------------------

  loss = np.mean(-y * np.log(p) - (1 - y) * np.log(1 - p))


Fold 2/5
Train loss: 0.66681
Test loss: inf
Train accuracy: 0.64814
Train f1_score: 0.31011
Test accuracy: 0.64748
Test f1_score: 0.30457
------------------------------
Fold 3/5
Train loss: 0.66679
Test loss: inf
Train accuracy: 0.64879
Train f1_score: 0.30879
Test accuracy: 0.64569
Test f1_score: 0.30999
------------------------------
Fold 4/5
Train loss: 0.66712
Test loss: 0.66800
Train accuracy: 0.64793
Train f1_score: 0.30841
Test accuracy: 0.64859
Test f1_score: 0.31043
------------------------------
Fold 5/5
Train loss: 0.66678
Test loss: inf
Train accuracy: 0.64750
Train f1_score: 0.30736
Test accuracy: 0.64376
Test f1_score: 0.31025
------------------------------
Train:
accuracy : 0.648284 ± 0.000570
f1_score : 0.309047 ± 0.001164
--------------------
Test:
accuracy : 0.646402 ± 0.001639
f1_score : 0.307741 ± 0.003063
Fold 1/5
Train loss: 0.67007
Test loss: 0.67105
Train accuracy: 0.62950
Train f1_score: 0.30165
Test accuracy: 0.62919
Test f1_score: 0.29722
--------------------

  loss = np.mean(-y * np.log(p) - (1 - y) * np.log(1 - p))


Fold 2/5
Train loss: nan
Test loss: inf
Train accuracy: 0.64954
Train f1_score: 0.31064
Test accuracy: 0.64868
Test f1_score: 0.30508
------------------------------
Fold 3/5
Train loss: 0.66664
Test loss: inf
Train accuracy: 0.65012
Train f1_score: 0.30936
Test accuracy: 0.64690
Test f1_score: 0.31022
------------------------------
Fold 4/5
Train loss: 0.66694
Test loss: 0.66843
Train accuracy: 0.64957
Train f1_score: 0.30908
Test accuracy: 0.64999
Test f1_score: 0.31087
------------------------------
Fold 5/5
Train loss: 0.66669
Test loss: inf
Train accuracy: 0.64805
Train f1_score: 0.30769
Test accuracy: 0.64429
Test f1_score: 0.31029
------------------------------
Train:
accuracy : 0.649495 ± 0.000770
f1_score : 0.309553 ± 0.001180
--------------------
Test:
accuracy : 0.647517 ± 0.001913
f1_score : 0.308119 ± 0.002894
Fold 1/5
Train loss: 0.67007
Test loss: 0.67105
Train accuracy: 0.62950
Train f1_score: 0.30165
Test accuracy: 0.62919
Test f1_score: 0.29722
------------------------

In [11]:
results_max_iters = []
with open("../results/results_max_iters.csv", "r") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        results_max_iters.append(row)

In [12]:
# determine the best results for Logistic Regression drop nan threshold and for Reg Logistic Regression drop nan threshold
best_result_logistic = max(
    results_max_iters,
    key=lambda x: float(x["CV F1"]) if x["Model"] == "Logistic_Regression" else 0.0,
)
best_result_reg_logistic = max(
    results_max_iters,
    key=lambda x: float(x["CV F1"]) if x["Model"] == "Reg_Logistic_Regression" else 0.0,
)

print("Best result for Logistic Regression: ")
print(best_result_logistic["CV F1"] + "+/-" + best_result_logistic["CV F1 std"])
print(
    best_result_logistic["CV Accuracy"]
    + "+/-"
    + best_result_logistic["CV Accuracy std"]
)
print("With Max Iters: ")
print(best_result_logistic["Max Iters"])


print("Best result for Reg Logistic Regression:")
print(best_result_reg_logistic["CV F1"] + "+/-" + best_result_reg_logistic["CV F1 std"])
print(
    best_result_reg_logistic["CV Accuracy"]
    + "+/-"
    + best_result_reg_logistic["CV Accuracy std"]
)
print("With Max Iters: ")
print(best_result_reg_logistic["Max Iters"])

Best result for Logistic Regression: 
0.3081186569260702+/-0.0028942425950032307
0.6475170280524783+/-0.001913074692516863
With Max Iters: 
1000
Best result for Reg Logistic Regression:
0.3000367444953675+/-0.0027472772246543036
0.6282292349185548+/-0.0018224461308472368
With Max Iters: 
50
