### Experiment pipeline

In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import sys

sys.path.append("../")
# add ../ to path

In [3]:
import numpy as np
from helpers import load_csv_data
from feature_processing import (
    drop_columns,
    drop_correlated_columns,
    drop_single_value_columns,
    median_imputation,
    mean_imputation,
    standardize,
    build_poly,
    build_k_indices,
    build_log,
    build_ratios,
)
from cross_validation import (
    predict_mse,
    predict_logistic,
    accuracy,
    f1_score,
    print_results,
    cross_validation,
)
from implementations import (
    mean_squared_error_gd,
    mean_squared_error_sgd,
    least_squares,
    ridge_regression,
    logistic_regression,
    reg_logistic_regression,
)
from implementations_utils import compute_loss_mse, compute_loss_logistic
import csv
from pipeline import execute_pipeline


In [4]:
print("Loading data...")
x_train, x_test, y_train, train_ids, test_ids = load_csv_data("../../data/")
print("Data loaded...")

Loading data...
Data loaded...


In [7]:
print("x_train.shape =", x_train.shape)
print("x_test.shape =", x_test.shape)
print("y_train.shape =", y_train.shape)
print("train_ids.shape =", train_ids.shape)
print("test_ids.shape =", test_ids.shape)

x_train.shape = (328135, 321)
x_test.shape = (109379, 321)
y_train.shape = (328135,)
train_ids.shape = (328135,)
test_ids.shape = (109379,)


In [5]:
# replace -1 with 0 in y_train
y_train[np.where(y_train == -1)] = 0

In [6]:
results = []
# Example
# new_result = {
#     "Drop Threshold": 0.5,
#     "Drop corr thresh.": 0.6,
#     "Imputation": "mean",
#     "Standardization": "z-score",
#     "Build Poly": True,
#     "Build Log": False,
#     "Build Ratios": True,
#     "Model": "Logistic",
#     "Initial W": 0.5,
#     "Max Iters": 100,
#     "Gamma": 0.01,
#     "Lambda": 0.1,
#     "CV F1": 0.9,
#     "CV Accuracy": 95
# }
# results.append(new_result)


### Experiments using different drop NaN threshold

In [58]:
#### BASELINE ####

### Column dropping and imputation ###
# threshold for drop_columns function
DROP_NAN_THRESHOLD = 1  # 1 = keep everything, 0 = drop everything that contains nan
# threshold for categorical/numerical (categorical are imputated with median, numerical with mean)
CAT_NUM_THRESHOLD = 30
# flag for drop_single_value_columns function
# Should always be True, otherwise it is messing up with the correlation coefficient.
DROP_SINGLE = True
# threshold for drop_correlated_columns function
DROP_CORR_THRESHOLD = 0.9

### Feature processing ###
# flag for build_poly function
BUILD_POLY = False
# degree for build_poly function
DEGREE = 2
# flag for build_log function
BUILD_LOG = False  # TODO: TO BE IMPLEMENTED
# flag for build_x
BUILD_RATIOS = False  # TODO: TO BE IMPLEMENTED
# flag for standardize function
STANDARDIZE = True

NUM_FOLDS = 5


In [59]:
results = []

In [60]:
# Test different drop thresholds, 0 to 1 with 0.1 step
thresholds = np.arange(0, 1.1, 0.1)

for threshold in thresholds:
    DROP_NAN_THRESHOLD = threshold
    results = execute_pipeline(
        x_train,
        x_test,
        y_train,
        DROP_NAN_THRESHOLD,
        DROP_CORR_THRESHOLD,
        CAT_NUM_THRESHOLD,
        DROP_SINGLE,
        BUILD_RATIOS,
        BUILD_LOG,
        BUILD_POLY,
        DEGREE,
        STANDARDIZE,
        NUM_FOLDS,
        results,
    )


Dropping columns with DROP_NAN_THRESHOLD = 0.0...
Dropping single valued columns...
Standardizing...
Fold 1/5
Train loss: 0.67452
Test loss: 0.67480
Train accuracy: 0.61024
Train f1_score: 0.28764
Test accuracy: 0.61060
Test f1_score: 0.28343
------------------------------
Fold 2/5
Train loss: 0.67456
Test loss: 0.67457
Train accuracy: 0.60903
Train f1_score: 0.28751
Test accuracy: 0.60900
Test f1_score: 0.28268
------------------------------
Fold 3/5
Train loss: 0.67455
Test loss: 0.67467
Train accuracy: 0.60862
Train f1_score: 0.28531
Test accuracy: 0.60663
Test f1_score: 0.28803
------------------------------
Fold 4/5
Train loss: 0.67468
Test loss: 0.67413
Train accuracy: 0.60989
Train f1_score: 0.28649
Test accuracy: 0.61136
Test f1_score: 0.28854
------------------------------
Fold 5/5
Train loss: 0.67434
Test loss: 0.67551
Train accuracy: 0.60960
Train f1_score: 0.28579
Test accuracy: 0.60722
Test f1_score: 0.28838
------------------------------
Train:
accuracy : 0.609476 ± 0.000

In [61]:
print("Saving to csv...")

with open("../results/results_drop_nan.csv", "w") as csvfile:
    writer = csv.DictWriter(
        csvfile,
        fieldnames=[
            "Drop Threshold",
            "Drop corr thresh.",
            "Imputation",
            "Standardization",
            "Build Poly",
            "Degree",
            "Build Log",
            "Build Ratios",
            "Model",
            "Initial W",
            "Max Iters",
            "Gamma",
            "Lambda",
            "CV F1",
            "CV Accuracy",
        ],
    )

    writer.writeheader()
    for result in results:
        writer.writerow(result)

Saving to csv...


In [62]:
results_drop_nan = []
with open("../results/results_drop_nan.csv", "r") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        results_drop_nan.append(row)


In [63]:
# determine the best results for Logistic Regression drop nan threshold and for Reg Logistic Regression drop nan threshold
best_result_logistic = max(
    results_drop_nan,
    key=lambda x: float(
        x["CV F1"]) if x["Model"] == "Logistic_Regression" else 0.0,
)
best_result_reg_logistic = max(
    results_drop_nan,
    key=lambda x: float(
        x["CV F1"]) if x["Model"] == "Reg_Logistic_Regression" else 0.0,
)

print("Best result for Logistic Regression: ")
print(best_result_logistic["CV F1"])
print("With Drop Threshold: ")
print(best_result_logistic["Drop Threshold"])


print("Best result for Reg Logistic Regression:")
print(best_result_reg_logistic["CV F1"])
print("With Drop Threshold: ")
print(best_result_reg_logistic["Drop Threshold"])


Best result for Logistic Regression: 
0.3017536017318743
With Drop Threshold: 
1.0
Best result for Reg Logistic Regression:
0.299691008421669
With Drop Threshold: 
1.0


### Experiment using BUILD POLY

In [7]:
results = []


In [8]:
#### BASELINE ####

### Column dropping and imputation ###
# threshold for drop_columns function
DROP_NAN_THRESHOLD = 1  # 1 = keep everything, 0 = drop everything that contains nan
# threshold for categorical/numerical (categorical are imputated with median, numerical with mean)
CAT_NUM_THRESHOLD = 30
# flag for drop_single_value_columns function
# Should always be True, otherwise it is messing up with the correlation coefficient.
DROP_SINGLE = True
# threshold for drop_correlated_columns function
DROP_CORR_THRESHOLD = 0.9

### Feature processing ###
# flag for build_poly function
BUILD_POLY = False
# degree for build_poly function
DEGREE = 2
# flag for build_log function
BUILD_LOG = False
# flag for build_x
BUILD_RATIOS = False
# flag for standardize function
STANDARDIZE = True

NUM_FOLDS = 5


In [9]:
results = []

BUILD_POLY = True
DEGREE = 2  # when we use degree 3 the kernel crashes

results = execute_pipeline(
    x_train,
    x_test,
    y_train,
    DROP_NAN_THRESHOLD,
    DROP_CORR_THRESHOLD,
    CAT_NUM_THRESHOLD,
    DROP_SINGLE,
    BUILD_RATIOS,
    BUILD_LOG,
    BUILD_POLY,
    DEGREE,
    STANDARDIZE,
    NUM_FOLDS,
    results,
)

with open("../results/results_poly.csv", "w") as csvfile:
    writer = csv.DictWriter(
        csvfile,
        fieldnames=[
            "Drop Threshold",
            "Drop corr thresh.",
            "Imputation",
            "Standardization",
            "Build Poly",
            "Degree",
            "Build Log",
            "Build Ratios",
            "Model",
            "Initial W",
            "Max Iters",
            "Gamma",
            "Lambda",
            "CV F1",
            "CV Accuracy",
        ],
    )

    writer.writeheader()
    for result in results:
        writer.writerow(result)

Dropping columns with DROP_NAN_THRESHOLD = 1...
Dropping single valued columns...
Building polynomial with degree = 2...
Standardizing...
Fold 1/5
Train loss: 0.66351
Test loss: 0.66577
Train accuracy: 0.65875
Train f1_score: 0.31755
Test accuracy: 0.65763
Test f1_score: 0.31277
------------------------------
Fold 2/5
Train loss: 0.66351
Test loss: 0.66536
Train accuracy: 0.65733
Train f1_score: 0.31729
Test accuracy: 0.65514
Test f1_score: 0.31042
------------------------------
Fold 3/5
Train loss: 0.66330
Test loss: 0.66553
Train accuracy: 0.65877
Train f1_score: 0.31655
Test accuracy: 0.65548
Test f1_score: 0.31712
------------------------------
Fold 4/5
Train loss: 0.66383
Test loss: 0.66392
Train accuracy: 0.65740
Train f1_score: 0.31571
Test accuracy: 0.65822
Test f1_score: 0.31724
------------------------------
Fold 5/5
Train loss: 0.66330
Test loss: 0.66640
Train accuracy: 0.65775
Train f1_score: 0.31512
Test accuracy: 0.65446
Test f1_score: 0.31739
----------------------------

In [11]:
results_poly = []
with open("../results/results_poly.csv", "r") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        results_poly.append(row)


In [12]:
print("Results for build poly with degree 2, Logistic Regression: ")
print(results_poly[0]["CV F1"])
print("Results for build poly with degree 2, Reg Logistic Regression: ")
print(results_poly[1]["CV F1"])

Results for build poly with degree 2, Logistic Regression: 
0.3149890620184429
Results for build poly with degree 2, Reg Logistic Regression: 
0.29639980107081965


### Experiment using BUILD LOG

In [13]:
results = []


In [14]:
#### BASELINE ####

### Column dropping and imputation ###
# threshold for drop_columns function
DROP_NAN_THRESHOLD = 1  # 1 = keep everything, 0 = drop everything that contains nan
# threshold for categorical/numerical (categorical are imputated with median, numerical with mean)
CAT_NUM_THRESHOLD = 30
# flag for drop_single_value_columns function
# Should always be True, otherwise it is messing up with the correlation coefficient.
DROP_SINGLE = True
# threshold for drop_correlated_columns function
DROP_CORR_THRESHOLD = 0.9

### Feature processing ###
# flag for build_poly function
BUILD_POLY = False
# degree for build_poly function
DEGREE = 2
# flag for build_log function
BUILD_LOG = False
# flag for build_x
BUILD_RATIOS = False
# flag for standardize function
STANDARDIZE = True

NUM_FOLDS = 5


In [16]:
results = []

BUILD_LOG = True

results = execute_pipeline(
    x_train,
    x_test,
    y_train,
    DROP_NAN_THRESHOLD,
    DROP_CORR_THRESHOLD,
    CAT_NUM_THRESHOLD,
    DROP_SINGLE,
    BUILD_RATIOS,
    BUILD_LOG,
    BUILD_POLY,
    DEGREE,
    STANDARDIZE,
    NUM_FOLDS,
    results,
)

with open("../results/results_log.csv", "w") as csvfile:
    writer = csv.DictWriter(
        csvfile,
        fieldnames=[
            "Drop Threshold",
            "Drop corr thresh.",
            "Imputation",
            "Standardization",
            "Build Poly",
            "Degree",
            "Build Log",
            "Build Ratios",
            "Model",
            "Initial W",
            "Max Iters",
            "Gamma",
            "Lambda",
            "CV F1",
            "CV Accuracy",
        ],
    )

    writer.writeheader()
    for result in results:
        writer.writerow(result)

Dropping columns with DROP_NAN_THRESHOLD = 1...
Dropping single valued columns...
Building log...
Standardizing...
Fold 1/5
Train loss: 0.67492
Test loss: 0.67719
Train accuracy: 0.61004
Train f1_score: 0.25374
Test accuracy: 0.60530
Test f1_score: 0.24461
------------------------------
Fold 2/5
Train loss: 0.67424
Test loss: 0.67542
Train accuracy: 0.61505
Train f1_score: 0.25763
Test accuracy: 0.61638
Test f1_score: 0.25294
------------------------------
Fold 3/5
Train loss: 0.67443
Test loss: 0.67566
Train accuracy: 0.61277
Train f1_score: 0.25554
Test accuracy: 0.60867
Test f1_score: 0.25339
------------------------------
Fold 4/5
Train loss: 0.67706
Test loss: 0.67873
Train accuracy: 0.59908
Train f1_score: 0.24234
Test accuracy: 0.59448
Test f1_score: 0.24190
------------------------------
Fold 5/5
Train loss: 0.67514
Test loss: 0.67742
Train accuracy: 0.61023
Train f1_score: 0.25063
Test accuracy: 0.60509
Test f1_score: 0.25537
------------------------------
Train:
accuracy : 0.

In [17]:
results_log = []
with open("../results/results_log.csv", "r") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        results_log.append(row)

In [18]:
print("Results for build log, Logistic Regression: ")
print(results_log[0]["CV F1"])
print("Results for build log, Reg Logistic Regression: ")
print(results_log[1]["CV F1"])


Results for build log, Logistic Regression: 
0.24964105408875517
Results for build log, Reg Logistic Regression: 
0.14405154926770522


### Experiment using BUILD RATIOS

In [5]:
results = []

In [6]:
#### BASELINE ####

### Column dropping and imputation ###
# threshold for drop_columns function
DROP_NAN_THRESHOLD = 1  # 1 = keep everything, 0 = drop everything that contains nan
# threshold for categorical/numerical (categorical are imputated with median, numerical with mean)
CAT_NUM_THRESHOLD = 30
# flag for drop_single_value_columns function
# Should always be True, otherwise it is messing up with the correlation coefficient.
DROP_SINGLE = True
# threshold for drop_correlated_columns function
DROP_CORR_THRESHOLD = 0.9

### Feature processing ###
# flag for build_poly function
BUILD_POLY = False
# degree for build_poly function
DEGREE = 2
# flag for build_log function
BUILD_LOG = False
# flag for build_x
BUILD_RATIOS = False
# flag for standardize function
STANDARDIZE = True

NUM_FOLDS = 5


In [7]:
results = []

BUILD_RATIOS = True

results = execute_pipeline(
    x_train,
    x_test,
    y_train,
    DROP_NAN_THRESHOLD,
    DROP_CORR_THRESHOLD,
    CAT_NUM_THRESHOLD,
    DROP_SINGLE,
    BUILD_RATIOS,
    BUILD_LOG,
    BUILD_POLY,
    DEGREE,
    STANDARDIZE,
    NUM_FOLDS,
    results,
)

with open("../results/results_ratios.csv", "w") as csvfile:
    writer = csv.DictWriter(
        csvfile,
        fieldnames=[
            "Drop Threshold",
            "Drop corr thresh.",
            "Imputation",
            "Standardization",
            "Build Poly",
            "Degree",
            "Build Log",
            "Build Ratios",
            "Model",
            "Initial W",
            "Max Iters",
            "Gamma",
            "Lambda",
            "CV F1",
            "CV Accuracy",
        ],
    )

    writer.writeheader()
    for result in results:
        writer.writerow(result)

Dropping columns with DROP_NAN_THRESHOLD = 1...
Dropping single valued columns...
Building ratios...
Standardizing...
Fold 1/5
Train loss: 1.14464
Test loss: 1.15645
Train accuracy: 0.06569
Train f1_score: 0.33307
Test accuracy: 0.06453
Test f1_score: 0.32926
------------------------------
Fold 2/5
Train loss: 1.15013
Test loss: 1.15796
Train accuracy: 0.06594
Train f1_score: 0.33468
Test accuracy: 0.06453
Test f1_score: 0.32911
------------------------------
Fold 3/5
Train loss: 1.16565
Test loss: 1.25769
Train accuracy: 0.06453
Train f1_score: 0.33589
Test accuracy: 0.06435
Test f1_score: 0.33094
------------------------------
Fold 4/5
Train loss: 1.12160
Test loss: 1.09560
Train accuracy: 0.06606
Train f1_score: 0.33055
Test accuracy: 0.06671
Test f1_score: 0.33491
------------------------------
Fold 5/5
Train loss: 1.11553
Test loss: 1.14193
Train accuracy: 0.06618
Train f1_score: 0.33135
Test accuracy: 0.06755
Test f1_score: 0.33153
------------------------------
Train:
accuracy :

In [9]:
results_ratios = []
with open("../results/results_ratios.csv", "r") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        results_ratios.append(row)

In [10]:
print("Results for build ratios, Logistic Regression: ")
print(results_ratios[0]["CV F1"])
print(results_ratios[0]["CV Accuracy"])
print("Results for build ratios, Reg Logistic Regression: ")
print(results_ratios[1]["CV F1"])
print(results_ratios[1]["CV Accuracy"])

Results for build ratios, Logistic Regression: 
0.33115178907719345
0.0655340027732488
Results for build ratios, Reg Logistic Regression: 
0.33549740021554086
0.05608667164429274
