In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
from feature_processing import (
    build_k_indices,
    mean_imputation,
    standardize,
    drop_columns,
    drop_single_value_columns,
    drop_correlated_columns,
    median_imputation,
    mean_imputation,
)
from implementations import (
    mean_squared_error_gd,
    mean_squared_error_sgd,
    cross_validation,
    logistic_regression,
    ridge_regression,
    least_squares,
    reg_logistic_regression,
)
from helpers import *
import numpy as np
import sys

sys.path.append("../")


In [3]:
# dict with models
models = {
    "least_squares": least_squares,
    "ridge_regression": ridge_regression,
    "logistic_regression": logistic_regression,
    "reg_logistic_regression": reg_logistic_regression,
    "linear_regression_gradient_descent": mean_squared_error_gd,
    "linear_regression_stochastic_gradient_descent": mean_squared_error_sgd,
}


In [4]:
np.random.seed(42)

In [5]:
x_train, x_test, y_train, train_ids, test_ids = load_csv_data("../../data/")


In [15]:
y_train[np.where(y_train == -1)] = 0

In [6]:
x_train = drop_columns(x_train, 0.8)
# if column has 80% missing and above, we drop it
x_test = drop_columns(x_test, 0.8)
# TODO: maybe thiss drops different columns in x_train and x_test. Check it out.


In [7]:
print("x_train shape: ", x_train.shape)
print("x_test shape: ", x_test.shape)

x_train shape:  (328135, 205)
x_test shape:  (109379, 205)


In [None]:
x_train_temp = x_train.copy()
x_test_temp = x_test.copy()

x_train_temp, cols_to_keep_1 = drop_columns(x_train, 0.2)

x_train_temp, cols_to_keep_2 = drop_correlated_columns(x_train_temp, 0.85)

x_train_temp, cols_to_keep_3 = drop_single_value_columns(x_train_temp)

# apply the same column selection to x_test
x_test_temp = x_test[:, cols_to_keep_1]
x_test_temp = x_test_temp[:, cols_to_keep_2]
x_test_temp = x_test_temp[:, cols_to_keep_3]

categorical_features = []
numerical_features = []

# find categorical and numerical features
for i, feature in enumerate(x_train_temp.T):
    if (
        np.unique(feature).shape[0] < 30
    ):  # the columns with <30 unique values are considered categorical
        categorical_features.append(i)
    else:
        numerical_features.append(i)

x_train_temp[:, categorical_features] = median_imputation(
    x_train_temp[:, categorical_features]
)
x_test_temp[:, categorical_features] = median_imputation(
    x_test_temp[:, categorical_features]
)
x_train_temp[:, numerical_features] = mean_imputation(
    x_train_temp[:, numerical_features]
)
x_test_temp[:, numerical_features] = mean_imputation(x_test_temp[:, numerical_features])

In [10]:
x_train_temp = standardize(x_train_temp)
x_test_temp = standardize(x_test_temp)

In [11]:
# x_train = build_poly(x_train, 2)
# x_test = build_poly(x_test, 2)

In [16]:
model_args = {
    "max_iters": 50,
    "gamma": 0.1,
    "initial_w": np.zeros(x_train.shape[1]),
}  # CHANGE THIS IN ACCORDANCE TO THE MODEL

k_indices = build_k_indices(y_train, 5, 42)
train_loss, test_loss, acc_train, acc_test, weights = cross_validation(
    y_train, x_train_temp, k_indices, 5, models["logistic_regression"], model_args
)

kth=0
fold=0, accuracy_train=0.627, accuracy_test=0.626,
loss_train=0.088, loss_test=0.089,
kth=1
fold=1, accuracy_train=0.626, accuracy_test=0.626,
loss_train=0.088, loss_test=0.088,
kth=2
fold=2, accuracy_train=0.627, accuracy_test=0.624,
loss_train=0.089, loss_test=0.088,
kth=3
fold=3, accuracy_train=0.626, accuracy_test=0.628,
loss_train=0.087, loss_test=0.087,
kth=4
fold=4, accuracy_train=0.627, accuracy_test=0.625,
loss_train=0.089, loss_test=0.089,
