In [48]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
import sys
sys.path.append('../')

import numpy as np
from helpers import *
from implementations import (
    mean_squared_error_gd,
    mean_squared_error_sgd,
    cross_validation,
    logistic_regression,
    ridge_regression,
    least_squares,
    reg_logistic_regression,
)
from feature_processing import (
    build_k_indices,
    mean_imputation,
    standardize,
    drop_columns,
    build_poly
)

from implementations_utils import sigmoid

In [3]:
# dict with models
models = {
    "least_squares": least_squares,
    "ridge_regression": ridge_regression,
    "logistic_regression": logistic_regression,
    "reg_logistic_regression": reg_logistic_regression,
    "linear_regression_gradient_descent": mean_squared_error_gd,
    "linear_regression_stochastic_gradient_descent": mean_squared_error_sgd,
}


In [4]:
np.random.seed(42)

In [12]:
x_train, x_test, y_train, train_ids, test_ids = load_csv_data('../../data/')

In [13]:
y_train[np.where(y_train == -1)] = 0

In [14]:
x_train = drop_columns(x_train, 0.8)
x_test = drop_columns(x_test, 0.8)  # if column has 80% missing and above, we drop it
# TODO: maybe thiss drops different columns in x_train and x_test. Check it out. 

In [15]:
print("x_train shape: ", x_train.shape)
print("x_test shape: ", x_test.shape)

x_train shape:  (328135, 205)
x_test shape:  (109379, 205)


In [16]:
x_train = mean_imputation(x_train)  # median_imputation(x_train)
x_test = mean_imputation(x_test)  # median_imputation(x_test)

In [17]:
x_train = standardize(x_train)
x_test = standardize(x_test)

In [18]:
# x_train = build_poly(x_train, 2)
# x_test = build_poly(x_test, 2)

In [31]:
num_folds = 5
mod='reg_logistic'
max_iters=50
gamma=0.1
lambda_=0.01

k_indices = build_k_indices(y_train, num_folds, 42)
train_loss, test_loss, train_accuracy, test_accuracy, train_f1, test_f1, weights = cross_validation(
    y_train, x_train, k_indices, num_folds, lambda_=lambda_, max_iters=max_iters, gamma=gamma, mod=mod)

print("________________________________________________________")
print("Overall train accuracy: ", np.mean(train_accuracy))
print("Overall test accuracy: ", np.mean(test_accuracy))
print("Overall train f1: ", np.mean(train_f1))
print("Overall test f1: ", np.mean(test_f1))


Cross validation: 1/5
Training loss: 0.08609884828981772
Testing loss: 0.08630576980178555
Training f1 score: 0.2994228148655494
Testing f1 score: 0.2951280360164024

Cross validation: 2/5
Training loss: 0.08612242491566459
Testing loss: 0.08622474741297086
Training f1 score: 0.29933676809655296
Testing f1 score: 0.2944873303297271

Cross validation: 3/5
Training loss: 0.08638833309210706
Testing loss: 0.08611272219594807
Training f1 score: 0.29800074364328005
Testing f1 score: 0.29968257567169254

Cross validation: 4/5
Training loss: 0.08519104285162
Testing loss: 0.085077084658862
Training f1 score: 0.29810908935283925
Testing f1 score: 0.3006891030223315

Cross validation: 5/5
Training loss: 0.08655264442691356
Testing loss: 0.08708844563292753
Training f1 score: 0.29781692509140867
Testing f1 score: 0.3017148678476428

________________________________________________________
Overall train accuracy:  0.6256350282658053
Overall test accuracy:  0.6251268532768525
Overall train f1:  0.

In [41]:
#make predictions
y_pred = np.dot(x_test, weights)
if mod=='reg_logistic' or mod=='logistic':
    y_pred=sigmoid(y_pred)

y_pred=np.where(y_pred>0.5, 1, 0)
y_pred[np.where(y_pred == 0)] = -1
np.unique(y_pred, return_counts=True)

(array([-1,  1]), array([60806, 48573]))

In [42]:
create_csv_submission(test_ids, y_pred, "reg_logistic_1.csv")