In [None]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [None]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here 
y, tx, ids = load_csv_data(DATA_TRAIN_PATH)

## Clean the data

In [None]:
#tx, _, _ = modify_missing_data(tx, -999, 0.9, tx)

## Selection of model parameters

In [None]:
from param_selection import gamma_grid_search, lambda_grid_search

folder_path = '../figs/'
k_fold = 4

In [5]:
gammas = np.logspace(-5, -1, 10)

loss_logreg, loss_ls_GD, loss_ls_SGD = gamma_grid_search(y, tx, gammas, k_fold)

In [None]:
def plot_gamma_search(gammas, losses, folder_path):

    fig, axes = plt.subplots(1, 3, figsize=(12, 5))

    xlabels = ['logistic_regression', 'least_squares_GD', 'least_squares_SGD']

    for ax_nb in range(3):
        
        loss = np.array(losses[ax_nb])
        axes[ax_nb].semilogx(gammas, loss[:, 0], marker=".", color='b', label='train loss')
        axes[ax_nb].semilogx(gammas, loss[:, 1], marker="x", color='r', label='test loss')
        if ax_nb==0:
            axes[ax_nb].set_ylabel("loss") 
        if ax_nb==2:
            axes[ax_nb].set_ylim([0.125, 0.2])
            
        axes[ax_nb].set_xlabel(xlabels[ax_nb])
        axes[ax_nb].grid(True)
        axes[ax_nb].legend(loc="upper right")

    fig.suptitle('Gamma grid search')
    plt.tight_layout()
    plt.savefig((folder_path+'gammas'))
    plt.show()

In [None]:
plot_gamma_search(gammas, [loss_logreg, loss_ls_GD, loss_ls_SGD], folder_path)

In [None]:
print(gammas)

In [None]:
lambdas_log_reg = np.linspace(0, 100, 15)
lambdas_ridge = np.logspace(-5, 0, 15)
lambdas = [lambdas_log_reg, lambdas_ridge]
loss_logreg_reg, loss_ridge = lambda_grid_search(y, tx, lambdas, k_fold)

In [None]:
def plot_lambda_search(lambdas, losses, folder_path):

    fig, axes = plt.subplots(1, 2, figsize=(8, 5))

    xlabels = ['reg_logistic_regression', 'ridge_regression']

    for ax_nb in range(2):
        
        loss = np.array(losses[ax_nb])
        if ax_nb==0:
            axes[ax_nb].plot(lambdas[0], loss[:, 0], marker=".", color='b', label='train loss')
            axes[ax_nb].plot(lambdas[0], loss[:, 1], marker=".", color='r', label='test loss')
            axes[ax_nb].set_ylabel("loss")
        else:
            axes[ax_nb].semilogx(lambdas[1], loss[:, 0], marker=".", color='b', label='train loss')
            axes[ax_nb].semilogx(lambdas[1], loss[:, 1], marker=".", color='r', label='test loss')
        axes[ax_nb].set_xlabel(xlabels[ax_nb])
        axes[ax_nb].grid(True)
        axes[ax_nb].legend(loc="upper right")

    fig.suptitle('Lambda grid search')
    plt.tight_layout()
    plt.savefig((folder_path+'lambdas'))
    plt.show()

In [None]:
plot_lambda_search(lambdas, [loss_logreg_reg, loss_ridge], folder_path)

## Compare the performance of the models (accuracy)

In [None]:
from param_selection import compare_models

In [None]:
gamma_logreg = 1e-5
gamma_logreg_reg = gamma_logreg
gamma_ls_GD = 1e-4
gamma_ls_SGD = 4.6e-3
gammas = [gamma_logreg, gamma_logreg_reg, gamma_ls_GD, gamma_ls_SGD]

lambda_logreg_reg = 1e-3
lambda_ridge = 1e-4
lambdas = [lambda_logreg_reg, lambda_ridge]

In [None]:
acc_ls_GD, acc_ls_SGD, acc_ls, acc_ridge, acc_logreg, acc_logreg_reg = compare_models(y, tx, gammas, lambdas, k_fold, degree=11)

In [None]:
print(acc_ls_GD, acc_ls_SGD, acc_ls, acc_ridge, acc_logreg, acc_logreg_reg)