In [1]:
import os
os.sys.path.append('./scripts')

# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from implementations import *
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = 'data/train.csv'
y_train, X_train, ids = load_csv_data(DATA_TRAIN_PATH, sub_sample=True)

In [3]:
def accuracy(y_pred, y_true):
    return np.sum(y_pred == y_true) / len(y_true)

In [4]:
def standardize(X):
    X = (X - np.mean(X, axis=0)) / (np.std(X, axis=0) + 1e-20)
    return X

In [5]:
def build_poly(X, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    d = X.shape[1]
    X_poly = np.zeros((X.shape[0], d * (degree + 1)))
    X_poly[:, 0] = np.zeros(X.shape[0])
    for i in range(0, degree):
        X_poly[:, (1 + i * d):((i + 1) * d + 1)] = X ** (i + 1)
    return X_poly

In [6]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)


def cross_validation(y, X, k_indices, k, degree, gamma, lambda_, max_iters, batch_size):
    """return the loss of ridge regression."""
    X_test = X[k_indices[k]]
    y_test = y[k_indices[k]]
    X_train = np.vstack([X[k_indices[i]] for i in range(k_indices.shape[0]) if not i == k])
    y_train = np.hstack([y[k_indices[i]] for i in range(k_indices.shape[0]) if not i == k])
        
    X_test = build_poly(X_test, degree)
    X_train = build_poly(X_train, degree)
     
    X_train = standardize(X_train)
    X_test = standardize(X_test)
    
    w0 = np.zeros(X_train.shape[1])
    w, loss = reg_logistic_regression(y=y_train, tx=X_train, lambda_=lambda_, initial_w=w0, max_iters=max_iters, gamma=gamma, batch_size=batch_size)
    
    y_train_pred = predict_labels(w, X_train)
    y_test_pred = predict_labels(w, X_test)
    
    acc_train = accuracy(y_train_pred, y_train)
    acc_test = accuracy(y_test_pred, y_test)
    return acc_train, acc_test

In [7]:
# Define hyperparameters
seed = 44
k_fold = 10

max_iters = 200
batch_size = 100
gamma = 1e-2                    # learning rate
    
degrees = [2, 3, 4, 5]            # polynomial expansion degree
lambdas = np.logspace(-4, 0, 5) # regularization constant



# split data in k fold
k_indices = build_k_indices(y_train, k_fold, seed)

# define lists to store the accuracies of training data and test data
accs_train = np.zeros((len(degrees), len(lambdas)))
accs_test = np.zeros((len(degrees), len(lambdas)))

for id_degree, degree in enumerate(degrees):
    for id_lambda, lambda_ in enumerate(lambdas):
        cur_acc_train = np.zeros(k_fold)
        cur_acc_test = np.zeros(k_fold)

        for k in range(k_fold):
            acc_train, acc_test = cross_validation(y=y_train, X=X_train, k_indices=k_indices, k=k, 
                                                   degree=degree, gamma=gamma, lambda_=lambda_, 
                                                   max_iters=max_iters, batch_size=batch_size)

            cur_acc_train[k] = acc_train
            cur_acc_test[k] = acc_test

        accs_train[id_degree, id_lambda] = cur_acc_train.mean()
        accs_test[id_degree, id_lambda] = cur_acc_test.mean()
        print(f"{id_degree} {id_lambda} Train: {cur_acc_train.mean()}, Test: {cur_acc_test.mean()}")

0 0 Train: 0.7297333333333333, Test: 0.7268000000000001
0 1 Train: 0.7251999999999998, Test: 0.7192
0 2 Train: 0.7254666666666665, Test: 0.7206
0 3 Train: 0.7261111111111112, Test: 0.7218
0 4 Train: 0.7086444444444445, Test: 0.7114
1 0 Train: 0.7298444444444445, Test: 0.7282
1 1 Train: 0.7300888888888888, Test: 0.7296
1 2 Train: 0.7289111111111112, Test: 0.7162000000000001
1 3 Train: 0.7251777777777778, Test: 0.7256
1 4 Train: 0.708288888888889, Test: 0.7106
2 0 Train: 0.7253777777777778, Test: 0.7232


  return 1.0 / (1 + np.exp(-z))


2 1 Train: 0.7270666666666666, Test: 0.7238000000000001
2 2 Train: 0.725488888888889, Test: 0.7226
2 3 Train: 0.7253111111111111, Test: 0.7246
2 4 Train: 0.7056444444444444, Test: 0.7070000000000001
3 0 Train: 0.7238444444444444, Test: 0.7251999999999998
3 1 Train: 0.7189555555555556, Test: 0.7121999999999999
3 2 Train: 0.7240666666666667, Test: 0.722
3 3 Train: 0.7253777777777778, Test: 0.7314
3 4 Train: 0.7041333333333333, Test: 0.7041999999999999


In [8]:
from proj1_helpers import *
DATA_TRAIN_PATH = 'data/train.csv'
y_train_whole, X_train_whole, ids = load_csv_data(DATA_TRAIN_PATH, sub_sample=False)

id_degree, id_lambda = np.unravel_index(np.argmax(accs_test), accs_test.shape)
degree, lambda_ = degrees[id_degree], lambdas[id_lambda]

X_train_whole = build_poly(X_train_whole, degree)
X_train_whole = standardize(X_train_whole)

w0 = np.zeros(X_train_whole.shape[1])
w, loss = reg_logistic_regression(y=y_train_whole, tx=X_train_whole, lambda_=lambda_, initial_w=w0, max_iters=max_iters, gamma=gamma, batch_size=batch_size)

y_train_pred = predict_labels(w, X_train_whole)
acc_train = accuracy(y_train_pred, y_train_whole)
print(f"Train accuracy: {acc_train}")

Train accuracy: 0.703648


## Generate predictions and save ouput in csv format for submission:

In [9]:
DATA_TEST_PATH = './data/test.csv'
_, X_test, ids_test = load_csv_data(DATA_TEST_PATH)
X_test = build_poly(X_test, degree)
X_test = standardize(X_test)

In [10]:
OUTPUT_PATH = './predictions/predictions.csv'
y_test_pred = predict_labels(w, X_test)
create_csv_submission(ids_test, y_test_pred, OUTPUT_PATH)