In [1]:
import numpy as np
from helpers import *
from implementations import *

In [2]:
x_train, x_test, y_train, train_ids, test_ids = load_csv_data('data/dataset/')

In [3]:
NAN_PERCENTAGE = 0.4

# Find columns with more than NAN_PERCENTAGE missing values in x_train
nan_cols_train = np.where(np.sum(np.isnan(x_train), axis=0) > NAN_PERCENTAGE * x_train.shape[0])[0]

# Remove those columns from both x_train and x_test
x_train = np.delete(x_train, nan_cols_train, axis=1)
x_test = np.delete(x_test, nan_cols_train, axis=1)

# Find the columns where the values are all the same
same_cols_train = np.where(np.all(x_train == x_train[0,:], axis=0))[0]

# Remove those columns from both x_train and x_test
x_train = np.delete(x_train, same_cols_train, axis=1)
x_test = np.delete(x_test, same_cols_train, axis=1)

# Replace NaN values with column means
col_means = np.nanmean(x_train, axis=0)
train_nans = np.isnan(x_train)
x_train[train_nans] = np.take(col_means, np.where(train_nans)[1])
test_nans = np.isnan(x_test)
x_test[test_nans] = np.take(col_means, np.where(test_nans)[1])

# Get the mean and standard deviation of each column in x_train
means = np.mean(x_train, axis=0)
stds = np.std(x_train, axis=0)

# If a column has a standard deviation of 0, remove it from x_train, x_test and the means and stds arrays
zero_std_cols = np.where(stds == 0)[0]
x_train = np.delete(x_train, zero_std_cols, axis=1)
x_test = np.delete(x_test, zero_std_cols, axis=1)
means = np.delete(means, zero_std_cols)
stds = np.delete(stds, zero_std_cols)

# Standardize x_train and x_test
x_train = (x_train - means) / stds
x_test = (x_test - means) / stds

# Convert y_train to 0 and 1
y_train[y_train == -1] = 0

In [4]:
print(x_train.shape)
print(x_test.shape)

(328135, 163)
(109379, 163)


In [14]:
# Train our model
NBR_ITER = 1000

# Try different values for gamma and lambda
gammas = [0.4]
lambdas = [0]

best_gamma = None
best_lambda = None
best_loss = np.inf
best_w = np.ones(x_train.shape[1])
initial_w = -1e-3 * np.ones(x_train.shape[1])
losses = []

for gamma in gammas:
    for lambda_ in lambdas:

        w = initial_w

        for _ in range(NBR_ITER):
            # compute gradient
            grad = compute_gradient_neg_log(y_train, x_train, w) + 2 * lambda_ * w
            # update w by gradient
            w = w - gamma * grad
            # compute loss
            loss = compute_loss_neg_log(y_train, x_train, w) + lambda_ * np.linalg.norm(w) ** 2
            losses.append(loss)

    
        if loss < best_loss:
            best_loss = loss
            best_gamma = gamma
            best_lambda = lambda_
            best_w = w

        print("gamma = ", gamma)
        print("lambda = ", lambda_)
        print("loss = ", loss)

        # Plot the loss
        plt.plot(losses)

print("Best gamma = ", best_gamma)
print("Best lambda = ", best_lambda)
print("Best loss = ", best_loss)

#GAMMA =  0.3
#loss =  0.5887492781783168

NameError: name 'max_iters' is not defined

In [6]:
# Function to predict the labels for the test data
def predict_labels(w, data):
    """Generates class predictions given weights, and a test data matrix"""
    y_pred = sigmoid(data.dot(w))
    y_pred[np.where(y_pred <= 0.5)] = 0
    y_pred[np.where(y_pred > 0.5)] = 1
    return y_pred

In [7]:
# Generate predictions and save ouput in csv format for submission:
OUTPUT_PATH = 'data/submission.csv'

y_pred = predict_labels(best_w, x_test)
# Convert 0 and 1 back to -1 and 1
y_pred[y_pred == 0] = -1
create_csv_submission(test_ids, y_pred, OUTPUT_PATH)