In [8]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load data

### Imports

In [12]:
from costs import *
from least_squares import *
from gradient_descent import *
from stochastic_gradient_descent import *
from ridge_regression import *
from logistic_regression import *
from newton import *
from cross_validation import *

from helpers import *
from proj1_helpers import *

### Load train set

In [13]:
DATA_TRAIN_PATH = 'data/train.csv' 
y, x, ids = load_csv_data(DATA_TRAIN_PATH)
tx, mean, std = standardize(x)

In [14]:
x.shape, tx.shape

((250000, 30), (250000, 31))

### Load test set

In [15]:
y_test, x_test, id_test = load_csv_data('data/test.csv')
tx_test, mean_test, std_test = standardize(x_test)
tx_test.shape

(568238, 31)

## Algorithms

### Least squares

In [16]:
# Training
ls_loss, ls_w = least_squares(y, tx)

ls_w0 = ls_w[0]
ls_w1 = ls_w[1]

print("Least Squares: loss*={l}, w0*={w0}, w1*={w1}".format(
        l=ls_loss, w0=ls_w0, w1=ls_w1))

Least Squares: loss*=0.3394455984940183, w0*=-0.31466399999964345, w1*=0.02937894984909547


In [17]:
# Self check
ls_y_check = predict_labels(ls_w, tx)
sum(ls_y_check == y)

186243

In [18]:
# Predict 0.74760
LS_OUTPUT_PATH = 'data/ls_submission.csv'
ls_y_pred = predict_labels(ls_w, tx_test)
create_csv_submission(id_test, ls_y_pred, LS_OUTPUT_PATH)

### Gradient Descent

In [19]:
# Training
max_iters = 300
gamma = 0.15
gd_loss, gd_w = gradient_descent(y, tx, max_iters, gamma)

Gradient Descent(0/299): loss=0.5, w0=-0.0471996, w1=0.03405015486157466
Gradient Descent(10/299): loss=0.3649841296757523, w0=-0.2620071055676516, w1=0.0812202751318934
Gradient Descent(20/299): loss=0.3550763049590692, w0=-0.30429720527420195, w1=0.06898193617063272
Gradient Descent(30/299): loss=0.35048628668581794, w0=-0.3126230434634405, w1=0.058971823442580486
Gradient Descent(40/299): loss=0.3474756325054262, w0=-0.3142621878975853, w1=0.05202445848547232
Gradient Descent(50/299): loss=0.34539464652489404, w0=-0.3145848934816871, w1=0.047082545929858804
Gradient Descent(60/299): loss=0.34392784649434605, w0=-0.3146484259513354, w1=0.04342652316324255
Gradient Descent(70/299): loss=0.3428808998038696, w0=-0.31466093386845445, w1=0.04063318268296978
Gradient Descent(80/299): loss=0.3421265445364248, w0=-0.31466339635718743, w1=0.03844935235840481
Gradient Descent(90/299): loss=0.3415788792923234, w0=-0.31466388115819044, w1=0.036715290258110135
Gradient Descent(100/299): loss=0.34

In [20]:
# Predict
# No need to generate predict file
# GD is strictly worse than LS if no data filtering or feature weights are implemented

### Ridge regression

In [21]:
# Training
rr_lambdas = np.array([8e-15])
degree = 2

rr_phi = build_poly(x, degree)

for lamb in rr_lambdas:
    rr_loss, rr_w = ridge_regression(y, rr_phi, lamb)
    rr_w0 = rr_w[0]
    rr_w1 = rr_w[1]
    print("Ridge Regression: lambda={lam:3f}, loss*={l}, w0*={w0}, w1*={w1}".format(
        lam=lamb, l=rr_loss, w0=rr_w0, w1=rr_w1))

Ridge Regression: lambda=0.000000, loss*=0.31786522157389413, w0*=-59626.77200386367, w1*=0.00012348494288154849


In [22]:
# Self check
rr_x_check = build_poly(x, degree)
rr_y_check = predict_labels(rr_w, rr_x_check)
sum(rr_y_check == y)

192591

In [23]:
# Predict 0.76996
rr_test_phi = build_poly(x_test, degree)
rr_y_pred = predict_labels(rr_w, rr_test_phi)
RR_OUTPUT_PATH = 'data/rr_submission.csv'
create_csv_submission(id_test, rr_y_pred, RR_OUTPUT_PATH)

### Ridge with cross validation

In [24]:
# Training
seed = 1
k_fold = 10
lamb = 8e-15
degree = 2


# split data in k fold
k_indices = build_k_indices(y, k_fold, seed)

# define lists to store the loss of training data and test data
loss_tr = []
loss_te = []
weight_tr = 0

for k in range(k_fold):
    l_tr, l_te, w_tr = cross_validation(y, x, k_indices, k, lamb, degree)
    loss_tr.append(l_tr)
    loss_te.append(l_te)
    weight_tr += w_tr
    
rcv_w = weight_tr / k_fold

In [25]:
# Self check
rcv_x_check = build_poly(x, degree)
rcv_y_check = predict_labels(rcv_w, rcv_x_check)
sum(rcv_y_check == y)

192651

In [26]:
# Predict 0.77010
rcv_test_phi = build_poly(x_test, degree)
rcv_y_pred = predict_labels(rcv_w, rcv_test_phi)
RCV_OUTPUT_PATH = 'data/rcv_submission.csv'
create_csv_submission(id_test, rcv_y_pred, RCV_OUTPUT_PATH)

### Logistic regression with GD

In [27]:
# Turn the (-1, 1) classification problem into (0, 1)
log_y = y.copy()
for i in range(len(log_y)):
    if log_y[i] == -1:
        log_y[i] = 0
        
log_y[:50,]

array([ 1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,
        0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        1.,  1.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,
        1.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  1.])

In [152]:
max_iters = 200
gamma = 1e-6

log_loss, log_w = logistic_regression(log_y, tx, max_iters, gamma)

logistic regression with GD(0/199): loss=5.251673835930225, w0=0.9820152363517101, w1=1.000917742730888
logistic regression with GD(10/199): loss=2.2464721097614415, w0=0.7008801683495484, w1=0.973306923063839
logistic regression with GD(20/199): loss=1.1385378968121571, w0=0.22506262004311559, w1=0.8681563954008141
logistic regression with GD(30/199): loss=0.7881992718141739, w0=-0.18323352984869065, w1=0.7414252132235548
logistic regression with GD(40/199): loss=0.6275812378110788, w0=-0.45832495764845205, w1=0.6276787651478991
logistic regression with GD(50/199): loss=0.5643941109355801, w0=-0.6232198981951527, w1=0.5497686454509927
logistic regression with GD(60/199): loss=0.5388019607990212, w0=-0.7184900091077283, w1=0.5001154985611691
logistic regression with GD(70/199): loss=0.527196103172945, w0=-0.7740306692828949, w1=0.46678497989916806
logistic regression with GD(80/199): loss=0.5211854177245719, w0=-0.8073551774810406, w1=0.4423275245940156
logistic regression with GD(90/1

In [153]:
# Self check
log_y_train = predict_logistic_labels(log_w, tx)
sum(log_y_train == y)

180164

In [132]:
# Predict 0.72849
LOG_OUTPUT_PATH = 'data/log_submission.csv'
log_y_pred = predict_logistic_labels(log_w, tx_test)
create_csv_submission(id_test, log_y_pred, LOG_OUTPUT_PATH)

### Regularized logistic regression 

In [None]:
# Training

### Logistic regression with Newton method

In [60]:
nt_y = y.copy()
for i in range(len(nt_y)):
    if nt_y[i] == -1:
        nt_y[i] = 0
        
nt_y[:50,]

array([ 1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,
        0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        1.,  1.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,
        1.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  1.])

In [61]:
max_iters = 5
gamma = 1e-6

# DON'T RUN! COMPUTING COST IS TOO HIGH. WILL KILL KERNEL!
nt_loss, nt_w = newton_method(log_y, tx, max_iters, gamma)

logistic regression with Newton(0/4): loss=5.251673835930225, w0=0.8694523517477522, w1=0.9999391698936346
logistic regression with Newton(1/4): loss=5.16994722901324, w0=0.790285808782625, w1=0.9998791203845311
logistic regression with Newton(2/4): loss=5.1222550873283055, w0=0.7292318748242601, w1=0.9998196343266107
logistic regression with Newton(3/4): loss=5.085943382542923, w0=0.6792816984810927, w1=0.9997605954004771
logistic regression with Newton(4/4): loss=5.056448324915354, w0=0.637075214983694, w1=0.9997019250261237


In [62]:
initial_w = np.array([1.0] * tx.shape[1])
grad = compute_logistic_gradient(nt_y, tx, initial_w)
diaga = sigma(tx.dot(initial_w))
diagb = 1 - sigma(tx.dot(initial_w))
diag = diaga * diagb
s = np.diag(diag)

MemoryError: 

## Generate predictions and save ouput in csv format for submission:

In [10]:
DATA_TEST_PATH = 'data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [31]:
OUTPUT_PATH = 'data/submission.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)