In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load data

### Imports

In [12]:
from costs import *
from least_squares import *
from gradient_descent import *
from stochastic_gradient_descent import *
from ridge_regression import *
from logistic_regression import *
from newton import *
from cross_validation import *

from helpers import *
from proj1_helpers import *

### Load train set

In [3]:
DATA_TRAIN_PATH = 'data/train.csv' 
y, x, ids = load_csv_data(DATA_TRAIN_PATH)
tx, mean, std = standardize(x)

In [4]:
x.shape, tx.shape

((250000, 30), (250000, 31))

### Load test set

In [5]:
y_test, x_test, id_test = load_csv_data('data/test.csv')
tx_test, mean_test, std_test = standardize(x_test)
tx_test.shape

(568238, 31)

## Algorithms

### Least squares

In [6]:
# Training
ls_loss, ls_w = least_squares(y, tx)

ls_w0 = ls_w[0]
ls_w1 = ls_w[1]

print("Least Squares: loss*={l}, w0*={w0}, w1*={w1}".format(
        l=ls_loss, w0=ls_w0, w1=ls_w1))

Least Squares: loss*=0.3394455984940183, w0*=-0.31466399999964345, w1*=0.02937894984909547


In [7]:
# Self check
ls_y_check = predict_labels(ls_w, tx)
sum(ls_y_check == y)

186243

In [10]:
# Predict 0.74760
LS_OUTPUT_PATH = 'data/ls_submission.csv'
ls_y_pred = predict_labels(ls_w, tx_test)
create_csv_submission(id_test, ls_y_pred, LS_OUTPUT_PATH)

### Gradient Descent

In [12]:
# Training
max_iters = 300
gamma = 0.15
gd_loss, gd_w = gradient_descent(y, tx, max_iters, gamma)

Gradient Descent(0/299): loss=0.5, w0=-0.0471996, w1=0.03405015486157466
Gradient Descent(10/299): loss=0.3649841296757523, w0=-0.2620071055676516, w1=0.0812202751318934
Gradient Descent(20/299): loss=0.3550763049590692, w0=-0.30429720527420195, w1=0.06898193617063272
Gradient Descent(30/299): loss=0.35048628668581794, w0=-0.3126230434634405, w1=0.058971823442580486
Gradient Descent(40/299): loss=0.3474756325054262, w0=-0.3142621878975853, w1=0.05202445848547232
Gradient Descent(50/299): loss=0.34539464652489404, w0=-0.3145848934816871, w1=0.047082545929858804
Gradient Descent(60/299): loss=0.34392784649434605, w0=-0.3146484259513354, w1=0.04342652316324255
Gradient Descent(70/299): loss=0.3428808998038696, w0=-0.31466093386845445, w1=0.04063318268296978
Gradient Descent(80/299): loss=0.3421265445364248, w0=-0.31466339635718743, w1=0.03844935235840481
Gradient Descent(90/299): loss=0.3415788792923234, w0=-0.31466388115819044, w1=0.036715290258110135
Gradient Descent(100/299): loss=0.34

In [None]:
# Predict
# No need to generate predict file
# GD is strictly worse than LS if no data filtering or feature weights are implemented

### Ridge regression

In [72]:
# Training
rr_lambdas = np.array([8e-15])
degree = 2

rr_phi = build_poly(x, degree)

for lamb in rr_lambdas:
    rr_loss, rr_w = ridge_regression(y, rr_phi, lamb)
    rr_w0 = rr_w[0]
    rr_w1 = rr_w[1]
    print("Ridge Regression: lambda={lam:3f}, loss*={l}, w0*={w0}, w1*={w1}".format(
        lam=lamb, l=rr_loss, w0=rr_w0, w1=rr_w1))

Ridge Regression: lambda=0.000000, loss*=0.31786522157389413, w0*=-59626.77200386367, w1*=0.00012348494288154849


In [149]:
# Self check
rr_x_check = build_poly(x, degree)
rr_y_check = predict_labels(rr_w, rr_x_check)
sum(rr_y_check == y)

192591

In [73]:
# Predict 0.76996
rr_test_phi = build_poly(x_test, degree)
rr_y_pred = predict_labels(rr_w, rr_test_phi)
RR_OUTPUT_PATH = 'data/rr_submission.csv'
create_csv_submission(id_test, rr_y_pred, RR_OUTPUT_PATH)

### Ridge with cross validation

In [143]:
# Training
seed = 1
k_fold = 10
lamb = 8e-15
degree = 2


# split data in k fold
k_indices = build_k_indices(y, k_fold, seed)

# define lists to store the loss of training data and test data
loss_tr = []
loss_te = []
weight_tr = 0

for k in range(k_fold):
    l_tr, l_te, w_tr = cross_validation(y, x, k_indices, k, lamb, degree)
    loss_tr.append(l_tr)
    loss_te.append(l_te)
    weight_tr += w_tr
    
rcv_w = weight_tr / k_fold

In [148]:
# Self check
rcv_x_check = build_poly(x, degree)
rcv_y_check = predict_labels(rcv_w, rcv_x_check)
sum(rcv_y_check == y)

192651

In [78]:
# Predict 0.77010
rcv_test_phi = build_poly(x_test, degree)
rcv_y_pred = predict_labels(rcv_w, rcv_test_phi)
RCV_OUTPUT_PATH = 'data/rcv_submission.csv'
create_csv_submission(id_test, rcv_y_pred, RCV_OUTPUT_PATH)

### Logistic regression with GD

In [38]:
# Turn the (-1, 1) classification problem into (0, 1)
log_y = y.copy()
for i in range(len(log_y)):
    if log_y[i] == -1:
        log_y[i] = 0
        
log_y[:50,]

array([ 1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,
        0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        1.,  1.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,
        1.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  1.])

In [41]:
max_iters = 300
gamma = 1e-6

log_loss, log_w = logistic_regression(log_y, tx, max_iters, gamma)

logistic regression with GD(0/299): loss=2.682837515207593, w0=0.4792886650862448, w1=0.5007768283788979
logistic regression with GD(20/299): loss=0.616510027467686, w0=-0.26331699128599506, w1=0.40816506142346515
logistic regression with GD(40/299): loss=0.5363931283545257, w0=-0.6044983635043047, w1=0.35009208349049126
logistic regression with GD(60/299): loss=0.5227242801507874, w0=-0.7353249119065798, w1=0.33364478788152807
logistic regression with GD(80/299): loss=0.5174966086324174, w0=-0.7939061989167525, w1=0.3246734252858627
logistic regression with GD(100/299): loss=0.5143808703287661, w0=-0.8235871357260085, w1=0.3166407784012507
logistic regression with GD(120/299): loss=0.5121055242239606, w0=-0.8400849139099502, w1=0.3088243175380369
logistic regression with GD(140/299): loss=0.5102904050169369, w0=-0.8499670399130937, w1=0.3014324525381244
logistic regression with GD(160/299): loss=0.5087807188227276, w0=-0.8563078533973341, w1=0.29468723527678725
logistic regression wit

In [42]:
# Self check
log_y_train = predict_logistic_labels(log_w, tx)
sum(log_y_train == y)

180533

In [132]:
# Predict 0.72849
LOG_OUTPUT_PATH = 'data/log_submission.csv'
log_y_pred = predict_logistic_labels(log_w, tx_test)
create_csv_submission(id_test, log_y_pred, LOG_OUTPUT_PATH)

### Regularized logistic regression 

In [43]:
# logisticalization
reg_y = y.copy()
for i in range(len(reg_y)):
    if reg_y[i] == -1:
        reg_y[i] = 0
        
reg_y[:50,]

array([ 1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,
        0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        1.,  1.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,
        1.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  1.])

In [45]:
# Training
max_iters = 300
gamma = 1e-6
lamb = 10

reg_loss, reg_w = logistic_regression(reg_y, tx, max_iters, gamma, regularized=True, lambda_=lamb)

logistic regression with GD(0/299): loss=2.682837515207593, w0=0.47927866508624484, w1=0.5007668283788979
logistic regression with GD(20/299): loss=0.6164594109036251, w0=-0.2633436864032931, w1=0.4080176433706233
logistic regression with GD(40/299): loss=0.5363948889402543, w0=-0.6043458056205644, w1=0.34990603938341813
logistic regression with GD(60/299): loss=0.5227374563936514, w0=-0.7350279707320829, w1=0.33341832163501584
logistic regression with GD(80/299): loss=0.5175140525579578, w0=-0.793503447021783, w1=0.32440879592875677
logistic regression with GD(100/299): loss=0.5144011575215567, w0=-0.8231058494265406, w1=0.31634699360386176
logistic regression with GD(120/299): loss=0.5121283884441394, w0=-0.8395445316432651, w1=0.3085116005782197
logistic regression with GD(140/299): loss=0.5103158021464339, w0=-0.849381615628723, w1=0.3011097897572627
logistic regression with GD(160/299): loss=0.5088085881819281, w0=-0.8556873466936473, w1=0.294361752929418
logistic regression with 

In [46]:
# Self check
reg_y_train = predict_logistic_labels(reg_w, tx)
sum(reg_y_train == y) 

180525

In [None]:
# Predict
REG_OUTPUT_PATH = 'data/reg_submission.csv'
reg_y_pred = predict_logistic_labels(reg_w, tx_test)
create_csv_submission(id_test, reg_y_pred, REG_OUTPUT_PATH)

### Logistic regression with Newton method

In [47]:
nt_y = y.copy()
for i in range(len(nt_y)):
    if nt_y[i] == -1:
        nt_y[i] = 0
        
nt_y[:50,]

array([ 1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,
        0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        1.,  1.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,
        1.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  1.])

In [71]:
max_iters = 1000
gamma = 1e-2
batch_size = 10000
lamb = 1

nt_loss, nt_w = newton_method(nt_y, tx, batch_size, max_iters, gamma, regularized=True, lambda_=lamb)

logistic with Newton(0/999): loss=2.737002412413841, w0=-1.7181550200522229, w1=0.44802379524322206
logistic with Newton(50/999): loss=0.6228125940984359, w0=-0.9049012751400399, w1=0.23052569657523556
logistic with Newton(100/999): loss=0.5467339034684046, w0=-0.838494723154859, w1=0.21739184709948728
logistic with Newton(150/999): loss=0.5190068124265335, w0=-0.8366759109704975, w1=0.21330372558586713
logistic with Newton(200/999): loss=0.5032983304543842, w0=-0.8514850303479574, w1=0.21514267436104093
logistic with Newton(250/999): loss=0.4968322585801632, w0=-0.8675866737267524, w1=0.21782875518396022
logistic with Newton(300/999): loss=0.4963691684124786, w0=-0.8796742269076965, w1=0.21899354200223928
logistic with Newton(350/999): loss=0.5076311803961472, w0=-0.8918754092639407, w1=0.22417528404527762
logistic with Newton(400/999): loss=0.49828422563838365, w0=-0.898974643138658, w1=0.22128517619590768
logistic with Newton(450/999): loss=0.5004126275011205, w0=-0.9037515613346028

In [72]:
# Self check
nt_y_train = predict_logistic_labels(nt_w, tx)
sum(nt_y_train == y)

181964

## Generate predictions and save ouput in csv format for submission:

In [10]:
DATA_TEST_PATH = 'data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [31]:
OUTPUT_PATH = 'data/submission.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)