In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [13]:
DATA_FOLDER = '../data/'
DATA_ZIP = DATA_FOLDER + 'datasets.zip'

DATA_TRAIN_PATH = DATA_FOLDER + 'train.csv'
DATA_TRAIN_PATH_CLEAN = DATA_FOLDER + 'train_clean.csv'
DATA_TEST_PATH = DATA_FOLDER + 'test.csv' 

## Load the training data into feature matrix, class labels, and event ids:

In [14]:
from zipfile import ZipFile
import os.path

#Uncompress zip if a file is missing
if not (os.path.isfile(DATA_TRAIN_PATH) and os.path.isfile(DATA_TEST_PATH)):
    with ZipFile(DATA_ZIP, 'r') as zip: 
        zip.printdir()
        print("Extracting datasets.zip ...")
        zip.extractall(DATA_FOLDER)
        print("Extraction done!")


In [15]:
from proj1_helpers import *
from split_data import *

y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
y_clean, tX_clean, ids_clean = load_csv_data(DATA_TRAIN_PATH_CLEAN)

split_ratio = 0.8
tX_train, tX_validation, y_train, y_validation = split_data(tX, y, split_ratio)
tX_train_clean, tX_validation_clean, y_train_clean, y_validation_clean = split_data(tX_clean, y_clean, split_ratio)

In [16]:
print("Raw set : ")
row_format = "{:<15}" * 3
print(row_format.format("", "Training", "Validation"))
print(row_format.format("Features", str(tX_train.shape), str(tX_validation.shape)))
print(row_format.format("Labels", str(y_train.shape), str(y_validation.shape)))

print("\nClean set : ")
row_format = "{:<15}" * 3
print(row_format.format("", "Training", "Validation"))
print(row_format.format("Features", str(tX_train_clean.shape), str(tX_validation_clean.shape)))
print(row_format.format("Labels", str(y_train_clean.shape), str(y_validation_clean.shape)))

Raw set : 
               Training       Validation     
Features       (200000, 30)   (50000, 30)    
Labels         (200000,)      (50000,)       

Clean set : 
               Training       Validation     
Features       (60122, 23)    (15031, 23)    
Labels         (60122,)       (15031,)       


## Do your thing crazy machine learning thing here :) ...

In [17]:
from stochastic_gradient_descent import *
from costs import *
# weights_clean, t_loss_clean = least_squares(y_train_clean, tX_train_clean)
# weights, t_loss = least_squares(y_train, tX_train)
initial_w = np.zeros(tX_train.shape[1])
max_iters = 100
gamma = 0.1
batch_size=10
losses, weights = stochastic_gradient_descent(y_train, tX_train, initial_w,batch_size, max_iters, gamma)

[-4.156900e+00  1.788400e+01 -1.140000e-01 -3.559080e+01 -1.006290e+02
 -2.371849e+02 -9.932910e+01  2.497000e-01  9.132000e-01 -4.184250e+01
  1.800000e-01 -5.126000e-01 -1.000027e+02 -8.394400e+00  1.362000e-01
 -5.772000e-01 -1.465300e+00  2.362000e-01  2.800000e-03 -1.301280e+01
 -7.400000e-02 -3.672970e+01 -3.000000e-01 -2.240407e+02 -2.003350e+02
 -2.002625e+02 -1.076424e+02 -9.979010e+01 -9.932920e+01 -3.198310e+01]
504.04821133935195
[ 3.66158049e+07 -6.25993481e+06 -6.87381525e+06 -1.37905687e+06
  1.18423418e+08  1.18496458e+08  1.18423117e+08 -2.62747715e+05
 -8.94070685e+05 -8.36103187e+06 -1.44153945e+05  1.46479153e+05
  1.18422831e+08 -3.37766397e+06  9.99376701e+03  2.12049775e+04
 -4.05875372e+06  1.71268679e+04 -6.00636906e+04 -3.87733602e+06
 -1.08133347e+05 -1.41245189e+07 -1.38116142e+04  1.01294157e+08
  1.02286783e+08  1.02246803e+08  1.18449332e+08  1.18423082e+08
  1.18423553e+08 -9.24599509e+05]
362094723.9749791
[-1.44305173e+13  3.60700354e+12  4.77123898e+1

[ 7.99628323e+98 -2.41688062e+98 -2.46623615e+98 -2.56714424e+98
  3.57650301e+99  3.57830938e+99  3.57650993e+99 -6.93436514e+96
 -5.67092722e+97 -5.28501420e+98 -4.77451514e+96  2.60120884e+95
  3.57649275e+99 -1.46772608e+98 -9.91142046e+95 -3.33600561e+96
 -1.57831951e+98 -1.01125883e+96  6.24687791e+95 -2.06117145e+98
  2.14502034e+96 -6.83743773e+98 -2.96077927e+96  3.36370774e+98
  5.62420681e+98  5.64356493e+98  3.57817425e+99  3.57648113e+99
  3.57644920e+99 -2.23897913e+98]
9.59299942523281e+99
[-6.59259150e+104  1.53500268e+104  1.63902496e+104  4.96285642e+103
 -2.12331854e+105 -2.16084048e+105 -2.12300441e+105  5.19638237e+102
  2.49660297e+103  1.82081408e+104  3.27404565e+102 -1.52068669e+102
 -2.12316696e+105  6.93403006e+103  1.04583330e+102  7.42941756e+101
  8.78631532e+103  1.14032715e+102  8.85402083e+101  1.00553740e+104
 -2.16540905e+101  2.99185726e+104  4.10480207e+101 -1.58973387e+105
 -1.61815836e+105 -1.61797185e+105 -2.12661079e+105 -2.12321647e+105
 -2.123

[ 1.40845686e+202 -3.11880758e+201 -3.72514159e+201 -2.32881276e+201
  4.89196709e+202  4.96963059e+202  4.89117469e+202 -1.20921174e+200
 -6.08222268e+200 -6.03436401e+201 -8.33013366e+199  2.86422352e+199
  4.89165853e+202 -1.52328140e+201  1.06969495e+199  3.25910000e+198
 -2.55547512e+201 -9.93212629e+198  5.62567043e+199 -2.74868292e+201
  1.42816134e+199 -9.25970902e+201 -1.49505511e+199  2.91451061e+202
  3.12071327e+202  3.12066426e+202  4.89798006e+202  4.89178265e+202
  4.89184897e+202 -1.95559627e+201]
inf
[-4.99676192e+207  1.31494627e+207  2.97585399e+207  1.37972671e+207
 -3.39208376e+208 -3.45378543e+208 -3.39129300e+208  8.31369572e+205
  5.95306146e+206  3.54299356e+207  3.49037624e+205 -5.48515449e+204
 -3.39178253e+208  1.33104341e+207  1.31414693e+205  5.39639552e+204
  1.27598851e+207  1.53499445e+205 -4.78219080e+204  8.83866143e+206
 -1.45302432e+205  5.10947535e+207  1.96174828e+205 -1.22451815e+208
 -1.32172248e+208 -1.31935501e+208 -3.39354017e+208 -3.39149599

[ 1.60169779e+293 -1.57255545e+293 -2.50951088e+293 -1.24668255e+293
  2.72832063e+294  2.98231358e+294  2.72643880e+294 -7.60836061e+291
 -2.02218367e+292 -3.05862495e+293 -4.88745688e+291  3.36448357e+289
  2.72767951e+294 -6.83534104e+292 -2.45426721e+290 -4.04112034e+290
 -1.22396365e+293 -2.16448039e+291 -2.23701092e+291 -9.77904870e+292
  4.01670654e+288 -4.52151067e+293 -1.98652016e+291  3.67522791e+293
  4.89693688e+293  4.90998143e+293  2.73443097e+294  2.72711265e+294
  2.72788265e+294 -1.15112341e+293]
inf
[ 2.42647958e+299  9.34524994e+298  1.76604213e+299  7.24999127e+298
 -1.80432804e+300 -1.80467083e+300 -1.80432763e+300  4.94532128e+297
  3.06538680e+298  2.21187659e+299  3.02204148e+297 -1.41341311e+296
 -1.80432569e+300  7.29667645e+298  1.26074393e+296 -1.52070685e+296
  9.07793527e+298  8.41486996e+296 -1.47416851e+295  4.96665323e+298
 -2.06654694e+296  3.01762370e+299  9.67316602e+296 -7.72818842e+299
 -8.31981008e+299 -8.31529654e+299 -1.80446761e+300 -1.80432694

In [8]:
print("Training loss : {}".format(compute_loss(y_train, tX_train, weights)))
print("Validation loss : {}".format(compute_loss(y_validation, tX_validation, weights)))
print("Clean training loss : {}".format(compute_loss(y_train_clean, tX_train_clean, weights_clean)))
print("Clean validation loss : {}".format(compute_loss(y_validation_clean, tX_validation_clean, weights_clean)))

NameError: name 'weights' is not defined

## Generate predictions and save ouput in csv format for submission:

In [None]:
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
OUTPUT_PATH = DATA_FOLDER + 'submission.csv' 
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)