In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = 'train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [3]:
print(y)
print('----------------------------------')
print(tX)
print('----------------------------------')
print(ids)

[ 1. -1. -1. ...  1. -1. -1.]
----------------------------------
[[ 138.47    51.655   97.827 ...    1.24    -2.475  113.497]
 [ 160.937   68.768  103.235 ... -999.    -999.      46.226]
 [-999.     162.172  125.953 ... -999.    -999.      44.251]
 ...
 [ 105.457   60.526   75.839 ... -999.    -999.      41.992]
 [  94.951   19.362   68.812 ... -999.    -999.       0.   ]
 [-999.      72.756   70.831 ... -999.    -999.       0.   ]]
----------------------------------
[100000 100001 100002 ... 349997 349998 349999]


In [4]:
tx = np.insert(tX, 0, 1, axis=1)

## Do your thing crazy machine learning thing here :) ...

In [5]:
# there seem to be quiet a few incorrect values "-999", for now we'll set these equal to the average of their respecitve collumn

In [6]:
tx_ = np.where(tx==-999, np.nan, tx)
tx_

array([[  1.   , 138.47 ,  51.655, ...,   1.24 ,  -2.475, 113.497],
       [  1.   , 160.937,  68.768, ...,     nan,     nan,  46.226],
       [  1.   ,     nan, 162.172, ...,     nan,     nan,  44.251],
       ...,
       [  1.   , 105.457,  60.526, ...,     nan,     nan,  41.992],
       [  1.   ,  94.951,  19.362, ...,     nan,     nan,   0.   ],
       [  1.   ,     nan,  72.756, ...,     nan,     nan,   0.   ]])

In [7]:
for i in range(0, 30):
    tx[:,i] = np.where(tx[:,i]==-999, np.nanmean(tx_[:,i]), tx_[:,i])

In [8]:
np.around(tx, 1)

array([[  1. , 138.5,  51.7, ...,   1.2,  -2.5, 113.5],
       [  1. , 160.9,  68.8, ...,  -0. ,  -0. ,  46.2],
       [  1. , 121.9, 162.2, ...,  -0. ,  -0. ,  44.3],
       ...,
       [  1. , 105.5,  60.5, ...,  -0. ,  -0. ,  42. ],
       [  1. ,  95. ,  19.4, ...,  -0. ,  -0. ,   0. ],
       [  1. , 121.9,  72.8, ...,  -0. ,  -0. ,   0. ]])

In [9]:
def compute_loss(y, tx, w):
    
    N = y.shape[0]
    L = (1/(2*N))*sum(((y-np.matmul(tx, w)))**2)
    
    return L

In [10]:
def compute_gradient(y, tx, w):
    
    N = y.size
    
    return - np.matmul(np.transpose(tx), y-np.matmul(tx, w))/N

In [11]:
def compute_st_gradient(y_n, tx_n, w):
    """ We don't use batches here! """
    return -np.transpose(tx_n)*(y_n - np.matmul(tx_n, w))

In [12]:
def least_squares_GD(y, tx, initial_w, max_iters, gamma):
    ws = initial_w
    N = y.shape[0]
    
    for i in range(max_iters):
        
        grad = compute_gradient(y, tx, ws)
        ws = ws - gamma*grad
    
    loss = compute_loss(y, tx, ws)
   
    return loss, ws

In [13]:
def least_squares_SGD(y, tx, initial_w, max_iters, gamma):
    
    ws = initial_w
    N = y.shape[0]
    
    for i in range (max_iters):
            for n in range (N):
                grad = compute_st_gradient(y[n], tx[n], ws)
                ws = ws - gamma*grad
    
    loss = compute_loss(y, tx, ws)
    
    return loss, ws

In [14]:
def least_squares(y, tx):
    
    Gram = np.dot(np.transpose(tx), tx)
    
    ws = np.dot(np.linalg.inv(Gram), np.dot(np.transpose(tx), y))
    
    loss = compute_loss(y, tx, ws)
    
    return loss, ws

In [15]:
# Tests

In [16]:
least_squares(y, tx)

(0.34040945217523216,
 array([-1.15430619e+00,  1.82637143e-04, -7.20672107e-03, -6.45384640e-03,
        -1.73123839e-05,  2.32665195e-02,  4.20381986e-04,  2.50304362e-03,
         3.60203545e-01, -1.26385429e-03, -2.84568774e+00, -2.22719927e-01,
         9.89163555e-02,  3.56752661e-01,  2.85396180e+00, -6.42020723e-04,
        -4.57219107e-04,  2.85879539e+00, -6.80776737e-04,  1.38605239e-03,
         3.15125355e-03,  5.15272243e-04, -3.71558734e-04,  4.27220740e-02,
        -1.01225645e-03,  4.70620275e-04,  1.34341503e-04, -2.12423006e-03,
         1.42389540e-03, -1.78105047e-03,  2.84577828e+00]))

In [17]:
least_squares_GD(y, tx, np.zeros(31), 100, 0.000001)

# THIS CONVERGES INTO A LOCAL MINIMUM, solution: Dynamic Step Size ?

(0.42762024204943844,
 array([-9.62870376e-06, -8.53651159e-04, -1.50111309e-03, -6.36774819e-04,
         5.46515024e-04, -6.60541850e-06,  1.50986077e-04, -2.28034909e-05,
        -1.99323578e-05, -2.03017754e-04,  1.82396766e-04, -2.81108003e-05,
         2.98359738e-05, -8.86668785e-07,  1.49124208e-04, -1.16576897e-08,
        -6.96956963e-07, -4.49068267e-04,  3.39987182e-07,  2.41478887e-07,
        -3.03244905e-04,  1.26971335e-06, -3.35812581e-04,  1.71133406e-06,
        -4.13086154e-04,  7.26136261e-08,  1.66055103e-07, -5.63962759e-04,
         1.99886767e-07, -3.09761717e-07,  4.82340961e-04]))

In [18]:
least_squares_GD(y, tx, [-1.15430619e+00,  1.82637143e-04, -7.20672107e-03, -6.45384640e-03,
        -1.73123839e-05,  2.32665195e-02,  4.20381986e-04,  2.50304362e-03,
         3.60203545e-01, -1.26385429e-03, -2.84568774e+00, -2.22719927e-01,
         9.89163555e-02,  3.56752661e-01,  2.85396180e+00, -6.42020723e-04,
        -4.57219107e-04,  2.85879539e+00, -6.80776737e-04,  1.38605239e-03,
         3.15125355e-03,  5.15272243e-04, -3.71558734e-04,  4.27220740e-02,
        -1.01225645e-03,  4.70620275e-04,  1.34341503e-04, -2.12423006e-03,
         1.42389540e-03, -1.78105047e-03,  2.84577828e+00], 100, 0.000001)

# IT STAYS IN THE ABSOLUTE MINIMUM WHEN USING THAT AS A STARTING POINT

(0.3404094521712253,
 array([-1.15430619e+00,  1.82642973e-04, -7.20671861e-03, -6.45384287e-03,
        -1.73132964e-05,  2.32665198e-02,  4.20385733e-04,  2.50304348e-03,
         3.60203545e-01, -1.26385382e-03, -2.84568774e+00, -2.22719927e-01,
         9.89163555e-02,  3.56752661e-01,  2.85396180e+00, -6.42020722e-04,
        -4.57219112e-04,  2.85879539e+00, -6.80776738e-04,  1.38605239e-03,
         3.15125411e-03,  5.15272240e-04, -3.71557781e-04,  4.27220741e-02,
        -1.01225793e-03,  4.70620273e-04,  1.34341503e-04, -2.12423048e-03,
         1.42389540e-03, -1.78105047e-03,  2.84577828e+00]))

In [19]:
least_squares_SGD(y, tx, np.zeros(31), 100, 0.000001)

(0.35331736694878296,
 array([-1.87271515e-01, -3.18249509e-05, -7.86553702e-03, -6.12747878e-03,
        -2.79314299e-04, -1.00144970e-02,  9.66471915e-05, -1.78655467e-02,
         2.94121690e-01, -9.52512471e-04,  6.71011715e-03, -2.29125293e-01,
         9.86471103e-02,  1.66238227e-01,  2.22108976e-03, -2.40704264e-03,
         2.39504581e-04,  6.78697853e-03, -2.84137894e-04,  1.44738922e-03,
         2.56756367e-03,  1.23142637e-03, -8.18848408e-05, -2.17112860e-01,
        -4.47434614e-03, -3.55307447e-04,  1.57568937e-03, -8.47702975e-03,
         4.10466181e-03, -2.18287587e-03, -2.29300926e-03]))

In [20]:
# yay !

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
OUTPUT_PATH = '' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)