In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import datetime
%load_ext autoreload
%autoreload 2


## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = 'train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [3]:
print(y)
print('----------------------------------')
print(tX)
print('----------------------------------')
print(ids)

[ 1. -1. -1. ...  1. -1. -1.]
----------------------------------
[[ 138.47    51.655   97.827 ...    1.24    -2.475  113.497]
 [ 160.937   68.768  103.235 ... -999.    -999.      46.226]
 [-999.     162.172  125.953 ... -999.    -999.      44.251]
 ...
 [ 105.457   60.526   75.839 ... -999.    -999.      41.992]
 [  94.951   19.362   68.812 ... -999.    -999.       0.   ]
 [-999.      72.756   70.831 ... -999.    -999.       0.   ]]
----------------------------------
[100000 100001 100002 ... 349997 349998 349999]


In [4]:
tx = np.insert(tX, 0, 1, axis=1)

In [5]:
tx

array([[   1.   ,  138.47 ,   51.655, ...,    1.24 ,   -2.475,  113.497],
       [   1.   ,  160.937,   68.768, ..., -999.   , -999.   ,   46.226],
       [   1.   , -999.   ,  162.172, ..., -999.   , -999.   ,   44.251],
       ...,
       [   1.   ,  105.457,   60.526, ..., -999.   , -999.   ,   41.992],
       [   1.   ,   94.951,   19.362, ..., -999.   , -999.   ,    0.   ],
       [   1.   , -999.   ,   72.756, ..., -999.   , -999.   ,    0.   ]])

## Cleaning the data
There seems to be quiet a few incorrect values "-999", for now we'll set these equal to the average of their respective column.

In [56]:
tx = np.where(tx==-999, np.nan, tx) # replace -999 value with nan
col_mean = np.nanmean(tx, axis=0)
inds_nan = np.where(np.isnan(tx))
tx[inds_nan] = np.take(col_mean, inds_nan[1])

In [57]:
tx

array([[  1. , 138.5,  51.7, ...,   1.2,  -2.5, 113.5],
       [  1. , 160.9,  68.8, ...,   0. ,   0. ,  46.2],
       [  1. ,   0. , 162.2, ...,   0. ,   0. ,  44.3],
       ...,
       [  1. , 105.5,  60.5, ...,   0. ,   0. ,  42. ],
       [  1. ,  95. ,  19.4, ...,   0. ,   0. ,   0. ],
       [  1. ,   0. ,  72.8, ...,   0. ,   0. ,   0. ]])

## Splitting the data into train and test datasets

In [58]:
ratio = 0.8 # 100ratio% of the data will be used to train the model
x_train, y_train, x_test, y_test = split_data(tx, y, ratio, seed=1)

## Testing the functions

In [59]:
weights_LS = least_squares(y_train, x_train)[1]
cat_accuracy_train, f1_score_train = metrics(weights_LS,y_train,x_train)
cat_accuracy_test, f1_score_test = metrics(weights_LS,y_test,x_test)


print("Categorical accuracy train : ",cat_accuracy_train," || f1_score train: ",f1_score_train)
print("Categorical accuracy test : ",cat_accuracy_test," || f1_score test: ",f1_score_test)

Categorical accuracy train :  0.746285  || f1_score train:  0.2892969865966419
Categorical accuracy test :  0.747645431826469  || f1_score test:  0.28738319536360796


In [60]:
weights_LSGD1 = least_squares_GD(y_train, x_train, np.zeros(31), 100, 0.000001)[1]
cat_accuracy_train, f1_score_train = metrics(weights_LSGD1,y_train,x_train)
cat_accuracy_test, f1_score_test = metrics(weights_LSGD1,y_test,x_test)


print("Categorical accuracy train : ",cat_accuracy_train," || f1_score train: ",f1_score_train)
print("Categorical accuracy test : ",cat_accuracy_test," || f1_score test: ",f1_score_test)

# THIS CONVERGES INTO A LOCAL MINIMUM, solution: Dynamic Step Size ?

Categorical accuracy train :  0.693025  || f1_score train:  0.10416656792534208
Categorical accuracy test :  0.693083483782606  || f1_score test:  0.10352591942924126


In [11]:
weights_LSGD2 = least_squares_GD(y, tx, [-1.15430619e+00,  1.82637143e-04, -7.20672107e-03, -6.45384640e-03,
        -1.73123839e-05,  2.32665195e-02,  4.20381986e-04,  2.50304362e-03,
         3.60203545e-01, -1.26385429e-03, -2.84568774e+00, -2.22719927e-01,
         9.89163555e-02,  3.56752661e-01,  2.85396180e+00, -6.42020723e-04,
        -4.57219107e-04,  2.85879539e+00, -6.80776737e-04,  1.38605239e-03,
         3.15125355e-03,  5.15272243e-04, -3.71558734e-04,  4.27220740e-02,
        -1.01225645e-03,  4.70620275e-04,  1.34341503e-04, -2.12423006e-03,
         1.42389540e-03, -1.78105047e-03,  2.84577828e+00], 100, 0.000001)[1]

cat_accuracy_train, f1_score_train = metrics(weights_LSGD2,y_train,x_train)
cat_accuracy_test, f1_score_test = metrics(weights_LSGD2,y_test,x_test)


print("Categorical accuracy train : ",cat_accuracy_train," || f1_score train: ",f1_score_train)
print("Categorical accuracy test : ",cat_accuracy_test," || f1_score test: ",f1_score_test)

# IT STAYS IN THE ABSOLUTE MINIMUM WHEN USING THAT AS A STARTING POINT

Categorical accuracy train :  0.73289  || f1_score train:  0.2737637991006309
Categorical accuracy test :  0.7327931982106172  || f1_score test:  0.2720790496826641


In [92]:
weights_LSSGD = least_squares_SGD(y, tx, np.zeros(31), 100, 0.000001)[1]

cat_accuracy_train, f1_score_train = metrics(weights_LSSGD,y_train,x_train)
cat_accuracy_test, f1_score_test = metrics(weights_LSSGD,y_test,x_test)


print("Categorical accuracy train : ",cat_accuracy_train," || f1_score train: ",f1_score_train)
print("Categorical accuracy test : ",cat_accuracy_test," || f1_score train: ",f1_score_test)

Categorical accuracy train :  0.721595  || f1_score train:  0.375502578889656
Categorical accuracy test :  0.7212315792282175  || f1_score train:  0.37321411778322733


In [61]:
lambda_ = 0.005

weights_RR = ridge_regression(y_train, x_train, lambda_)[1]
cat_acc_test, f1_score_test = metrics(weights_RR, y_test, x_test)

print(cat_acc_test, f1_score_test)

0.747645431826469 0.28738319536360796


In [65]:
num_iterations = 10000
lr = 0.0001
w = np.zeros(31)
%run implementations.py
weights_log_reg = logistic_regression(x_train, y_train, w, lr, num_iterations)

Step:  0  loss:  -1.7506894657049745  accuracy:  0.65668  f1_score:  0.0


  loss = -np.mean(y*np.log(sigmoid_prediction)+(1-y)*np.log(1-sigmoid_prediction))
  loss = -np.mean(y*np.log(sigmoid_prediction)+(1-y)*np.log(1-sigmoid_prediction))


Step:  10  loss:  nan  accuracy:  0.658485  f1_score:  0.0037927936919852283
Step:  20  loss:  nan  accuracy:  0.659315  f1_score:  0.005663914482868653
Step:  30  loss:  nan  accuracy:  0.659555  f1_score:  0.006151024090680251
Step:  40  loss:  nan  accuracy:  0.65964  f1_score:  0.0063199657087034105
Step:  50  loss:  nan  accuracy:  0.659685  f1_score:  0.00640939379893642
Step:  60  loss:  nan  accuracy:  0.659695  f1_score:  0.0064292656183807225
Step:  70  loss:  nan  accuracy:  0.659705  f1_score:  0.0064491370416701966
Step:  80  loss:  nan  accuracy:  0.659705  f1_score:  0.0064491370416701966


KeyboardInterrupt: 

## Generate predictions and save ouput in csv format for submission:

In [21]:
DATA_TEST_PATH = 'test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [18]:
tX_test = np.insert(tX_test, 0, 1, axis=1)

NameError: name 'tX_test' is not defined

In [42]:
time_day = datetime.datetime.now().day
time_hour = datetime.datetime.now().hour
time_min = datetime.datetime.now().minute
time_second = datetime.datetime.now().second

time = str(time_day)+"-"+str(time_hour)+"-"+str(time_min)+"-"+str(time_second)

OUTPUT_PATH = 'submission'+"_"+str(time)+".csv"
print(weights_LS.shape)
y_pred = predict_labels(weights_LS, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

(30,)
