In [3]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import datetime
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the training data into feature matrix, class labels, and event ids:

In [61]:
%run proj1_helpers.py
DATA_TRAIN_PATH = 'train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [62]:
print(y)
print('----------------------------------')
print(tX)
print('----------------------------------')
print(ids)

[ 1. -1. -1. ...  1. -1. -1.]
----------------------------------
[[ 138.47    51.655   97.827 ...    1.24    -2.475  113.497]
 [ 160.937   68.768  103.235 ... -999.    -999.      46.226]
 [-999.     162.172  125.953 ... -999.    -999.      44.251]
 ...
 [ 105.457   60.526   75.839 ... -999.    -999.      41.992]
 [  94.951   19.362   68.812 ... -999.    -999.       0.   ]
 [-999.      72.756   70.831 ... -999.    -999.       0.   ]]
----------------------------------
[100000 100001 100002 ... 349997 349998 349999]


In [63]:
tx = np.insert(tX, 0, 1, axis=1)

In [64]:
tx

array([[   1.   ,  138.47 ,   51.655, ...,    1.24 ,   -2.475,  113.497],
       [   1.   ,  160.937,   68.768, ..., -999.   , -999.   ,   46.226],
       [   1.   , -999.   ,  162.172, ..., -999.   , -999.   ,   44.251],
       ...,
       [   1.   ,  105.457,   60.526, ..., -999.   , -999.   ,   41.992],
       [   1.   ,   94.951,   19.362, ..., -999.   , -999.   ,    0.   ],
       [   1.   , -999.   ,   72.756, ..., -999.   , -999.   ,    0.   ]])

## Cleaning the data
There seems to be quiet a few incorrect values "-999", for now we'll set these equal to the average of their respective column.

In [65]:
tx = np.where(tx==-999, np.nan, tx) # replace -999 value with nan
col_mean = np.nanmean(tx, axis=0)
inds_nan = np.where(np.isnan(tx))
tx[inds_nan] = np.take(col_mean, inds_nan[1])

In [66]:
tx

array([[ 1.00000000e+00,  1.38470000e+02,  5.16550000e+01, ...,
         1.24000000e+00, -2.47500000e+00,  1.13497000e+02],
       [ 1.00000000e+00,  1.60937000e+02,  6.87680000e+01, ...,
        -1.18452642e-02, -1.58228913e-03,  4.62260000e+01],
       [ 1.00000000e+00,  1.21858528e+02,  1.62172000e+02, ...,
        -1.18452642e-02, -1.58228913e-03,  4.42510000e+01],
       ...,
       [ 1.00000000e+00,  1.05457000e+02,  6.05260000e+01, ...,
        -1.18452642e-02, -1.58228913e-03,  4.19920000e+01],
       [ 1.00000000e+00,  9.49510000e+01,  1.93620000e+01, ...,
        -1.18452642e-02, -1.58228913e-03,  0.00000000e+00],
       [ 1.00000000e+00,  1.21858528e+02,  7.27560000e+01, ...,
        -1.18452642e-02, -1.58228913e-03,  0.00000000e+00]])

Changing predictions {-1;1} --> {0;1}

In [67]:
y_log = np.copy(y)
y_log = np.where(y_log==-1, 0, y_log)

In [68]:
y_log

array([1., 0., 0., ..., 1., 0., 0.])

Getting the min and max values for each columns of tx

In [75]:
from implementations import *
minmax = dataset_minmax(tx)

In [76]:
minmax

[[1.0, 1.0],
 [9.044, 1192.026],
 [0.0, 690.075],
 [6.329, 1349.351],
 [0.0, 2834.999],
 [0.0, 8.503],
 [13.602, 4974.979],
 [-18.066, 16.69],
 [0.208, 5.684],
 [0.0, 2834.999],
 [46.104, 1852.462],
 [0.047, 19.773],
 [-1.414, 1.414],
 [0.0, 1.0],
 [20.0, 764.408],
 [-2.499, 2.497],
 [-3.142, 3.142],
 [26.0, 560.271],
 [-2.505, 2.503],
 [-3.142, 3.142],
 [0.109, 2842.617],
 [-3.142, 3.142],
 [13.678, 2003.976],
 [0.0, 3.0],
 [30.0, 1120.573],
 [-4.499, 4.499],
 [-3.142, 3.141],
 [30.0, 721.456],
 [-4.5, 4.5],
 [-3.142, 3.142],
 [0.0, 1633.433]]

Normalized dataset

In [77]:
tx_normalized = np.copy(tx)
normalize_dataset(tx_normalized, minmax)

In [78]:
print(tx_normalized)

[[1.         0.10940657 0.07485418 ... 0.63777778 0.10614258 0.06948372]
 [1.         0.1283984  0.09965294 ... 0.49868386 0.4997482  0.02829991]
 [1.         0.09536454 0.23500634 ... 0.49868386 0.4997482  0.0270908 ]
 ...
 [1.         0.08149997 0.08770931 ... 0.49868386 0.4997482  0.02570782]
 [1.         0.07261903 0.02805782 ... 0.49868386 0.4997482  0.        ]
 [1.         0.09536454 0.10543202 ... 0.49868386 0.4997482  0.        ]]


Standardized dataset (works well if features are distributed according to a gaussian curve)

In [82]:
tx_standardized = np.copy(tx)
means = column_means(tx_standardized)
stdevs = column_stdevs(tx_standardized, means)
standardize_dataset(tx_standardized, means, stdevs)

In [83]:
print(tx_standardized)

[[ 1.00000000e+00  3.14910026e-01  6.83318303e-02 ...  1.14381645e+00
  -2.52713783e+00  4.12509672e-01]
 [ 1.00000000e+00  7.40825545e-01  5.52503718e-01 ...  2.27720450e-14
  -9.07313667e-15 -2.73819417e-01]
 [ 1.00000000e+00 -1.00190088e-12  3.19514914e+00 ...  2.27720450e-14
  -9.07313667e-15 -2.93969258e-01]
 ...
 [ 1.00000000e+00 -3.10930051e-01  3.19315808e-01 ...  2.27720450e-14
  -9.07313667e-15 -3.17016595e-01]
 [ 1.00000000e+00 -5.10096314e-01 -8.45322280e-01 ...  2.27720450e-14
  -9.07313667e-15 -7.45437922e-01]
 [ 1.00000000e+00 -1.00190088e-12  6.65334753e-01 ...  2.27720450e-14
  -9.07313667e-15 -7.45437922e-01]]


## Splitting the data into train and test datasets

In [25]:
ratio = 0.8 # 100ratio% of the data will be used to train the model

In [26]:
x_train, y_train, x_test, y_test = split_data(tx, y, ratio, seed=1)

In [27]:
x_train_log, y_train_log, x_test_log, y_test_log = split_data(tx, y_log, ratio, seed=1)

In [84]:
x_train_log_norm, y_train_log_norm, x_test_log_norm, y_test_log_norm = split_data(tx_normalized, y_log, ratio, seed=1)

In [85]:
x_train_log_stand, y_train_log_stand, x_test_log_stand, y_test_log_stand = split_data(tx_standardized, y_log, ratio, seed=1)

## Testing the functions

In [59]:
weights_LS = least_squares(y_train, x_train)[1]
cat_accuracy_train, f1_score_train = metrics(weights_LS,y_train,x_train)
cat_accuracy_test, f1_score_test = metrics(weights_LS,y_test,x_test)


print("Categorical accuracy train : ",cat_accuracy_train," || f1_score train: ",f1_score_train)
print("Categorical accuracy test : ",cat_accuracy_test," || f1_score test: ",f1_score_test)

Categorical accuracy train :  0.746285  || f1_score train:  0.2892969865966419
Categorical accuracy test :  0.747645431826469  || f1_score test:  0.28738319536360796


In [60]:
weights_LSGD1 = least_squares_GD(y_train, x_train, np.zeros(31), 100, 0.000001)[1]
cat_accuracy_train, f1_score_train = metrics(weights_LSGD1,y_train,x_train)
cat_accuracy_test, f1_score_test = metrics(weights_LSGD1,y_test,x_test)


print("Categorical accuracy train : ",cat_accuracy_train," || f1_score train: ",f1_score_train)
print("Categorical accuracy test : ",cat_accuracy_test," || f1_score test: ",f1_score_test)

# THIS CONVERGES INTO A LOCAL MINIMUM, solution: Dynamic Step Size ?

Categorical accuracy train :  0.693025  || f1_score train:  0.10416656792534208
Categorical accuracy test :  0.693083483782606  || f1_score test:  0.10352591942924126


In [11]:
weights_LSGD2 = least_squares_GD(y, tx, [-1.15430619e+00,  1.82637143e-04, -7.20672107e-03, -6.45384640e-03,
        -1.73123839e-05,  2.32665195e-02,  4.20381986e-04,  2.50304362e-03,
         3.60203545e-01, -1.26385429e-03, -2.84568774e+00, -2.22719927e-01,
         9.89163555e-02,  3.56752661e-01,  2.85396180e+00, -6.42020723e-04,
        -4.57219107e-04,  2.85879539e+00, -6.80776737e-04,  1.38605239e-03,
         3.15125355e-03,  5.15272243e-04, -3.71558734e-04,  4.27220740e-02,
        -1.01225645e-03,  4.70620275e-04,  1.34341503e-04, -2.12423006e-03,
         1.42389540e-03, -1.78105047e-03,  2.84577828e+00], 100, 0.000001)[1]

cat_accuracy_train, f1_score_train = metrics(weights_LSGD2,y_train,x_train)
cat_accuracy_test, f1_score_test = metrics(weights_LSGD2,y_test,x_test)


print("Categorical accuracy train : ",cat_accuracy_train," || f1_score train: ",f1_score_train)
print("Categorical accuracy test : ",cat_accuracy_test," || f1_score test: ",f1_score_test)

# IT STAYS IN THE ABSOLUTE MINIMUM WHEN USING THAT AS A STARTING POINT

Categorical accuracy train :  0.73289  || f1_score train:  0.2737637991006309
Categorical accuracy test :  0.7327931982106172  || f1_score test:  0.2720790496826641


In [92]:
weights_LSSGD = least_squares_SGD(y, tx, np.zeros(31), 100, 0.000001)[1]

cat_accuracy_train, f1_score_train = metrics(weights_LSSGD,y_train,x_train)
cat_accuracy_test, f1_score_test = metrics(weights_LSSGD,y_test,x_test)


print("Categorical accuracy train : ",cat_accuracy_train," || f1_score train: ",f1_score_train)
print("Categorical accuracy test : ",cat_accuracy_test," || f1_score train: ",f1_score_test)

Categorical accuracy train :  0.721595  || f1_score train:  0.375502578889656
Categorical accuracy test :  0.7212315792282175  || f1_score train:  0.37321411778322733


In [61]:
lambda_ = 0.005

weights_RR = ridge_regression(y_train, x_train, lambda_)[1]
cat_acc_test, f1_score_test = metrics(weights_RR, y_test, x_test)

print(cat_acc_test, f1_score_test)

0.747645431826469 0.28738319536360796


In [87]:
num_iterations = 10000
lr = 0.005
w = np.zeros(31)
%run implementations.py
weights_log_reg = logistic_regression(x_train_log, y_train_log, w, lr, num_iterations)

  def compute_log_gradient(tx, y, ws, lr):
  def compute_log_gradient(tx, y, ws, lr):


Step:  0  loss:  nan  accuracy:  0.65668  f1_score:  0.0


  N = y.shape[0]
  N = y.shape[0]


Step:  100  loss:  nan  accuracy:  0.65698  f1_score:  0.0005998200539838049
Step:  200  loss:  nan  accuracy:  0.71535  f1_score:  0.37105693970467263
Step:  300  loss:  nan  accuracy:  0.578615  f1_score:  0.47623111002754215
Step:  400  loss:  nan  accuracy:  0.59427  f1_score:  0.47406870505655485
Step:  500  loss:  nan  accuracy:  0.600145  f1_score:  0.4731017792181518
Step:  600  loss:  nan  accuracy:  0.603315  f1_score:  0.4723495264283532
Step:  700  loss:  nan  accuracy:  0.60513  f1_score:  0.4717892291704872
Step:  800  loss:  nan  accuracy:  0.60655  f1_score:  0.4715089263878699
Step:  900  loss:  nan  accuracy:  0.60778  f1_score:  0.47129278947026476
Step:  1000  loss:  nan  accuracy:  0.608445  f1_score:  0.47099477080211616
Step:  1100  loss:  nan  accuracy:  0.6091  f1_score:  0.4707375632732333
Step:  1200  loss:  nan  accuracy:  0.60997  f1_score:  0.47069663591492483
Step:  1300  loss:  nan  accuracy:  0.610485  f1_score:  0.4706557063658956
Step:  1400  loss:  n

KeyboardInterrupt: 

In [93]:
num_iterations = 100000
lr = 0.005
w = np.zeros(31)
%run implementations.py
weights_log_reg = logistic_regression(x_train_log_stand, y_train, x_test_log_stand, y_test, w, lr, num_iterations)

Step:  0  loss:  0.6874130944712561  accuracy_train:  0.70014  f1_score_train:  0.21663434316413652  accuracy_test:  0.699122206312644  f1_score_test:  0.2138897272568424


  * np.log(1-sigmoid_prediction))
  * np.log(1-sigmoid_prediction))


Step:  10000  loss:  nan  accuracy_train:  0.713885  f1_score_train:  0.1746911805641117  accuracy_test:  0.7124358552485304  f1_score_test:  0.17108141738303012
Step:  20000  loss:  nan  accuracy_train:  0.71391  f1_score_train:  0.17559863169897377  accuracy_test:  0.7126759811812418  f1_score_test:  0.1722408706324114


  loss = -np.mean(y*np.log(sigmoid_prediction)+(1-y)
  ret = umr_sum(arr, axis, dtype, out, keepdims)


Step:  30000  loss:  nan  accuracy_train:  0.71392  f1_score_train:  0.17576503853696357  accuracy_test:  0.712569258544481  f1_score_test:  0.1722854356306892
Step:  40000  loss:  nan  accuracy_train:  0.71396  f1_score_train:  0.17587318670393967  accuracy_test:  0.7126404069689882  f1_score_test:  0.17238941217583525
Step:  50000  loss:  nan  accuracy_train:  0.713985  f1_score_train:  0.17596468667633405  accuracy_test:  0.7126848747343051  f1_score_test:  0.17244882203314074
Step:  60000  loss:  nan  accuracy_train:  0.713945  f1_score_train:  0.1759314150212048  accuracy_test:  0.7126670876281783  f1_score_test:  0.17246367389396353
Step:  70000  loss:  nan  accuracy_train:  0.713915  f1_score_train:  0.17592309691776606  accuracy_test:  0.7126670876281783  f1_score_test:  0.17246367389396353
Step:  80000  loss:  nan  accuracy_train:  0.7139  f1_score_train:  0.17591477873846267  accuracy_test:  0.7126404069689882  f1_score_test:  0.17247852551339668
Step:  90000  loss:  nan  acc

## Generate predictions and save ouput in csv format for submission:

In [21]:
DATA_TEST_PATH = 'test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [18]:
tX_test = np.insert(tX_test, 0, 1, axis=1)

NameError: name 'tX_test' is not defined

In [42]:
time_day = datetime.datetime.now().day
time_hour = datetime.datetime.now().hour
time_min = datetime.datetime.now().minute
time_second = datetime.datetime.now().second

time = str(time_day)+"-"+str(time_hour)+"-"+str(time_min)+"-"+str(time_second)

OUTPUT_PATH = 'submission'+"_"+str(time)+".csv"
print(weights_LS.shape)
y_pred = predict_labels(weights_LS, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

(30,)
