In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import datetime
import copy
%load_ext autoreload
%autoreload 2

In [2]:
%run proj1_helpers.py
%run implementations.py

In [3]:
DATA_TRAIN_PATH = 'train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## preprocessing

In [4]:
# Im merging X & Y so that I don't have to store indices in a seperate variable when shuffling
Y = np.reshape(y,(250000,1))
X = np.append(Y, tX, axis=1)
print(X)

[[   1.     138.47    51.655 ...    1.24    -2.475  113.497]
 [  -1.     160.937   68.768 ... -999.    -999.      46.226]
 [  -1.    -999.     162.172 ... -999.    -999.      44.251]
 ...
 [   1.     105.457   60.526 ... -999.    -999.      41.992]
 [  -1.      94.951   19.362 ... -999.    -999.       0.   ]
 [  -1.    -999.      72.756 ... -999.    -999.       0.   ]]


In [5]:
# splitting the data set
X_0 = X[X[:,23]==0]
print(X_0.shape)
X_1 = X[X[:,23]==1]
print(X_1.shape)
X_2 = X[X[:,23]==2]
print(X_2.shape)
X_3 = X[X[:,23]==3]
print(X_3.shape)

(99913, 31)
(77544, 31)
(50379, 31)
(22164, 31)


In [6]:
# merging 2 & 3
X_2_3 = np.concatenate((X_2, X_3), axis=0)
print(X_2_3.shape)

(72543, 31)


In [7]:
# remove useless constants/making sure the matrix isn't singular
X_0 = np.delete(X_0, 23, 1)
X_1 = np.delete(X_1, 23, 1)

In [8]:
# remove the -999 colums
where = np.array([np.all(X_0[:,i]==-999) for i in range(0, 30)])
print(where)
print(where.shape)

[False False False False False  True  True  True False False False False
 False  True False False False False False False False False False  True
  True  True  True  True  True False]
(30,)


In [9]:
remove = [i for i in range(0,30) if where[i]==True]
print(remove)

[5, 6, 7, 13, 23, 24, 25, 26, 27, 28]


In [10]:
X_0 = np.delete(X_0, remove, 1)

In [11]:
# Last column in X_0 are all zeros for some reason 
X_0 = np.delete(X_0, 19, 1)

In [12]:
where = np.array([np.all(X_1[:,i]==-999) for i in range(0, 30)])
remove = [i for i in range(0,30) if where[i]==True]
X_1 = np.delete(X_1, remove, 1)

print(X_0.shape)
print(X_1.shape)
print(X_2_3.shape)

(99913, 19)
(77544, 23)
(72543, 31)


In [13]:
# replace the remaining -999 in first column by median
X_0 = np.where(X_0==-999, np.nan, X_0)
X_1 = np.where(X_1==-999, np.nan, X_1)
X_2_3 = np.where(X_2_3==-999, np.nan, X_2_3)

In [14]:
median = np.nanmedian(X_0[:,1])
print(median)
X_0[np.isnan(X_0[:,1]),1]=median

median = np.nanmedian(X_1[:,1])
print(median)
X_1[np.isnan(X_1[:,1]),1]=median

median = np.nanmedian(X_2_3[:,1])
print(median)
X_2_3[np.isnan(X_2_3[:,1]),1]=median

111.452
112.4055
113.23


In [15]:
# splitting the data without cross-validation
np.random.seed(1)

np.random.shuffle(X_0)
np.random.shuffle(X_1)
np.random.shuffle(X_2_3)


X_0_t = X_0[:-100, 1:]
X_0_v = X_0[-100:, 1:]

X_1_t = X_1[:-100, 1:]
X_1_v = X_1[-100:, 1:]

X_2_3_t = X_2_3[:-100, 1:]
X_2_3_v = X_2_3[-100:, 1:]

In [16]:
# seperating X & Y again after shuffle
Y_0_t = X_0[:-100, 0]
Y_0_v = X_0[-100:, 0]

Y_1_t = X_1[:-100, 0]
Y_1_v = X_1[-100:, 0]

Y_2_3_t = X_2_3[:-100, 0]
Y_2_3_v = X_2_3[-100:, 0]

## Used function

In [17]:
def build_3D_poly(x, degree):
    phi = np.zeros((x.shape[0],x.shape[1],degree))
    for i in range(1,degree+1):
        phi[:,:,i-1] = x**i
    return phi

In [18]:
def build_poly(x, degree):
    phi = build_3D_poly(x, degree)
    Phi = phi[:,:,0]
    for i in range(1, degree):
        Phi = np.concatenate((Phi, phi[:,:,i]), axis = 1)
    
    return Phi

In [19]:
def least_squares(y, x):
    """calculate the least squares solution."""
    Gram = np.dot(np.transpose(x), x)    
    ws = np.dot(np.linalg.inv(Gram), np.dot(np.transpose(x), y))
    return ws

In [20]:
def predict_labels(weights, data):
    """Generates class predictions given weights, and a test data matrix"""
    y_pred = np.dot(data, weights)
    y_pred[np.where(y_pred <= 0)] = -1
    y_pred[np.where(y_pred > 0)] = 1
    return y_pred

In [21]:
def accuracy(y_pred, y_real):
    return str(sum([1 for i in range(0, len(y_pred)) if y_pred[i]==y_real[i]])/len(y_pred)*100)+' %'

## Results

In [22]:
X_0_t_3 = build_poly(X_0_t, 3)
ws = least_squares(Y_0_t, X_0_t_3)
y_pred = predict_labels(ws, X_0_t_3)
print('training accuracy: '+accuracy(y_pred, Y_0_t))

training accuracy: 82.14360854798474 %


In [23]:
X_0_v_3 = build_poly(X_0_v, 3)
y_pred = predict_labels(ws, X_0_v_3)
print('validation accuracy: '+accuracy(y_pred, Y_0_v))

validation accuracy: 79.0 %
