In [33]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

import pandas as pd
from implementations import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
#TODO move this to a helpers folder and also why was a helper not provided anywhere?
def load_csv_data(data_path):
    """Loads data and returns y (class labels), tX (features) and ids (event ids)"""
    y = np.genfromtxt(data_path, delimiter=",", skip_header=1, dtype=str, usecols=1)
    x = np.genfromtxt(data_path, delimiter=",", skip_header=1)
    ids = x[:, 0].astype(np.int64) #check if int 64 precision was actually needed
    input_data = x[:, 2:]

    # convert class labels from strings to binary (-1,1)
    yb = np.ones(len(y))
    yb[np.where(y=='b')] = -1

    return yb, input_data, ids

# Code start 

In [35]:
import datetime

DATA_TRAIN_PATH = 'train.csv'
DATA_TEST_PATH = 'test.csv'
y_train, tX_train, ids_train = load_csv_data(DATA_TRAIN_PATH)
y_test, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)


### Useful  functions 

In [36]:
def clean_data(data):
    data_cleaned = data
    
    data_cleaned[data_cleaned == -999] = np.NaN
    
    #replace NaN's by mean of columns
    medians = np.nanmedian(data_cleaned, axis=0)
    sq_std = np.std(data_cleaned, axis=0) **2
    inds = np.where(np.isnan(data_cleaned))
    data_cleaned[inds] = np.take(medians, inds[1])
    
    #standardize the columns 
    data_cleaned = (data_cleaned - medians)  / sq_std 
    
    
    #augment the data 
    
    
    
    return data 

In [37]:
def split_data(x, y, ratio, seed=1):
    """split the dataset based on the split ratio."""
    # set seed
    np.random.seed(seed)
    # generate random indices
    num_row = len(y)
    indices = np.random.permutation(num_row)
    index_split = int(np.floor(ratio * num_row))
    index_tr = indices[: index_split]
    index_te = indices[index_split:]
    # create split
    x_tr = x[index_tr]
    x_te = x[index_te]
    y_tr = y[index_tr]
    y_te = y[index_te]
    return x_tr, x_te, y_tr, y_te

In [38]:
def real_positives(pred, act):
    tot = 0
    good = 0
    for p,a in zip(pred,act):
        if(a == 1.0):
            tot += 1
            if(p == 1.0):
                good += 1
    return good / tot
                
            
def real_negatives(pred,act):
    tot = 0
    good = 0
    for p,a in zip(pred,act):
        if(a == 0.0):
            tot += 1
            if(p == 0.0):
                good += 1
    return good/tot
        

In [39]:
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree.
    Args:
        x: numpy array of shape (N,), N is the number of samples.
        degree: integer.
        
    Returns:
        poly: numpy array of shape (N,d+1)
    """
    degrees = x
    for i in range(degree):
        degree_matrix = x**(i+2)
        degrees = np.c_[degrees, degree_matrix]
        
    return degrees

def pairwise_column(x):
    x_aug = x 
    for i in range(x.shape[1]):
        if i!=1:
            x_aug = np.c_[x_aug, np.multiply(x[:, i], x[:,1])]
    return x_aug
        

In [40]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold.
    
    Args:
        y:      shape=(N,)
        k_fold: K in K-fold, i.e. the fold num
        seed:   the random seed

    Returns:
        A 2D array of shape=(k_fold, N/k_fold) that indicates the data indices for each fold

    >>> build_k_indices(np.array([1., 2., 3., 4.]), 2, 1)
    array([[3, 2],
           [0, 1]])
    """
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval] for k in range(k_fold)]
    return np.array(k_indices)




In [48]:
def cross_validation(y, x, k_indices):
    """return the loss of ridge regression for a fold corresponding to k_indices
    
    Args:
        y:          shape=(N,)
        x:          shape=(N,)
        k_indices:  2D array returned by build_k_indices()
        k:          scalar, the k-th fold (N.B.: not to confused with k_fold which is the fold nums)
        

    Returns:
            ws - the parameters for each fold 
            precsions - the precision of each fold 
            best k - the k-th fold that leads to the best score 
    >>> cross_validation(np.array([1.,2.,3.,4.]), np.array([6.,7.,8.,9.]), np.array([[3,2], [0,1]]), 1, 2, 3)
    (0.019866645527597114, 0.33555914361295175)
    """


    precisions = []
    ws = []
    
    for k in range(k_indices.shape[0]):
        train_folds = np.delete(k_indices, k, axis=0)

        train = x[train_folds][0]
        y_train = y[train_folds][0]
        
        w, loss_train = ridge_regression(y_train, train, 0.000001)

        test = x[k_indices[k]]
        y_test = y[k_indices[k]]
        
        #do the prediction on the k-th fold 
        pred = test.dot(w)
        
        #formating the prediction 
        
        pred = (pred )/(pred.max()- pred.min())
        pred[pred > 0] = 1
        pred[pred < 0] = -1 

        #precision 
        errors = np.sum(np.abs((y_test - pred)))/2
        precision = 1- (errors / len(y_test))
        
        ws.append(w)
        
        precisions = np.append(precisions, precision)
    
    return ws, precisions, np.argmax(precisions)


#### Here we do a cross validation (it returns us various w's, losses and the fold that generates the most precise model )

In [42]:
# cleaning the training and test set
tX_train_clean = clean_data(tX_train)
tX_test_clean = clean_data(tX_test)
# adding features with polynomial basis function 
tX_train_clean_poly = build_poly(tX_train_clean, 3)
tX_test_clean_poly = build_poly(tX_test_clean, 3)
# adding the pairwise multiplication
tX_train_clean_pc = pairwise_column(tX_train_clean)
tX_test_clean_pc = pairwise_column(tX_test_clean)
# mixing it all together 
tX_test_clean_aug = np.c_[tX_test_clean_pc[:,30:], tX_test_clean_poly]
tX_train_clean_aug = np.c_[tX_train_clean_pc[:,30:], tX_train_clean_poly]

In [43]:
print(tX_test_clean.shape)
print(tX_train_clean.shape)
print(tX_test_clean_poly.shape)
print(tX_test_clean_aug.shape)

(568238, 30)
(250000, 30)
(568238, 120)
(568238, 149)


In [51]:
#trying the cross validation 
k_indices = build_k_indices(y_train, 10, 10)
ws, precisions, best_k = cross_validation(y_train, tX_train_clean_aug, k_indices)
print(precisions)
cross_val_std = np.std(precisions)
cross_val_mean = np.mean(precisions)

print("the mean is" + str(cross_val_mean) + "the std is " + str(cross_val_std))

[0.79552 0.79348 0.80048 0.79672 0.79828 0.79504 0.79672 0.80044 0.8008
 0.80064]
the mean is0.7978120000000001the std is 0.002554559844669927


In [122]:
w = ws[best_k]
print(precisions[best_k])
print(w.shape)


0.82216
(299,)


#### Once all the cells above have been run got to the part prediction of the test set 

In [78]:
print(w)

#tX_train_aug = build_poly(tX_train, 3)

0.0017057552688037572


In [40]:
#tX_train[0]

In [41]:
#tX_train_aug[0]

##### Here we do not do the cross validation 

In [55]:
#cleaning the training and test set
tX_train_clean = clean_data(tX_train)
tX_test_clean = clean_data(tX_test)

In [56]:
#adding features with polynomial basis function 
tX_train_clean_poly = build_poly(tX_train_clean, 5)
tX_test_clean_poly = build_poly(tX_test_clean, 5)
#adding the pairwise multiplication
tX_train_clean_pc = pairwise_column(tX_train_clean)
tX_test_clean_pc = pairwise_column(tX_test_clean)

In [29]:
tX_test_clean_aug = np.c_[tX_test_clean_pc[:,30:], tX_test_clean_poly]
tX_train_clean_aug = np.c_[tX_train_clean_pc[:,30:], tX_train_clean_poly]

In [30]:
tX_train_clean_pc.shape

(250000, 59)

In [31]:
print(y_train.shape)
print(tX_train_clean_aug.shape)


(250000,)
(250000, 209)


In [32]:
#splitting the dataset into train and validation 
xTr, xVal, yTr, yVal = split_data(tX_train_clean_aug, y_train, ratio=0.75, seed= 1)

In [33]:
#train the model on the training set 
yTr.shape

(187500,)

In [48]:
w, loss = ridge_regression(yTr, xTr, 0.00001)
print(loss)

0.28611428959152935


In [109]:
#predict on the validation set 
pred = xVal.dot(w)
print(pred)
#format the predictions  
pred = (pred )/(pred.max()- pred.min())
print(pred)
pred[pred > 0] = 1
pred[pred < 0] = -1 

#precision 
errors = np.sum(np.abs((yVal - pred)))/2
print("the number of errors : "+str(errors))

#real_positives = real_positives(pred, yVal) 
#real_negatives = real_negatives(pred, yVal)
#TODO: add the F1 score calculation as well
print("the precision is :")
print(1- (errors / len(yVal)))
print("the real positives are :")
#print(real_positives)
print("the real negatives are :")
#print(real_negatives)


NameError: name 'xVal' is not defined

## Prediction on the test set 

In [126]:
pred_test = tX_test_clean_aug.dot(w)
print(pred_test)
#format the predictions  
pred_test = pred_test /(pred_test.max()- pred_test.min())
print(pred_test)
pred_test[pred_test > 0] = 1
pred_test[pred_test < 0] = -1 
print("we set everything to 1 or -1")
print(pred_test)

[-0.9715329  -0.71737571 -0.02043206 ...  0.15877099 -0.14964181
 -0.93231669]
[-4.34137012e-08 -3.20564901e-08 -9.13022449e-10 ...  7.09480477e-09
 -6.68686035e-09 -4.16612944e-08]
we set everything to 1 or -1
[-1. -1. -1. ...  1. -1. -1.]


In [127]:
import csv 
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in .csv format for submission to Kaggle or AIcrowd
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, "w") as csvfile:
        fieldnames = ["Id", "Prediction"]
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({"Id": int(r1), "Prediction": int(r2)})


In [128]:
create_csv_submission(ids_test, pred_test, "pred1.csv")