In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

import pandas as pd
from implementations import *

In [2]:
#TODO move this to a helpers folder and also why was a helper not provided anywhere?
def load_csv_data(data_path):
    """Loads data and returns y (class labels), tX (features) and ids (event ids)"""
    y = np.genfromtxt(data_path, delimiter=",", skip_header=1, dtype=str, usecols=1)
    x = np.genfromtxt(data_path, delimiter=",", skip_header=1)
    ids = x[:, 0].astype(np.int64) #check if int 64 precision was actually needed
    input_data = x[:, 2:]

    # convert class labels from strings to binary (-1,1)
    yb = np.ones(len(y))
    yb[np.where(y=='b')] = -1

    return yb, input_data, ids

In [3]:
df = pd.read_csv('train.csv')


In [4]:
print(df.columns[0])

Id


In [5]:
df.columns

Index(['Id', 'Prediction', 'DER_mass_MMC', 'DER_mass_transverse_met_lep',
       'DER_mass_vis', 'DER_pt_h', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet',
       'DER_prodeta_jet_jet', 'DER_deltar_tau_lep', 'DER_pt_tot', 'DER_sum_pt',
       'DER_pt_ratio_lep_tau', 'DER_met_phi_centrality',
       'DER_lep_eta_centrality', 'PRI_tau_pt', 'PRI_tau_eta', 'PRI_tau_phi',
       'PRI_lep_pt', 'PRI_lep_eta', 'PRI_lep_phi', 'PRI_met', 'PRI_met_phi',
       'PRI_met_sumet', 'PRI_jet_num', 'PRI_jet_leading_pt',
       'PRI_jet_leading_eta', 'PRI_jet_leading_phi', 'PRI_jet_subleading_pt',
       'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi', 'PRI_jet_all_pt'],
      dtype='object')

In [6]:
df['PRI_jet_num']

0         2
1         1
2         1
3         0
4         0
         ..
249995    0
249996    0
249997    1
249998    0
249999    0
Name: PRI_jet_num, Length: 250000, dtype: int64

# Code start 

In [7]:
import datetime

DATA_TRAIN_PATH = 'train.csv'
DATA_TEST_PATH = 'test.csv'
y_train, tX_train, ids_train = load_csv_data(DATA_TRAIN_PATH)
y_test, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)


### Useful  functions 

In [8]:
def clean_data(data):
    data_cleaned = data
    
    data_cleaned[data_cleaned == -999] = np.NaN
    
    #replace NaN's by mean of columns
    medians = np.nanmedian(data_cleaned, axis=0)
    sq_std = np.std(data_cleaned, axis=0) **2
    inds = np.where(np.isnan(data_cleaned))
    data_cleaned[inds] = np.take(medians, inds[1])
    
    #standardize the columns 
    data_cleaned = (data_cleaned - medians)  / sq_std 
    
    
    #augment the data 
    
    
    
    return data 

In [9]:
def split_data(x, y, ratio, seed=1):
    """split the dataset based on the split ratio."""
    # set seed
    np.random.seed(seed)
    # generate random indices
    num_row = len(y)
    indices = np.random.permutation(num_row)
    index_split = int(np.floor(ratio * num_row))
    index_tr = indices[: index_split]
    index_te = indices[index_split:]
    # create split
    x_tr = x[index_tr]
    x_te = x[index_te]
    y_tr = y[index_tr]
    y_te = y[index_te]
    return x_tr, x_te, y_tr, y_te

In [10]:
def real_positives(pred, act):
    tot = 0
    good = 0
    for p,a in zip(pred,act):
        if(a == 1.0):
            tot += 1
            if(p == 1.0):
                good += 1
    return good / tot
                
            
def real_negatives(pred,act):
    tot = 0
    good = 0
    for p,a in zip(pred,act):
        if(a == 0.0):
            tot += 1
            if(p == 0.0):
                good += 1
    return good/tot
        

In [11]:
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree.
    Args:
        x: numpy array of shape (N,), N is the number of samples.
        degree: integer.
        
    Returns:
        poly: numpy array of shape (N,d+1)
    """
    degrees = x
    for i in range(degree):
        degree_matrix = x**(i+2)
        degrees = np.c_[degrees, degree_matrix]
        
    return degrees

def pairwise_column(x):
    x_aug = x 
    for i in range(x.shape[1]):
        if i!=1:
            x_aug = np.c_[x_aug, np.multiply(x[:, i], x[:,1])]
    return x_aug
        

In [12]:
tX_train_aug = build_poly(tX_train, 3)

In [13]:
tX_train[0]

array([ 1.38470e+02,  5.16550e+01,  9.78270e+01,  2.79800e+01,
        9.10000e-01,  1.24711e+02,  2.66600e+00,  3.06400e+00,
        4.19280e+01,  1.97760e+02,  1.58200e+00,  1.39600e+00,
        2.00000e-01,  3.26380e+01,  1.01700e+00,  3.81000e-01,
        5.16260e+01,  2.27300e+00, -2.41400e+00,  1.68240e+01,
       -2.77000e-01,  2.58733e+02,  2.00000e+00,  6.74350e+01,
        2.15000e+00,  4.44000e-01,  4.60620e+01,  1.24000e+00,
       -2.47500e+00,  1.13497e+02])

In [14]:
tX_train_aug[0]

array([ 1.38470000e+02,  5.16550000e+01,  9.78270000e+01,  2.79800000e+01,
        9.10000000e-01,  1.24711000e+02,  2.66600000e+00,  3.06400000e+00,
        4.19280000e+01,  1.97760000e+02,  1.58200000e+00,  1.39600000e+00,
        2.00000000e-01,  3.26380000e+01,  1.01700000e+00,  3.81000000e-01,
        5.16260000e+01,  2.27300000e+00, -2.41400000e+00,  1.68240000e+01,
       -2.77000000e-01,  2.58733000e+02,  2.00000000e+00,  6.74350000e+01,
        2.15000000e+00,  4.44000000e-01,  4.60620000e+01,  1.24000000e+00,
       -2.47500000e+00,  1.13497000e+02,  1.91739409e+04,  2.66823903e+03,
        9.57012193e+03,  7.82880400e+02,  8.28100000e-01,  1.55528335e+04,
        7.10755600e+00,  9.38809600e+00,  1.75795718e+03,  3.91090176e+04,
        2.50272400e+00,  1.94881600e+00,  4.00000000e-02,  1.06523904e+03,
        1.03428900e+00,  1.45161000e-01,  2.66524388e+03,  5.16652900e+00,
        5.82739600e+00,  2.83046976e+02,  7.67290000e-02,  6.69427653e+04,
        4.00000000e+00,  

In [15]:
#cleaning the training and test set
tX_train_clean = clean_data(tX_train)
tX_test_clean = clean_data(tX_test)

In [16]:
#adding features with polynomial basis function 
tX_train_clean_poly = build_poly(tX_train_clean, 4)
tX_test_clean_poly = build_poly(tX_test_clean, 4)
#adding the pairwise multiplication
tX_train_clean_pc = pairwise_column(tX_train_clean)
tX_test_clean_pc = pairwise_column(tX_test_clean)

In [17]:
tX_test_clean_aug = np.c_[tX_test_clean_pc[:,30:], tX_test_clean_poly]
tX_train_clean_aug = np.c_[tX_train_clean_pc[:,30:], tX_train_clean_poly]

In [18]:
tX_train_clean_pc.shape

(250000, 59)

In [19]:
print(y_train.shape)
print(tX_train_clean_aug.shape)


(250000,)
(250000, 179)


In [20]:
#splitting the dataset into train and validation 
xTr, xVal, yTr, yVal = split_data(tX_train_clean_aug, y_train, ratio=0.75, seed= 1)

In [21]:
#train the model on the training set 
yTr.shape

(187500,)

In [22]:
w, loss = ridge_regression(yTr, xTr, 0)
print(loss)

0.2889176205734921


In [23]:
#predict on the validation set 
pred = xVal.dot(w)
print(pred)
#format the predictions  
pred = (pred )/(pred.max()- pred.min())
print(pred)
pred[pred > 0] = 1
pred[pred < 0] = -1 

#precision 
errors = np.sum(np.abs((yVal - pred)))/2
print("the number of errors : "+str(errors))

#real_positives = real_positives(pred, yVal) 
#real_negatives = real_negatives(pred, yVal)
#TODO: add the F1 score calculation as well
print("the precision is :")
print(1- (errors / len(yVal)))
print("the real positives are :")
#print(real_positives)
print("the real negatives are :")
#print(real_negatives)


[-0.10836343  0.35903445  0.96014503 ... -1.1217302   0.27259588
 -0.6263442 ]
[-0.01137542  0.03768953  0.10079092 ... -0.11775327  0.02861567
 -0.06575028]
the number of errors : 12396.0
the precision is :
0.8016639999999999
the real positives are :
the real negatives are :


## Prediction on the test set 

In [24]:
pred_test = tX_test_clean_aug.dot(w)
print(pred_test)
#format the predictions  
pred_test = pred_test /(pred_test.max()- pred_test.min())
print(pred_test)
pred_test[pred_test > 0] = 1
pred_test[pred_test < 0] = -1 
print("we set everything to 1 or -1")
print(pred_test)

[-1.06741461 -0.67981738 -0.1094449  ...  0.29242037 -0.03996507
 -0.79447477]
[-1.33381938e-03 -8.49485846e-04 -1.36760095e-04 ...  3.65402489e-04
 -4.99395314e-05 -9.92759360e-04]
we set everything to 1 or -1
[-1. -1. -1. ...  1. -1. -1.]


In [25]:
import csv 
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in .csv format for submission to Kaggle or AIcrowd
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, "w") as csvfile:
        fieldnames = ["Id", "Prediction"]
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({"Id": int(r1), "Prediction": int(r2)})


In [26]:
create_csv_submission(ids_test, pred_test, "pred1.csv")