In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, roc_auc_score

In [2]:
train_df = pd.read_csv("./PreprocessedTrain.csv")
test_df = pd.read_csv("./PreprocessedTest.csv")

In [3]:
len(train_df)

128220

In [4]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,isFraud
0,0,0.279217,0.001118,4,0.040297,0.566,0.381679,4,0.919708,3,...,0.75,260,0,1,1,1,1,0,829,0
1,1,0.341831,0.000961,4,0.025523,0.78,0.381679,4,0.481752,3,...,0.75,260,0,1,1,1,1,0,829,0
2,2,0.329605,0.000828,0,0.276615,0.616,0.648855,3,0.905109,3,...,0.75,260,0,0,0,2,0,2,280,0
3,3,0.655649,0.009229,4,0.99155,0.52,0.381679,3,0.189781,3,...,0.75,260,0,1,1,1,1,0,829,0
4,4,0.241131,0.000462,1,0.288515,0.966,0.381679,4,0.919708,2,...,0.75,221,4,2,0,2,2,1,723,0


In [5]:
test_df.drop('Unnamed: 0', inplace=True, axis =1)

In [6]:
train_df.drop('Unnamed: 0', inplace=True, axis =1)

In [7]:
Ytrain = train_df['isFraud']

In [8]:
train_df.drop('isFraud', inplace=True, axis =1)

In [9]:
train_df.head()

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,0.279217,0.001118,4,0.040297,0.566,0.381679,4,0.919708,3,0.904545,...,11,0.75,260,0,1,1,1,1,0,829
1,0.341831,0.000961,4,0.025523,0.78,0.381679,4,0.481752,3,0.452273,...,11,0.75,260,0,1,1,1,1,0,829
2,0.329605,0.000828,0,0.276615,0.616,0.648855,3,0.905109,3,0.452273,...,49,0.75,260,0,0,0,2,0,2,280
3,0.655649,0.009229,4,0.99155,0.52,0.381679,3,0.189781,3,0.452273,...,11,0.75,260,0,1,1,1,1,0,829
4,0.241131,0.000462,1,0.288515,0.966,0.381679,4,0.919708,2,0.236364,...,48,0.75,221,4,2,0,2,2,1,723


### Parameter's Optimal Value

KD_Tree/Ball_Tree/BruteForce - For small data sets, Brute Force is justifiable, however, for increasing data the KD or Ball Tree is better alternatives due to their speed and efficiency. For low-dimensional data, the KD Tree Algorithm might be the best solution, while for the high dimensional dataset ball tree algorithm would be better.

n_neighbors - As found by numerous, data scientists the optimal value of k is approximately equal to sqroot of N.

weights - Obviously distance based are better because it is inversely proportional to the distance between the data points.
Priority will be given to the data point that is closest. 

Leaf-size - In any case, it was coming to be 20. It basically gives us the min number of samples that should be at each node.

p - Can take value as 1 or 2 based on the type of distance. 1 for Manhattan Distance and 2 for euclidean Distance. Manhattan Distance is preferred over Euclidean distance as the dimension of the data increases.

Sqroot(128000) = 359

In [11]:
param_grid = {
    'n_neighbors':[5, 100, 359], 
    'algorithm':['ball_tree'],
    'leaf_size': [20, 40],
    'p' : [1, 2],
    'weights':['distance'],
    }

In [12]:
knnModel = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid, verbose=1, cv=3, n_jobs=-1)
knnModel.fit(train_df, Ytrain)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [13]:
print(knnModel.best_params_)

{'algorithm': 'ball_tree', 'leaf_size': 20, 'n_neighbors': 5, 'p': 1, 'weights': 'distance'}


In [14]:
print(knnModel.best_score_)

0.9150132584620184


In [15]:
ypred = knnModel.predict(test_df)

In [16]:
for i in ypred:
    print(i)

0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
1
0
0
0
0
1
0
0
1
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
1
0
0
0
0
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
1
0
0
0
1
1
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
1
1
1
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0


In [17]:
CSV4 = pd.DataFrame(ypred)
file = CSV4.to_csv("PredictionsKNN.csv")