In [0]:
# Import some libraries we will need
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling

In [0]:
#Load data as Pandas Dataframe
train = pd.read_csv('training.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

In [0]:
to_23 = ['TransactionId', 'AccountId', 'BatchId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'Amount', 'TransactionStartTime']
to_17 = ['TransactionId', 'SubscriptionId', 'AccountId', 'BatchId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'Amount', 'TransactionStartTime']

In [0]:
train_mod = train.drop(to_17, axis=1)
test_mod = test.drop(to_17, axis=1)

In [0]:
train_mod = train_mod.drop_duplicates()
#test_uni = test_mod.drop_duplicates()

In [0]:
#plugging the data into the library
pandas_profiling.ProfileReport(train_mod)

In [0]:
#Features
X = train_mod.drop('FraudResult', axis=1)
#Labels
y = train_mod['FraudResult']

# Predict Features
X_pred_f = test_mod

In [0]:
# Transforming the Features
X_transformed = pd.get_dummies(X, drop_first=True)
X_pred_f_trans = pd.get_dummies(X_pred_f, drop_first=True)

In [9]:
print(X_transformed.shape,'Train')
print(X_pred_f_trans.shape,'Test')

(3638, 40) Train
(45019, 44) Test


In [10]:
missing_pred_col = []
for x in X_pred_f_trans.columns:
    if x not in X_transformed.columns:
        missing_pred_col.append(x)
print(missing_pred_col, 'test')

missing_train_col = []
for x in X_transformed.columns:
    if x not in X_pred_f_trans.columns:
        missing_train_col.append(x)
print(missing_train_col, 'train')


['ProductId_ProductId_17', 'ProductId_ProductId_18', 'ProductId_ProductId_25', 'ProductId_ProductId_26', 'ProductCategory_retail', 'ChannelId_ChannelId_4'] test
['ProductId_ProductId_12', 'ProductCategory_other'] train


In [0]:
X_pred_f_trans = X_pred_f_trans.drop(missing_pred_col, axis=1)
X_transformed = X_transformed.drop(missing_train_col, axis=1)

In [12]:
print(X_transformed.shape,'Train')
print(X_pred_f_trans.shape,'Test')

(3638, 38) Train
(45019, 38) Test


In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=50)

In [0]:
from sklearn.neighbors import KNeighborsClassifier

In [0]:
KN = KNeighborsClassifier()

In [17]:
KN.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [0]:
pred_KN = KN.predict(X_test)

![title](https://github.com/Explore-AI/Public-Data/blob/master/Data/matrix2.png?raw=true)



* true positives (TP) : These are cases in which we predicted a Fraud, and is indeed Fraud.
* true negatives (TN) : We predicted no Fraud, and is not Fraud.
* false positives (FP): We predicted Fraud, but actually not Fraud (Also known as a **Type I error**).
* false negatives (FN): We predicted no Fraud, but actually Fraud. (Also known as a **Type II error**).

In [0]:
from sklearn.metrics import confusion_matrix

In [0]:
KN_cm = confusion_matrix(y_test, pred_KN)

In [0]:
labels = ['No Fraud', 'Fraud']

In [22]:
pd.DataFrame(data=KN_cm, index=labels, columns=labels)

Unnamed: 0,No Fraud,Fraud
No Fraud,706,3
Fraud,2,17


In [0]:
from sklearn.metrics import classification_report

In [24]:
print('Classification Report')
print(classification_report(y_test, pred_KN, target_names=['No Fraud', 'Fraud']))

Classification Report
              precision    recall  f1-score   support

    No Fraud       1.00      1.00      1.00       709
       Fraud       0.85      0.89      0.87        19

    accuracy                           0.99       728
   macro avg       0.92      0.95      0.93       728
weighted avg       0.99      0.99      0.99       728



In [0]:
predFinal_preTune_KN = KN.predict(X_pred_f_trans)

In [26]:
#Creating a submission file
my_submission = pd.DataFrame({'TransactionId': test.TransactionId, 'FraudResult': predFinal_preTune_KN}) 
my_submission.to_csv('submission_Untuned_KNN.csv', index=False)
my_submission.head()

Unnamed: 0,TransactionId,FraudResult
0,TransactionId_50600,0
1,TransactionId_95109,0
2,TransactionId_47357,0
3,TransactionId_28185,0
4,TransactionId_22140,0


In [0]:
#X_transformed
#X_pred_f_trans
from sklearn.model_selection import RandomizedSearchCV
X = X_transformed

y = train_mod['FraudResult']

algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
n_jobs=-1, 
n_neighbors=[3,5,7] 
p = [1,2]
weights=['uniform', 'distance']

hyperparameters = dict(algorithm=algorithm,
                       n_jobs=n_jobs,
                       n_neighbors=n_neighbors,
                       p=p,
                       weights=weights)

clf = RandomizedSearchCV(KN, hyperparameters, random_state=12, n_iter=10, cv=5, verbose=0, n_jobs=-1)
best_model = clf.fit(X, y)

In [28]:
print("Tuned KNN Parameters: {}".format(best_model.best_params_))

Tuned KNN Parameters: {'weights': 'uniform', 'p': 1, 'n_neighbors': 7, 'n_jobs': -1, 'algorithm': 'brute'}


In [0]:
predFinal_preTuned_KN = best_model.predict(X_pred_f_trans)

In [30]:
#Creating a submission file
my_submission = pd.DataFrame({'TransactionId': test.TransactionId, 'FraudResult': predFinal_preTuned_KN}) 
my_submission.to_csv('submission_Tuned_KNN.csv', index=False)
my_submission.head()

Unnamed: 0,TransactionId,FraudResult
0,TransactionId_50600,0
1,TransactionId_95109,0
2,TransactionId_47357,0
3,TransactionId_28185,0
4,TransactionId_22140,0
