In [73]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import utils
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

In [74]:
df = pd.read_csv('Fraud.csv')

In [75]:
types = df['type'].unique()

In [76]:
types

array(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'],
      dtype=object)

In [77]:
#One hot the dataset
onehot = OneHotEncoder(handle_unknown='ignore').fit(types.reshape(-1,1))

In [78]:
df_onehot = pd.DataFrame(onehot.transform(np.array(df['type']).reshape(-1,1)).toarray())
df = df.join(df_onehot)
df = df.rename(columns={0:'type_0',1:'type_1',2:'type_2',3:'type_3',4:'type_4'})

In [79]:
features = ['amount', 'oldbalanceOrg', 'newbalanceOrig','oldbalanceDest', 'newbalanceDest','isFlaggedFraud','type_0','type_1','type_2','type_3','type_4']

In [80]:
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_0,type_1,type_2,type_3,type_4
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0,0.0,0.0,0.0,1.0,0.0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0,0.0,0.0,0.0,1.0,0.0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0,0.0,0.0,0.0,0.0,1.0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0,0.0,1.0,0.0,0.0,0.0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0,0.0,1.0,0.0,0.0,0.0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0,0.0,0.0,0.0,0.0,1.0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0,0.0,1.0,0.0,0.0,0.0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0,0.0,0.0,0.0,0.0,1.0


In [81]:
X = np.array(df[features])
y = df['isFraud']
## Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=3000)

In [83]:
X_train

array([[-0.62106392,  0.09920155, -0.28871645, ..., -0.08095965,
        -0.71477788, -0.30234516],
       [ 0.43983624, -0.06585787, -0.28871645, ..., -0.08095965,
        -0.71477788, -0.30234516],
       [ 0.7700502 , -0.27578098, -0.28078592, ..., -0.08095965,
         1.39903602, -0.30234516],
       ...,
       [-0.56485729, -0.00151553, -0.28871645, ..., -0.08095965,
        -0.71477788, -0.30234516],
       [ 0.41173293, -0.23929869, -0.28871645, ..., -0.08095965,
         1.39903602, -0.30234516],
       [ 0.5522495 , -0.2410463 , -0.28871645, ..., -0.08095965,
        -0.71477788, -0.30234516]])

In [84]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

treeModel = DecisionTreeClassifier(min_samples_split=3).fit(X_train, y_train)
treePred = treeModel.predict(X_test)

cm = confusion_matrix(y_test, treePred)
print("Confusion Matrix")
print(cm)
print()
print("TN (True Negative) predictions:", cm[0][0])
print("FP (False Positive) predictions:", cm[0][1])
print("FN (False Negative) predictions:", cm[1][0])
print("TP (True Positive) predictions:", cm[1][1])

Confusion Matrix
[[1588440     198]
 [    246    1771]]

TN (True Negative) predictions: 1588440
FP (False Positive) predictions: 198
FN (False Negative) predictions: 246
TP (True Positive) predictions: 1771


In [85]:
classification_report = classification_report(y_test, treePred)
print("classification Report")
print(classification_report)
print()

classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1588638
           1       0.90      0.88      0.89      2017

    accuracy                           1.00   1590655
   macro avg       0.95      0.94      0.94   1590655
weighted avg       1.00      1.00      1.00   1590655




In [86]:
#accuracy and error of testing set
accuracy = accuracy_score(y_test, treePred)
error = 1-accuracy
print()
print("Accuracy=",accuracy)
print("Error=",error)

#Precsion, Recall, and F1_score
precision = precision_score(y_test, treePred)
recall = recall_score(y_test, treePred)
f1 = f1_score(y_test, treePred)
print()
print("Precision=", precision)
print("Recall=", recall)
print("F1_score=", f1)


Accuracy= 0.9997208697046185
Error= 0.00027913029538151335

Precision= 0.8994413407821229
Recall= 0.8780366881507189
F1_score= 0.8886101354741596


## User inputs: When user inputs a new row, make prediction on it

In [93]:
import pymongo
import pprint

In [94]:
# connect to the mongoclient
client = pymongo.MongoClient('mongodb://localhost:27017')

In [95]:
# get the database
database = client['Fraud']

In [96]:
# create weekly demand collection
database.create_collection("FraudData")

ServerSelectionTimeoutError: localhost:27017: [Errno 61] Connection refused, Timeout: 30s, Topology Description: <TopologyDescription id: 624a068826289a0f9c668396, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 61] Connection refused')>]>

In [None]:
fraud_collection = database.get_collection("FraudData")

In [72]:
df.iloc[0]

step                        1
type                  PAYMENT
amount                9839.64
nameOrig          C1231006815
oldbalanceOrg        170136.0
newbalanceOrig      160296.36
nameDest          M1979787155
oldbalanceDest            0.0
newbalanceDest            0.0
isFraud                     0
isFlaggedFraud              0
type_0                    0.0
type_1                    0.0
type_2                    0.0
type_3                    1.0
type_4                    0.0
type_0                    0.0
type_1                    0.0
type_2                    0.0
type_3                    1.0
type_4                    0.0
Name: 0, dtype: object

In [61]:
test = pd.DataFrame(df.iloc[0]).T[features]

In [62]:
test = np.array(test)

In [65]:
test

array([[9839.64, 170136.0, 160296.36, 0.0, 0.0, 0, 0.0, 0.0, 0.0, 1.0,
        0.0]], dtype=object)

In [63]:
#Takes in an np.array of features, of size 11
def make_pred(X):
    pred = treeModel.predict(X)
    return pred[0]

In [64]:
pred_test = make_pred(test)

0