In [9]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import utils
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

In [10]:
df = pd.read_csv('Fraud.csv')

In [11]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [12]:
#One hot the dataset
onehot = OneHotEncoder(handle_unknown='ignore')
df_onehot = pd.DataFrame(onehot.fit_transform(np.array(df['type']).reshape(-1,1)).toarray())
df = df.join(df_onehot)
features = ['amount', 'oldbalanceOrg', 'newbalanceOrig','oldbalanceDest', 'newbalanceDest','isFlaggedFraud',0,1,2,3,4]

In [13]:
X = np.array(df[features])
y = df['isFraud']
## Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=3000)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(X.shape)

(4771965, 11)
(1590655, 11)
(4771965,)
(1590655,)
(6362620, 11)


In [10]:
#The whole dataset
logistic_model = LogisticRegression(max_iter=10000).fit(X_train, y_train)
pred = logistic_model.predict(X_test)

In [12]:
confusion_matrix = confusion_matrix(y_test, pred)
print("Confusion Matrix")
print(confusion_matrix)
print()
print("TP (True Positive) predictions:", confusion_matrix[0][0])
print("FN (False Negative) predictions:", confusion_matrix[0][1])
print("FP (False Positive) predictions:", confusion_matrix[1][0])
print("TN (True Negative) predictions:", confusion_matrix[1][1])

#accuracy and error of testing set
accuracy = accuracy_score(y_test, pred)
error = 1-accuracy
print()
print("Accuracy=",accuracy)
print("Error=",error)

#Precsion, Recall, and F1_score
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
print()
print("Precision=", precision)
print("Recall=", recall)
print("F1_score=", f1)

Confusion Matrix
[[1588576      62]
 [   1185     832]]

TP (True Positive) predictions: 1588576
FN (False Negative) predictions: 62
FP (False Positive) predictions: 1185
TN (True Negative) predictions: 832

Accuracy= 0.9992160462199534
Error= 0.0007839537800465868

Precision= 0.930648769574944
Recall= 0.41249380267724345
F1_score= 0.5716248711782892


In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

treeModel = DecisionTreeClassifier().fit(X_train, y_train)
treePred = treeModel.predict(X_test)

confusion_matrix = confusion_matrix(y_test, treePred)
print("Confusion Matrix")
print(confusion_matrix)
print()
print("TP (True Positive) predictions:", confusion_matrix[0][0])
print("FN (False Negative) predictions:", confusion_matrix[0][1])
print("FP (False Positive) predictions:", confusion_matrix[1][0])
print("TN (True Negative) predictions:", confusion_matrix[1][1])


Confusion Matrix
[[1588438     200]
 [    205    1812]]

TP (True Positive) predictions: 1588438
FN (False Negative) predictions: 200
FP (False Positive) predictions: 205
TN (True Negative) predictions: 1812


In [16]:
classification_report = classification_report(y_test, treePred)
print("classification Report")
print(classification_report)
print()

classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1588638
           1       0.90      0.90      0.90      2017

    accuracy                           1.00   1590655
   macro avg       0.95      0.95      0.95   1590655
weighted avg       1.00      1.00      1.00   1590655




In [17]:
#accuracy and error of testing set
accuracy = accuracy_score(y_test, treePred)
error = 1-accuracy
print()
print("Accuracy=",accuracy)
print("Error=",error)

#Precsion, Recall, and F1_score
precision = precision_score(y_test, treePred)
recall = recall_score(y_test, treePred)
f1 = f1_score(y_test, treePred)
print()
print("Precision=", precision)
print("Recall=", recall)
print("F1_score=", f1)


Accuracy= 0.9997453879062399
Error= 0.0002546120937600982

Precision= 0.9005964214711729
Recall= 0.8983639067922657
F1_score= 0.8994787788533134


In [18]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
regressor = DecisionTreeRegressor().fit(X_train, y_train)
y_pred = regressor.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.0002652995149796782
MSE: 0.0002652995149796782
Root Mean Squared Error: 0.016288017527608393
