# Prepare Data

## Import

In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv(r"C:\Users\jason\Documents\Jerry\Studies\Jerry DSc\IS\INSAID\Fraud.csv")
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


## Explore

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [4]:
df.shape

(6362620, 11)

In [5]:
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [6]:
df.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


In [7]:
df.corr()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
step,1.0,0.022373,-0.010058,-0.010299,0.027665,0.025888,0.031578,0.003277
amount,0.022373,1.0,-0.002762,-0.007861,0.294137,0.459304,0.076688,0.012295
oldbalanceOrg,-0.010058,-0.002762,1.0,0.998803,0.066243,0.042029,0.010154,0.003835
newbalanceOrig,-0.010299,-0.007861,0.998803,1.0,0.067812,0.041837,-0.008148,0.003776
oldbalanceDest,0.027665,0.294137,0.066243,0.067812,1.0,0.976569,-0.005885,-0.000513
newbalanceDest,0.025888,0.459304,0.042029,0.041837,0.976569,1.0,0.000535,-0.000529
isFraud,0.031578,0.076688,0.010154,-0.008148,-0.005885,0.000535,1.0,0.044109
isFlaggedFraud,0.003277,0.012295,0.003835,0.003776,-0.000513,-0.000529,0.044109,1.0


# Setting up variables

In [8]:
df.drop(columns= ["nameOrig", "nameDest"], axis=1, inplace=True)
y = df["isFraud"]
X = df.drop(columns= "isFraud", axis =1)

In [9]:
label = LabelEncoder()
df.type = label.fit_transform(df.type)

In [10]:
x = df.drop('isFraud', axis=1)
y = df['isFraud']
X = StandardScaler().fit_transform(x)

# Split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [12]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X_train, y_train = sm.fit_resample(X_train, y_train.ravel())

# Building, iterating and evaluating

### Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression
clr = LogisticRegression()
clr.fit(X_train, y_train)
y_pred = clr.predict(X_test)

In [14]:
print(confusion_matrix(y_test, y_pred))
accuracy = metrics.accuracy_score(y_test, y_pred)
print(accuracy)
print(metrics.classification_report(y_test, y_pred))

[[2035186   61758]
 [    242    2479]]
0.9704714799741864
              precision    recall  f1-score   support

           0       1.00      0.97      0.98   2096944
           1       0.04      0.91      0.07      2721

    accuracy                           0.97   2099665
   macro avg       0.52      0.94      0.53   2099665
weighted avg       1.00      0.97      0.98   2099665



### Decision Tree Classifier

In [15]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred3 = dtc.predict(X_test)

In [16]:
print(confusion_matrix(y_test, y_pred3))
accuracy = metrics.accuracy_score(y_test, y_pred3)
print(accuracy)
print(metrics.classification_report(y_test, y_pred3))

[[2095729    1215]
 [    102    2619]]
0.9993727570826775
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   2096944
           1       0.68      0.96      0.80      2721

    accuracy                           1.00   2099665
   macro avg       0.84      0.98      0.90   2099665
weighted avg       1.00      1.00      1.00   2099665



### Random Forest Classifier

In [17]:
from sklearn.ensemble import RandomForestClassifier
classifier_rf = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=100)
classifier_rf.fit(X_train, y_train)
y_pred4 = classifier_rf.predict(X_test)

In [18]:
print(confusion_matrix(y_test, y_pred4))
accuracy = metrics.accuracy_score(y_test, y_pred4)
print(accuracy)
print(metrics.classification_report(y_test, y_pred4))

[[2095421    1523]
 [    114    2607]]
0.9992203518180282
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   2096944
           1       0.63      0.96      0.76      2721

    accuracy                           1.00   2099665
   macro avg       0.82      0.98      0.88   2099665
weighted avg       1.00      1.00      1.00   2099665



### Gaussian NB

In [19]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred5 = gnb.fit(X_train, y_train).predict(X_test)

In [20]:
print(confusion_matrix(y_test, y_pred5))
accuracy = metrics.accuracy_score(y_test, y_pred5)
print(accuracy)
print(metrics.classification_report(y_test, y_pred5))

[[2091214    5730]
 [   2346     375]]
0.9961536721334118
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   2096944
           1       0.06      0.14      0.08      2721

    accuracy                           1.00   2099665
   macro avg       0.53      0.57      0.54   2099665
weighted avg       1.00      1.00      1.00   2099665



In [21]:
import eli5
eli5.show_weights(dtc)
columns = ["step","type", "amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest", "isFlaggedFraud"]
feature_names = columns
eli5.show_weights(dtc, feature_names = feature_names)

Weight,Feature
0.4235,newbalanceOrig
0.3756,oldbalanceOrg
0.1597,amount
0.0281,type
0.0069,step
0.0032,newbalanceDest
0.0028,oldbalanceDest
0.0003,isFlaggedFraud


In [22]:
eli5.explain_prediction(dtc , np.array(X_test)[1], feature_names = feature_names)

Contribution?,Feature
0.5,<BIAS>
0.494,oldbalanceOrg
0.005,amount
0.001,step


## Communicating results

#### Exploring the data shows no missing values. Certain scenarios like credit card fraud, healthcare, etc. outliers are a "good to have".
#### We were able to run Logistic Regression, Decision Tree Classifier, Random Forest Classfier and Naive Bayes.
#### The originating and destination account were dropped from columns as they didnt provide any usable information.
#### The model was preprocessed with StandardScaler and LabelEncoder and then split. Accuracy of each model is taken into consideration.
#### The factors are sufficient as we achieved a 99.93% accuracy with one of our models.
#### The company can collect location data to further narrow down the possibility of fradulant transactions. Furthermore, transactions outside a specific geography must be yellow flagged to the customer and subsequently labled as red or green.
#### Location data will aid in alerting to sudden, non standard transactions occuring. Customer participation is paramount to training the model to predict accurately. If a customer shifts to a new locality, interacting with th eprompt will share insight on their profile. 