In [1]:
# Import models and utility functions
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import roc_auc_score, accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

In [2]:
# ignore warning
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# **Data Cleaning**

---


Columns are renamed for better readability.
Irrelevant columns like 'Name_Orig', 'Name_Dest', 'isFlaggedFraud', and 'type' were dropped from the dataset.

In [3]:
# import data
file = 'Fraud.csv'
df = pd.read_csv(file)
print(df.columns)
df = df.rename(columns={'oldbalanceOrg': 'Old_Balance_Orig',
                        'newbalanceOrig': 'New_Balance_Orig',
                        'oldbalanceDest': 'Old_Balance_Dest',
                        'newbalanceDest': 'New_Balance_Dest',
                        'nameOrig': 'Name_Orig',
                        'nameDest': 'Name_Dest'})

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')


In [4]:
print(df.head())

   step      type    amount    Name_Orig  Old_Balance_Orig  New_Balance_Orig  \
0     1   PAYMENT   9839.64  C1231006815          170136.0         160296.36   
1     1   PAYMENT   1864.28  C1666544295           21249.0          19384.72   
2     1  TRANSFER    181.00  C1305486145             181.0              0.00   
3     1  CASH_OUT    181.00   C840083671             181.0              0.00   
4     1   PAYMENT  11668.14  C2048537720           41554.0          29885.86   

     Name_Dest  Old_Balance_Dest  New_Balance_Dest  isFraud  isFlaggedFraud  
0  M1979787155               0.0               0.0      0.0             0.0  
1  M2044282225               0.0               0.0      0.0             0.0  
2   C553264065               0.0               0.0      1.0             0.0  
3    C38997010           21182.0               0.0      1.0             0.0  
4  M1230701703               0.0               0.0      0.0             0.0  


In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14247 entries, 0 to 14246
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   step              14247 non-null  int64  
 1   type              14247 non-null  object 
 2   amount            14247 non-null  float64
 3   Name_Orig         14247 non-null  object 
 4   Old_Balance_Orig  14246 non-null  float64
 5   New_Balance_Orig  14246 non-null  float64
 6   Name_Dest         14246 non-null  object 
 7   Old_Balance_Dest  14246 non-null  float64
 8   New_Balance_Dest  14246 non-null  float64
 9   isFraud           14246 non-null  float64
 10  isFlaggedFraud    14246 non-null  float64
dtypes: float64(7), int64(1), object(3)
memory usage: 1.2+ MB
None


In [6]:
# EDA
print('\n The types of fraudulent transactions are {}'.format(list(df.loc[df.isFraud == 1].type.drop_duplicates().values)))


 The types of fraudulent transactions are ['TRANSFER', 'CASH_OUT']


In [7]:
dfFraudTransfer = df.loc[(df.isFraud == 1) & (df.type == 'TRANSFER')]
dfFraudCashout = df.loc[(df.isFraud == 1) & (df.type == 'CASH_OUT')]

In [8]:
print('\n No.fraudulent in TRANSFERs = {}'.
      format(len(dfFraudTransfer)))


 No.fraudulent in TRANSFERs = 38


In [9]:
print('\n No.fraudulent in CASH_OUTs = {}'.
      format(len(dfFraudCashout)))


 No.fraudulent in CASH_OUTs = 40


# **Variable Selection**

---


In this code, the variable selection process involves excluding certain columns that are either irrelevant or are unlikely to contribute significantly to the fraud detection task. For example, 'Name_Orig' and 'Name_Dest' columns are excluded. The 'isFlaggedFraud' column is excluded as well, as it's likely not useful for predictive modeling.

In [10]:
# data import and cleaning
X = df.loc[(df.type == 'TRANSFER')]
y = X['isFraud']
del X['isFraud']

In [11]:
# Eliminate columns shown to be irrelevant for analysis in the EDA
X = X.drop(['Name_Orig', 'Name_Dest', 'isFlaggedFraud', 'type'], axis=1)
print(X.head())

    step     amount  Old_Balance_Orig  New_Balance_Orig  Old_Balance_Dest  \
2      1     181.00            181.00               0.0              0.00   
19     1  215310.30            705.00               0.0          22425.00   
24     1  311685.89          10835.00               0.0           6267.00   
58     1   62610.80          79114.00           16503.2            517.00   
78     1   42712.39          10363.39               0.0          57901.66   

    New_Balance_Dest  
2               0.00  
19              0.00  
24        2719172.89  
58           8383.29  
78          24044.18  


# **Fraud Detection Model**


---




A Decision Tree classifier is used as the fraud detection model. A Decision Tree is a supervised machine learning algorithm that's used for classification tasks. It creates a tree-like model of decisions and their possible consequences.

In [12]:
# decision tree model
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2)

In [13]:
dt = DecisionTreeClassifier(max_depth=2, random_state=1)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
acc = accuracy_score(y_test, y_pred)

In [14]:
print('size of X_train, X_test, y_train, y_test')
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

size of X_train, X_test, y_train, y_test
(1150, 6) (288, 6) (1150,) (288,)


# **Model Performance Evaluation**


---


The performance of the Decision Tree classifier is evaluated using various metrics:

1.Accuracy Score: The percentage of correctly predicted
instances out of all instances.

2.F1 Macro Score: The F1-score calculated for each class and then averaged.

3.F1 Micro Score: The F1-score calculated globally by considering the total number of true positives, false negatives, and false positives.

In [15]:
acc = accuracy_score(y_test, y_pred)
print('accuracy of decision tree result',acc)

accuracy of decision tree result 0.9965277777777778


In [16]:
from sklearn.metrics import f1_score
#f1 score is good to evaluate unbalanced data
print('F1 macro score')
print(f1_score(y_test, y_pred, average='macro')  )
print('F1 micro score')
print(f1_score(y_test, y_pred, average='micro')  )


F1 macro score
0.9727865444580932
F1 micro score
0.9965277777777778


# Confusion Matrix:
A matrix showing the count of true positive, true negative, false positive, and false negative predictions.

In [17]:
#confusion matrix of decision tree result with .2 random test dataset
from sklearn.metrics import confusion_matrix
print('confusion matrix of decision tree with .2 random test data:')
print(confusion_matrix(y_test, y_pred))

confusion matrix of decision tree with .2 random test data:
[[278   1]
 [  0   9]]


In [18]:
#Cross validation accuracy score wiht cv = 5
from sklearn.model_selection import cross_val_score
clf = DecisionTreeClassifier(max_depth=2, random_state=1)
scores = cross_val_score(clf, X_train, y_train, cv=5)
print('Cross validation confisuion matrix wiht cv = 5')
print([s for s in scores])

Cross validation confisuion matrix wiht cv = 5
[1.0, 0.9956521739130435, 1.0, 0.9956521739130435, 0.9956521739130435]


In [19]:
#Cross validation confisuion matrix wiht cv = 5
from sklearn.model_selection import cross_val_predict
y_pred = cross_val_predict(clf, X_train, y_train,cv=5)
conf_mat = confusion_matrix(y_train,y_pred)
print('Cross validation confisuion matrix wiht cv = 5')
print(conf_mat)

Cross validation confisuion matrix wiht cv = 5
[[1120    1]
 [   2   27]]


# Key Factors for Fraud Prediction
The Decision Tree classifier doesn't inherently provide "key factors" like other models, but you can interpret the splits made by the tree to understand the features that contribute to the decision-making process.

# Interpretation of Factors
Unfortunately, the code provided doesn't contain explicit code for feature interpretation. However, you can interpret a Decision Tree by analyzing the splits it makes. Each split indicates a decision point based on certain features. By tracing the splits that lead to a particular outcome (fraud or not fraud), you can understand which features are influential in making that decision.

# Prevention Measures
The provided code focuses on building and evaluating a fraud detection model but doesn't include specific steps for prevention. In general, prevention measures could involve improving authentication mechanisms, monitoring for unusual patterns, setting up alerts for large transactions, etc.

# Determining Effectiveness
To determine the effectiveness of the model and prevention measures, you would need to:

Collect data after implementing prevention measures.
Monitor relevant metrics such as fraud incidents, accuracy, precision, recall, etc.
Compare these metrics to the period before implementing the measures.

This comparison would help you assess whether the model and preventive actions are achieving their intended goals.


# Interpretation:
You would need to compare the performance metrics (accuracy, F1-score, confusion matrix) between the test set evaluation and cross-validation results. If the model performs well on both the test set and cross-validation, it suggests that the model generalizes well and the implemented actions are effective.

## Monitoring over Time:
To determine the long-term effectiveness of the implemented actions, you should continuously monitor the model's performance and relevant metrics. If the model's performance starts to deteriorate over time, it might indicate that the fraud patterns have changed, and you need to adapt your prevention measures accordingly.

Remember that achieving effective fraud prevention is an ongoing process, and you should regularly revisit and refine your strategies based on new data and insights.