# Detect financial fraud with decision tree in Python
07-detect-financial-fraud-w-dt-in-python

In [1]:
# Setup libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [2]:
# Read data into dataframe
df = pd.read_csv('../00-Datasets/finance.csv')

  df = pd.read_csv('../00-Datasets/finance.csv')


In [3]:
df.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,24999.0,24999.0,24999.0,24999.0,24999.0,24999.0,24999.0,24999.0
mean,6.311452,132244.6,800493.6,817291.0,849717.0,1205839.0,0.00332,0.0
std,2.368394,298180.3,2197915.0,2239935.0,2523119.0,3148265.0,0.057526,0.0
min,1.0,2.39,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,5771.545,0.0,0.0,0.0,0.0,0.0,0.0
50%,7.0,18314.92,19591.0,4259.59,0.0,0.0,0.0,0.0
75%,8.0,154917.1,138258.5,140832.7,342850.1,606878.5,0.0,0.0
max,8.0,10000000.0,22400000.0,22500000.0,25000000.0,28800000.0,1.0,0.0


In [4]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1.0,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0.0,0.0
1,1.0,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0.0,0.0
2,1.0,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1.0,0.0
3,1.0,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1.0,0.0
4,1.0,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0.0,0.0


In [5]:
df.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
1048557,,,,,,,,,,,
1048558,,,,,,,,,,,
1048559,,,,,,,,,,,
1048560,,,,,,,,,,,
1048561,,,,,,,,,,,


In [6]:
df.sample(6)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
5494,5.0,CASH_OUT,89569.75,C719518227,0.0,0.0,C380242442,132427.3,19525.79,0.0,0.0
571642,,,,,,,,,,,
752746,,,,,,,,,,,
754578,,,,,,,,,,,
534243,,,,,,,,,,,
430029,,,,,,,,,,,


In [7]:
df.isna().sum()

step              1023563
type              1023563
amount            1023563
nameOrig          1023563
oldbalanceOrg     1023563
newbalanceOrig    1023563
nameDest          1023563
oldbalanceDest    1023563
newbalanceDest    1023563
isFraud           1023563
isFlaggedFraud    1023563
dtype: int64

In [8]:
df.dropna(inplace=True)

In [9]:
df.duplicated().sum()

0

In [10]:
df['isFlaggedFraud'].value_counts()

0.0    24999
Name: isFlaggedFraud, dtype: int64

In [11]:
df[df['isFlaggedFraud']!=0.0]

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud


In [23]:
# Though this isn't good quality data, 
# we'll continue with the exercise for 
# the purpose of practicing the algorithm. 

In [12]:
# Explore some features
df['type'].value_counts()

PAYMENT     12482
CASH_OUT     5091
CASH_IN      4505
TRANSFER     2425
DEBIT         496
Name: type, dtype: int64

In [13]:
# Explore tabular summary
df.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,24999.0,24999.0,24999.0,24999.0,24999.0,24999.0,24999.0,24999.0
mean,6.311452,132244.6,800493.6,817291.0,849717.0,1205839.0,0.00332,0.0
std,2.368394,298180.3,2197915.0,2239935.0,2523119.0,3148265.0,0.057526,0.0
min,1.0,2.39,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,5771.545,0.0,0.0,0.0,0.0,0.0,0.0
50%,7.0,18314.92,19591.0,4259.59,0.0,0.0,0.0,0.0
75%,8.0,154917.1,138258.5,140832.7,342850.1,606878.5,0.0,0.0
max,8.0,10000000.0,22400000.0,22500000.0,25000000.0,28800000.0,1.0,0.0


In [15]:
# Prepare the data
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])

In [17]:
# Split data into 80% train and 20% set
X = df.drop(columns=['isFlaggedFraud', 'nameDest', 'nameOrig'], axis=1)
y = df['isFlaggedFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [19]:
# Build model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [21]:
# Feed test data into model
y_predict = model.predict(X_test)

actual_vs_predict = pd.DataFrame({'Actual': y_test,
                                 'Prediction': y_predict})
actual_vs_predict.sample(12)

Unnamed: 0,Actual,Prediction
2549,0.0,0.0
7926,0.0,0.0
8152,0.0,0.0
4100,0.0,0.0
17640,0.0,0.0
8705,0.0,0.0
22315,0.0,0.0
24375,0.0,0.0
13000,0.0,0.0
8039,0.0,0.0


In [22]:
# Evaluate model
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      5000

    accuracy                           1.00      5000
   macro avg       1.00      1.00      1.00      5000
weighted avg       1.00      1.00      1.00      5000



## Notes
* We wanted to detect fraudulent activities. 
* Our dataset had 1048562 records and 11 features, 8 of which are of data type float and 3 is of type object/string.
* There were a lot of missing values. All 11 columns of a row would be empty. After dropping these records, we are left with 24999 samples.
* A quick inspection showed that there are no duplicate records.
* A closer look at the label showed total imbalance. All records being labeled as not fraud.
* Though this isn't good quality data, we'll continue with the exercise for the purpose of practicing the algorithm. 
* The dataset was split into 80% train and 20% test sets. No validation set was created or used. No hyperparameter tuning occurred.
* We built, trained, and tested a decision tree classifier.
* We evaluated our model using ground truth and several metrics including precision, recall, and f1-score.