In [55]:
import numpy as np
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics 
from sklearn.metrics import classification_report

In [2]:
#import the data
#drop column isFlaggedFraud since there is no explanation about it and I don't know the usage of this variable
df = pd.read_csv('../input/online-payments-fraud-detection-dataset/PS_20174392719_1491204439457_log.csv')
df.drop('isFlaggedFraud', axis=1, inplace=True)

In [3]:
#check the head of data
df.head()

In [4]:
#dataset info
df.info()

In [5]:
#some statistic from dataset
df.describe()

In [7]:
#check the missing values
df.isnull().sum()

The data has no missing values

Change column name from oldbalanceOrg to newbalanceOrig

In [8]:
df.rename(columns={'oldbalanceOrg' : 'oldbalanceOrig'}, inplace=True)

## Exploratory Data Analysis

1. step

Frequency of step where 1 step equals 1 hour

In [9]:
df['step'].value_counts()

There are 743 steps and all steps has occurences more than 1. 

2. type

Frequency of online transaction type

In [11]:
fig = plt.figure()
ax = sns.countplot(x='type', data=df)
for container in ax.containers:
    ax.bar_label(container)
plt.tight_layout()
plt.title('Count plot of transaction type')
plt.show(fig)

The most transaction type is CASH_OUT with slight difference than PAYMENT

3. amount

Distribution of transaction amount

In [12]:
fig = plt.figure()
sns.distplot(df['amount'])
plt.title(' Distribution of transaction amount')
plt.show(fig)

The distribution of amount is left skewed

4. nameOrig

Frequency of customer name starting the transaction

In [13]:
df['nameOrig'].value_counts()

5. oldbalanceOrg

Distribution of balance before the transaction

In [14]:
fig = plt.figure()
sns.distplot(df['oldbalanceOrig'])
plt.title(' Distribution of balance before transaction')
plt.show(fig)

The distribution of oldbalanceOrg left skewed

6. newbalanceOrig

Distribution of balance after the transaction

In [15]:
fig = plt.figure()
sns.distplot(df['newbalanceOrig'])
plt.title(' Distribution of balance after transaction')
plt.show(fig)

The distribution of newbalanceOrig i s left skewed

7. nameDest

Frequency of transaction recipient 

In [16]:
df['nameDest'].value_counts()

There are more than 2 million recipients. Some recipients appear more than once and some recipients appear only once

8. oldbalanceDest

Distribution of initial balance of recipient before the transaction

In [17]:
fig = plt.figure()
sns.distplot(df['oldbalanceDest'])
plt.title(' Distribution of initial balance of \n recipient before the transaction')
plt.show(fig)

The distribution of oldbalanceDest is left skewed

9. newbalanceDest

Distribution of the new balance of recipient after the transaction

In [18]:
fig = plt.figure()
sns.distplot(df['newbalanceDest'])
plt.title(' Distribution of the new balance of \n recipient after the transaction')
plt.show(fig)

The distribution of newbalanceDest is left skewed

10. isFraud

Frequency of isFraud variable

In [19]:
fig = plt.figure()
ax = sns.countplot(x='isFraud', data=df)
for container in ax.containers:
    ax.bar_label(container)
plt.tight_layout()
plt.title('Frequency of isFraud')
plt.show(fig)

There is a huge difference between "0" and "1" data which means there is imbalance in the dataset

Correlation 

Correlation of variables in df

In [21]:
sns.heatmap(df.loc[:, df.columns != 'isFraud'].corr())

oldbalanceOrg and newbalanceOrg has high correlation

oldbalanceDest and newbalanceDest has high correlation


Frequency of transaction type and isFraud

In [37]:
sns.countplot(x='type', data=df, hue='isFraud')

All type of transaction dominated by "0"

## Modelling

In this modelling I will use step, amount, odlbalanceOrg, newbalanceOrig, oldbalanceDest, newbalanceDest

In [39]:
X = df[['step','type','amount','oldbalanceOrig','newbalanceOrig','oldbalanceDest','newbalanceDest']]
y = df['isFraud']

In [44]:
X = pd.get_dummies(data=X, prefix='type')
X.head()

In [47]:
#split train set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size = 0.8)

In [50]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [51]:
# Model Accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [56]:
#classification report
print(classification_report(y_test, y_pred))

Seems the model is good at classifyng the fraud transaction