# 0.0 Imports

## 0.1 Imports

In [22]:
import pandas as pd
from  pyspark.sql import SparkSession

## 0.2 Starting Spark Session

In [21]:
spark = (SparkSession
.builder
.appName('Fraud')
.getOrCreate())

spark

## 0.3 Helper Functions



## 0.4 Data Loading

In [28]:
%%time
data = spark.read.csv('../data/raw/fraud.csv', header = True, inferSchema=True)



CPU times: user 10.4 ms, sys: 1.01 ms, total: 11.4 ms
Wall time: 18.3 s


                                                                                

In [32]:
data.show(5)

+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|  amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT| 9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
|   1| PAYMENT| 1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|
|   1|TRANSFER|   181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|
|   1|CASH_OUT|   181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|      1|             0|
|   1| PAYMENT|11668.14|C2048537720|      41554.0|      29885.86|M1230701703|      

In [35]:
type(data)

pyspark.sql.dataframe.DataFrame

In [30]:
data.dtypes

[('step', 'int'),
 ('type', 'string'),
 ('amount', 'double'),
 ('nameOrig', 'string'),
 ('oldbalanceOrg', 'double'),
 ('newbalanceOrig', 'double'),
 ('nameDest', 'string'),
 ('oldbalanceDest', 'double'),
 ('newbalanceDest', 'double'),
 ('isFraud', 'int'),
 ('isFlaggedFraud', 'int')]

## 0.5 Columns Description



**step** - maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation).

**type** - CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER.

**amount** -
amount of the transaction in local currency.

**nameOrig** - customer who started the transaction

**oldbalanceOrg** - initial balance before the transaction

**newbalanceOrig** - new balance after the transaction

**nameDest** - customer who is the recipient of the transaction

**oldbalanceDest** - initial balance recipient before the transaction. Note that there is not information for customers that start with M (Merchants).

**newbalanceDest** - new balance recipient after the transaction. Note that there is not information for customers that start with M (Merchants).

**isFraud** - This is the transactions made by the fraudulent agents inside the simulation. In this specific dataset the fraudulent behavior of the agents aims to profit by taking control or customers accounts and try to empty the funds by transferring to another account and then cashing out of the system.

**isFlaggedFraud** - The business model aims to control massive transfers from one account to another and flags illegal attempts. An illegal attempt in this dataset is an attempt to transfer more than 200.000 in a single transaction.

## 0.6 Solution Method

# 1.0 Data Description


## 1.1 Rename Columns

## 1.2 Data Dimensions

## 1.3 Data Types

## 1.4 NA Check

In [49]:
data.columns

['step',
 'type',
 'amount',
 'nameOrig',
 'oldbalanceOrg',
 'newbalanceOrig',
 'nameDest',
 'oldbalanceDest',
 'newbalanceDest',
 'isFraud',
 'isFlaggedFraud']

In [55]:
columns = data.columns


NAs = {col: data[data[col].isNull()].count() for col in columns }

NAs

                                                                                

{'step': 0,
 'type': 0,
 'amount': 0,
 'nameOrig': 0,
 'oldbalanceOrg': 0,
 'newbalanceOrig': 0,
 'nameDest': 0,
 'oldbalanceDest': 0,
 'newbalanceDest': 0,
 'isFraud': 0,
 'isFlaggedFraud': 0}

## 1.5 NA Fillout

## 1.6 Changing Data Types


## 1.7 Descriptive Statistics

### 1.7.1 Numerical Attributes

In [62]:
desc = data.describe().toPandas()

                                                                                

In [75]:
desc = desc.T
cols = list(desc.loc['summary', :])
cols

rename_cols = dict(zip(desc.columns, cols))
rename_cols

desc.rename(columns = rename_cols, inplace = True)
desc = desc.drop('summary')

In [101]:
desc

Unnamed: 0,count,mean,stddev,min,max
step,6362620,243.39724563151657,142.33197104912588,1,743
type,6362620,,,CASH_IN,TRANSFER
amount,6362620,179861.90354913412,603858.2314629498,0.0,9.244551664E7
nameOrig,6362620,,,C1000000639,C999999784
oldbalanceOrg,6362620,833883.1040744719,2888242.673037545,0.0,5.958504037E7
newbalanceOrig,6362620,855113.6685785714,2924048.502954253,0.0,4.958504037E7
nameDest,6362620,,,C1000004082,M999999784
oldbalanceDest,6362620,1100701.6665196654,3399180.112994485,0.0,3.5601588935E8
newbalanceDest,6362620,1224996.3982019408,3674128.9421195714,0.0,3.5617927892E8
isFraud,6362620,0.0012908204481801,0.0359047968016044,0,1


In [107]:
# spark.sql('select * from data')
prop = data.groupBy('isFraud').count().toPandas()
prop

                                                                                

Unnamed: 0,isFraud,count
0,1,8213
1,0,6354407


In [113]:
frauds = prop.iloc[0,1]
non_frauds = prop.iloc[1,1]

print('fraud percentage: {:.4f}%'.format( 100*frauds/(frauds+non_frauds)))


fraud percentage: 0.1291%


### 1.7.2 Categorical Attributes

# 2.0 Feature Engineering

## 2.1 Response Variable Mind Map

## 2.2 Hypothesis List

## 2.3 Selected Hypothesis:

**H1:** 

**H2:** 

**H3:** 

**H4:** 

**H5:** 

**H6:** 

**H7:** 

**H8:** 

**H9:** 

**H10:** 

**H11:** 

## 2.4 Feature Engineering

# 3.0 Variable Selection

# 4.0 Exploratory Data Analysis

## 4.1 Univariate Analysis

### 4.1.1 Response Variable

### 4.1.2 Numerical Variables

### 4.1.3 Categorical Variables

## 4.2 Bivariate Analysis

### **H1.**

### **H2.**


### **H3.**

### **H4.**

### **H5.**

### **H6.**

### **H7.**

### **H8.**

### **H9.**

### **H10.**

## 4.3 Multivariate Analysis

### 4.3.1 Numerical Attributes

### 4.3.2 Categorical Attributes

# 5.0 Data Preparation

## 5.1 Numerical Variables Preparation

### 5.1.1 Rescaling

## 5.2.0 Categorical Data Encoding

## 5.3 Data Transformation

# 6.0 Feature Selection

## 6.1 Spliting Dataframe into Train Test 

## 6.2 Boruta Feature Selection

## 6.3 Results and Considerations

# 7.0 Model Testing

## 7.6 Comparing Model's Performances

## 7.7 Cross Validation

# 8.0 Hyperparameter Fine Tuning

## 8.X Final Model

# 9.0 Error Interpretation

## 9.1 Total Performance - Business Perspective

## 9.2 Machine Learning Performance

# 10 Model Deployment