# Detection de fraude bancaire

## Import des libraries

In [1]:
import findspark
findspark.init()
import pyspark

# import pyspark.sql module
from pyspark.sql import SparkSession

# Create SparkSession object
spark = SparkSession.builder \
                    .master('local[*]') \
                    .appName('FraudDetectionApp') \
                    .getOrCreate()

In [2]:
import warnings
warnings.filterwarnings('ignore')

import os
os.chdir('..')

from utils.preprocessing import *
from utils.model import *

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (18, 8)

## Import des données

In [3]:
# import data
df = spark.read.csv(r'C:\Users\User\Documents\M2big data\ProjetMLib\DetectionDeFraudeBancaire\input_data\creditcardsdata.csv',
                         sep=',',
                         header=True)

# View the first five records
df.show(5)

+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|  amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT| 9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
|   1| PAYMENT| 1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|
|   1|TRANSFER|   181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|
|   1|CASH_OUT|   181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|      1|             0|
|   1| PAYMENT|11668.14|C2048537720|      41554.0|      29885.86|M1230701703|      

In [4]:
# Select randomly 1M records
df = df.sample(False, 0.5, 42)

# Pre-processing des données 

In [5]:
from pyspark.sql.types import DoubleType

df = df.withColumn("step", df.step.cast(DoubleType()))
df = df.withColumn("amount", df.amount.cast(DoubleType()))
df = df.withColumn("newbalanceOrig", df.newbalanceOrig.cast(DoubleType()))
df = df.withColumn("oldbalanceDest", df.oldbalanceDest.cast(DoubleType()))
df = df.withColumn("isFraud", df.isFraud.cast('int'))

In [6]:
df = df.drop('isFlaggedFraud')
df = df.drop('newbalanceDest')
df = df.drop('oldbalanceOrg')

In [7]:
data = df

In [8]:
data.printSchema()

root
 |-- step: double (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)



In [9]:
data.show(5)

+----+--------+-------+-----------+--------------+-----------+--------------+-------+
|step|    type| amount|   nameOrig|newbalanceOrig|   nameDest|oldbalanceDest|isFraud|
+----+--------+-------+-----------+--------------+-----------+--------------+-------+
| 1.0|CASH_OUT|  181.0| C840083671|           0.0|  C38997010|       21182.0|      1|
| 1.0| PAYMENT|7861.64|C1912850431|     168225.59| M633326333|           0.0|      0|
| 1.0|   DEBIT|9644.94|C1900366749|           0.0| C997608398|       10845.0|      0|
| 1.0| PAYMENT|2560.74|C1648232591|       2509.26| M972865270|           0.0|      0|
| 1.0| PAYMENT|1563.82| C761750706|           0.0|M1731217984|           0.0|      0|
+----+--------+-------+-----------+--------------+-----------+--------------+-------+
only showing top 5 rows



In [10]:
data['type', 'nameOrig', 'nameDest'].show(5)

+--------+-----------+-----------+
|    type|   nameOrig|   nameDest|
+--------+-----------+-----------+
|CASH_OUT| C840083671|  C38997010|
| PAYMENT|C1912850431| M633326333|
|   DEBIT|C1900366749| C997608398|
| PAYMENT|C1648232591| M972865270|
| PAYMENT| C761750706|M1731217984|
+--------+-----------+-----------+
only showing top 5 rows



### separer nameOrig et nameDest :

In [11]:
data = sepNameOrig(data)
data = sepNameDest(data)
# show df
data.show(5)

+----+--------+-------+--------------+--------------+-------+--------+----------+--------+----------+
|step|    type| amount|newbalanceOrig|oldbalanceDest|isFraud|str_orig|  num_orig|str_dest|  num_dest|
+----+--------+-------+--------------+--------------+-------+--------+----------+--------+----------+
| 1.0|CASH_OUT|  181.0|           0.0|       21182.0|      1|       C| 840083671|       C|  38997010|
| 1.0| PAYMENT|7861.64|     168225.59|           0.0|      0|       C|1912850431|       M| 633326333|
| 1.0|   DEBIT|9644.94|           0.0|       10845.0|      0|       C|1900366749|       C| 997608398|
| 1.0| PAYMENT|2560.74|       2509.26|           0.0|      0|       C|1648232591|       M| 972865270|
| 1.0| PAYMENT|1563.82|           0.0|           0.0|      0|       C| 761750706|       M|1731217984|
+----+--------+-------+--------------+--------------+-------+--------+----------+--------+----------+
only showing top 5 rows



In [12]:
#Transformer les variables "num_orig" et "num_Dest" en des variables numériques
data = create_numDf(data)


In [14]:
data.dtypes

[('step', 'double'),
 ('type', 'string'),
 ('amount', 'double'),
 ('newbalanceOrig', 'double'),
 ('oldbalanceDest', 'double'),
 ('isFraud', 'int'),
 ('str_orig', 'string'),
 ('num_orig', 'double'),
 ('str_dest', 'string'),
 ('num_dest', 'double')]

### Appliquer OneHotEncoder aux variables type, str_orig et str_dest columns

In [15]:
data = type_indexer(data)
data = nameOrig_indexer(data)
data = nameDest_indexer(data)
indexed_data = data

In [16]:
indexed_data.dtypes

[('step', 'double'),
 ('amount', 'double'),
 ('newbalanceOrig', 'double'),
 ('oldbalanceDest', 'double'),
 ('isFraud', 'int'),
 ('num_orig', 'double'),
 ('num_dest', 'double'),
 ('typeIndexed', 'double'),
 ('nameOrigIndexed', 'double'),
 ('nameDestIndexed', 'double')]

In [17]:
indexed_data.show(5)

+----+-------+--------------+--------------+-------+-------------+-------------+-----------+---------------+---------------+
|step| amount|newbalanceOrig|oldbalanceDest|isFraud|     num_orig|     num_dest|typeIndexed|nameOrigIndexed|nameDestIndexed|
+----+-------+--------------+--------------+-------+-------------+-------------+-----------+---------------+---------------+
| 1.0|  181.0|           0.0|       21182.0|      1| 8.40083671E8| 8.40083671E8|        0.0|            0.0|            0.0|
| 1.0|7861.64|     168225.59|           0.0|      0|1.912850431E9|1.912850431E9|        1.0|            0.0|            1.0|
| 1.0|9644.94|           0.0|       10845.0|      0|1.900366749E9|1.900366749E9|        4.0|            0.0|            0.0|
| 1.0|2560.74|       2509.26|           0.0|      0|1.648232591E9|1.648232591E9|        1.0|            0.0|            1.0|
| 1.0|1563.82|           0.0|           0.0|      0| 7.61750706E8| 7.61750706E8|        1.0|            0.0|            1.0|


In [19]:
indexed_data.printSchema()

root
 |-- step: double (nullable = true)
 |-- amount: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- num_orig: double (nullable = true)
 |-- num_dest: double (nullable = true)
 |-- typeIndexed: double (nullable = false)
 |-- nameOrigIndexed: double (nullable = false)
 |-- nameDestIndexed: double (nullable = false)



In [18]:
indexed_data.groupBy("typeIndexed").count().show()

+-----------+-------+
|typeIndexed|  count|
+-----------+-------+
|        0.0|1119671|
|        1.0|1075718|
|        4.0|  20830|
|        3.0| 265771|
|        2.0| 699817|
+-----------+-------+



In [None]:
data.write.csv("/input_data")