# PROJET

# Import des libraries

In [1]:
import findspark
findspark.init()
import pyspark

# import pyspark.sql module
from pyspark.sql import SparkSession

# Create SparkSession object
spark = SparkSession.builder \
                    .master('local[*]') \
                    .appName('FraudDetectionApp') \
                    .getOrCreate()

In [4]:
import warnings
warnings.filterwarnings('ignore')

import os
os.chdir('..')

from utils.preprocessing import *
from utils.model import *

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (18, 8)

# Import des données

In [None]:
# import data
indexed_data = spark.read.csv(r'C:\Users\User\Documents\M2big data\ProjetMLib\DetectionDeFraudeBancaire\input_data\newdata.csv',
                         sep=',',
                         header=True)

In [None]:
indexed_data.show(5)

+----+-------+--------------+--------------+-------+-------------+-------------+-----------+---------------+---------------+
|step| amount|newbalanceOrig|oldbalanceDest|isFraud|     num_orig|     num_dest|typeIndexed|nameOrigIndexed|nameDestIndexed|
+----+-------+--------------+--------------+-------+-------------+-------------+-----------+---------------+---------------+
| 1.0|  181.0|           0.0|       21182.0|      1| 8.40083671E8| 8.40083671E8|        0.0|            0.0|            0.0|
| 1.0|7861.64|     168225.59|           0.0|      0|1.912850431E9|1.912850431E9|        1.0|            0.0|            1.0|
| 1.0|9644.94|           0.0|       10845.0|      0|1.900366749E9|1.900366749E9|        4.0|            0.0|            0.0|
| 1.0|2560.74|       2509.26|           0.0|      0|1.648232591E9|1.648232591E9|        1.0|            0.0|            1.0|
| 1.0|1563.82|           0.0|           0.0|      0| 7.61750706E8| 7.61750706E8|        1.0|            0.0|            1.0|


In [None]:
indexed_data.printSchema()

root
 |-- step: double (nullable = true)
 |-- amount: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- num_orig: double (nullable = true)
 |-- num_dest: double (nullable = true)
 |-- typeIndexed: double (nullable = false)
 |-- nameOrigIndexed: double (nullable = false)
 |-- nameDestIndexed: double (nullable = false)



## Transformer les données à l’aide de VectorAssembler :

In [None]:
vectorized_data = vectorize_fraud_data(indexed_data)

In [None]:
new_df = vectorized_data

In [None]:
new_df['features','isFraud'].show(5)

+--------------------+-------+
|            features|isFraud|
+--------------------+-------+
|(10,[0,1,3,5,6],[...|      1|
|[7861.64,176087.2...|      0|
|[9644.94,4465.0,0...|      0|
|[2560.74,5070.0,2...|      0|
|[1563.82,450.0,0....|      0|
+--------------------+-------+
only showing top 5 rows



### Choix du meilleur modèle de classification pour catégoriser les transactions bancaires et classifier les nouvelles transactions

In [None]:
# Split the data to training sets and test sets :
(train, test )= new_df.randomSplit([0.8, 0.2], seed=23)
[train.count(), test.count()]

[2546215, 635592]

### Modèle de classification :  Logistic Regression 

In [None]:
lr_model = lr_train(train)

In [None]:
lr_eval = lr_eval_test(lr_model, test)

+-------+----------+------+
|isFraud|prediction| count|
+-------+----------+------+
|      1|       0.0|   415|
|      0|       0.0|634751|
|      1|       1.0|   380|
|      0|       1.0|    46|
+-------+----------+------+

Recall :  0.4779874213836478
Precision :  0.892018779342723
F1 Score :  0.6224406224406225
Area under ROC = 0.9909929508193294
Area under PR = 0.5130515281003208


### Modèle de classification :  Decision Tree Classifier

In [None]:
dt_model = Dt_train(train)

In [None]:
dt_eval = Dt_eval_test(dt_model, test)

+--------------------+---------------+-----------+----------+
|            features|  rawPrediction|probability|prediction|
+--------------------+---------------+-----------+----------+
|[23.31,45360.0,45...|[1437028.0,0.0]|  [1.0,0.0]|       0.0|
|(10,[0,5,6,7,9],[...|[1437028.0,0.0]|  [1.0,0.0]|       0.0|
|[112.56,609035.85...|[1437028.0,0.0]|  [1.0,0.0]|       0.0|
|[154.87,9339.0,91...|[1437028.0,0.0]|  [1.0,0.0]|       0.0|
|[339.82,12076.0,1...|[1437028.0,0.0]|  [1.0,0.0]|       0.0|
+--------------------+---------------+-----------+----------+
only showing top 5 rows

+-------+----------+------+
|isFraud|prediction| count|
+-------+----------+------+
|      1|       0.0|   244|
|      0|       0.0|634761|
|      1|       1.0|   551|
|      0|       1.0|    36|
+-------+----------+------+

Recall :  0.6930817610062893
Precision :  0.938671209540034
F1 Score :  0.7973950795947901
Area under ROC = 0.7798224050687704
Area under PR = 0.4142278715874976


### Modèle de classification :  Random Forest Classifier

In [None]:
rf_model = rf_train(train)

In [None]:
rf_eval = rf_eval_test(rf_model, test)

+-------+----------+------+
|isFraud|prediction| count|
+-------+----------+------+
|      1|       0.0|   479|
|      0|       0.0|634796|
|      1|       1.0|   316|
|      0|       1.0|     1|
+-------+----------+------+

Recall :  0.39748427672955977
Precision :  0.9968454258675079
F1 Score :  0.5683453237410073
Area under ROC = 0.9692023953420934
Area under PR = 0.7132785942938621


### Modèle de classification :  Gradient-Boosted Tree Classifier 

In [None]:
GBT_model = GBT_train(train)

In [None]:
GBT_eval = GBT_eval_test(GBT_model, test)

+-------+----------+------+
|isFraud|prediction| count|
+-------+----------+------+
|      1|       0.0|   237|
|      0|       0.0|634783|
|      1|       1.0|   558|
|      0|       1.0|    14|
+-------+----------+------+

Recall :  0.7018867924528301
Precision :  0.9755244755244755
F1 Score :  0.8163862472567667
Area under ROC = 0.9829058015208803
Area under PR = 0.7807439732858391


### Modèle de classification :  Naive Bayes

In [None]:
NB_model = NB_train(train)

In [None]:
NB_eval = NB_eval_test(NB_model, test)

+-------+----------+------+
|isFraud|prediction| count|
+-------+----------+------+
|      1|       0.0|   355|
|      0|       0.0|596227|
|      1|       1.0|   440|
|      0|       1.0| 38570|
+-------+----------+------+

Recall :  0.5534591194968553
Precision :  0.011279159189951295
F1 Score :  0.022107775405099866
Area under ROC = 0.49412195289727784
Area under PR = 0.0012358315240476004


**Observation:** Le GRadient Boost Classifier est plus performant que les autres modèles si on considère toutes les métriques de l'évaluation. Donc c'est celui qui sera utilisé pour classer les nouvelles transactions.