# Usando o Spark para fazer um breve EDA e depois fazer um Undersampling

- Chamando o PySpark e criando uma sessão

In [1]:
from pyspark import SparkContext

In [2]:
sc = SparkContext(master='local[2]')

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/08 20:41:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
sc

In [6]:
import warnings
warnings.filterwarnings('ignore')

from pyspark.sql import SparkSession


spark = SparkSession.builder.master("local").appName("MyApp").getOrCreate()

- Lendo o dataset utilizando o spark

In [7]:
path = '1. Dados/fraud_detection_dataset.csv'

df = spark.read.csv(path,
                    inferSchema="true", #spark precisa disso para ler os tipos dos dados
                    header=True #spark precisa disso para ler o nome das colunas
                   ) 
df.show(5)

                                                                                

+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|  amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT| 9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
|   1| PAYMENT| 1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|
|   1|TRANSFER|   181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|
|   1|CASH_OUT|   181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|      1|             0|
|   1| PAYMENT|11668.14|C2048537720|      41554.0|      29885.86|M1230701703|      

- Analisando o tipo dos dados e quantidade de nulos, similar ao pandas.info(

In [8]:
df.printSchema()

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- isFlaggedFraud: integer (nullable = true)



- Similar ao pandas.describe()

In [9]:
df.summary().show()

[Stage 5:>                                                          (0 + 1) / 1]

+-------+------------------+--------+-----------------+-----------+-----------------+------------------+-----------+------------------+------------------+--------------------+--------------------+
|summary|              step|    type|           amount|   nameOrig|    oldbalanceOrg|    newbalanceOrig|   nameDest|    oldbalanceDest|    newbalanceDest|             isFraud|      isFlaggedFraud|
+-------+------------------+--------+-----------------+-----------+-----------------+------------------+-----------+------------------+------------------+--------------------+--------------------+
|  count|           6362620| 6362620|          6362620|    6362620|          6362620|           6362620|    6362620|           6362620|           6362620|             6362620|             6362620|
|   mean|243.39724563151657|    null|179861.9035491287|       null|833883.1040744764| 855113.6685785812|       null|1100701.6665196533|1224996.3982019224|0.001290820448180152| 2.51468734577894E-6|
| stddev|142.33

                                                                                

- Agrupando por isFraud para ver os dados

In [10]:
df.groupBy('isFraud').avg().show()



+-------+------------------+------------------+------------------+-------------------+-------------------+-------------------+------------+--------------------+
|isFraud|         avg(step)|       avg(amount)|avg(oldbalanceOrg)|avg(newbalanceOrig)|avg(oldbalanceDest)|avg(newbalanceDest)|avg(isFraud)| avg(isFlaggedFraud)|
+-------+------------------+------------------+------------------+-------------------+-------------------+-------------------+------------+--------------------+
|      1| 368.4138560818215|1467967.2991403872|1649667.6057116736| 192392.63183611355|   544249.619074638| 1279707.6171459882|         1.0|0.001948131011810...|
|      0|243.23566306029815|178197.04172740472| 832828.7117272523|   855970.228108804| 1101420.8745693846| 1224925.6845631544|         0.0|                 0.0|
+-------+------------------+------------------+------------------+-------------------+-------------------+-------------------+------------+--------------------+



                                                                                

- Separando treino em teste (70/30)

In [13]:
train_df,test_df = df.randomSplit([0.7,0.3])

- contabilizando o ratio entre isFraud = 0 e isFraud = 1 para iniciar o Undersampling 

In [14]:
major_df = train_df.filter("isFraud==0")
minor_df = train_df.filter("isFraud== 1")
ratio = int(major_df.count()/minor_df.count())
print("ratio: {}".format(ratio))



ratio: 773


                                                                                

- Criando um dataset treino com fraudes 50% positivas e 50% negativas

In [15]:
sampled_majority_df = major_df.sample(False, 1/ratio)
combined_df_2 = sampled_majority_df.unionAll(minor_df)
combined_df_2.show()

[Stage 15:>                                                         (0 + 1) / 1]

+----+--------+----------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|    amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+----------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT|    634.41|  C91220956|     18396.16|      17761.75| M999221400|           0.0|           0.0|      0|             0|
|   1| PAYMENT|   6623.21| C380616082|       1036.0|           0.0| M744316958|           0.0|           0.0|      0|             0|
|   2| PAYMENT|    8068.5|C1973281540|      19934.0|       11865.5|M1447678234|           0.0|           0.0|      0|             0|
|   3| PAYMENT|  22415.68| C856937591|      31164.0|       8748.32| M741095751|           0.0|           0.0|      0|             0|
|   3|TRANSFER| 521678.46| C954742840|          0.0|           0.0| C

                                                                                

- Visualizando a descrição do novo dataset

In [16]:
combined_df_2.summary().show()



+-------+------------------+--------+-----------------+-----------+------------------+------------------+-----------+------------------+------------------+------------------+--------------------+
|summary|              step|    type|           amount|   nameOrig|     oldbalanceOrg|    newbalanceOrig|   nameDest|    oldbalanceDest|    newbalanceDest|           isFraud|      isFlaggedFraud|
+-------+------------------+--------+-----------------+-----------+------------------+------------------+-----------+------------------+------------------+------------------+--------------------+
|  count|             11541|   11541|            11541|      11541|             11541|             11541|      11541|             11541|             11541|             11541|               11541|
|   mean| 305.6164110562343|    null| 830593.909984404|       null|1241769.1279663814| 523593.4608855386|       null| 790294.0138211594|1226811.8251633309| 0.498310371718222|0.001039771250324...|
| stddev|193.9448873

                                                                                

- Criando um novo arquivo .csv com o undersampling dataset

In [17]:
combined_df_2.toPandas().to_csv('balanceado_train.csv')

                                                                                

In [None]:
test_df.toPandas().to_csv('balanceado_test.csv')