## Classificação Binária

Uso do algoritmo RandomForest para construir um modelo preditivo capaz de prever se um cliente pagará ou não um empréstimo bancário.

In [3]:
# Importa o findspark e inicializa
import findspark
findspark.init()

In [5]:
# Imports
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import PCA
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

### Carregando os Dados

In [6]:
# Criando o Spark Context
sc = SparkContext(appName = 'Classificacao Binaria')

23/03/21 08:57:11 WARN Utils: Your hostname, ingo-Vostro-3583 resolves to a loopback address: 127.0.1.1; using 192.168.1.10 instead (on interface wlo1)
23/03/21 08:57:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/21 08:57:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
sc.setLogLevel('ERROR')

In [8]:
# Spark Session - usada quando se trabalha com Dataframes no Spark
spSession = SparkSession.builder.master('local').getOrCreate()

In [9]:
# Carregando os dados e gerando um RDD
bankRDD = sc.textFile('dados/dataset3.csv')

In [10]:
bankRDD.cache()

dados/dataset3.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [11]:
bankRDD.count()

                                                                                

542

In [12]:
bankRDD.take(5)

['"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"',
 '30;"unemployed";"married";"primary";"no";1787;"no";"no";"cellular";19;"oct";79;1;-1;0;"unknown";"no"',
 '33;"services";"married";"secondary";"no";4789;"yes";"yes";"cellular";11;"may";220;1;339;4;"failure";"yes"',
 '35;"management";"single";"tertiary";"no";1350;"yes";"no";"cellular";16;"apr";185;1;330;1;"failure";"yes"',
 '30;"management";"married";"tertiary";"no";1476;"yes";"yes";"unknown";3;"jun";199;4;-1;0;"unknown";"yes"']

In [15]:
# Removendo a primeira linha do arquivo (cabeçalho)
firstLine = bankRDD.first()
bankRDD2 = bankRDD.filter(lambda x: x!= firstLine)
bankRDD2.count()

541

### Limpeza e Transofmração dos Dados

Vou usar somente algumas variáveis originais e então criar novas variáveis. Essa decisão pode ser baseada no conhecimento da área de negócio e a fim de evitar possíveis preconceitos. Podemos usar outras técnicas de seleção de atributos.

In [33]:
# Transformando os dados para valores numéricos
def transformToNumeric(inputStr):
    
    # Em cada linha faz substituição de caracteres e separa as colunas
    attList = inputStr.replace('\"','').split(';')
    
    # Converte de int para float a fim de aumentar a precisão de cálculos
    age = float(attList[0])
    balance = float(attList[5])
    
    # Aplica One-Hot Enconding criando variáveis dummy
    single = 1.0 if attList[2] == 'single' else 0.0
    married = 1.0 if attList[2] == 'married' else 0.0
    divorced = 1.0 if attList[2] == 'divorced' else 0.0
    primary = 1.0 if attList[3] == 'primary' else 0.0
    secondary = 1.0 if attList[3] == 'secondary' else 0.0
    tertiary = 1.0 if attList[3] == 'tertiary' else 0.0
    
    # Aplica labal encoding convertendo a variável categórica para sua representação numérica
    default = 0.0 if attList[4] == 'no' else 1.0
    loan = 0.0 if attList[7] == 'no' else 1.0
    outcome = 0.0 if attList[16] == 'no' else 1.0
    
    # Cria as linhas com os atributos transformados
    linhas = Row(OUTCOME = outcome,
                 AGE = age,
                 SINGLE = single,
                 MARRIED = married,
                 DIVORCED = divorced,
                 PRIMARY = primary,
                 SECONDARY = secondary,
                 TERTIARY = tertiary,
                 DEFAULT = default,
                 BALANCE = balance,
                 LOAN = loan)
    return linhas
     

In [34]:
# Aplicando a função de limpeza ao conjunto de dados
bankRDD3 = bankRDD2.map(transformToNumeric)

In [35]:
bankRDD3.take(10)

[Row(OUTCOME=0.0, AGE=30.0, SINGLE=0.0, MARRIED=1.0, DIVORCED=0.0, PRIMARY=1.0, SECONDARY=0.0, TERTIARY=0.0, DEFAULT=0.0, BALANCE=1787.0, LOAN=0.0),
 Row(OUTCOME=1.0, AGE=33.0, SINGLE=0.0, MARRIED=1.0, DIVORCED=0.0, PRIMARY=0.0, SECONDARY=1.0, TERTIARY=0.0, DEFAULT=0.0, BALANCE=4789.0, LOAN=1.0),
 Row(OUTCOME=1.0, AGE=35.0, SINGLE=1.0, MARRIED=0.0, DIVORCED=0.0, PRIMARY=0.0, SECONDARY=0.0, TERTIARY=1.0, DEFAULT=0.0, BALANCE=1350.0, LOAN=0.0),
 Row(OUTCOME=1.0, AGE=30.0, SINGLE=0.0, MARRIED=1.0, DIVORCED=0.0, PRIMARY=0.0, SECONDARY=0.0, TERTIARY=1.0, DEFAULT=0.0, BALANCE=1476.0, LOAN=1.0),
 Row(OUTCOME=0.0, AGE=59.0, SINGLE=0.0, MARRIED=1.0, DIVORCED=0.0, PRIMARY=0.0, SECONDARY=1.0, TERTIARY=0.0, DEFAULT=0.0, BALANCE=0.0, LOAN=0.0),
 Row(OUTCOME=1.0, AGE=35.0, SINGLE=1.0, MARRIED=0.0, DIVORCED=0.0, PRIMARY=0.0, SECONDARY=0.0, TERTIARY=1.0, DEFAULT=0.0, BALANCE=747.0, LOAN=0.0),
 Row(OUTCOME=1.0, AGE=36.0, SINGLE=0.0, MARRIED=1.0, DIVORCED=0.0, PRIMARY=0.0, SECONDARY=0.0, TERTIARY=1.0, D

### Análise Exploratória de Dados

In [36]:
# Transforma para DataFrame
bankDF = spSession.createDataFrame(bankRDD3)

In [37]:
# Estatística Descritiva
bankDF.describe().show()

+-------+-------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+--------------------+------------------+-------------------+
|summary|            OUTCOME|               AGE|            SINGLE|           MARRIED|           DIVORCED|           PRIMARY|         SECONDARY|          TERTIARY|             DEFAULT|           BALANCE|               LOAN|
+-------+-------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+--------------------+------------------+-------------------+
|  count|                541|               541|               541|               541|                541|               541|               541|               541|                 541|               541|                541|
|   mean| 0.3974121996303142| 41.26987060998152|0.2754158964879852|0.6155268022181146|0.1090573012939001

In [38]:
# Correlação entre as variáveis
for i in bankDF.columns:
    if not ( isinstance(bankDF.select(i).take(1)[0][0], str)) :
        print( 'Correlação da variável OUTCOME com:', i, bankDF.corr('OUTCOME', i))

Correlação da variável OUTCOME com: OUTCOME 1.0
Correlação da variável OUTCOME com: AGE -0.1823210432736525
Correlação da variável OUTCOME com: SINGLE 0.46323284934360515
Correlação da variável OUTCOME com: MARRIED -0.37532412991335623
Correlação da variável OUTCOME com: DIVORCED -0.07812659940926987
Correlação da variável OUTCOME com: PRIMARY -0.12561548832677982
Correlação da variável OUTCOME com: SECONDARY 0.026392774894072973
Correlação da variável OUTCOME com: TERTIARY 0.08494840766635618
Correlação da variável OUTCOME com: DEFAULT -0.04536965206737378
Correlação da variável OUTCOME com: BALANCE 0.03657486611997681
Correlação da variável OUTCOME com: LOAN -0.030420586112717318


Como ja foi removido muitas variáveis do dataset, levaremos para o modelo todas as variáveis, mesmo as que possuem uma baixa correlação com a variável target

### Pré-Processamento dos Dados

In [40]:
# Criando um LabeledPoint (target, Vector[features])
def transformaVar(row):
    obj = (row['OUTCOME'], Vectors.dense([row['AGE'],
                                          row['BALANCE'],
                                          row['DEFAULT'],
                                          row['DIVORCED'],
                                          row['LOAN'],
                                          row['MARRIED'],
                                          row['PRIMARY'],
                                          row['SECONDARY'],
                                          row['SINGLE'],
                                          row['SINGLE'],
                                          row['TERTIARY']]))
    return obj

In [42]:
# Aplica a função
bankRDD4 = bankDF.rdd.map(transformaVar)

In [43]:
bankRDD4.collect()

[(0.0,
  DenseVector([30.0, 1787.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0])),
 (1.0,
  DenseVector([33.0, 4789.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0])),
 (1.0,
  DenseVector([35.0, 1350.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0])),
 (1.0,
  DenseVector([30.0, 1476.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0])),
 (0.0, DenseVector([59.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0])),
 (1.0,
  DenseVector([35.0, 747.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0])),
 (1.0,
  DenseVector([36.0, 307.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0])),
 (0.0,
  DenseVector([39.0, 147.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0])),
 (0.0,
  DenseVector([41.0, 221.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0])),
 (1.0,
  DenseVector([43.0, -88.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0])),
 (0.0,
  DenseVector([39.0, 9374.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0])),
 (0.0,
  DenseVector([43.0, 264.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0])),
 (0

In [45]:
# Converte o RDD em DataFrame
bankDF = spSession.createDataFrame(bankRDD4, ['label', 'features'])

In [46]:
bankDF.select('features', 'label').show(10)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[30.0,1787.0,0.0,...|  0.0|
|[33.0,4789.0,0.0,...|  1.0|
|[35.0,1350.0,0.0,...|  1.0|
|[30.0,1476.0,0.0,...|  1.0|
|[59.0,0.0,0.0,0.0...|  0.0|
|[35.0,747.0,0.0,0...|  1.0|
|[36.0,307.0,0.0,0...|  1.0|
|[39.0,147.0,0.0,0...|  0.0|
|[41.0,221.0,0.0,0...|  0.0|
|[43.0,-88.0,0.0,0...|  1.0|
+--------------------+-----+
only showing top 10 rows



### Redução de Dimensionalidade com PCA
A redução de dimensionalidade deve ser aplicada quando o númeoro de variáveis preditoras for muito alta.

In [47]:
# Cria o objeto PCA com 3 componentes
bankPCA = PCA(k = 3, inputCol = 'features', outputCol = 'pcaFeatures')

In [48]:
# Treina o modelo
pcaModel = bankPCA.fit(bankDF)

[Stage 46:>                                                         (0 + 2) / 2]                                                                                

In [49]:
# Aplica o modelo PCA para reduzir a dimensionalidade
pcaResult = pcaModel.transform(bankDF).select('label', 'pcaFeatures')

In [50]:
# A informação contida nas variáveis preditoras está agora consolidada em 3 componentes, pra cada linha.
pcaResult.show(truncate = False)

+-----+------------------------------------------------------------+
|label|pcaFeatures                                                 |
+-----+------------------------------------------------------------+
|0.0  |[-1787.0188971972239,28.857529701336283,0.1266291312972141] |
|1.0  |[-4789.020177138077,29.917847882598316,0.053292610767770504]|
|1.0  |[-1350.022213576835,34.077829984525,2.076966626087616]      |
|1.0  |[-1476.0189517183253,29.046732264823223,0.22896055408606056]|
|0.0  |[-0.03788918535837401,58.980330709829545,0.8347761558607867]|
|1.0  |[-747.0223381771169,34.46494761811958,2.0889980872297267]   |
|1.0  |[-307.023069102228,35.794164025123,0.41477695315073215]     |
|0.0  |[-147.02501216174548,38.89489745628777,0.308404639262873]   |
|0.0  |[-221.02629853485416,40.847140155182636,0.5473525013901758] |
|1.0  |[87.97238687688544,43.05581193510255,0.49881521556787134]   |
|0.0  |[-9374.023105550134,32.97129147755279,0.12430133584951678]  |
|0.0  |[-264.0275573152553,42.8179

In [52]:
# Indexação do label é pré-requisito para Decision Trees
stringIndexer = StringIndexer(inputCol = 'label', outputCol = 'label_indexed')
si_model = stringIndexer.fit(pcaResult)
obj_final = si_model.transform(pcaResult)
obj_final.collect()

[Stage 48:>                                                         (0 + 2) / 2]                                                                                

[Row(label=0.0, pcaFeatures=DenseVector([-1787.0189, 28.8575, 0.1266]), label_indexed=0.0),
 Row(label=1.0, pcaFeatures=DenseVector([-4789.0202, 29.9178, 0.0533]), label_indexed=1.0),
 Row(label=1.0, pcaFeatures=DenseVector([-1350.0222, 34.0778, 2.077]), label_indexed=1.0),
 Row(label=1.0, pcaFeatures=DenseVector([-1476.019, 29.0467, 0.229]), label_indexed=1.0),
 Row(label=0.0, pcaFeatures=DenseVector([-0.0379, 58.9803, 0.8348]), label_indexed=0.0),
 Row(label=1.0, pcaFeatures=DenseVector([-747.0223, 34.4649, 2.089]), label_indexed=1.0),
 Row(label=1.0, pcaFeatures=DenseVector([-307.0231, 35.7942, 0.4148]), label_indexed=1.0),
 Row(label=0.0, pcaFeatures=DenseVector([-147.025, 38.8949, 0.3084]), label_indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-221.0263, 40.8471, 0.5474]), label_indexed=0.0),
 Row(label=1.0, pcaFeatures=DenseVector([87.9724, 43.0558, 0.4988]), label_indexed=1.0),
 Row(label=0.0, pcaFeatures=DenseVector([-9374.0231, 32.9713, 0.1243]), label_indexed=0.0),
 Ro

### Machine Learning

In [53]:
# Dados de Treino e de Teste
(dados_treino, dados_teste) = obj_final.randomSplit([0.7, 0.3])

In [54]:
dados_treino.count()

[Stage 52:>                                                         (0 + 2) / 2]                                                                                

383

In [55]:
dados_teste.count()

158

In [58]:
# Criando o objeto
rfClassifier = RandomForestClassifier(labelCol = 'label_indexed', featuresCol = 'pcaFeatures')

In [59]:
# Treinando o objeto e criando o modelo
modelo = rfClassifier.fit(dados_treino)

In [60]:
# Previsões com dados de teste
predictions = modelo.transform(dados_teste)

In [61]:
predictions

DataFrame[label: double, pcaFeatures: vector, label_indexed: double, rawPrediction: vector, probability: vector, prediction: double]

In [62]:
predictions.select('label', 'label_indexed', 'pcaFeatures', 'prediction').collect()

[Stage 72:>                                                         (0 + 2) / 2]                                                                                

[Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-16873.0325, 45.1494, 0.4196]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-11494.0342, 49.6022, 0.5531]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-8104.0336, 49.7795, 0.5684]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-7082.0351, 52.4461, 0.7276]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-5996.0302, 45.1355, 0.4534]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-5426.0252, 37.5056, 0.4435]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-4030.0229, 34.403, 0.1786]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-3762.0275, 41.5725, 0.5552]), prediction=0.0),
 Row(label=0.0, label_indexed=0.0, pcaFeatures=DenseVector([-3571.025, 37.6969, 0.4543]), prediction=0.0),
 Row(label=0.0, label_indexe

In [64]:
# Avaliando a acurácia
evaluator = MulticlassClassificationEvaluator(predictionCol = 'prediction',
                                              labelCol = 'label_indexed',
                                              metricName = 'accuracy')

In [66]:
evaluator.evaluate(predictions)

0.759493670886076

In [67]:
# Confusion Matrix
predictions.groupBy('label_indexed', 'prediction').count().show()

+-------------+----------+-----+
|label_indexed|prediction|count|
+-------------+----------+-----+
|          1.0|       1.0|   39|
|          0.0|       1.0|   14|
|          1.0|       0.0|   24|
|          0.0|       0.0|   81|
+-------------+----------+-----+

