## Analise de Crédito

In [None]:
import findspark
findspark.init()

In [None]:
import math
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import PCA
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# Build the SparkSession
spark = SparkSession.builder \
   .master("local[*]") \
   .appName("Nome do Projeto") \
   .config("spark.executor.memory", "6gb") \
   .config('spark.sql.debug.maxToStringFields', 2000) \
   .config('spark.debug.maxToStringFields', 2000) \
   .config("spark.sql.caseSensitive", "false") \
   .getOrCreate()
   
sc = spark.sparkContext

In [None]:
# Carregando os dados e gerando um RDD
bankRDD = sc.textFile("data/bank.csv")

In [None]:
bankRDD.cache()

In [None]:
bankRDD.count()

In [None]:
bankRDD.take(5)

In [None]:
# Removendo a primeira linha do arquivo (cabeçalho)
firstLine = bankRDD.first()
bankRDD2 = bankRDD.filter(lambda x: x != firstLine)
bankRDD2.count()

## Limpeza dos Dados

In [None]:
# Transformando os dados para valore snuméricos
def transformToNumeric( inputStr) :
    
    attList = inputStr.replace("\"","").split(";")
    
    age = float(attList[0])
    outcome = 0.0 if attList[16] == "no" else 1.0
    single = 1.0 if attList[2] == "single" else 0.0
    married = 1.0 if attList[2] == "married" else 0.0
    divorced = 1.0 if attList[2] == "divorced" else 0.0
    primary = 1.0 if attList[3] == "primary" else 0.0
    secondary = 1.0 if attList[3] == "secondary" else 0.0
    tertiary = 1.0 if attList[3] == "tertiary" else 0.0
    default = 0.0 if attList[4] == "no" else 1.0
    balance = float(attList[5])
    loan = 0.0 if attList[7] == "no" else 1.0
    
    # Cria as linhas com os objetos transformados
    linhas = Row(OUTCOME = outcome, AGE = age, SINGLE = single, MARRIED = married, DIVORCED = divorced,
                 PRIMARY = primary, SECONDARY = secondary, TERTIARY = tertiary, DEFAULT = default, BALANCE = balance,
                 LOAN = loan) 
    return linhas

In [None]:
# Aplicando a função de limpeza ao conjunto de dados
bankRDD3 = bankRDD2.map(transformToNumeric)
bankRDD3.collect()[:15]

## Análise Exploratória de Dados

In [None]:
# Transforma para Dataframe
bankDF = spark.createDataFrame(bankRDD3)

In [None]:
# Estatística descritiva
bankDF.describe().show()

In [None]:
# Correlação entre as variáveis
for i in bankDF.columns:
    if not( isinstance(bankDF.select(i).take(1)[0][0], str)) :
        print( "Correlação da variável OUTCOME com", i, bankDF.stat.corr('OUTCOME',i))

## Pré-Processamento dos Dados

In [None]:
# Criando um LabeledPoint (target, Vector[features])
def transformaVar(row) :
    obj = (row["OUTCOME"], Vectors.dense([row["AGE"], row["BALANCE"], row["DEFAULT"], row["DIVORCED"], row["LOAN"], 
                                          row["MARRIED"], row["PRIMARY"], row["SECONDARY"], row["SINGLE"], 
                                          row["TERTIARY"]]))
    return obj

In [None]:
bankRDD4 = bankDF.rdd.map(transformaVar)

In [None]:
bankRDD4.collect()

In [None]:
bankDF = spark.createDataFrame(bankRDD4,["label", "features"])
bankDF.select("label","features").show(10)

## Machine Learning

In [None]:
# Aplicando Redução de Dimensionalidade com PCA
bankPCA = PCA(k = 3, inputCol = "features", outputCol = "pcaFeatures")
pcaModel = bankPCA.fit(bankDF)
pcaResult = pcaModel.transform(bankDF).select("label","pcaFeatures")
pcaResult.show(truncate = False)

In [None]:
# Indexação é pré-requisito para Decision Trees
stringIndexer = StringIndexer(inputCol = "label", outputCol = "indexed")
si_model = stringIndexer.fit(pcaResult)
obj_final = si_model.transform(pcaResult)
obj_final.collect()

In [None]:
# Dados de Treino e de Teste
(dados_treino, dados_teste) = obj_final.randomSplit([0.7, 0.3])

In [None]:
dados_treino.count()

In [None]:
dados_teste.count()

In [None]:
# Criando o modelo
rfClassifer = RandomForestClassifier(labelCol = "indexed", featuresCol = "pcaFeatures")
modelo = rfClassifer.fit(dados_treino)

In [None]:
# Previsões com dados de teste
predictions = modelo.transform(dados_teste)
predictions.select("prediction", "indexed", "label", "pcaFeatures").collect()

In [None]:
# Avaliando a acurácia
evaluator = MulticlassClassificationEvaluator(predictionCol = "prediction", labelCol = "indexed", metricName = "accuracy")
evaluator.evaluate(predictions)      

In [None]:
# Confusion Matrix
predictions.groupBy("indexed", "prediction").count().show()

In [None]:
# Stop Spark session
spark.stop()