### Obtenção dos dados

In [1]:
# Imports
import math
from pyspark.ml.linalg         import Vectors
from pyspark.sql               import Row
from pyspark.ml.feature        import StringIndexer
from pyspark.ml.feature        import PCA
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation     import MulticlassClassificationEvaluator

In [2]:
# Spark Session - usada quando se trabalha com Dataframes no Spark
spSession = SparkSession.builder.master("local").appName("DSA-Mini_Pr4").getOrCreate()

In [3]:
# Carregando os dados e gerando um RDD via textFile
# RDD treino
treino_RDD = sc.textFile('projeto4_telecom_treino.csv')
# RDD validacao
teste_RDD = sc.textFile('projeto4_telecom_teste.csv')

# colocar os RDD's em cache para melhorar performance
treino_RDD.cache()
teste_RDD.cache

<bound method RDD.cache of projeto4_telecom_teste.csv MapPartitionsRDD[3] at textFile at NativeMethodAccessorImpl.java:0>

In [4]:
# contagem de registros em treino:
treino_RDD.count()

3334

In [5]:
# contagem de registros em teste:
teste_RDD.count()

1668

In [6]:
# visualizar os primeiros registros em treino:
treino_RDD.take(3)

['"","state","account_length","area_code","international_plan","voice_mail_plan","number_vmail_messages","total_day_minutes","total_day_calls","total_day_charge","total_eve_minutes","total_eve_calls","total_eve_charge","total_night_minutes","total_night_calls","total_night_charge","total_intl_minutes","total_intl_calls","total_intl_charge","number_customer_service_calls","churn"',
 '"1","KS",128,"area_code_415","no","yes",25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10,3,2.7,1,"no"',
 '"2","OH",107,"area_code_415","no","yes",26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,"no"']

In [7]:
# visualizar os primeiros registros em teste:
teste_RDD.take(3)

['"","state","account_length","area_code","international_plan","voice_mail_plan","number_vmail_messages","total_day_minutes","total_day_calls","total_day_charge","total_eve_minutes","total_eve_calls","total_eve_charge","total_night_minutes","total_night_calls","total_night_charge","total_intl_minutes","total_intl_calls","total_intl_charge","number_customer_service_calls","churn"',
 '"1","HI",101,"area_code_510","no","no",0,70.9,123,12.05,211.9,73,18.01,236,73,10.62,10.6,3,2.86,3,"no"',
 '"2","MT",137,"area_code_510","no","no",0,223.6,86,38.01,244.8,139,20.81,94.2,81,4.24,9.5,7,2.57,0,"no"']

### Limpeza dos dados

In [8]:
# Removendo a primeira linha do arquivo (cabeçalho)
firstLine_treino = treino_RDD.first()
firstLine_teste  = teste_RDD.first()
treino_RDD2 = treino_RDD.filter(lambda x: x != firstLine_treino)
teste_RDD2  = teste_RDD.filter(lambda x: x != firstLine_teste)

In [9]:
# Transformando os dados para valores numéricos
# com um Pandas get_dummies manual (tem que ter outra forma, não é possível!!!)
def transformToNumeric(inputStr) :
    
    attList = inputStr.replace("\"","").split(',')
    
    account_length        = float(attList[2])
    area_code_408         = 1.0 if attList[3]  == 'area_code_408' else 0.0   
    area_code_415         = 1.0 if attList[3]  == 'area_code_415' else 0.0   
    area_code_510         = 1.0 if attList[3]  == 'area_code_510' else 0.0   
    international_plan_y  = 1.0 if attList[4]  == 'yes' else 0.0
    international_plan_n  = 1.0 if attList[4]  == 'no' else 0.0
    voice_mail_plan_y     = 1.0 if attList[5]  == 'yes' else 0.0
    voice_mail_plan_n     = 1.0 if attList[5]  == 'no' else 0.0    
    number_vmail_messages = float(attList[6])
    total_day_minutes     = float(attList[7])
    total_day_calls       = float(attList[8])
    total_day_charge      = float(attList[9])
    total_eve_minutes     = float(attList[10])
    total_eve_calls       = float(attList[11])
    total_eve_charge      = float(attList[12])
    total_night_minutes   = float(attList[13])
    total_night_calls     = float(attList[14])
    total_night_charge    = float(attList[15])
    total_intl_minutes    = float(attList[16])
    total_intl_calls      = float(attList[17])
    total_intl_charge     = float(attList[18])
    number_customer_service_calls = float(attList[19])
    churn                 = 1.0 if attList[20] == 'yes' else 0.0
    
    # Cria as linhas com os objetos transformados
    linhas = Row(ACCOUNT_LENGTH = account_length, AREA_CODE_408 = area_code_408,
                 AREA_CODE_415 = area_code_415, AREA_CODE_510 = area_code_510,
                 INTERNATIONAL_PLAN_Y = international_plan_y, INTERNATIONAL_PLAN_N = international_plan_n,
                 VOICE_MAIL_PLAN_Y = voice_mail_plan_y, VOICE_MAIL_PLAN_N = voice_mail_plan_n,
                 NUMBER_VMAIL_MESSAGES = number_vmail_messages, TOTAL_DAY_MINUTES = total_day_minutes,
                 TOTAL_DAY_CALLS = total_day_calls, TOTAL_DAY_CHARGE = total_day_charge,
                 TOTAL_EVE_MINUTES = total_eve_minutes, TOTAL_EVE_CALLS = total_eve_calls,
                 TOTAL_EVE_CHARGE = total_eve_charge, TOTAL_NIGHT_MINUTES = total_night_minutes,
                 TOTAL_NIGHT_CALLS = total_night_calls, TOTAL_NIGHT_CHARGE = total_night_charge,
                 TOTAL_INTL_MINUTES = total_intl_minutes, TOTAL_INTL_CALLS = total_intl_calls,
                 TOTAL_INTL_CHARGE = total_intl_charge, 
                 NUMBER_CUSTOMER_SERVICE_CALLS = number_customer_service_calls, CHURN = churn) 
    return linhas

In [10]:
# Aplicando a função de limpeza ao conjunto de dados
treino_RDD3 = treino_RDD2.map(transformToNumeric)
teste_RDD3  = teste_RDD2.map(transformToNumeric)

In [11]:
# Verificar resultado no dataset de treino
treino_RDD3.collect()[:13]

[Row(ACCOUNT_LENGTH=128.0, AREA_CODE_408=0.0, AREA_CODE_415=1.0, AREA_CODE_510=0.0, CHURN=0.0, INTERNATIONAL_PLAN_N=1.0, INTERNATIONAL_PLAN_Y=0.0, NUMBER_CUSTOMER_SERVICE_CALLS=1.0, NUMBER_VMAIL_MESSAGES=25.0, TOTAL_DAY_CALLS=110.0, TOTAL_DAY_CHARGE=45.07, TOTAL_DAY_MINUTES=265.1, TOTAL_EVE_CALLS=99.0, TOTAL_EVE_CHARGE=16.78, TOTAL_EVE_MINUTES=197.4, TOTAL_INTL_CALLS=3.0, TOTAL_INTL_CHARGE=2.7, TOTAL_INTL_MINUTES=10.0, TOTAL_NIGHT_CALLS=91.0, TOTAL_NIGHT_CHARGE=11.01, TOTAL_NIGHT_MINUTES=244.7, VOICE_MAIL_PLAN_N=0.0, VOICE_MAIL_PLAN_Y=1.0),
 Row(ACCOUNT_LENGTH=107.0, AREA_CODE_408=0.0, AREA_CODE_415=1.0, AREA_CODE_510=0.0, CHURN=0.0, INTERNATIONAL_PLAN_N=1.0, INTERNATIONAL_PLAN_Y=0.0, NUMBER_CUSTOMER_SERVICE_CALLS=1.0, NUMBER_VMAIL_MESSAGES=26.0, TOTAL_DAY_CALLS=123.0, TOTAL_DAY_CHARGE=27.47, TOTAL_DAY_MINUTES=161.6, TOTAL_EVE_CALLS=103.0, TOTAL_EVE_CHARGE=16.62, TOTAL_EVE_MINUTES=195.5, TOTAL_INTL_CALLS=3.0, TOTAL_INTL_CHARGE=3.7, TOTAL_INTL_MINUTES=13.7, TOTAL_NIGHT_CALLS=103.0, TOTA

In [12]:
# Verificar resultado no dataset de treino
teste_RDD3.collect()[:13]

[Row(ACCOUNT_LENGTH=101.0, AREA_CODE_408=0.0, AREA_CODE_415=0.0, AREA_CODE_510=1.0, CHURN=0.0, INTERNATIONAL_PLAN_N=1.0, INTERNATIONAL_PLAN_Y=0.0, NUMBER_CUSTOMER_SERVICE_CALLS=3.0, NUMBER_VMAIL_MESSAGES=0.0, TOTAL_DAY_CALLS=123.0, TOTAL_DAY_CHARGE=12.05, TOTAL_DAY_MINUTES=70.9, TOTAL_EVE_CALLS=73.0, TOTAL_EVE_CHARGE=18.01, TOTAL_EVE_MINUTES=211.9, TOTAL_INTL_CALLS=3.0, TOTAL_INTL_CHARGE=2.86, TOTAL_INTL_MINUTES=10.6, TOTAL_NIGHT_CALLS=73.0, TOTAL_NIGHT_CHARGE=10.62, TOTAL_NIGHT_MINUTES=236.0, VOICE_MAIL_PLAN_N=1.0, VOICE_MAIL_PLAN_Y=0.0),
 Row(ACCOUNT_LENGTH=137.0, AREA_CODE_408=0.0, AREA_CODE_415=0.0, AREA_CODE_510=1.0, CHURN=0.0, INTERNATIONAL_PLAN_N=1.0, INTERNATIONAL_PLAN_Y=0.0, NUMBER_CUSTOMER_SERVICE_CALLS=0.0, NUMBER_VMAIL_MESSAGES=0.0, TOTAL_DAY_CALLS=86.0, TOTAL_DAY_CHARGE=38.01, TOTAL_DAY_MINUTES=223.6, TOTAL_EVE_CALLS=139.0, TOTAL_EVE_CHARGE=20.81, TOTAL_EVE_MINUTES=244.8, TOTAL_INTL_CALLS=7.0, TOTAL_INTL_CHARGE=2.57, TOTAL_INTL_MINUTES=9.5, TOTAL_NIGHT_CALLS=81.0, TOTAL_NI

### Análise Exploratória de Dados

In [13]:
# Transforma para Dataframe
df_treino = spSession.createDataFrame(treino_RDD3)
df_teste  = spSession.createDataFrame(teste_RDD3)

In [14]:
# Estatísticas do dataset de treino:
df_treino.describe().show()

+-------+------------------+-------------------+------------------+-------------------+-------------------+--------------------+--------------------+-----------------------------+---------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+-------------------+
|summary|    ACCOUNT_LENGTH|      AREA_CODE_408|     AREA_CODE_415|      AREA_CODE_510|              CHURN|INTERNATIONAL_PLAN_N|INTERNATIONAL_PLAN_Y|NUMBER_CUSTOMER_SERVICE_CALLS|NUMBER_VMAIL_MESSAGES|   TOTAL_DAY_CALLS|  TOTAL_DAY_CHARGE| TOTAL_DAY_MINUTES|   TOTAL_EVE_CALLS|  TOTAL_EVE_CHARGE|TOTAL_EVE_MINUTES|  TOTAL_INTL_CALLS| TOTAL_INTL_CHARGE|TOTAL_INTL_MINUTES| TOTAL_NIGHT_CALLS|TOTAL_NIGHT_CHARGE|TOTAL_NIGHT_MINUTES|  VOICE_MAIL_PLAN_N|  VOICE_MAIL_PLAN_Y|
+-------+------------------+-------------------+------------

In [15]:
# correlação entre as variáveis:
for col in df_treino.columns:
    print( "Correlação da variável CHURN com", col, df_treino.stat.corr('CHURN', col))

Correlação da variável CHURN com ACCOUNT_LENGTH 0.016540742243674286
Correlação da variável CHURN com AREA_CODE_408 0.0011034497604588199
Correlação da variável CHURN com AREA_CODE_415 -0.006534884590439005
Correlação da variável CHURN com AREA_CODE_510 0.006422866455683383
Correlação da variável CHURN com CHURN 1.0
Correlação da variável CHURN com INTERNATIONAL_PLAN_N -0.2598518473454816
Correlação da variável CHURN com INTERNATIONAL_PLAN_Y 0.2598518473454819
Correlação da variável CHURN com NUMBER_CUSTOMER_SERVICE_CALLS 0.20874999878379408
Correlação da variável CHURN com NUMBER_VMAIL_MESSAGES -0.08972796983506418
Correlação da variável CHURN com TOTAL_DAY_CALLS 0.018459311608577066
Correlação da variável CHURN com TOTAL_DAY_CHARGE 0.20515074317015397
Correlação da variável CHURN com TOTAL_DAY_MINUTES 0.2051508292613899
Correlação da variável CHURN com TOTAL_EVE_CALLS 0.00923313191307792
Correlação da variável CHURN com TOTAL_EVE_CHARGE 0.09278603942871391
Correlação da variável CHUR

### Pré-processamento dos Dados

In [16]:
# Criando um LabeledPoint (target, Vector[features])
def transformaVar(row) :
    obj = (row["CHURN"], Vectors.dense([row["ACCOUNT_LENGTH"], row["AREA_CODE_408"], row["AREA_CODE_415"], row["AREA_CODE_510"], row["INTERNATIONAL_PLAN_N"], 
                                        row["INTERNATIONAL_PLAN_Y"], row["NUMBER_CUSTOMER_SERVICE_CALLS"], row["NUMBER_VMAIL_MESSAGES"], row["TOTAL_DAY_CALLS"], 
                                        row["TOTAL_DAY_CHARGE"], row["TOTAL_DAY_MINUTES"], row["TOTAL_EVE_CALLS"], row["TOTAL_EVE_CHARGE"], 
                                        row["TOTAL_EVE_MINUTES"], row["TOTAL_INTL_CALLS"], row["TOTAL_INTL_CHARGE"], row["TOTAL_INTL_MINUTES"], 
                                        row["TOTAL_NIGHT_CALLS"], row["TOTAL_NIGHT_CHARGE"], row["TOTAL_NIGHT_MINUTES"], row["VOICE_MAIL_PLAN_N"], 
                                        row["VOICE_MAIL_PLAN_Y"]]))
    return obj

In [17]:
# Converte os dataframes para RDD para aplicar a função via map
treino_RDD4 = df_treino.rdd.map(transformaVar)
teste_RDD4  = df_teste.rdd.map(transformaVar)

In [18]:
treino_RDD4.take(13)

[(0.0,
  DenseVector([128.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 25.0, 110.0, 45.07, 265.1, 99.0, 16.78, 197.4, 3.0, 2.7, 10.0, 91.0, 11.01, 244.7, 0.0, 1.0])),
 (0.0,
  DenseVector([107.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 26.0, 123.0, 27.47, 161.6, 103.0, 16.62, 195.5, 3.0, 3.7, 13.7, 103.0, 11.45, 254.4, 0.0, 1.0])),
 (0.0,
  DenseVector([137.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 114.0, 41.38, 243.4, 110.0, 10.3, 121.2, 5.0, 3.29, 12.2, 104.0, 7.32, 162.6, 1.0, 0.0])),
 (0.0,
  DenseVector([84.0, 1.0, 0.0, 0.0, 0.0, 1.0, 2.0, 0.0, 71.0, 50.9, 299.4, 88.0, 5.26, 61.9, 7.0, 1.78, 6.6, 89.0, 8.86, 196.9, 1.0, 0.0])),
 (0.0,
  DenseVector([75.0, 0.0, 1.0, 0.0, 0.0, 1.0, 3.0, 0.0, 113.0, 28.34, 166.7, 122.0, 12.61, 148.3, 3.0, 2.73, 10.1, 121.0, 8.41, 186.9, 1.0, 0.0])),
 (0.0,
  DenseVector([118.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 98.0, 37.98, 223.4, 101.0, 18.75, 220.6, 6.0, 1.7, 6.3, 118.0, 9.18, 203.9, 1.0, 0.0])),
 (0.0,
  DenseVector([121.0, 0.0, 0.0, 1.0, 1.0, 0.0, 3.0, 24.0, 88.0, 37.09,

In [19]:
teste_RDD4.take(13)

[(0.0,
  DenseVector([101.0, 0.0, 0.0, 1.0, 1.0, 0.0, 3.0, 0.0, 123.0, 12.05, 70.9, 73.0, 18.01, 211.9, 3.0, 2.86, 10.6, 73.0, 10.62, 236.0, 1.0, 0.0])),
 (0.0,
  DenseVector([137.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 86.0, 38.01, 223.6, 139.0, 20.81, 244.8, 7.0, 2.57, 9.5, 81.0, 4.24, 94.2, 1.0, 0.0])),
 (0.0,
  DenseVector([103.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 29.0, 95.0, 50.1, 294.7, 105.0, 20.17, 237.3, 6.0, 3.7, 13.7, 127.0, 13.51, 300.3, 0.0, 1.0])),
 (0.0,
  DenseVector([99.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 123.0, 36.86, 216.8, 88.0, 10.74, 126.4, 2.0, 4.24, 15.7, 82.0, 9.93, 220.6, 1.0, 0.0])),
 (0.0,
  DenseVector([108.0, 0.0, 1.0, 0.0, 1.0, 0.0, 2.0, 0.0, 78.0, 33.56, 197.4, 101.0, 10.54, 124.0, 4.0, 2.08, 7.7, 107.0, 9.2, 204.5, 1.0, 0.0])),
 (0.0,
  DenseVector([117.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 85.0, 38.51, 226.5, 68.0, 12.04, 141.6, 5.0, 1.86, 6.9, 90.0, 10.04, 223.0, 1.0, 0.0])),
 (0.0,
  DenseVector([63.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 32.0, 124.0, 37.21, 21

In [20]:
# converte os RDD's para dataframe novamente
df_treino = spSession.createDataFrame(treino_RDD4, ["label", "features"])
df_treino.select("features", "label").show(13)
df_teste  = spSession.createDataFrame(teste_RDD4, ['label', 'features'])
df_teste.select('features', 'label').show(13)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[128.0,0.0,1.0,0....|  0.0|
|[107.0,0.0,1.0,0....|  0.0|
|[137.0,0.0,1.0,0....|  0.0|
|[84.0,1.0,0.0,0.0...|  0.0|
|[75.0,0.0,1.0,0.0...|  0.0|
|[118.0,0.0,0.0,1....|  0.0|
|[121.0,0.0,0.0,1....|  0.0|
|[147.0,0.0,1.0,0....|  0.0|
|[117.0,1.0,0.0,0....|  0.0|
|[141.0,0.0,1.0,0....|  0.0|
|[65.0,0.0,1.0,0.0...|  1.0|
|[74.0,0.0,1.0,0.0...|  0.0|
|[168.0,1.0,0.0,0....|  0.0|
+--------------------+-----+
only showing top 13 rows

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[101.0,0.0,0.0,1....|  0.0|
|[137.0,0.0,0.0,1....|  0.0|
|[103.0,1.0,0.0,0....|  0.0|
|[99.0,0.0,1.0,0.0...|  0.0|
|[108.0,0.0,1.0,0....|  0.0|
|[117.0,0.0,1.0,0....|  0.0|
|[63.0,0.0,1.0,0.0...|  0.0|
|[94.0,1.0,0.0,0.0...|  0.0|
|[138.0,0.0,0.0,1....|  0.0|
|[128.0,0.0,1.0,0....|  0.0|
|[113.0,0.0,0.0,1....|  0.0|
|[140.0,0.0,1.0,0....|  0.0|
|[102.0,0.0,1.0,0....|  0.0|
+----------------

In [21]:
# Aplicar PCA para reduzir as características do vetor
treino_pca = PCA(k = 3, inputCol = 'features', outputCol = 'pcaFeatures')
pcaModelo  = treino_pca.fit(df_treino)
treino_res = pcaModelo.transform(df_treino).select("label","pcaFeatures")
treino_res.show(truncate = False)

+-----+-------------------------------------------------------------+
|label|pcaFeatures                                                  |
+-----+-------------------------------------------------------------+
|0.0  |[-284.9452776474054,-20.157289854202656,-299.855298265654]   |
|0.0  |[-180.2084568461734,-15.279103237416633,-311.7301028794292]  |
|0.0  |[-258.31433333219644,-2.692200674246979,-189.7469293265914]  |
|0.0  |[-312.517613863277,66.24962419593346,-184.05441696267064]    |
|0.0  |[-181.941656174862,-13.503883987373728,-229.76645208482387]  |
|0.0  |[-242.8926121022391,-63.24568901652336,-280.84837847988786]  |
|0.0  |[-243.09764883764464,-165.2677369984866,-359.57614400953764] |
|0.0  |[-170.93271916000646,37.37219011957097,-223.79201955210036]  |
|0.0  |[-208.68790354634794,-166.73021124988378,-365.32007851324875]|
|0.0  |[-281.33776466464775,4.39965191892087,-381.3770760944855]    |
|1.0  |[-147.24240965694008,-68.79023962531346,-294.64330201448064] |
|0.0  |[-203.9775284

In [22]:
# repetir o processo no dataset de teste
teste_pca = PCA(k = 3, inputCol = 'features', outputCol = 'pcaFeatures')
pcaModelo = teste_pca.fit(df_teste)
teste_res = pcaModelo.transform(df_teste).select("label","pcaFeatures")
teste_res.show(truncate = False)

+-----+------------------------------------------------------------+
|label|pcaFeatures                                                 |
+-----+------------------------------------------------------------+
|0.0  |[45.32738009407737,-73.52353324157008,-316.71449294307905]  |
|0.0  |[141.11843460874294,111.52971791670255,-294.72464174792793] |
|0.0  |[257.9197833641402,-30.303788227506686,-412.7430932412296]  |
|0.0  |[205.39406550266378,-43.645470247177684,-263.29619799676726]|
|0.0  |[184.64724114875668,-41.145293166229465,-249.67710278174673]|
|0.0  |[210.094466859633,-37.11399345673627,-278.3116337961707]    |
|0.0  |[186.27013271865326,-34.13768918855857,-359.88744667958906] |
|0.0  |[138.3547293864501,-96.13703541940149,-383.5751473000109]   |
|0.0  |[80.52660214042481,-70.12433735173279,-223.9583031183828]   |
|0.0  |[156.08057381698663,-29.971551532891326,-259.69397206266757]|
|0.0  |[171.96125679840674,17.88623564672551,-260.8667664233172]   |
|0.0  |[66.39288723761629,-61.2193

In [23]:
# Indexação é pré-requisito para Decision Trees
# dados de treino:
stringIndexer = StringIndexer(inputCol = "label", outputCol = "indexed")
si_model = stringIndexer.fit(treino_res)
obj_treino_final = si_model.transform(treino_res)
obj_treino_final.take(13)

[Row(label=0.0, pcaFeatures=DenseVector([-284.9453, -20.1573, -299.8553]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-180.2085, -15.2791, -311.7301]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-258.3143, -2.6922, -189.7469]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-312.5176, 66.2496, -184.0544]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-181.9417, -13.5039, -229.7665]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-242.8926, -63.2457, -280.8484]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-243.0976, -165.2677, -359.5761]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-170.9327, 37.3722, -223.792]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-208.6879, -166.7302, -365.3201]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-281.3378, 4.3997, -381.3771]), indexed=0.0),
 Row(label=1.0, pcaFeatures=DenseVector([-147.2424, -68.7902, -294.6433]), indexed=1.0),
 Row(label=0.0, pcaFeatur

In [24]:
# dados de teste
si_model = stringIndexer.fit(teste_res)
obj_teste_final = si_model.transform(teste_res)
obj_teste_final.take(13)

[Row(label=0.0, pcaFeatures=DenseVector([45.3274, -73.5235, -316.7145]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([141.1184, 111.5297, -294.7246]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([257.9198, -30.3038, -412.7431]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([205.3941, -43.6455, -263.2962]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([184.6472, -41.1453, -249.6771]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([210.0945, -37.114, -278.3116]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([186.2701, -34.1377, -359.8874]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([138.3547, -96.137, -383.5751]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([80.5266, -70.1243, -223.9583]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([156.0806, -29.9716, -259.694]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([171.9613, 17.8862, -260.8668]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVecto

### Machine Learning

In [25]:
# Criando o modelo
rfClassifer = RandomForestClassifier(labelCol = "indexed", featuresCol = "pcaFeatures")
modelo = rfClassifer.fit(obj_treino_final)

In [26]:
# Previsões com dados de teste
predictions = modelo.transform(obj_teste_final)
predictions.select("prediction", "indexed", "label", "pcaFeatures").collect()

[Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([45.3274, -73.5235, -316.7145])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([141.1184, 111.5297, -294.7246])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([257.9198, -30.3038, -412.7431])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([205.3941, -43.6455, -263.2962])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([184.6472, -41.1453, -249.6771])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([210.0945, -37.114, -278.3116])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([186.2701, -34.1377, -359.8874])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([138.3547, -96.137, -383.5751])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([80.5266, -70.1243, -223.9583])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector

In [27]:
# Avaliando a acurácia
evaluator = MulticlassClassificationEvaluator(predictionCol = "prediction", labelCol = "indexed", metricName = "accuracy")
evaluator.evaluate(predictions)

0.865626874625075

In [28]:
# Confusion Matrix
predictions.groupBy("indexed", "prediction").count().show()

+-------+----------+-----+
|indexed|prediction|count|
+-------+----------+-----+
|    1.0|       0.0|  224|
|    0.0|       0.0| 1443|
+-------+----------+-----+

