In [1]:
from pyspark.sql.types import *
from pyspark.ml.linalg import DenseVector
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.mllib.classification import SVMWithSGD
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [2]:
sc

# RDD

In [3]:
data = sc.textFile('table_models.csv')
header = data.first()

In [4]:
header

u'IDADE,QTD_DEPENDENTES,RENDA_TITULAR,SEXO_M,ESTADO_CIVIL_CO,ESTADO_CIVIL_SE,ESTADO_CIVIL_SO,ESTADO_CIVIL_VI,CATEGORIAL_PROFISSAO_APOSENTADO,CATEGORIAL_PROFISSAO_ASSALARIADO,CATEGORIAL_PROFISSAO_AUT+NOMO,CATEGORIAL_PROFISSAO_LIBERAL,TIPO_RESIDENCIA_FA,TIPO_RESIDENCIA_FI,TIPO_RESIDENCIA_OU,TIPO_RESIDENCIA_PR,TIPO_RESIDENCIA_SM,CLASS'

In [5]:
data = data.filter(lambda line: line != header)

In [6]:
data = data.map(lambda k: k.split(","))

# Data Frame

In [7]:
df = data.toDF(header.split(","))

In [8]:
df.show(5)

+-----+---------------+-------------+------+---------------+---------------+---------------+---------------+-------------------------------+--------------------------------+-----------------------------+----------------------------+------------------+------------------+------------------+------------------+------------------+-----+
|IDADE|QTD_DEPENDENTES|RENDA_TITULAR|SEXO_M|ESTADO_CIVIL_CO|ESTADO_CIVIL_SE|ESTADO_CIVIL_SO|ESTADO_CIVIL_VI|CATEGORIAL_PROFISSAO_APOSENTADO|CATEGORIAL_PROFISSAO_ASSALARIADO|CATEGORIAL_PROFISSAO_AUT+NOMO|CATEGORIAL_PROFISSAO_LIBERAL|TIPO_RESIDENCIA_FA|TIPO_RESIDENCIA_FI|TIPO_RESIDENCIA_OU|TIPO_RESIDENCIA_PR|TIPO_RESIDENCIA_SM|CLASS|
+-----+---------------+-------------+------+---------------+---------------+---------------+---------------+-------------------------------+--------------------------------+-----------------------------+----------------------------+------------------+------------------+------------------+------------------+------------------+-----

# Converter as colunas

In [9]:
# Write a custom function to convert the data type of DataFrame columns
def convertColumn(df, names, newType):
    for name in names: 
        df = df.withColumn(name, df[name].cast(newType))
    return df 

In [10]:
# Assign all column names to `columns`
columns_int = ['IDADE','QTD_DEPENDENTES','SEXO_M','ESTADO_CIVIL_CO','ESTADO_CIVIL_SE','ESTADO_CIVIL_SO',
               'ESTADO_CIVIL_VI','CATEGORIAL_PROFISSAO_APOSENTADO','CATEGORIAL_PROFISSAO_ASSALARIADO',
               'CATEGORIAL_PROFISSAO_AUT+NOMO','CATEGORIAL_PROFISSAO_LIBERAL','TIPO_RESIDENCIA_FA',
               'TIPO_RESIDENCIA_FI','TIPO_RESIDENCIA_OU','TIPO_RESIDENCIA_PR','TIPO_RESIDENCIA_SM','CLASS']
columns_float = ['RENDA_TITULAR']

# Conver the `df` columns to `FloatType()`
df = convertColumn(df, columns_float, FloatType())
df = convertColumn(df, columns_int, IntegerType())

In [11]:
df.printSchema()

root
 |-- IDADE: integer (nullable = true)
 |-- QTD_DEPENDENTES: integer (nullable = true)
 |-- RENDA_TITULAR: float (nullable = true)
 |-- SEXO_M: integer (nullable = true)
 |-- ESTADO_CIVIL_CO: integer (nullable = true)
 |-- ESTADO_CIVIL_SE: integer (nullable = true)
 |-- ESTADO_CIVIL_SO: integer (nullable = true)
 |-- ESTADO_CIVIL_VI: integer (nullable = true)
 |-- CATEGORIAL_PROFISSAO_APOSENTADO: integer (nullable = true)
 |-- CATEGORIAL_PROFISSAO_ASSALARIADO: integer (nullable = true)
 |-- CATEGORIAL_PROFISSAO_AUT+NOMO: integer (nullable = true)
 |-- CATEGORIAL_PROFISSAO_LIBERAL: integer (nullable = true)
 |-- TIPO_RESIDENCIA_FA: integer (nullable = true)
 |-- TIPO_RESIDENCIA_FI: integer (nullable = true)
 |-- TIPO_RESIDENCIA_OU: integer (nullable = true)
 |-- TIPO_RESIDENCIA_PR: integer (nullable = true)
 |-- TIPO_RESIDENCIA_SM: integer (nullable = true)
 |-- CLASS: integer (nullable = true)



# Criar o DenseVector (Machine learning)

In [12]:
ind = range(17)

In [13]:
# Define the `input_data` 
input_data = df.rdd.map(lambda x: (x[-1], DenseVector([x[i] for i in ind])))

# Replace `df` with the new DataFrame
df = spark.createDataFrame(input_data, ["label", "features"])

In [14]:
df.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|[34.0,0.0,950.0,1...|
|    0|[43.0,0.0,1400.0,...|
|    0|[67.0,0.0,1050.0,...|
|    0|[40.0,0.0,1500.0,...|
|    0|[57.0,0.0,2000.0,...|
+-----+--------------------+
only showing top 5 rows



# Modelos

In [15]:
train_data, test_data = df.randomSplit([.85,.15],seed=42)

## Regressão Logística

In [16]:
lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=100)
lrModel = lr.fit(train_data)

In [17]:
predictions_lr = lrModel.transform(test_data)
predictions_lr.select('label', 'rawPrediction', 'prediction', 'probability').show(10)

+-----+--------------------+----------+--------------------+
|label|       rawPrediction|prediction|         probability|
+-----+--------------------+----------+--------------------+
|    0|[-0.3044090621128...|       1.0|[0.42448000670474...|
|    0|[-0.3044090621128...|       1.0|[0.42448000670474...|
|    0|[-0.3161524815673...|       1.0|[0.42161370268506...|
|    0|[-0.3172579426913...|       1.0|[0.42134415319695...|
|    0|[-0.3059079924504...|       1.0|[0.42411386443941...|
|    0|[-0.8033979141888...|       1.0|[0.30929914166715...|
|    0|[-0.5383497638300...|       1.0|[0.36857155269056...|
|    0|[-0.3129717016663...|       1.0|[0.42238954642602...|
|    0|[-0.0869098988293...|       1.0|[0.47828619120359...|
|    0|[-0.3137773767228...|       1.0|[0.42219299285184...|
+-----+--------------------+----------+--------------------+
only showing top 10 rows



In [18]:
evaluator = BinaryClassificationEvaluator()
print('Log Regression - Test Area Under ROC', evaluator.evaluate(predictions_lr))

('Log Regression - Test Area Under ROC', 0.6163578606792683)


## Random Forest

In [19]:
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train_data)
predictions_rf = rfModel.transform(test_data)
predictions_rf.select('label', 'rawPrediction', 'prediction', 'probability').show(10)

+-----+--------------------+----------+--------------------+
|label|       rawPrediction|prediction|         probability|
+-----+--------------------+----------+--------------------+
|    0|[9.61475730090719...|       1.0|[0.48073786504535...|
|    0|[9.61475730090719...|       1.0|[0.48073786504535...|
|    0|[9.60963998784871...|       1.0|[0.48048199939243...|
|    0|[9.60963998784871...|       1.0|[0.48048199939243...|
|    0|[9.61475730090719...|       1.0|[0.48073786504535...|
|    0|[7.56308396233749...|       1.0|[0.37815419811687...|
|    0|[8.02926896292282...|       1.0|[0.40146344814614...|
|    0|[9.17091494977226...|       1.0|[0.45854574748861...|
|    0|[8.79055398946703...|       1.0|[0.43952769947335...|
|    0|[9.17091494977226...|       1.0|[0.45854574748861...|
+-----+--------------------+----------+--------------------+
only showing top 10 rows



In [20]:
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions_rf, {evaluator.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.610862813012


## Gradient Boost

In [21]:
gbt = GBTClassifier(maxIter=10)
gbtModel = gbt.fit(train_data)
predictions_gbt = gbtModel.transform(test_data)
predictions_gbt.select('label', 'rawPrediction', 'prediction', 'probability').show(10)

+-----+--------------------+----------+--------------------+
|label|       rawPrediction|prediction|         probability|
+-----+--------------------+----------+--------------------+
|    0|[0.23785631555708...|       0.0|[0.61673496524547...|
|    0|[0.23785631555708...|       0.0|[0.61673496524547...|
|    0|[0.22669817421378...|       0.0|[0.61144644211879...|
|    0|[0.22669817421378...|       0.0|[0.61144644211879...|
|    0|[0.23785631555708...|       0.0|[0.61673496524547...|
|    0|[-0.4784474057578...|       1.0|[0.27750033588419...|
|    0|[-0.4507416213201...|       1.0|[0.28874578595333...|
|    0|[-0.0749533992602...|       1.0|[0.46259332453229...|
|    0|[-0.1987638869445...|       1.0|[0.40190646342719...|
|    0|[-0.0749533992602...|       1.0|[0.46259332453229...|
+-----+--------------------+----------+--------------------+
only showing top 10 rows



In [22]:
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions_gbt, {evaluator.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.604979853001


## Support Vector Machine with SGD

In [23]:
svm = SVMWithSGD()
svmModel = gbt.fit(train_data)

In [24]:
predictions_svm = svmModel.transform(test_data)
predictions_svm.select('label', 'rawPrediction', 'prediction', 'probability').show(10)

+-----+--------------------+----------+--------------------+
|label|       rawPrediction|prediction|         probability|
+-----+--------------------+----------+--------------------+
|    0|[0.23785631555708...|       0.0|[0.61673496524547...|
|    0|[0.23785631555708...|       0.0|[0.61673496524547...|
|    0|[0.22669817421378...|       0.0|[0.61144644211879...|
|    0|[0.22669817421378...|       0.0|[0.61144644211879...|
|    0|[0.23785631555708...|       0.0|[0.61673496524547...|
|    0|[-0.4784474057578...|       1.0|[0.27750033588419...|
|    0|[-0.4507416213201...|       1.0|[0.28874578595333...|
|    0|[-0.0749533992602...|       1.0|[0.46259332453229...|
|    0|[-0.1987638869445...|       1.0|[0.40190646342719...|
|    0|[-0.0749533992602...|       1.0|[0.46259332453229...|
+-----+--------------------+----------+--------------------+
only showing top 10 rows



In [25]:
evaluator = BinaryClassificationEvaluator()
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions_svm, {evaluator.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.604979853001


In [26]:
# spark.stop()