In [0]:
!java -version

In [0]:
sc.version

In [0]:
#import math
#import pandas as pd
#import matplotlib.pylab as plt
#%matplotlib inline

In [0]:
# carregar tabela de dados
df = table("cadastro_nomes_sexo")
#df = spark.sql("select * from cadastro_nomes_sexo")

In [0]:
df.printSchema()

In [0]:
df.show(10)

In [0]:
rdd = df.rdd.filter(lambda x: x['SEXO'] in ['M', 'F'])
#rdd = df.rdd
#rdd = df.rdd.map(list)
#rdd = df.rdd.map(tuple)
rdd

In [0]:
type(rdd)

In [0]:
rdd.take(5)

In [0]:
from pyspark.sql import Row

rdd2 = rdd.map(lambda x: Row(
  nome=x['PNOME'],
  sexo={'F': 0, 'M': 1, 'X': 9}[x['SEXO']]
))
rdd2

In [0]:
rdd2.take(5)

In [0]:
MAIOR_QTDE_LETRAS = 16 # fixado para o maior arquivo

def incluir_letras(row):
  row_dict = row.asDict()
  rev = row_dict['nome'][::-1]
  tam = len(rev)
  letras = [0] * MAIOR_QTDE_LETRAS
  
  for i in range(tam):
    letras[i] = ord(rev[i]) - 64
  row_dict['letras'] = letras
  
  new_row = Row(**row_dict)
  return new_row

In [0]:
rdd3 = rdd2.map(incluir_letras)
rdd3

In [0]:
rdd3.take(5)

In [0]:
df3 = rdd3.toDF()
df3

In [0]:
df3.select("nome", "sexo").describe().show()

In [0]:
from pyspark.ml.linalg import Vectors

def converter_para_ponto_rotulado(row):
    obj = (row["nome"], row["sexo"], Vectors.dense(row["letras"]))
    return obj

In [0]:
rdd4 = rdd3.map(converter_para_ponto_rotulado)
rdd4

In [0]:
rdd4.take(5)

In [0]:
df4 = rdd4.toDF(["nome", "real", "features"])
df4

In [0]:
df4.show(5, truncate=False)

In [0]:
from pyspark.ml.feature import PCA

pca = PCA(
  k = 10,
  inputCol = "features",
  outputCol = "pcaFeatures"
)

#pcaModel = pca.fit(df4)
#pcaResult = pcaModel.transform(df4).select("nome", "real", "pcaFeatures")
#pcaResult.show(5)

In [0]:
# usar redução de dimensionalidade PCA
#df5 = pcaResult.withColumnRenamed("pcaFeatures", "features")

# não usar PCA
df5 = df4

df5.show(5, truncate=False)

In [0]:
# dividir dados entre treino e teste
(dados_treino, dados_teste) = df5.randomSplit([0.7, 0.3], seed=42)

print("dados de treino:", dados_treino.count())
print("dados de teste: ", dados_teste.count())

In [0]:
from datetime import datetime

models = {}

def evaluate_model(name, classifier, evaluator, train_data, test_data):

  start = datetime.now()
  model = classifier.fit(train_data)
  #kfold = KFold(n_splits=NUMBER_KFOLD_SPLITS, shuffle=True, random_state=42)
  #results = cross_val_score(model, X, y, cv=kfold,
  #                          scoring=SCORING_METRIC, verbose=1, n_jobs=-1)
  end = datetime.now()

  elapsed = int((end - start).total_seconds() * 1000)
  #score = results.mean() * 100
  #stddev = results.std() * 100

  predictions = model.transform(test_data)
  score = evaluator.evaluate(predictions) * 100

  models[name] = (model, score, elapsed)
  print(model, '\nScore: %.2f [%5s ms]' % (score, elapsed))
  return predictions

In [0]:
# TODO: implementar GridSearchCV e Cross-Validation
# import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
# https://spark.apache.org/docs/latest/ml-tuning.html
# https://www.programcreek.com/scala/org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

avaliador = MulticlassClassificationEvaluator(
    predictionCol = "previsto",
    labelCol = "real",
    metricName = "accuracy"
)

In [0]:
# Logistic Regression
from pyspark.ml.classification import LogisticRegression

estimador = LogisticRegression(
  #regParam=1.0,
  #maxIter=1000,
# ...
  labelCol="real",
  featuresCol="features",
  predictionCol="previsto"
)

evaluate_model('LR', estimador, avaliador, dados_treino, dados_teste)

In [0]:
# Decision Tree
from pyspark.ml.classification import DecisionTreeClassifier

estimador = DecisionTreeClassifier(
  maxDepth=10,
  labelCol="real",
  featuresCol="features",
  predictionCol="previsto"
)

evaluate_model('DT', estimador, avaliador, dados_treino, dados_teste)

In [0]:
# Random Forest
from pyspark.ml.classification import RandomForestClassifier

estimador = RandomForestClassifier(
  maxDepth=10,
  labelCol="real",
  featuresCol="features",
  predictionCol="previsto"
)

evaluate_model('RF', estimador, avaliador, dados_treino, dados_teste)

In [0]:
# Gradient-Boosted Trees (GBTs)
from pyspark.ml.classification import GBTClassifier

estimador = GBTClassifier(
  maxDepth=10,
  labelCol="real",
  featuresCol="features",
  predictionCol="previsto"
)

# só classificação binária!
evaluate_model('GB', estimador, avaliador, dados_treino, dados_teste)

In [0]:
# Multilayer Perceptron (MLP)
from pyspark.ml.classification import MultilayerPerceptronClassifier

estimador = MultilayerPerceptronClassifier(
  maxIter=100,
  layers=[4, 5, 3],
  #blockSize=128,
  seed=42,
  labelCol="real",
  featuresCol="features",
  predictionCol="previsto"
)

# está dando erro...
#evaluate_model('MLP', estimador, avaliador, dados_treino, dados_teste)

In [0]:
# Linear Support Vector Machine (SVM)
from pyspark.ml.classification import LinearSVC

estimador = LinearSVC(
  #maxIter=10,
  #regParam=0.1,
  labelCol="real",
  featuresCol="features",
  predictionCol="previsto"
)

# só classificação binária!
evaluate_model('LSVM', estimador, avaliador, dados_treino, dados_teste)

In [0]:
# Naïve Bayes
from pyspark.ml.classification import NaiveBayes

estimador = NaiveBayes(
  #smoothing=1.0,
  #modelType="multinomial",
  labelCol="real",
  featuresCol="features",
  predictionCol="previsto"
)

evaluate_model('NB', estimador, avaliador, dados_treino, dados_teste)

In [0]:
# Factorization Machines (FM)
from pyspark.ml.classification import FMClassifier

estimador = FMClassifier(
  #stepSize=0.001,
  labelCol="real",
  featuresCol="features",
  predictionCol="previsto"
)

# só classificação binária!
evaluate_model('FM', estimador, avaliador, dados_treino, dados_teste)

In [0]:
#results = []
#for key, value in models.items():
#  tup = (key,) + value
#  results.append(tup)
#results

In [0]:
#from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
#schema = StructType([ \
#  StructField("Model", StringType(), True), \
#  StructField("Estimator", StringType(), True), \
#  StructField("Score", FloatType(), True), \
#  StructField("Time (ms)", IntegerType(), True) \
#])

In [0]:
#results_df = spark.createDataFrame(data=results, schema=schema)
#results_df.printSchema()
#results_df.show(truncate=False)

In [0]:
names = []
estimators = []
scores = []
times = []

for key, value in models.items():
  (model, score, elapsed) = value
  names.append(key)
  estimators.append(model)
  scores.append(score)
  times.append(elapsed)

In [0]:
import pandas as pd

results_df = pd.DataFrame({
  'Model': names,
  'Score': scores,
  #'Std Dev': stdevs,
  'Time (ms)': times,
  'Estimator': estimators})

results_df.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score,Time (ms),Estimator
3,GB,99.592003,85960,GBTClassificationModel: uid = GBTClassifier_4d...
1,DT,99.534884,14358,DecisionTreeClassificationModel: uid=DecisionT...
2,RF,98.433293,19592,RandomForestClassificationModel: uid=RandomFor...
0,LR,94.875561,7944,LogisticRegressionModel: uid=LogisticRegressio...
4,LSVM,94.614443,16090,"LinearSVCModel: uid=LinearSVC_35c53ba40f43, nu..."
5,NB,91.701346,3498,"NaiveBayesModel: uid=NaiveBayes_eea399634b13, ..."
6,FM,91.170951,39531,FMClassificationModel: uid=FMClassifier_5caf52...


In [0]:
modelo = models['GB'][0]
modelo

In [0]:
# gerar previsões e mostrar exemplos
previsoes = modelo.transform(dados_teste)
previsoes.select("nome", "real", "previsto").show(10)

In [0]:
# matriz de confusão
previsoes.groupBy("real", "previsto").count().show()

In [0]:
previsoes.filter("real = 1").filter("real != previsto").select("nome", "real", "previsto").show(10)

In [0]:
previsoes.filter("real = 0").filter("real != previsto").select("nome", "real", "previsto").show(10)

In [0]:
NUMBER_KFOLD_SPLITS = 5 # number of splits in cross-validation
NUMBER_GRID_ITERATIONS = 10 # number of grid iterations to parameters testing
SCORING_METRIC = 'accuracy' # the scoring metric to be used

In [0]:
# faz o ajuste fino do modelo, calculando os melhores hiperparâmetros
def fine_tune_model(model, params, X=X, y=y):

  print('\nFine Tuning Model:')
  print(model, "\nparams:", params)
  
  kfold = KFold(n_splits=NUMBER_KFOLD_SPLITS, shuffle=True, random_state=42)

  search = RandomizedSearchCV(model, param_distributions=params, 
                              n_iter=NUMBER_GRID_ITERATIONS, 
                              scoring=SCORING_METRIC, cv=kfold, 
                              verbose=1, n_jobs=-1)
  #search = GridSearchCV(estimator=model, param_grid=params, scoring='accuracy', cv=kfold, verbose=1)

  search.fit(X, y)
  print('\nBest Score: %.2f %%' % (search.best_score_ * 100))
  print('Best Params:', search.best_params_)
  return search

In [0]:
# K-Nearest Neighbours (KNN)
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=11) # 87.45
evaluate_model('KNN', model)

params = dict(
    n_neighbors=[1,3,5,7,9,11]
)
#fine_tune_model(model, params)

In [0]:
y_pred = model.predict(X)
y_pred

In [0]:
dados = pd.DataFrame({'REAL': y, 'PREV': y_pred}, index=X.index)
for col in dados.columns:
    dados[col] = dados[col].map({0: 'X', 1: 'F', 2: 'M'})
dados.head()

In [0]:
from sklearn.metrics import accuracy_score

accuracy_score(y, y_pred)

In [0]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y, y_pred)

In [0]:
dados[dados['REAL'] != dados['PREV']].head(20)