## Classificação de Spam

https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.feature import IDF
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# Build the SparkSession
spark = SparkSession.builder \
   .master("local[*]") \
   .appName("Nome do Projeto") \
   .config("spark.executor.memory", "6gb") \
   .config('spark.sql.debug.maxToStringFields', 2000) \
   .config('spark.debug.maxToStringFields', 2000) \
   .config("spark.sql.caseSensitive", "false") \
   .getOrCreate()
   
sc = spark.sparkContext

In [None]:
# Carregando os dados e gerando um RDD
spamRDD = sc.textFile("SMSSpamCollection.csv", 2)

In [None]:
spamRDD.cache()

In [None]:
spamRDD.collect()

## Pré-Processamento dos Dados

In [None]:
def TransformToVector(inputStr):
    attList = inputStr.split(",")
    smsType = 0.0 if attList[0] == "ham" else 1.0
    return [smsType, attList[1]]

In [None]:
spamRDD2 = spamRDD.map(TransformToVector)
spamDF = spSession.createDataFrame(spamRDD2, ["label", "message"])
spamDF.cache()
spamDF.select("label", "message").show()

## Machine Learning

In [None]:
# Dados de Treino e de Teste
(dados_treino, dados_teste) = spamDF.randomSplit([0.7, 0.3])

In [None]:
dados_treino.count()

In [None]:
dados_teste.count()

In [None]:
# Divisão em palavras e aplicação do TF-IDF 
tokenizer = Tokenizer(inputCol = "message", outputCol = "words")
hashingTF = HashingTF(inputCol = tokenizer.getOutputCol(), outputCol = "tempfeatures")
idf = IDF(inputCol = hashingTF.getOutputCol(), outputCol = "features")
nbClassifier = NaiveBayes()

In [None]:
# Criação do Pipeline
pipeline = Pipeline(stages = [tokenizer, hashingTF, idf, nbClassifier])

In [None]:
# Criação do modelo com o Pipeline
modelo = pipeline.fit(dados_treino)

In [None]:
# Previsões nos dados de teste
previsoes = modelo.transform(dados_teste)
previsoes.select("prediction", "label").collect()

In [None]:
# Avaliando a acurácia
avaliador = MulticlassClassificationEvaluator(predictionCol = "prediction", labelCol = "label", metricName = "accuracy")
avaliador.evaluate(previsoes)

In [None]:
# Resumindo as previsões - Confusion Matrix
previsoes.groupBy("label","prediction").count().show()

In [None]:
# Stop Spark session
spark.stop()