In [1]:
#bibliotecas
import unidecode
from pyspark.sql.functions import udf,col,concat_ws, explode,length,abs, lower,lit
from pyspark.sql.types import StringType, IntegerType

#bibliotecas ML
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler,RegexTokenizer, StopWordsRemover, CountVectorizer, NGram
from pyspark.ml.classification import NaiveBayes,RandomForestClassifier,LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline,PipelineModel

#variables generales del notebook
stopWordsCustomizados = ["amlo","felipe calderon","EPN","calderon"] + StopWordsRemover.loadDefaultStopWords("spanish")
minTokenSize = 2
cantidadNGrams = 1

In [2]:
tweetData = spark.read.csv('dbfs:/mnt/jglake/tweetsSentiment.csv',sep=";",header="True")
tweetData.createOrReplaceTempView("tweetData")
tweetData.show(10)

In [3]:
%sql 
select categoria, count(*) as cantidad from tweetData group by categoria

categoria,cantidad
informativo,114
negativo,107
positivo,38


In [4]:
#Quitamos acentos y convertimos a minusculas
def remove_accents(input_str):
  return unidecode.unidecode(input_str).lower()

remove_accents_udf = udf(remove_accents, StringType())
tweetData = tweetData.withColumn('textoSinAcentos',remove_accents_udf(tweetData.texto))

#Tokenizamos
regexTokenizer = RegexTokenizer(inputCol="textoSinAcentos", outputCol="words", pattern="\\W")
#Establecemos token minimo
regexTokenizer.setMinTokenLength(minTokenSize)
#Quitamos stop words
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stopWordsCustomizados)
#Convertimos a ngramas
ngram = NGram(n=cantidadNGrams, inputCol="filtered", outputCol="ngrams")
#Convertimos categoria de texto a numerica
label_stringIdx = StringIndexer(inputCol = "categoria", outputCol = "label")
#Vectorizamos
countVectors = CountVectorizer(inputCol="ngrams", outputCol="features", vocabSize=150, minDF=5)

#Ejecutamos el pipeline completo
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, ngram,countVectors,label_stringIdx])

pipelineFit = pipeline.fit(tweetData)
dfTweets = pipelineFit.transform(tweetData)
dfTweets.show(10)

In [5]:
# Dividiendo datos de entrenamiento y de prueba
(trainingData, testData) = dfTweets.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

#algoritmo naive bayes
nb = NaiveBayes(smoothing=1)
#entrenamiento
modelNB = nb.fit(trainingData)
#generar predicciones
predictions = modelNB.transform(testData)
predictions.select("texto","categoria","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 20, truncate = 20)

#evaluando el modelo
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
precision = evaluator.evaluate(predictions)
print("Precisión del modelo: "+ str(precision))

In [6]:
#usando random forest
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)
# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
predictions.select("texto","categoria","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 20, truncate = 30)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
precision = evaluator.evaluate(predictions)
print("Precisión del modelo: "+ str(precision))


In [7]:
#usando Logistic Regression using Count Vector Features
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.select("texto","categoria","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 20, truncate = 20)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
precision = evaluator.evaluate(predictions)
print("Precisión del modelo: "+ str(precision))