In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import length
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark import SparkContext
from pyspark import SparkFiles
from pyspark.sql.functions import length

import pandas as pd

import os
os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/jdk1.8.0_181.jdk/Contents/Home/'


In [2]:
spark = SparkSession.builder.appName('twitter').getOrCreate()

In [3]:
url ="CSV_cleaned/tweets_sample2.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("tweets_sample2.csv"), sep=",", header=True)
df.show()


+--------------------+--------+--------+-------+--------+--------+---------+
|               tweet|Compound|Negative|Neutral|Positive|original|sentiment|
+--------------------+--------+--------+-------+--------+--------+---------+
|Happy Monday twit...|   0.778|   0.564|   0.09|   0.346|positive|        4|
|C'MON STEWARDESS!...|     0.0|     1.0|    0.0|     0.0|negative|        0|
|@saraeden That so...|   0.806|   0.452|    0.0|   0.548|positive|        4|
|@mitchelmusso I R...|   0.684|   0.771|    0.0|   0.229|negative|        0|
|I don't know what...|     0.0|     1.0|    0.0|     0.0|negative|        0|
|Changeling. So fa...|    0.03|   0.369|   0.31|   0.321|negative|        0|
|@kateadams will t...|     0.0|     1.0|    0.0|     0.0|positive|        4|
|cuddling with mys...|     0.0|     1.0|    0.0|     0.0|negative|        0|
|@mcttron Rip it u...|     0.0|     1.0|    0.0|     0.0|positive|        4|
|@eatsomemore hell...|     0.0|     1.0|    0.0|     0.0|positive|        4|

In [4]:
# Create a length column to be used as a future feature
data = df.withColumn('length', length(df['tweet']))
data.show()

+--------------------+--------+--------+-------+--------+--------+---------+------+
|               tweet|Compound|Negative|Neutral|Positive|original|sentiment|length|
+--------------------+--------+--------+-------+--------+--------+---------+------+
|Happy Monday twit...|   0.778|   0.564|   0.09|   0.346|positive|        4|   116|
|C'MON STEWARDESS!...|     0.0|     1.0|    0.0|     0.0|negative|        0|   136|
|@saraeden That so...|   0.806|   0.452|    0.0|   0.548|positive|        4|    54|
|@mitchelmusso I R...|   0.684|   0.771|    0.0|   0.229|negative|        0|   132|
|I don't know what...|     0.0|     1.0|    0.0|     0.0|negative|        0|    27|
|Changeling. So fa...|    0.03|   0.369|   0.31|   0.321|negative|        0|    38|
|@kateadams will t...|     0.0|     1.0|    0.0|     0.0|positive|        4|    35|
|cuddling with mys...|     0.0|     1.0|    0.0|     0.0|negative|        0|    21|
|@mcttron Rip it u...|     0.0|     1.0|    0.0|     0.0|positive|        4|

In [5]:
# Create all the features to the data set

pos_neg_to_num = StringIndexer(inputCol='original',outputCol='label')
pos_neg_to_num2 = StringIndexer(inputCol='Compound',outputCol='compound2')
pos_neg_to_num3 = StringIndexer(inputCol='Positive',outputCol='positive2')
pos_neg_to_num4 = StringIndexer(inputCol='Negative',outputCol='negative2')
pos_neg_to_num5 = StringIndexer(inputCol='Neutral',outputCol='neutral2')

tokenizer = Tokenizer(inputCol="tweet", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [6]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
# clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')
# clean_up = VectorAssembler(inputCols=['idf_token', 'compound2'], outputCol='features')
# clean_up = VectorAssembler(inputCols=['idf_token', 'compound2','negative2','positive2','neutral2'], outputCol='features')
clean_up = VectorAssembler(inputCols=['idf_token', 'length','compound2','negative2','positive2','neutral2'], outputCol='features')

In [7]:
# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[pos_neg_to_num,pos_neg_to_num2,pos_neg_to_num3,pos_neg_to_num4,pos_neg_to_num5,tokenizer, stopremove, hashingTF, idf, clean_up])

In [8]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data)
cleaned = cleaner.transform(data)

In [9]:
# cleaned.show()

cleaned.select(['label','stop_tokens', 'features']).show()

+-----+--------------------+--------------------+
|label|         stop_tokens|            features|
+-----+--------------------+--------------------+
|  1.0|[happy, monday, t...|(262149,[21872,37...|
|  0.0|[c'mon, stewardes...|(262149,[304,3091...|
|  1.0|[@saraeden, sound...|(262149,[113432,1...|
|  0.0|[@mitchelmusso, r...|(262149,[14,33053...|
|  0.0|       [know, watch]|(262149,[140931,2...|
|  0.0|[changeling., far...|(262149,[155321,1...|
|  1.0|[@kateadams, frui...|(262149,[88244,15...|
|  0.0|          [cuddling]|(262149,[38629,26...|
|  1.0|[@mcttron, rip, u...|(262149,[5381,266...|
|  1.0|[@eatsomemore, he...|(262149,[19140,28...|
|  0.0|[, cant, tomorrow...|(262149,[29129,12...|
|  1.0|[@jesscorrie, aww...|(262149,[55724,82...|
|  1.0|[@daddys_pet, yes...|(262149,[39276,54...|
|  0.0|[@clarencehill, r...|(262149,[87348,15...|
|  1.0|[opps, meant, was...|(262149,[2054,844...|
|  0.0|[@joannahang, bad...|(262149,[16143,57...|
|  1.0|[gonna, get, bett...|(262149,[99895,12...|


In [10]:
from pyspark.ml.classification import NaiveBayes

# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])


In [11]:
# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [12]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(5)

+--------------------+--------+--------+-------+--------+--------+---------+------+-----+---------+---------+---------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|               tweet|Compound|Negative|Neutral|Positive|original|sentiment|length|label|compound2|positive2|negative2|neutral2|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+--------------------+--------+--------+-------+--------+--------+---------+------+-----+---------+---------+---------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
| OMG! i hear ever...|     0.0|     1.0|    0.0|     0.0|negative|        0|    47|  0.0|      0.0|      0.0|      0.0|     0.0|[, omg!, i,

In [13]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print(f"Accuracy of model at predicting reviews was: {acc}")

Accuracy of model at predicting reviews was: 0.671703374583188


In [14]:
# clean_up = VectorAssembler(inputCols=['idf_token', 'length','compound2'], outputCol='features') 0.5919176454727069
 
# clean_up = VectorAssembler(inputCols=['idf_token', 'compound2'], outputCol='features') 0.5707940034594462

# clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features') 0.6008312043512044

# clean_up = VectorAssembler(inputCols=['idf_token', 'length','compound2','negative2','positive2','neutral2'], outputCol='features') 0.6785473212243401

In [15]:
# Save the model
predictor.save("sentiment_model.h5")

# write.overwrite().save(path)