In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import length
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark import SparkContext
from pyspark import SparkFiles
from pyspark.sql.functions import length

import pandas as pd

import os
os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/jdk1.8.0_181.jdk/Contents/Home/'


In [2]:
spark = SparkSession.builder.appName('twitter').getOrCreate()

In [3]:
url ="CSV_cleaned/tweets_sample2.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("tweets_sample2.csv"), sep=",", header=True)
df.show()


+--------------------+--------+--------+-------+--------+--------+---------+
|               tweet|Compound|Negative|Neutral|Positive|original|sentiment|
+--------------------+--------+--------+-------+--------+--------+---------+
|Happy Monday twit...|   0.778|   0.564|   0.09|   0.346|positive|        4|
|C'MON STEWARDESS!...|     0.0|     1.0|    0.0|     0.0|negative|        0|
|@saraeden That so...|   0.806|   0.452|    0.0|   0.548|positive|        4|
|@mitchelmusso I R...|   0.684|   0.771|    0.0|   0.229|negative|        0|
|I don't know what...|     0.0|     1.0|    0.0|     0.0|negative|        0|
|Changeling. So fa...|    0.03|   0.369|   0.31|   0.321|negative|        0|
|@kateadams will t...|     0.0|     1.0|    0.0|     0.0|positive|        4|
|cuddling with mys...|     0.0|     1.0|    0.0|     0.0|negative|        0|
|@mcttron Rip it u...|     0.0|     1.0|    0.0|     0.0|positive|        4|
|@eatsomemore hell...|     0.0|     1.0|    0.0|     0.0|positive|        4|

In [4]:
# Create a length column to be used as a future feature
data = df.withColumn('length', length(df['tweet']))
data.show()

+--------------------+--------+--------+-------+--------+--------+---------+------+
|               tweet|Compound|Negative|Neutral|Positive|original|sentiment|length|
+--------------------+--------+--------+-------+--------+--------+---------+------+
|Happy Monday twit...|   0.778|   0.564|   0.09|   0.346|positive|        4|   116|
|C'MON STEWARDESS!...|     0.0|     1.0|    0.0|     0.0|negative|        0|   136|
|@saraeden That so...|   0.806|   0.452|    0.0|   0.548|positive|        4|    54|
|@mitchelmusso I R...|   0.684|   0.771|    0.0|   0.229|negative|        0|   132|
|I don't know what...|     0.0|     1.0|    0.0|     0.0|negative|        0|    27|
|Changeling. So fa...|    0.03|   0.369|   0.31|   0.321|negative|        0|    38|
|@kateadams will t...|     0.0|     1.0|    0.0|     0.0|positive|        4|    35|
|cuddling with mys...|     0.0|     1.0|    0.0|     0.0|negative|        0|    21|
|@mcttron Rip it u...|     0.0|     1.0|    0.0|     0.0|positive|        4|

In [5]:
# Create all the features to the data set

pos_neg_to_num = StringIndexer(inputCol='original',outputCol='label')
pos_neg_to_num2 = StringIndexer(inputCol='Compound',outputCol='compound2')
pos_neg_to_num3 = StringIndexer(inputCol='Positive',outputCol='positive2')
pos_neg_to_num4 = StringIndexer(inputCol='Negative',outputCol='negative2')
pos_neg_to_num5 = StringIndexer(inputCol='Neutral',outputCol='neutral2')

tokenizer = Tokenizer(inputCol="tweet", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [6]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length','compound2','negative2','positive2','neutral2'], outputCol='features')

In [7]:
# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[pos_neg_to_num,pos_neg_to_num2,pos_neg_to_num3,pos_neg_to_num4,pos_neg_to_num5,tokenizer, stopremove, hashingTF, idf, clean_up])

In [8]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data)
cleaned = cleaner.transform(data)

In [9]:
# cleaned.show()

# cleaned.select(['label','stop_tokens', 'features']).show()

cleaned= cleaned.select(['tweet','idf_token', 'length','compound2','negative2','positive2','neutral2','label', 'features'])
cleaned.show()

+--------------------+--------------------+------+---------+---------+---------+--------+-----+--------------------+
|               tweet|           idf_token|length|compound2|negative2|positive2|neutral2|label|            features|
+--------------------+--------------------+------+---------+---------+---------+--------+-----+--------------------+
|Happy Monday twit...|(262144,[21872,37...|   116|     43.0|    167.0|    139.0|    82.0|  1.0|(262149,[21872,37...|
|C'MON STEWARDESS!...|(262144,[304,3091...|   136|      0.0|      0.0|      0.0|     0.0|  0.0|(262149,[304,3091...|
|@saraeden That so...|(262144,[113432,1...|    54|    610.0|    614.0|    403.0|     0.0|  1.0|(262149,[113432,1...|
|@mitchelmusso I R...|(262144,[14,33053...|   132|    151.0|    331.0|    155.0|     0.0|  0.0|(262149,[14,33053...|
|I don't know what...|(262144,[140931,2...|    27|      0.0|      0.0|      0.0|     0.0|  0.0|(262149,[140931,2...|
|Changeling. So fa...|(262144,[155321,1...|    38|    669.0|    

In [10]:

# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])


In [11]:
# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [12]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
# test_results.show(5)


test_results.select(['tweet','prediction', 'probability']).show(5)

+--------------------+----------+--------------------+
|               tweet|prediction|         probability|
+--------------------+----------+--------------------+
| OMG! i hear ever...|       1.0|[0.00165477759632...|
| youtwitface  #yo...|       1.0|[0.12865392665973...|
|#jaredleto he loo...|       1.0|[7.86315677143086...|
|#musicmonday Brit...|       1.0|[1.80540529830780...|
|#squarespace my f...|       1.0|[6.97034232321681...|
+--------------------+----------+--------------------+
only showing top 5 rows



In [13]:
test_results.show(5)

+--------------------+--------------------+------+---------+---------+---------+--------+-----+--------------------+--------------------+--------------------+----------+
|               tweet|           idf_token|length|compound2|negative2|positive2|neutral2|label|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+------+---------+---------+---------+--------+-----+--------------------+--------------------+--------------------+----------+
| OMG! i hear ever...|(262144,[37101,10...|    47|      0.0|      0.0|      0.0|     0.0|  0.0|(262149,[37101,10...|[-523.79292034120...|[0.00165477759632...|       1.0|
| youtwitface  #yo...|(262144,[167348,1...|    39|      0.0|      0.0|      0.0|     0.0|  0.0|(262149,[167348,1...|[-411.51875849300...|[0.12865392665973...|       1.0|
|#jaredleto he loo...|(262144,[27582,29...|   134|    402.0|    263.0|    260.0|     0.0|  1.0|(262149,[27582,29...|[-3416.0056328282...|[7.8631567714

In [14]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print(f"Accuracy of model at predicting reviews was: {acc}")

Accuracy of model at predicting reviews was: 0.6704865512565825


In [15]:
# clean_up = VectorAssembler(inputCols=['idf_token', 'length','compound2'], outputCol='features') 0.5919176454727069
 
# clean_up = VectorAssembler(inputCols=['idf_token', 'compound2'], outputCol='features') 0.5707940034594462

# clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features') 0.6008312043512044

# clean_up = VectorAssembler(inputCols=['idf_token', 'length','compound2','negative2','positive2','neutral2'], outputCol='features') 0.6785473212243401

In [16]:
##Save Model
predictor.save("sentiment_model.h5")


Py4JJavaError: An error occurred while calling o480.save.
: java.io.IOException: Path sentiment_model.h5 already exists. To overwrite it, please use write.overwrite().save(path) for Scala and use write().overwrite().save(path) for Java and Python.
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:503)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:102)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [17]:

##Load model
new_predictor = NaiveBayesModel.load("sentiment_model.h5")

In [18]:
test_results = new_predictor.transform(testing)
test_results.show(5)

+--------------------+--------------------+------+---------+---------+---------+--------+-----+--------------------+--------------------+--------------------+----------+
|               tweet|           idf_token|length|compound2|negative2|positive2|neutral2|label|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+------+---------+---------+---------+--------+-----+--------------------+--------------------+--------------------+----------+
| OMG! i hear ever...|(262144,[37101,10...|    47|      0.0|      0.0|      0.0|     0.0|  0.0|(262149,[37101,10...|[-485.37778564159...|[0.99999997583999...|       0.0|
| youtwitface  #yo...|(262144,[167348,1...|    39|      0.0|      0.0|      0.0|     0.0|  0.0|(262149,[167348,1...|[-411.16899338807...|[0.15216455109871...|       1.0|
|#jaredleto he loo...|(262144,[27582,29...|   134|    402.0|    263.0|    260.0|     0.0|  1.0|(262149,[27582,29...|[-3420.5557424115...|[3.3594475472

In [19]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print(f"Accuracy of model at predicting reviews was: {acc}")

Accuracy of model at predicting reviews was: 0.8243621616156596


In [25]:
# testdata= spark.createDataFrame([    
#     ("Happy Monday twitter", 0.778,0.564,0.09,0.346)
# ], ["tweet", "Compound","Negative","Neutral","Positive"])

testdata2= spark.createDataFrame([    
    ("Happy Monday twitter", 0.778,0.564,0.09,0.346, 100,-90)
], ["tweet", "Compound","Negative","Neutral","Positive", "lat","long"])

data = testdata2.withColumn('length', length(testdata2['tweet']))
data.show(1)

+--------------------+--------+--------+-------+--------+---+----+------+
|               tweet|Compound|Negative|Neutral|Positive|lat|long|length|
+--------------------+--------+--------+-------+--------+---+----+------+
|Happy Monday twitter|   0.778|   0.564|   0.09|   0.346|100| -90|    20|
+--------------------+--------+--------+-------+--------+---+----+------+



In [26]:
# Create all the features to the data set
pos_neg_to_num2 = StringIndexer(inputCol='Compound',outputCol='compound2')
pos_neg_to_num3 = StringIndexer(inputCol='Positive',outputCol='positive2')
pos_neg_to_num4 = StringIndexer(inputCol='Negative',outputCol='negative2')
pos_neg_to_num5 = StringIndexer(inputCol='Neutral',outputCol='neutral2')

tokenizer = Tokenizer(inputCol="tweet", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')


# Create feature vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

clean_up = VectorAssembler(inputCols=['idf_token', 'length','compound2','negative2','positive2','neutral2'], outputCol='features')

# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[pos_neg_to_num2,pos_neg_to_num3,pos_neg_to_num4,pos_neg_to_num5,tokenizer, stopremove, hashingTF, idf, clean_up])


In [27]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data)
cleaned = cleaner.transform(data)
cleaned.show(2)

+--------------------+--------+--------+-------+--------+---+----+------+---------+---------+---------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|               tweet|Compound|Negative|Neutral|Positive|lat|long|length|compound2|positive2|negative2|neutral2|          token_text|         stop_tokens|          hash_token|           idf_token|            features|
+--------------------+--------+--------+-------+--------+---+----+------+---------+---------+---------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Happy Monday twitter|   0.778|   0.564|   0.09|   0.346|100| -90|    20|      0.0|      0.0|      0.0|     0.0|[happy, monday, t...|[happy, monday, t...|(262144,[64274,86...|(262144,[64274,86...|(262149,[262144],...|
+--------------------+--------+--------+-------+--------+---+----+------+---------+---------+---------+--------+----------------

In [28]:
#apply model

test_results = new_predictor.transform(cleaned)
test_results.select(["prediction"]).show(5)


# cleaned.select(['tweet','idf_token', 'length','compound2','negative2','positive2','neutral2','label', 'features'])
# cleaned.show()

+----------+
|prediction|
+----------+
|       1.0|
+----------+

