In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import length
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark import SparkContext
from pyspark import SparkFiles
from pyspark.sql.functions import length

import pandas as pd

import os
os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/jdk1.8.0_181.jdk/Contents/Home/'


In [None]:
spark = SparkSession.builder.appName('twitter').getOrCreate()

In [None]:
url ="CSV_cleaned/tweets_sample2.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("tweets_sample2.csv"), sep=",", header=True)
df.show()


In [None]:
# Create a length column to be used as a future feature
data = df.withColumn('length', length(df['tweet']))
data.show()

In [None]:
# Create all the features to the data set

pos_neg_to_num = StringIndexer(inputCol='original',outputCol='label')
pos_neg_to_num2 = StringIndexer(inputCol='Compound',outputCol='compound2')
pos_neg_to_num3 = StringIndexer(inputCol='Positive',outputCol='positive2')
pos_neg_to_num4 = StringIndexer(inputCol='Negative',outputCol='negative2')
pos_neg_to_num5 = StringIndexer(inputCol='Neutral',outputCol='neutral2')

tokenizer = Tokenizer(inputCol="tweet", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
# clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')
# clean_up = VectorAssembler(inputCols=['idf_token', 'compound2'], outputCol='features')
# clean_up = VectorAssembler(inputCols=['idf_token', 'compound2','negative2','positive2','neutral2'], outputCol='features')
clean_up = VectorAssembler(inputCols=['idf_token', 'length','compound2','negative2','positive2','neutral2'], outputCol='features')

In [None]:
# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[pos_neg_to_num,pos_neg_to_num2,pos_neg_to_num3,pos_neg_to_num4,pos_neg_to_num5,tokenizer, stopremove, hashingTF, idf, clean_up])

In [None]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data)
cleaned = cleaner.transform(data)

In [None]:
# cleaned.show()

cleaned.select(['label','stop_tokens', 'features']).show()

In [None]:
from pyspark.ml.classification import NaiveBayes

# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])


In [None]:
# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [None]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(5)

In [None]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print(f"Accuracy of model at predicting reviews was: {acc}")

In [None]:
# clean_up = VectorAssembler(inputCols=['idf_token', 'length','compound2'], outputCol='features') 0.5919176454727069
 
# clean_up = VectorAssembler(inputCols=['idf_token', 'compound2'], outputCol='features') 0.5707940034594462

# clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features') 0.6008312043512044

# clean_up = VectorAssembler(inputCols=['idf_token', 'length','compound2','negative2','positive2','neutral2'], outputCol='features') 0.6785473212243401

In [None]:
# Save the model
predictor.save("sentiment_model.h5")

# write.overwrite().save(path)