In [None]:
# initialize spark
import findspark
findspark.init()

In [None]:
from pyspark.sql.session import SparkSession
import pyspark.sql.types as tp
from pyspark.ml import Pipeline
#from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.feature import StopWordsRemover, Word2Vec, Tokenizer
#from pyspark.ml.feature import RegexTokenizer,StopWordsRemover,CountVectorizer,IDF, HashingTF
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from  pyspark.ml.pipeline import PipelineModel  # For saving the model
from pyspark.sql.functions import col
import pyspark.sql.functions as F
# import structType
from pyspark.sql.types import StructType, StringType
from pyspark.sql.functions import from_json
import numpy as np
import requests
import pyspark

### Running spark

In [None]:
!/Users/JoeKifle/spark-3.2.1-bin-hadoop3.2/sbin/start-master.sh

In [None]:
!/Users/JoeKifle/spark-3.2.1-bin-hadoop3.2/sbin/start-worker.sh spark://joetelila.lan:7077

In [None]:
#!/Users/JoeKifle/spark-3.2.1-bin-hadoop3.2/sbin/stop-worker.sh

In [None]:
#!/Users/JoeKifle/spark-3.2.1-bin-hadoop3.2/sbin/stop-master.sh

### Spark Session

In [None]:
# dependency for spark-sql-kafka
conf = pyspark.SparkConf()
conf.set("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.1")

#spark_master = "spark://131.114.50.200:7077"
spark_master = "spark://joetelila.lan:7077"
#sc = pyspark.SparkContext(master=spark_master,appName="Hello Spark")
spark = SparkSession\
        .builder\
        .master(spark_master)\
        .appName("sentimentAnalysis")\
        .config(conf=conf)\
        .getOrCreate()
#spark._sc.setLogLevel("ERROR")

### Reading Data

In [None]:
# define the schema
my_schema = tp.StructType([
  tp.StructField(name= 'text',       dataType= tp.StringType(),   nullable= True),
  tp.StructField(name= 'polarity',    dataType= tp.IntegerType(),  nullable= True)
  ])

In [None]:
# read the dataset  
tweet_data = spark.read.csv('data/tweets_dataset_may6_no_comma.csv',inferSchema=True, header=True)

In [None]:
# Removing handles and links from the tweets
tweet_data = tweet_data.withColumn('text', F.regexp_replace('text','@[A-Za-z0-9_]+',''))
tweet_data = tweet_data.withColumn('text', F.regexp_replace('text','https?://[^ ]+',''))
tweet_data = tweet_data.withColumn('text', F.regexp_replace('text','www.[^ ]+',''))

In [None]:
# print the schema of the file
tweet_data.printSchema()

In [None]:
# dropping null columns
tweet_data=tweet_data.na.drop()

In [None]:
# Show distribution of the polarity
tweet_data.groupBy("polarity") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

### Building pipeline

In [None]:
# Stages For the Pipeline
tokenizer = Tokenizer(inputCol='text',outputCol='mytokens')
stopwords_remover = StopWordsRemover(inputCol='mytokens',outputCol='filtered_tokens')
word_2_vec = Word2Vec(inputCol= 'filtered_tokens', outputCol= 'w2v', vectorSize=200) #, vectorSize= 300)

In [None]:
#model = LogisticRegression(featuresCol='vector',labelCol='polarity')
model = LogisticRegression(featuresCol= 'w2v',labelCol= 'polarity', regParam=0.008, maxIter=10000)

In [None]:
# setup the pipeline
pipeline = Pipeline(stages= [tokenizer, stopwords_remover, word_2_vec, model])

### Training model

In [None]:
### Split Dataset and train
(train_tweet,test_tweet) = tweet_data.randomSplit((0.8,0.2),seed=42)
pipelineFit = pipeline.fit(train_tweet)

In [None]:
'''
# fit the pipeline model with the training data
if os.path.isdir('pipeline_model'):
    # Loading the model
    print("Loading the model from a file . . .")
    pipeline_model = PipelineModel.load('pipeline_model')
else:
    print("Training the model model . . .")
    pipelineFit = pipeline.fit(train_tweet)
    # Persist the model, ref: https://spark.apache.org/docs/latest/ml-pipeline.html
    pipelineFit.write().overwrite().save('pipeline_model')
'''

### Evaluating model

In [23]:
# Predictions on our - Test Dataset.
predictions = pipelineFit.transform(test_tweet)

evaluator = MulticlassClassificationEvaluator(labelCol='polarity',predictionCol='prediction',metricName='f1')
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

In [None]:
# persisting the model
pipelineFit.write().overwrite().save('pipeline_lr_model')