## Import Libraries

In [2]:
import os

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, LongType, DoubleType
from pyspark.ml.feature import CountVectorizer , IDF, VectorAssembler
from pyspark.mllib.linalg import Vector, Vectors, VectorUDT
from pyspark.ml.clustering import LDA
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.sql.functions import udf, lit, when, col

## Initialize Spark Session

In [3]:
spark = SparkSession.builder.master('local').appName('local').config("spark.driver.memory", "15g").getOrCreate()

## Paths to Covid Tweets sanatized for LDA Purposes

In [4]:
COVID_PROCESSED_PATH = '../data/processed/full-tweets-sanitized'

COVID_PROCESSED_FILENAME = 'Tweets_fully_processed_LDA.csv'

## Read in the data

In [5]:
covid_schema = StructType([
    StructField('id', LongType(), True),
    StructField('full_text', StringType(), True),
])

df_covid = spark.read.csv(COVID_PROCESSED_PATH+"/" + COVID_PROCESSED_FILENAME, header=True, schema=covid_schema)

## Clean-up and Tokenize

There could be rows where there are no words left after cleaning, therefore we need to impute something in there for Tokenizer to work. 

Tokenizer will convert a String into a List where each entry is a word.

In [6]:
df_covid_no_null = df_covid.na.fill("I am null imputed")
tokenizer = Tokenizer(inputCol="full_text", outputCol="words") 
tokenized = tokenizer.transform(df_covid_no_null)

## Model Prep

Use the TF-IDF method to extract features of our tweets. Read more about it here: https://spark.apache.org/docs/1.4.1/mllib-feature-extraction.html

After some inital runs, results were not ideal, therefore I decided to remove any words that were 3 characters long or less. In addition, a few other words appeared in topics that appeared to make little sense or appeared too often i.e. "coronavirus". The StopWordsRemover function can accept a custom list to remove words from which is how I remove them from our tweets. Then I reprocess cvmodel. 

Example to see the 20 most common vocab words left from the model: list(cvmodel.vocabulary[0:20])

In [7]:
# TF
cv = CountVectorizer(inputCol="words", outputCol="raw_features")
cvmodel = cv.fit(tokenized)

more_then_3_charachters = [word for word in cvmodel.vocabulary if len(word) <= 3] 
more_then_3_charachters.append('coronavirus')
more_then_3_charachters.append('corona')
more_then_3_charachters.append('tandem')
more_then_3_charachters.append('skin')
more_then_3_charachters.append('tone')
remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords = more_then_3_charachters)
wordsDataFrame = remover.transform(tokenized)

cv = CountVectorizer(inputCol="filtered", outputCol="raw_features")
cvmodel = cv.fit(wordsDataFrame)
result_cv = cvmodel.transform(wordsDataFrame)


#result_cv = cvmodel.transform(tokenized)
# IDF
idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv) 

## Train Model

k parameter tells you how many topics you want, maxIter is how many times the algorithm will run. Found 4 topics to be the sweetspot where you get reasonable output, can run less iterations but 200 or 300 worked best.

In [None]:
lda = LDA(k=4, maxIter=300)
model = lda.fit(result_tfidf)

## Find Topics

In [None]:
topics = model.describeTopics()
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)


## Find Topic words rather than indices

In [None]:
topics_list = [[29, 31, 13, 22, 37, 57, 40, 49, 99, 21],
              [4, 54, 3, 7, 5, 27, 69, 118, 0, 1]  ,
              [23, 39, 0, 86, 25, 44, 130, 4, 71, 152],
              [0, 8, 20, 1, 3, 10, 2, 14, 11, 6] ]
results = []
for j in topics_list:
    sub_results = []
    for i in j:
        sub_results.append(cvmodel.vocabulary[i])
    results.append(sub_results)
results

## Use our model to get predictions

Ideally this would be done on unseen text, but we want to determine for each tweet what topic it best falls under. Will return a column which gives probabilites of it falling under a topic.

In [None]:
test = result_tfidf
test2 = model.transform(test)

## Pull out Probabilities

The results of the model call above result in a Vector format used by PySparks ml library which isn't suitable for export. Used this UDF function to pull out the probabilies so they can be used

In [None]:
def ith_(v, i):
    try:
        return float(v[i])
    except ValueError:
        return None

ith = udf(ith_, DoubleType())

## Create Final Output

In [None]:
test3 = test2.select('id',ith("topicDistribution", lit(0)).alias('a'),ith("topicDistribution", lit(1)).alias('b'),ith("topicDistribution", lit(2)).alias('c'),ith("topicDistribution", lit(3)).alias('d'))
test3 = test3.withColumn("Topic",when((col("a") >= col("b")) & (col("a") >= col("c")) & (col("a") >= col("d")), "Topic 1") \
            .when((col("b") >= col("a")) & (col("b") >= col("c")) & (col("b") >= col("d")), "Topic 2") \
            .when((col("c") >= col("a")) & (col("c") >= col("b")) & (col("c") >= col("d")), "Topic 3").otherwise("Topic 4"))
test3 = test3.withColumn("Topic Description", when(col("Topic") == "Topic 1", "Other").when(col("Topic") == "Topic 2", "Social Distance").when(col("Topic") == "Topic 3", "Masks").otherwise("Lockdown"))
test3 = test3.select("id","Topic","Topic Description")
#test3.write.csv("../data/processed/LDA_topics")