In [56]:
import pandas as pd
import ijson
from pyspark.sql import SparkSession
import warnings
warnings.filterwarnings('ignore')

In [66]:
import glob
# tweet_id=[]
time = []
text = []
location =[]
read_files = glob.glob("7day/7dayTweetData_formatted*.json")
for file in read_files:
    with open(str(file), 'r') as f:
        objects = ijson.items(f, 'results.item')
        for row in objects:  
#             tweet_id.append(row['id'])
            if row['text'].startswith("RT") == False and row['id'] not in location:
                location.append(row['user']['location'])
                time.append(row['created_at'])
                text.append(row['text'])

data = pd.DataFrame({"text":text,"time":time,"location":location},columns=["text","time","location"])
data['time'] = pd.to_datetime(data['time']).dt.date

In [61]:
tweet_id = {'id':tweet_id}
with open('tweet_id.json', 'w') as outfile:
    json.dump(tweet_id, outfile)

In [None]:
from SentimentAnalysis import *

In [None]:
spark = SparkSession.builder.appName('NLP').getOrCreate()

In [None]:
strip_non_ascii_udf = udf(strip_non_ascii, StringType())
fix_abbreviation_udf = udf(fix_abbreviation, StringType())
remove_features_udf = udf(remove_features, StringType())
sentiment_analysis_udf = udf(sentiment_analysis , FloatType())
sentiment_udf = udf(lambda x: condition(x), StringType())

In [None]:
df = spark.createDataFrame(data)
df = df.withColumn('text_non_asci',strip_non_ascii_udf(df['text']))

In [None]:
df = df.withColumn('fixed_abbrev',fix_abbreviation_udf(df['text_non_asci']))

In [None]:
df = df.withColumn('removed',remove_features_udf(df['fixed_abbrev']))

In [None]:
df  = df.withColumn("sentiment_score", sentiment_analysis_udf( df['removed'] ))

In [None]:
df  = df.withColumn("sentiment", sentiment_udf( df['sentiment_score'] ))
SA_results = df.select('text','time','sentiment_score','sentiment')

In [None]:
SA_results.show()

In [None]:
SA_results = SA_results.groupBy(['time','sentiment']).count()
SA_results = SA_results.where(SA_results.sentiment=='negative').orderBy('time').show()

# Topic Model: Latent Dirichlet Allocation

In [3]:
from TextAnalytics import *

In [4]:
from pyspark.sql.types import ArrayType
def to_word(text):
    return text.split(" ")

In [5]:
spark = SparkSession.builder.appName('NLP').getOrCreate()

In [6]:
strip_non_ascii_udf = udf(strip_non_ascii, StringType())
check_blanks_udf = udf(check_blanks, StringType())
check_lang_udf = udf(check_lang, StringType())
fix_abbreviation_udf = udf(fix_abbreviation, StringType())
remove_stops_udf = udf(remove_stops, StringType())
remove_features_udf = udf(remove_features, StringType())
tag_and_remove_udf = udf(tag_and_remove, StringType())
lemmatize_udf = udf(lemmatize, StringType())
to_word_udf = udf(to_word,ArrayType(StringType()))

In [7]:
rawdata = spark.createDataFrame(data)

In [8]:
raw_cols =  rawdata.columns

In [9]:
rawdata = rawdata.withColumn('non_asci', strip_non_ascii_udf(rawdata['text']))

In [10]:
rawdata = rawdata.select(raw_cols+['non_asci'])\
                 .withColumn('fixed_abbrev',fix_abbreviation_udf(rawdata['non_asci']))

In [11]:
rawdata = rawdata.select(raw_cols+['fixed_abbrev'])\
                .withColumn('stop_texts',remove_stops_udf(rawdata['fixed_abbrev']))

In [12]:
rawdata=rawdata.select(raw_cols+['stop_texts'])\
                .withColumn('removed',remove_features_udf(rawdata['stop_texts']))

In [13]:
rawdata = rawdata.select(raw_cols+['removed'])\
                  .withColumn('tagged_text',tag_and_remove_udf(rawdata['removed']))

In [14]:
rawdata = rawdata.select(raw_cols+['tagged_text']) \
                  .withColumn('lemm_text',lemmatize_udf(rawdata['tagged_text']))

In [15]:
rawdata = rawdata.select(raw_cols+['lemm_text']) \
                  .withColumn("is_blank", check_blanks_udf(rawdata["lemm_text"]))

In [16]:
rawdata = rawdata.select(raw_cols+['lemm_text','is_blank']) \
                  .withColumn("word", to_word_udf(rawdata["lemm_text"]))

In [17]:
from pyspark.sql.functions import monotonically_increasing_id
# Create Unique ID
rawdata = rawdata.withColumn("uid", monotonically_increasing_id())
data = rawdata.filter(rawdata["is_blank"] == "False")

In [18]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes, RandomForestClassifier
from pyspark.ml.clustering import LDA
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.feature import CountVectorizer

# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and nb.
tokenizer = Tokenizer(inputCol="lemm_text", outputCol="words")
#data = tokenizer.transform(data)
vectorizer = CountVectorizer(inputCol= "words", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features")
#idfModel = idf.fit(data)

lda = LDA(k=20, seed=1, optimizer="em")

In [19]:
from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(inputCol="word", outputCol="rawFeatures", vocabSize = 1000)
cvmodel = cv.fit(data)
featurizedData = cvmodel.transform(data)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData) # TFIDF

In [20]:
# "em" = expectation-maximization 
lda = LDA(k=5, seed=123, optimizer="em", featuresCol="features")
ldamodel = lda.fit(rescaledData)
ldatopics = ldamodel.describeTopics()

In [21]:
def indices_to_terms(vocabulary):
    def indices_to_terms(xs):
        return [vocabulary[int(x)] for x in xs]
    return udf(indices_to_terms, ArrayType(StringType()))

In [22]:
ldatopics = ldatopics.withColumn(
    "topics_words", indices_to_terms(cvmodel.vocabulary)("termIndices"))

In [23]:
ldatopics.select(['topic','topics_words']).show()

+-----+--------------------+
|topic|        topics_words|
+-----+--------------------+
|    0|[covid, melbourne...|
|    1|[covid, get, peop...|
|    2|[covid, vaccine, ...|
|    3|[covid, get, vacc...|
|    4|[covid, test, cas...|
+-----+--------------------+



In [33]:
from pyspark.ml.feature import StopWordsRemover,Tokenizer, RegexTokenizer, CountVectorizer, IDF
from pyspark.sql.functions import udf, col, size, explode, regexp_replace, trim, lower, lit
from pyspark.sql.types import ArrayType, StringType, DoubleType, IntegerType, LongType
# from pyspark.ml.clustering import LDA
import pyLDAvis

import numpy as np
def format_data_to_pyldavis(df_filtered, count_vectorizer, transformed, lda_model):
    xxx = df_filtered.select((explode(df_filtered.word)).alias("words")).groupby("words").count()
    word_counts = {r['words']:r['count'] for r in xxx.collect()}
    word_counts = [word_counts[w] for w in count_vectorizer.vocabulary]


    data = {'topic_term_dists': np.array(lda_model.topicsMatrix().toArray()).T, 
            'doc_topic_dists': np.array([x.toArray() for x in transformed.select(["topicDistribution"]).toPandas()['topicDistribution']]),
            'doc_lengths': [r[0] for r in df_filtered.select(size(df_filtered.word)).collect()],
            'vocab': count_vectorizer.vocabulary,
            'term_frequency': word_counts}

    return data

def filter_bad_docs(data):
    bad = 0
    doc_topic_dists_filtrado = []
    doc_lengths_filtrado = []

    for x,y in zip(data['doc_topic_dists'], data['doc_lengths']):
        if np.sum(x)==0:
            bad+=1
        elif np.sum(x) != 1:
            bad+=1
        elif np.isnan(x).any():
            bad+=1
        else:
            doc_topic_dists_filtrado.append(x)
            doc_lengths_filtrado.append(y)

    data['doc_topic_dists'] = doc_topic_dists_filtrado
    data['doc_lengths'] = doc_lengths_filtrado

transformed = ldamodel.transform(rescaledData)

# # FORMAT DATA AND PASS IT TO PYLDAVIS
formatted = format_data_to_pyldavis(data, cvmodel, transformed, ldamodel)
filter_bad_docs(formatted)

  and should_run_async(code)


In [38]:
py_lda_prepared_data = pyLDAvis.prepare(formatted['topic_term_dists'],formatted['doc_topic_dists'],formatted['doc_lengths'],formatted['vocab'],formatted['term_frequency'])
pyLDAvis.display(py_lda_prepared_data)