In [1]:
1) Create term cluster for each of 4 weeks
3) Create weekly / daily stats for each term 
https://docs.databricks.com/notebooks/notebooks-use.html

In [2]:
import sparknlp

print("Spark NLP version")
print(sparknlp.version())
print("Apache Spark version")

In [3]:
!pip install wordcloud

In [4]:
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.functions import from_unixtime, to_date, asc, year, udf, explode, split, col, desc, length, rank, dense_rank, avg, sum
from pyspark.sql.window import Window
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.stat import Correlation
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql.functions import col, to_timestamp,date_format
from collections import Counter
from wordcloud import WordCloud

#from textblob import TextBlob

In [5]:
df = spark.sql("""select event_date, full_text as text from march_covid_tweets""")


In [6]:
data = df.withColumn("week", date_format(col("event_date"), "W"))

In [7]:
data.orderBy(asc("event_date")).select("text").show(2)

In [8]:
documentExplainerPipeline = PretrainedPipeline('explain_document_dl', 'en')

In [9]:
week4DF = data.filter((data['week'] == 4)).select("text")

In [10]:
week5DF = data.filter((data['week'] == 5)).select("text")

In [11]:
week4DF.show(2)

In [12]:
!pip install nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
eng_stopwords = stopwords.words('english')
#eng_stopwords.append('xxxx')
#for item in eng_stopwords[:10]:
#  print(item)

In [13]:
def wordcloud(corpus_sdf):
    corpus_pdf = corpus_sdf.limit(500).toPandas()

    corpus_dict = {}
    for index, row in corpus_pdf.iterrows():
        corpus_dict[row['text']] = row['count']
        
    wordcloud = WordCloud().generate_from_frequencies(corpus_dict)
    plt.imshow(wordcloud);

### Approach 1

In [15]:
def remove_stopwords(x):    
    sw = stopwords.words("english")
    string = ''
    for x in x.split(' '):
        if x.lower() not in sw:
            string += x + ' '
        else:
            pass
    return string

nosw = udf(remove_stopwords)
spark.udf.register("nosw", nosw)
week4DF = week4DF.withColumn('text_nosw',nosw('text'))

In [16]:
def corpus_creator(text_col):
    corpus = text_col.rdd \
                    .flatMap(flat_list) \
                    .map(lambda x: (x, 1)) \
                    .reduceByKey(lambda x, y: x+y ) \
                    .sortBy(lambda x: x[1], ascending=False) \
                    .toDF() \
                    .withColumnRenamed('_1','text') \
                    .withColumnRenamed('_2','count')
    return corpus

In [17]:
def flat_list(column):
    corpus = []
    for row in column:
        for w in row.split(' '):
            corpus.append(w)
    return corpus

In [18]:
corpus_tweet=corpus_creator(week4DF.select("text_nosw"))

In [19]:
import matplotlib.pyplot as plt
import seaborn as sns
fig, ax = plt.subplots()
ax = wordcloud(corpus_tweet)
fig.suptitle('{} wordcloud'.format("2020-Jan-W4"))

In [20]:
week5DF = week5DF.withColumn('text_nosw',nosw('text'))
corpus_tweet_janw5=corpus_creator(week5DF.select("text_nosw"))
fig, ax = plt.subplots()
ax = wordcloud(corpus_tweet_janw5)
fig.suptitle('{} wordcloud'.format("2020-Jan-W5"))

### Approach 2

In [22]:
import matplotlib.pyplot as plt
import seaborn as sns

def make_string(x):
  string = ''
  for x in x:
    string += x + ' '
  return string

make_string = udf(make_string)
spark.udf.register("make_string", make_string)   

In [23]:
def resolveEntities(input_corpus, week_list):
  weekly_entities = {}
  print(week_list)
  input_corpus.show(5)
  
  for weeknum in week_list:
      print("starting entity resolution")
      entities_filtered = documentExplainerPipeline.transform(input_corpus) \
                                  .select('text','count',
                                          col('entities.result').alias('entities'),
                                          col('pos.result').alias('pos'))

      entities_filtered.show(2)

      entities_filtered = entities_filtered.withColumn('entities',make_string('entities'))\
                                          .withColumn('pos',make_string('pos'))\
                                          .filter('entities <> ""')

      entities_filtered.show(2)
      weekly_entities[str(weeknum)] = entities_filtered
  return weekly_entities 

In [24]:
week_list = [4]
jan_weekly_entities = resolveEntities(corpus_tweet,week_list)

In [25]:
for key,value in jan_weekly_entities.items():
    fig, ax = plt.subplots()
    ax = wordcloud(value)
    fig.suptitle('{} wordcloud'.format(key))

## Approach 3

In [27]:
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer,LemmatizerModel, StopWordsCleaner)
from pyspark.ml import Pipeline
documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

# note normalizer defaults to changing all words to lowercase.
# Use .setLowercase(False) to maintain input case.
normalizer = Normalizer() \
    .setInputCols(['token']) \
    .setOutputCol('normalized') \
    .setLowercase(True)

# note that lemmatizer needs a dictionary. So I used the pre-trained
# model (note that it defaults to english)
lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(['normalized']) \
    .setOutputCol('lemma') \

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(['lemma']) \
    .setOutputCol('clean_lemma') \
    .setCaseSensitive(False) \
    .setStopWords(eng_stopwords) 

#eng_stopwords

# finisher converts tokens to human-readable output
finisher = Finisher() \
    .setInputCols(['clean_lemma']) \
    .setCleanAnnotations(False)

pipeline = Pipeline() \
    .setStages([
        documentAssembler,
        tokenizer,
        normalizer,
        lemmatizer,
        stopwords_cleaner,
        finisher
    ])

In [28]:
week4CleanDF = pipeline.fit(week4DF).transform(week4DF) # without stop words

In [29]:
week5CleanDF = pipeline.fit(week5DF).transform(week5DF)

In [30]:
week4CleanDFV2 = pipeline.fit(week4DF).transform(week4DF)
week4CleanDFV2.show(10)

In [31]:
from pyspark.sql.functions import explode, col
week4TokenDF = week4CleanDF.withColumn("exploded_text", explode(col("finished_clean_lemma")))

In [32]:
week5TokenDF = week5CleanDF.withColumn("exploded_text", explode(col("finished_clean_lemma")))

In [33]:
countWeek4DF = week4TokenDF.groupby('exploded_text').count()
countWeek4DF = countWeek4DF.withColumnRenamed("exploded_text","text")
#countWeek4DF.show()

In [34]:
countWeek5DF = week5TokenDF.groupby('exploded_text').count()
countWeek5DF = countWeek5DF.withColumnRenamed("exploded_text","text")
#countWeek4DF.show()

In [35]:
countWeek5DF.repartition(150).write.format("orc").mode('overwrite').saveAsTable("countJanWeek5DF")

In [36]:
topTermsWeek4DF = spark.sql("""select text, count from countJanWeek4DF where count > 1000 order by count desc limit 100""")
#%sql
#select text, count from countJanWeek4DF where count > 1000 order by count desc limit 50

In [37]:
topTermsWeek5DF = spark.sql("""select text, count from countJanWeek5DF where count > 1000 order by count desc limit 100""")
#topTermsDF.show(100)


In [38]:
fig, ax = plt.subplots()
ax = wordcloud(topTermsWeek4DF)
fig.suptitle('{} wordcloud'.format("2020-01-W4"))

In [39]:
topTermsWeek4DF.show(2)
jan_weekly_entities['4'].show()

In [40]:
fig, ax = plt.subplots()
ax = wordcloud(topTermsWeek5DF)
fig.suptitle('{} wordcloud'.format("2020-01-W5"))

##Approach 4

In [42]:
week_list = [4]
jan_weekly_entities_v2 = resolveEntities(topTermsWeek4DF,week_list)

In [43]:

for key,value in jan_weekly_entities_v2.items():
    fig, ax = plt.subplots()
    ax = wordcloud(value)
    fig.suptitle('{} wordcloud'.format(key))

In [44]:
topTermsWeek4DF.show(20)

In [45]:
from sparknlp.pretrained import PretrainedPipeline
pipeline1 = PretrainedPipeline('recognize_entities_dl', 'en')

pipeline2 = PretrainedPipeline("onto_recognize_entities_sm", "en")

predictions1 = pipeline1.transform(data)
predictions1EntityResult=predictions1.select("entities.result")
predictions1EntityResult.show(10)
#predictions1EntityResult.printSchema()

predictions1NerResult=predictions1.select("ner.result")
predictions1NerResult.show(1)
predictions1NerResult.printSchema()

predictions2 = pipeline2.transform(data)
predictions2.select("entities.result").show(5)
predictions2.select("ner.result").show(2)