In [83]:
folder_path = "../Weibo_Data/"

visuals_output_path = "./visuals/"
output_path = "./output/"

file1 = "weibo_封城+疫情.csv"
file2 = "weibo_封城.csv"
file3 = "weibo_疫情.csv"

In [84]:
!pip install pyLDAvis



In [85]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

import nltk
nltk.download('stopwords') #download if don't have yet
from nltk.tokenize import word_tokenize, RegexpTokenizer # tokenize words
from nltk.corpus import stopwords

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lindy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [86]:
# Read in files
df1 = pd.read_csv(folder_path + file1)
df2 = pd.read_csv(folder_path + file2)
df3 = pd.read_csv(folder_path + file3)

<h1>Cleaning Data</h1>

In [87]:
list1 = df1["text"].tolist()
list2 = df2["text"].tolist()
list3 = df3["text"].tolist()

In [88]:
def remove_user_mentions(text):
    return re.sub("@[A-Za-z0-9_]+","", text)

def remove_links(text):
    return re.sub(r'http\S+', '', text)

def remove_digit_strings(text):
    return re.sub(r'\d+', '', text)

def remove_special_chars(text):
    remove_chars = '[0-9’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？“”‘’！[\\]^_`{|}~]+'
    return re.sub(remove_chars, ' ', text)

def clean_text(text):
    result_text = text
    result_text = remove_user_mentions(result_text)
    result_text = remove_links(result_text)
    result_text = remove_digit_strings(result_text)
    result_text = remove_special_chars(result_text)
    result_text = result_text.lower()
    return result_text

def remove_cn_chars(text):
    result_text = re.sub(r'([\u4e00-\u9fff]+', '', text)
    return result_text

In [89]:
for i in range(len(list1)):
#     list1[i] = remove_cn_chars(list1[i])
    list1[i] = clean_text(list1[i])

for i in range(len(list2)):
#     list2[i] = remove_cn_chars(list2[i])
    list2[i] = clean_text(list2[i])

for i in range(len(list3)):
#     list3[i] = remove_cn_chars(list3[i])
    list3[i] = clean_text(list3[i])

In [90]:
df1["cleaned_text"] = list1
df2["cleaned_text"] = list2
df3["cleaned_text"] = list3

In [91]:
# Create dictionary of file name to associated data

weibo = {}
keys = ["weibo_封城+疫情", "weibo_封城", "weibo_疫情"]
for key in keys:
    weibo[key] = {}
    
weibo["weibo_封城+疫情"]["text"] = df1["cleaned_text"].tolist()
weibo["weibo_封城"]["text"] = df2["cleaned_text"].tolist()
weibo["weibo_疫情"]["text"] = df3["cleaned_text"].tolist()

<h1>Tokenizing Data</h1>

In [92]:
# Get custom stopwords
content = []
f = open("./stopwords.txt", encoding = 'utf-8')
# perform file operations
for line in f:
    content.append(line)
f.close()

custom_stopwords = []
for line in content:
    wordlist = line.split(",")
    for word in wordlist:
        custom_stopwords.append(word)

In [93]:
# Stopwords
stop_words = stopwords.words('english')
exclude_words = stop_words

exclude_words_extra = ["wuhan","covid","coronavirus","lockdown","lockdo","pandemic","let","get","ago","go","im","ive","would","one","also","to","tag"]

# Exclude custom stopwords
exclude_words.extend(custom_stopwords)
exclude_words.extend(exclude_words_extra)

In [94]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

for w in weibo.keys():
    weibo[w]["data_words"] = list(sent_to_words(weibo[w]["text"]))

In [95]:
def bigram_trigram_models(w):
    data_words = weibo[w]["data_words"]
    
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
    
    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    weibo[w]["bigram"] = bigram
    weibo[w]["trigram"] = trigram
    weibo[w]["bigram_mod"] = bigram_mod
    weibo[w]["trigram_mod"] = trigram_mod

for w in weibo.keys():
    bigram_trigram_models(w)

In [96]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in exclude_words] for doc in texts]

def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts, bigram_mod, trigram_mod):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [97]:
def combined(w):
    data_words = weibo[w]["data_words"]

    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)
    weibo[w]["data_words_nostops"] = data_words_nostops 

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops, weibo[w]["bigram_mod"])
    weibo[w]["data_words_bigrams"] = data_words_bigrams

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_bigrams, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    weibo[w]["data_lemmatized"] = data_lemmatized

for w in weibo.keys():
    combined(w)

In [98]:
print(weibo[list(weibo.keys())[0]].keys())

dict_keys(['text', 'data_words', 'bigram', 'trigram', 'bigram_mod', 'trigram_mod', 'data_words_nostops', 'data_words_bigrams', 'data_lemmatized'])


In [99]:
def tokenize(w):
    data_lemmatized = weibo[w]["data_lemmatized"]

    # Create Dictionary
    id2word = corpora.Dictionary(data_lemmatized)
    weibo[w]["id2word"] = id2word

    # Create Corpus
    texts = data_lemmatized
    weibo[w]["texts"] = texts

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    weibo[w]["corpus"] = corpus

    # Human readable format of corpus (term-frequency)
    weibo[w]["corpus_readable"] = [[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

for w in weibo.keys():
    tokenize(w)

In [100]:
index = 1   # change this to see diff output
print(weibo[list(weibo.keys())[index]]["corpus_readable"])

[[('advance', 1), ('city', 3), ('discipline', 1), ('dog', 1), ('event', 1), ('example', 1), ('fall', 1), ('go', 1), ('grow', 1), ('honest', 1), ('maybe', 1), ('myth', 1), ('prepare', 1), ('speak', 1), ('strange', 1), ('student', 1), ('subject', 1), ('today', 1), ('yesterday', 1)]]


<h1>LDA Model</h1>

In [101]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

# supporting function 2
def compute_perplexity_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    perplexity_score = lda_model.log_perplexity(corpus_sets[i])
    
    return perplexity_score

In [102]:
def lda_model(w):
    corpus = weibo[w]["corpus"]
    id2word = weibo[w]["id2word"]

    # Build LDA model
    num_topics = 4
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=num_topics, 
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

    weibo[w]["lda_model"] = lda_model

for w in weibo.keys():
    lda_model(w)

In [103]:
index = 1   # change this to see diff output
print(weibo[list(weibo.keys())[index]]["lda_model"])

LdaModel<num_terms=2768, num_topics=4, decay=0.5, chunksize=100>


In [104]:
def keyword_in_topics(w):
    corpus = weibo[w]["corpus"]
    lda_model = weibo[w]["lda_model"]

    # Print the Keyword in the 10 topics
    pprint(lda_model.print_topics())
    doc_lda = lda_model[corpus]
    weibo[w]["doc_lda"] = doc_lda

for w in weibo.keys():
    keyword_in_topics(w)

[(0,
  '0.025*"city" + 0.016*"text" + 0.015*"full" + 0.013*"people" + 0.012*"many" '
  '+ 0.011*"number" + 0.011*"impact" + 0.010*"spread" + 0.010*"outbreak" + '
  '0.009*"close"'),
 (1,
  '0.034*"day" + 0.031*"year" + 0.029*"city" + 0.026*"full" + 0.025*"time" + '
  '0.021*"close" + 0.014*"go" + 0.014*"text" + 0.012*"home" + 0.012*"today"'),
 (2,
  '0.042*"people" + 0.018*"country" + 0.016*"full" + 0.015*"text" + '
  '0.014*"city" + 0.013*"medical" + 0.013*"make" + 0.013*"control" + '
  '0.013*"fight" + 0.011*"life"'),
 (3,
  '0.029*"new" + 0.020*"country" + 0.016*"director" + 0.015*"express" + '
  '0.013*"confirm" + 0.010*"accord" + 0.010*"week" + 0.009*"quarantine" + '
  '0.009*"later" + 0.009*"text"')]
[(0,
  '0.014*"soon" + 0.012*"sell" + 0.011*"wave" + 0.009*"lie" + 0.008*"hand" + '
  '0.008*"strain" + 0.008*"everywhere" + 0.008*"die" + 0.008*"know" + '
  '0.008*"spread"'),
 (1,
  '0.016*"people" + 0.014*"high" + 0.013*"country" + 0.013*"less" + '
  '0.010*"text" + 0.009*"past" +

In [105]:
index = 1   # change this to see diff output
print(weibo[list(weibo.keys())[index]]["doc_lda"])

<gensim.interfaces.TransformedCorpus object at 0x000001C23BEBFCD0>


In [106]:
def perplexity_coherence(w):
    corpus = weibo[w]["corpus"]
    data_lemmatized = weibo[w]["data_lemmatized"]
    id2word = weibo[w]["id2word"]
    lda_model = weibo[w]["lda_model"]

    # Compute Perplexity
    weibo[w]["perplexity"] = lda_model.log_perplexity(corpus)
    # print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()

    weibo[w]["coherence_model_lda"] = coherence_model_lda
    weibo[w]["coherence_lda"] = coherence_lda

    # print('\nCoherence Score: ', coherence_lda)

for w in weibo.keys():
    perplexity_coherence(w)

In [107]:
index = 1   # change this to see diff output
print(weibo[list(weibo.keys())[index]]["perplexity"])
print(weibo[list(weibo.keys())[index]]["coherence_lda"])

-6.996089070469202
0.43052963952756484


In [108]:
def optimal_model(w): 
    lda_model = weibo[w]["lda_model"]

    optimal_model = lda_model
    model_topics = optimal_model.show_topics(formatted=False)

    weibo[w]["model_topics"] = model_topics

for w in weibo.keys():
    optimal_model(w)

In [109]:
index = 1   # change this to see diff output
print(weibo[list(weibo.keys())[index]]["model_topics"])

[(0, [('soon', 0.013631528), ('sell', 0.011673953), ('wave', 0.010919605), ('lie', 0.009306657), ('hand', 0.008355316), ('strain', 0.008177674), ('everywhere', 0.007935251), ('die', 0.0077032903), ('know', 0.007525805), ('spread', 0.00751747)]), (1, [('people', 0.01639532), ('high', 0.013906687), ('country', 0.013197607), ('less', 0.012678805), ('text', 0.010279735), ('past', 0.009227668), ('number', 0.0072286665), ('policy', 0.006835826), ('medium', 0.0067634718), ('much', 0.006626914)]), (2, [('outbreak', 0.012870387), ('lot', 0.010422628), ('hard', 0.0101859765), ('need', 0.008714052), ('read', 0.008118482), ('write', 0.007911781), ('kind', 0.007873071), ('little', 0.00784374), ('text', 0.007790091), ('afraid', 0.0073796595)]), (3, [('city', 0.041282814), ('year', 0.038655777), ('full', 0.028891843), ('day', 0.0234722), ('time', 0.022343297), ('last', 0.021189958), ('close', 0.019520547), ('people', 0.016406493), ('text', 0.014261148), ('go', 0.012534313)])]


In [110]:
def explore_topic(lda_model, topic_number, topn, output=True):
    """
    accept a ldamodel, atopic number and topn vocabs of interest
    prints a formatted list of the topn terms
    """
    terms = []
    for term, frequency in lda_model.show_topic(topic_number, topn=topn):
        terms += [term]
        if output:
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))
    
    return terms

In [111]:
def explore_topic_by_title(w):
    lda_model = weibo[w]["lda_model"]
    topic_summaries = []

    print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
    print(w)
    for i in range(4):
        print('Topic '+str(i)+' |---------------------\n')
        tmp = explore_topic(lda_model,topic_number=i, topn=10, output=True )
        #     print tmp[:5]
        topic_summaries += [tmp[:5]]
        # print

    weibo[w]["topic_summaries"] = topic_summaries

for w in weibo.keys():
    explore_topic_by_title(w)

term                 frequency

weibo_封城+疫情
Topic 0 |---------------------

city                 0.025
text                 0.016
full                 0.015
people               0.013
many                 0.012
number               0.011
impact               0.011
spread               0.010
outbreak             0.010
close                0.009
Topic 1 |---------------------

day                  0.034
year                 0.031
city                 0.029
full                 0.026
time                 0.025
close                0.021
go                   0.014
text                 0.014
home                 0.012
today                0.012
Topic 2 |---------------------

people               0.042
country              0.018
full                 0.016
text                 0.015
city                 0.014
medical              0.013
make                 0.013
control              0.013
fight                0.013
life                 0.011
Topic 3 |---------------------

new               

<h1>Data Output - Dominant Topic</h1>

In [112]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

for w in weibo.keys():
    lda_model = weibo[w]["lda_model"]
    corpus = weibo[w]["corpus"]
    texts = weibo[w]["texts"]
    
    df_topic_sents_keywords = format_topics_sentences(lda_model, corpus, texts)
    
    # Format
    df_dominant_topic = df_topic_sents_keywords.reset_index()
    df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
    
    weibo[w]["df_dominant_topic"] = df_dominant_topic
    
    df_dominant_topic.to_csv(output_path + w + ".csv")

<h1>Data Visualization</h1>

<h3>PyLDAVis</h3>

In [113]:
import pyLDAvis.gensim_models as gensimvis
import pickle
import string

# Visualize the topics
for w in weibo.keys():
    lda_model = weibo[w]["lda_model"]
    corpus = weibo[w]["corpus"]
    id2word = weibo[w]["id2word"]

    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
    
    file_path = visuals_output_path + w + '.html'
    pyLDAvis.save_html(LDAvis_prepared, file_path)
    
    with open(file_path, 'r+') as f:
        content = f.read()
        f.seek(0, 0)
        
        if (w == "weibo_封城+疫情"):
            f.write("<h1>" + "weibo_closure+epidemic" + "</h1>" + '\n' + content)
        elif (w == "weibo_封城"):
            f.write("<h1>" + "weibo_closure" + "</h1>" + '\n' + content)
        elif (w == "weibo_疫情"):
            f.write("<h1>" + "weibo_epidemic" + "</h1>" + '\n' + content)
        f.close()

  default_term_info = default_term_info.sort_values(
