In [1]:
import nltk
# nltk.download('stopwords')

In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

In [3]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [4]:
# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [6]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [7]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [8]:
df = pd.read_csv("usnewspaper_china.csv", header=None,)

In [9]:
data = df[0].dropna().tolist()

In [10]:
# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

print(data[:1])

['In the new Sino-Russian defense relationship, China does what it wants, and there isn’t a whole lot Russia can do about it. Xi Jingping’s recent visit to Moscow—his first since Vladimir Putin invaded Ukraine last year—was summed up by historian Sergey Radchenko in this way: “The summit can be summarised by the Chinese saying 雷声大雨点小 (Loud thunder but few raindrops). Scratch that, even the thunder wasn’t all that loud.” The meeting, which apparently produced no major policy shifts nor even notable statements of support, did further illustrate a tectonic shift in the supposed "no limits" relationship: China is taking the lead in nearly every aspect, including in the defense-industrial sphere once dominated by Russia. Russia’s modern defense ties to China go back to the 1920s. when the new Communist regime in Moscow initially supported the Kuomintang, rather than Mao’s forces, during the Chinese civil war and in the battle against imperial Japan. But Stalin eventually came to back the Ch

In [11]:
data[0]

'In the new Sino-Russian defense relationship, China does what it wants, and there isn’t a whole lot Russia can do about it. Xi Jingping’s recent visit to Moscow—his first since Vladimir Putin invaded Ukraine last year—was summed up by historian Sergey Radchenko in this way: “The summit can be summarised by the Chinese saying 雷声大雨点小 (Loud thunder but few raindrops). Scratch that, even the thunder wasn’t all that loud.” The meeting, which apparently produced no major policy shifts nor even notable statements of support, did further illustrate a tectonic shift in the supposed "no limits" relationship: China is taking the lead in nearly every aspect, including in the defense-industrial sphere once dominated by Russia. Russia’s modern defense ties to China go back to the 1920s. when the new Communist regime in Moscow initially supported the Kuomintang, rather than Mao’s forces, during the Chinese civil war and in the battle against imperial Japan. But Stalin eventually came to back the Chi

In [12]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[0])

['in', 'the', 'new', 'sino', 'russian', 'defense', 'relationship', 'china', 'does', 'what', 'it', 'wants', 'and', 'there', 'isn', 'whole', 'lot', 'russia', 'can', 'do', 'about', 'it', 'xi', 'jingping', 'recent', 'visit', 'to', 'moscow', 'his', 'first', 'since', 'vladimir', 'putin', 'invaded', 'ukraine', 'last', 'year', 'was', 'summed', 'up', 'by', 'historian', 'sergey', 'radchenko', 'in', 'this', 'way', 'the', 'summit', 'can', 'be', 'summarised', 'by', 'the', 'chinese', 'saying', '雷声大雨点小', 'loud', 'thunder', 'but', 'few', 'raindrops', 'scratch', 'that', 'even', 'the', 'thunder', 'wasn', 'all', 'that', 'loud', 'the', 'meeting', 'which', 'apparently', 'produced', 'no', 'major', 'policy', 'shifts', 'nor', 'even', 'notable', 'statements', 'of', 'support', 'did', 'further', 'illustrate', 'tectonic', 'shift', 'in', 'the', 'supposed', 'no', 'limits', 'relationship', 'china', 'is', 'taking', 'the', 'lead', 'in', 'nearly', 'every', 'aspect', 'including', 'in', 'the', 'defense', 'industrial', 's

In [13]:
# Define functions for stopwords and lemmatization
def remove_stopwords(texts):
    return [[word for word in doc if word not in stop_words] for doc in texts]


def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://urldefense.com/v3/__https://spacy.io/api/annotation__;!!Mih3wA!FnSIs50mlo_onRmsKNLcZh_4BRVE4khPD04RKVvif4OzRBesCZPampBVvipydU-_4YE6D9zZSxx8IU3qbQ$ """
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [14]:
print(data_words[0])

['in', 'the', 'new', 'sino', 'russian', 'defense', 'relationship', 'china', 'does', 'what', 'it', 'wants', 'and', 'there', 'isn', 'whole', 'lot', 'russia', 'can', 'do', 'about', 'it', 'xi', 'jingping', 'recent', 'visit', 'to', 'moscow', 'his', 'first', 'since', 'vladimir', 'putin', 'invaded', 'ukraine', 'last', 'year', 'was', 'summed', 'up', 'by', 'historian', 'sergey', 'radchenko', 'in', 'this', 'way', 'the', 'summit', 'can', 'be', 'summarised', 'by', 'the', 'chinese', 'saying', '雷声大雨点小', 'loud', 'thunder', 'but', 'few', 'raindrops', 'scratch', 'that', 'even', 'the', 'thunder', 'wasn', 'all', 'that', 'loud', 'the', 'meeting', 'which', 'apparently', 'produced', 'no', 'major', 'policy', 'shifts', 'nor', 'even', 'notable', 'statements', 'of', 'support', 'did', 'further', 'illustrate', 'tectonic', 'shift', 'in', 'the', 'supposed', 'no', 'limits', 'relationship', 'china', 'is', 'taking', 'the', 'lead', 'in', 'nearly', 'every', 'aspect', 'including', 'in', 'the', 'defense', 'industrial', 's

In [15]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[0])

['new', 'sino', 'russian', 'defense', 'want', 'whole', 'lot', 'jingpe', 'recent', 'visit', 'first', 'invade', 'last', 'year', 'sum', 'radchenko', 'way', 'summit', 'summarise', 'say', '雷声大雨点小', 'loud', 'thunder', 'raindrop', 'even', 'thunder', 'loud', 'meeting', 'apparently', 'produce', 'major', 'policy', 'shift', 'even', 'notable', 'statement', 'support', 'illustrate', 'tectonic', 'shift', 'suppose', 'limit', 'relationship', 'take', 'lead', 'nearly', 'aspect', 'include', 'defense', 'industrial', 'sphere', 'dominate', 'modern', 'defense', 'tie', 'go', 'back', 'new', 'communist', 'regime', 'initially', 'support', 'rather', 'force', 'chinese', 'civil', 'war', 'battle', 'imperial', 'eventually', 'come', 'back', 'chinese', 'handing', 'heavy', 'industry', 'supply', 'economic', 'aid', 'help', 'establish', 'nascent', 'defense', 'manufacturing', 'sector', 'period', 'willingly', 'transfer', 'various', 'military', 'technology', 'include', 'system', 'eventually', 'recast', 'fighter', 'aircraft', '

In [16]:
bigram = gensim.models.Phrases(data_words, min_count=20, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
#print (bigram)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['in', 'the', 'new', 'sino', 'russian', 'defense', 'relationship', 'china', 'does', 'what', 'it', 'wants', 'and', 'there', 'isn', 'whole', 'lot', 'russia', 'can', 'do', 'about', 'it', 'xi', 'jingping', 'recent', 'visit', 'to', 'moscow', 'his', 'first', 'since', 'vladimir_putin', 'invaded_ukraine', 'last', 'year', 'was', 'summed', 'up', 'by', 'historian', 'sergey_radchenko', 'in', 'this', 'way', 'the', 'summit', 'can', 'be', 'summarised', 'by', 'the', 'chinese', 'saying', '雷声大雨点小', 'loud', 'thunder', 'but', 'few', 'raindrops', 'scratch', 'that', 'even', 'the', 'thunder', 'wasn', 'all', 'that', 'loud', 'the', 'meeting', 'which', 'apparently', 'produced', 'no', 'major', 'policy', 'shifts', 'nor', 'even', 'notable', 'statements', 'of', 'support', 'did', 'further', 'illustrate', 'tectonic', 'shift', 'in', 'the', 'supposed', 'no', 'limits', 'relationship', 'china', 'is', 'taking', 'the', 'lead', 'in', 'nearly', 'every_aspect', 'including', 'in', 'the', 'defense', 'industrial', 'sphere', 'onc

In [17]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Frequency list   
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[0])

[(0, 1), (1, 1), (2, 2), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 1), (13, 4), (14, 3), (15, 2), (16, 1), (17, 1), (18, 1), (19, 6), (20, 1), (21, 1), (22, 2), (23, 2), (24, 3), (25, 4), (26, 5), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 3), (34, 1), (35, 1), (36, 1), (37, 6), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 3), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 7), (53, 1), (54, 2), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 2), (63, 1), (64, 2), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 4), (74, 1), (75, 6), (76, 1), (77, 1), (78, 1), (79, 2), (80, 2), (81, 1), (82, 1), (83, 2), (84, 1), (85, 1), (86, 1), (87, 1), (88, 14), (89, 2), (90, 1), (91, 1), (92, 2), (93, 2), (94, 1), (95, 2), (96, 1), (97, 1), (98, 1), (99, 2), (100, 2), (101, 1), (102, 1), (103, 2), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 1)

In [18]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('able', 1),
  ('absorb', 1),
  ('accelerate', 2),
  ('acquiescence', 1),
  ('acquire', 2),
  ('action', 1),
  ('adapt', 1),
  ('advance', 1),
  ('advanced', 1),
  ('advantage', 1),
  ('aerospace', 1),
  ('agreement', 2),
  ('aid', 1),
  ('air', 4),
  ('aircraft', 3),
  ('allow', 2),
  ('ally', 1),
  ('almaz', 1),
  ('already', 1),
  ('also', 6),
  ('ambition', 1),
  ('analyst', 1),
  ('announce', 2),
  ('apparently', 2),
  ('appear', 3),
  ('area', 4),
  ('arm', 5),
  ('arrangement', 1),
  ('art', 1),
  ('aspect', 1),
  ('aviation', 1),
  ('aware', 1),
  ('away', 1),
  ('back', 3),
  ('base', 1),
  ('battle', 1),
  ('become', 1),
  ('begin', 6),
  ('beneficial', 1),
  ('benefit', 1),
  ('bomber', 1),
  ('broad', 1),
  ('build', 1),
  ('business', 3),
  ('buy', 1),
  ('cancel', 1),
  ('cannibalize', 1),
  ('capacity', 1),
  ('cash', 1),
  ('catch', 1),
  ('center', 1),
  ('change', 1),
  ('chinese', 7),
  ('civil', 1),
  ('close', 2),
  ('cold', 1),
  ('collaboration', 1),
  ('collap

In [19]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [20]:
# Print the Keyword in the 10 topics
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0, '0.024*"defense" + 0.022*"nuclear" + 0.016*"new" + 0.016*"military" + 0.016*"technical" + 0.016*"weapon" + 0.015*"capability" + 0.015*"system" + 0.013*"missile" + 0.012*"include"'), (1, '0.015*"make" + 0.014*"chinese" + 0.014*"go" + 0.012*"say" + 0.011*"time" + 0.011*"get" + 0.009*"see" + 0.009*"even" + 0.008*"come" + 0.008*"think"'), (2, '0.026*"say" + 0.018*"year" + 0.015*"event" + 0.014*"start" + 0.014*"apple" + 0.013*"day" + 0.013*"chinatown" + 0.011*"new" + 0.010*"store" + 0.009*"first"'), (3, '0.029*"year" + 0.019*"market" + 0.017*"growth" + 0.015*"economy" + 0.013*"price" + 0.011*"rate" + 0.010*"month" + 0.010*"last" + 0.010*"demand" + 0.009*"fall"'), (4, '0.036*"say" + 0.035*"covid" + 0.025*"people" + 0.016*"report" + 0.014*"year" + 0.013*"city" + 0.012*"death" + 0.011*"pandemic" + 0.011*"health" + 0.010*"new"'), (5, '0.061*"share" + 0.044*"stock" + 0.038*"company" + 0.025*"quarter" + 0.023*"get" + 0.021*"report" + 0.019*"earning" + 0.017*"fund" + 0.017*"investor" + 0.017*

In [21]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [22]:
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [23]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -10.644161786545126

Coherence Score:  0.537153889856644
