### Topic Modeling using HDP and LDA

- Text Processing
- Generating dictionary of vocabulary
- Mapping corpus using dictionary
- Training the Topic Model

In [2]:
import matplotlib.pyplot as plt
import gensim
import numpy as np
import spacy
import pandas as pd
import re

from gensim import corpora, models
from gensim.models.ldamodel import LdaModel
from spacy.lang.en.stop_words import STOP_WORDS
import pyLDAvis.gensim
#Import nltk stopwords and add custom stopwords that are likely to appear in news articles.
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(["mrs","ms","say","he","mr","she","they","company"])

import os, re, operator, warnings
warnings.filterwarnings('ignore')
%matplotlib inline

  """
  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""
  name, n = re.sub('\d+$', '', var.name), 2
  RANGE_RE = re.compile('(-?\d+):(-?\d+)')
  """
  ENT = re.compile("&(\w+?);")
  """
  assert re.match('^[exps]\d+$', var), var
  + [' \  ' + blank + line for line in term_lines[1:2]]
  + [' /\ ' + var_string + line for line in term_lines[2:3]]
  """
  _STANDARD_NONTERM_RE = re.compile('( [\w/][\w/^<>-]* ) \s*', re.VERBOSE)
  _CONTEXT_RE = re.compile('\w+|[\.\!\?]')
  """
  if re.search('\s', brackets):
  node_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
  leaf_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
  '%s\s*(%s)?|%s|(%s)'
  reserved_chars = re.compile('([#\$%&~_\{\}])')
  """
  """
  """
  """
  """
  """
  """
  """
  elif re.match('\w+$', word):
  elif re.match('\W+$', word):
  if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word):
  tokenizer = RegexpTokenizer('[\w.@:/]+|\w+|\$[\d.]+')
  pat = '\s*'.join(re.escape(c) for c in tok)
  """


  '(?P<left>%s)\{(?P<right>%s)'
  RegexpChunkRule.__init__(self, regexp, '{\g<left>\g<right>', descr)
  '(?P<left>%s)\}(?P<right>%s)'
  RegexpChunkRule.__init__(self, regexp, '\g<left>\g<right>}', descr)
  r'^((%s|<%s>)*)$' % ('([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+', '[^\{\}<>]+')
  r'^((%s|<%s>)*)$' % ('([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+', '[^\{\}<>]+')
  """
  """
  valid_rule = re.compile("^[a-z]+\*?\d[a-z]*[>\.]?$")
  valid_rule = re.compile("^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$")
  """
  '.*\.(test|train).*',
  '.*\.(test|train).*',
  crubadan = LazyCorpusLoader('crubadan', CrubadanCorpusReader, '.*\.txt')
  'dependency_treebank', DependencyCorpusReader, '.*\.dp', encoding='ascii'
  'timit', TimitTaggedCorpusReader, '.+\.tags', tagset='wsj', encoding='ascii'
  twitter_samples = LazyCorpusLoader('twitter_samples', TwitterCorpusReader, '.*\.json')
  wordnet_ic = LazyCorpusLoader('wordnet_ic', WordNetICCorpusReader, '.*\.dat')
  'frames/.*\.xml',
  'frames/.*\.xml',
  'frames/.*\.xml',
 

### Text Processing
- Clean the article - Remove punctuation marks, special characters
- Tokenize each article
- Stem each token
- Remove numberical tokens

In [3]:
df=pd.read_csv("NewsArticles.csv", encoding='unicode_escape',index_col=0)
#drop all the unnamed columns
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
df.head()

Unnamed: 0_level_0,publish_date,article_source_link,title,subtitle,text
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2017/2/7,http://abcnews.go.com/Politics/pence-break-tie...,"Betsy DeVos Confirmed as Education Secretary, ...",,Michigan billionaire education activist Betsy ...
2,2017/2/7,http://abcnews.go.com/Politics/wireStory/melan...,Melania Trump Says White House Could Mean Mill...,,First lady Melania Trump has said little about...
3,2017/2/7,http://abcnews.go.com/Politics/wireStory/trump...,"As Trump Fears Fraud, GOP Eliminates Election ...",,A House committee voted on Tuesday to eliminat...
4,2017/2/7,http://abcnews.go.com/Politics/appeals-court-d...,Appeals Court to Decide on Challenge to Trump'...,,"This afternoon, three federal judges from the ..."
5,2017/2/7,http://abcnews.go.com/US/23-states-winter-weat...,At Least 4 Tornadoes Reported in Southeast Lou...,,At least four tornadoes touched down in Louisi...


I'm going to use SPACY in this notebook for all the text processing related tasks. It is very powerful than NLTK. [Click here to learn more](https://spacy.io/usage/spacy-101)

In [4]:
# before loading the language you have to download it first. Go to your command prompt and execute this statement and 
# restart the kernel:
# python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

In [5]:
data = df['text'].values.tolist()

In [5]:
#removing punctuations and others characters
def preprocess(string):
    return re.sub('[^\w_\s-]', ' ',str(string))

data = list(map(preprocess,data))    

In [6]:
#data cleaning and lemmatization
lemma_doc = []
for datum in data:
    sent = nlp(str(datum).lower())
    text = []
    for w in sent:
        if not w.is_stop and not w.is_punct and not w.like_num and str(w) not in stop_words and (len(str(w)) > 4):
            #adding the lematized version of the words
            text.append(w.lemma_)
    lemma_doc.append(text)
    

In [7]:
lemma_doc[0]

['michigan',
 'billionaire',
 'education',
 'activist',
 'betsy',
 'devos',
 'confirm',
 'today',
 'serve',
 'secretary',
 'education',
 'president',
 'trump',
 'administration',
 'president',
 'pence',
 'break',
 'senate',
 'senate',
 'vote',
 'devos"?highly',
 'contentious',
 'nomination',
 'afternoon',
 'tally',
 'split',
 'evenly',
 'require',
 'penny',
 'authority',
 'president',
 'upper',
 'chamber',
 'congress',
 'break',
 'impasse',
 'president',
 'break',
 'confirm',
 'cabinet',
 'nominee',
 'pence',
 'count',
 'vote',
 'render',
 'tally',
 'democrats',
 'stage',
 '24-hour',
 'marathon',
 'speech',
 'lawmaker',
 'take',
 'floor',
 'additional',
 'republican',
 'devos',
 'block',
 'confirmation',
 'imagine',
 'bad',
 'choice,"?sen',
 'elizabeth',
 'warren',
 'letter',
 'constituent',
 'urge',
 'devos',
 'stir',
 'vehement',
 'opposition',
 'teachers"?union',
 'senate',
 'democrats',
 'cite',
 'concern',
 'support',
 'school',
 'voucher',
 'critic',
 'believe',
 'weaken',
 'publ

In [8]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(lemma_doc, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[lemma_doc], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[lemma_doc[0]]])

['michigan', 'billionaire', 'education', 'activist', 'betsy_devos', 'confirm', 'today', 'serve', 'secretary', 'education', 'president', 'trump', 'administration', 'president', 'pence', 'break', 'senate', 'senate', 'vote', 'devos"?highly', 'contentious', 'nomination', 'afternoon', 'tally', 'split', 'evenly', 'require', 'penny', 'authority', 'president', 'upper', 'chamber', 'congress', 'break', 'impasse', 'president', 'break', 'confirm', 'cabinet_nominee', 'pence', 'count', 'vote', 'render', 'tally', 'democrats', 'stage', '24-hour', 'marathon', 'speech', 'lawmaker', 'take', 'floor', 'additional', 'republican', 'devos', 'block', 'confirmation', 'imagine', 'bad', 'choice,"?sen', 'elizabeth_warren', 'letter', 'constituent', 'urge', 'devos', 'stir', 'vehement', 'opposition', 'teachers"?union', 'senate', 'democrats', 'cite', 'concern', 'support', 'school', 'voucher', 'critic', 'believe', 'weaken', 'public', 'school', 'experience', 'attend', 'work', 'public', 'education', 'system', 'cite', 'fa

#### Create the Dictionary and Corpus needed for Topic Modeling
- Word to IDs mapping
- Bag of words of each document
- corpus (cluster of Bag of words of all the documents)

In [20]:
#Creates Word to IDs mapping
word2id = corpora.Dictionary(lemma_doc)

In [10]:
# Creates bag of words and a corpus
documents = lemma_doc
corpus = [word2id.doc2bow(doc) for doc in documents]

print('Corpus sample')
sample = corpus[0]
for i in range(len(sample)):
    print('Word', sample[i][0], ':', word2id[sample[i][0]], ' || Number of occurences:', sample[i][1])

Corpus sample
Word 0 : "?before  || Number of occurences: 1
Word 1 : "?devos  || Number of occurences: 1
Word 2 : 24-hour  || Number of occurences: 1
Word 3 : account  || Number of occurences: 1
Word 4 : activist  || Number of occurences: 1
Word 5 : addition  || Number of occurences: 1
Word 6 : additional  || Number of occurences: 1
Word 7 : administration  || Number of occurences: 1
Word 8 : administrator  || Number of occurences: 1
Word 9 : afternoon  || Number of occurences: 1
Word 10 : alaska  || Number of occurences: 1
Word 11 : announce  || Number of occurences: 2
Word 12 : answer  || Number of occurences: 1
Word 13 : appreciate  || Number of occurences: 1
Word 14 : attend  || Number of occurences: 1
Word 15 : authority  || Number of occurences: 1
Word 16 : average  || Number of occurences: 1
Word 17 : bad  || Number of occurences: 1
Word 18 : believe  || Number of occurences: 1
Word 19 : betsy  || Number of occurences: 1
Word 20 : billionaire  || Number of occurences: 1
Word 21 

#### Hierarchical Dirichlet Processing
This is kind of an unsupervised technique (Topic modeling is a unsupervised technique. Here the context is we don't decide the # of topics. In concept this is similar to Hierarchical cluster as don't choose the number of cluster before hand) as the model will identify the number of topics. Let's see what it will produce.

In [11]:
hdp = models.HdpModel(corpus,word2id)

In [12]:
hdp_topics = hdp.print_topics()
for topic in hdp_topics:
    print(topic)

(0, '0.008*trump + 0.006*people + 0.005*president + 0.005*country + 0.004*china + 0.004*government + 0.004*state + 0.003*house + 0.003*report + 0.003*year')
(1, '0.006*trump + 0.005*people + 0.005*china + 0.004*country + 0.004*president + 0.003*year + 0.003*government + 0.003*state + 0.003*include + 0.003*party')
(2, '0.007*target + 0.006*china + 0.006*attacker + 0.005*chinese + 0.004*wound + 0.004*attack + 0.004*story + 0.004*kill + 0.003*country + 0.003*police')
(3, '0.006*china + 0.003*government + 0.002*investment + 0.002*president + 0.002*people + 0.002*percent + 0.002*house + 0.002*state + 0.002*daily + 0.002*market')
(4, '0.004*china + 0.002*country + 0.002*people + 0.002*germany + 0.002*world + 0.001*growth + 0.001*price + 0.001*european + 0.001*russian + 0.001*german')
(5, '0.003*child + 0.003*china + 0.002*country + 0.002*percent + 0.002*year + 0.002*people + 0.001*province + 0.001*tsingtao + 0.001*accord + 0.001*report')
(6, '0.004*china + 0.003*chinese + 0.002*jiaozi + 0.00

In [13]:
print('HDP model created: '+str(len(hdp_topics))+' Topics')

HDP model created: 20 Topics


####  Latent Dirichlet Allocation Model


In [14]:
lda_model = LdaModel(corpus=corpus, id2word=word2id, num_topics=5, random_state=42, update_every=1, chunksize=100, 
                     passes=10, alpha='auto', per_word_topics=True)

In [19]:
#Article - Topic Distribution for first Article
def get_article_topic_distribution(article):
    return lda.get_document_topics(article)
#Returns a list containing a list of tuple
#Each inner list corresponds to an article and each tuple refers to topicID and its corresponding probability  
map(get_article_topic_distribution, corpus)

<map at 0x24d66dddac8>

In [15]:
lda_model.print_topics()

[(0,
  '0.009*"russian" + 0.009*"russia" + 0.009*"country" + 0.008*"government" + 0.007*"north" + 0.006*"state" + 0.006*"force" + 0.005*"minister" + 0.005*"attack" + 0.005*"security"'),
 (1,
  '0.010*"china" + 0.009*"percent" + 0.008*"european" + 0.007*"brexit" + 0.007*"trade" + 0.006*"country" + 0.006*"britain" + 0.006*"business" + 0.005*"market" + 0.005*"government"'),
 (2,
  '0.010*"player" + 0.008*"football" + 0.008*"sport" + 0.007*"berry" + 0.007*"kathrada" + 0.005*"ruffin" + 0.005*"world" + 0.005*"play" + 0.005*"award" + 0.004*"grenoble"'),
 (3,
  '0.013*"people" + 0.010*"woman" + 0.008*"police" + 0.008*"family" + 0.007*"child" + 0.006*"school" + 0.005*"find" + 0.005*"year" + 0.005*"young" + 0.004*"water"'),
 (4,
  '0.019*"trump" + 0.011*"house" + 0.010*"president" + 0.007*"committee" + 0.006*"court" + 0.006*"state" + 0.005*"white" + 0.005*"report" + 0.005*"people" + 0.005*"issue"')]

### How to interpret this?
The top 10 keywords that contribute to the topic are showcased with their respective weight.

Let's try to interpret the 5 topics:

- Topic 1: key words like "Russia", "Country", "Government", "Minister" suggest **Politics in Russia**
- Topic 2: key words like "China", "Brexit","Trade", "Business", "Market" suggest **Inter country trade news**
- Topic 3: key words like "Player","Sport","World" suggest **Sports news (football)**
- Topic 4: key words like "People","Woman","Police", "Family,"Child" suggest **Domestic news**
- Topic 5: key words like "Trump", "State", "White", "Committee" suggest **Polictics in USA**

### Compute Model Perplexity and Coherence Score

In [16]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = models.CoherenceModel(model=lda_model, texts=lemma_doc, dictionary=word2id, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.588727660028226

Coherence Score:  0.4507504883751433


Coherence measures the relative distance between words within a topic. There are two major types C_V typically 0 < x < 1 and uMass -14 < x < 14.
Coherence score of 0.4 is low. I want to explore what would have been the ideal number of topics. Will explore the elbow method below. 

In [17]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42, update_every=1, chunksize=100, 
                     passes=10, alpha='auto', per_word_topics=True)
        model_list.append(model)
        coherencemodel = models.CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
#Use this to get the graph of optimal # of topics
model_list, coherence_values = compute_coherence_values(dictionary=word2id, corpus=corpus, texts=lemma_doc, start=2, limit=100, step=10)
# Show graph
import matplotlib.pyplot as plt
limit=100; start=2; step=10;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

You can decide on the number of topics based on this analysis. Note that the Customization used for 5 topic model (lda_model) and the optimization models is difference therefore the Coherence score for 5 topics LDA model differ.

### Vizualize the topics

In [18]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model,corpus,word2id)