# Topic Modeling: BBC News 

In [None]:
# 
# We study the topic modeling using BBC News, where
# the dataset can be downloaded from kaggle.com 
# (see https://www.kaggle.com/yufengdev/bbc-fulltext-and-category)
#
# We adopt the latent Dirichlet allocation (LDA) from 
# sklearn and gensim libraries for topic modeling. We
# also exploit the non-negative matrix factorization (NMF)
# to do the task. 
#
# In this dataset, there are five different categories which 
# are tech, business, sport entertainment and politics, respectively.
# We made some tests on the three methods. Apart from tuning the 
# parameters in the package, it is useful to adjust the list of 
# stopwords for getting a better result. 
#
# By inspection, the LDA methods cannot appropriately group the 
# top words into these five categories. The sklearn LDA consumes
# a relatively long computing time compared to gensim LDA method.
# NMF method can group the relevant words of these five categories
# and computes within a second for this dataset. Therefore, NMF 
# shows a better performance in our case.
#
# Reference:
# [1] Dipanjan Sarkar, "Text Analytics with Python", APRESS Media, Springer (2016).
# [2] http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py
# [3] https://radimrehurek.com/gensim/models/ldamodel.html

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("bbc-text.csv")

In [3]:
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [39]:
df['category'].unique()

array(['tech', 'business', 'sport', 'entertainment', 'politics'],
      dtype=object)

In [4]:
from nltk.corpus import stopwords

In [5]:
stop_words = stopwords.words('english')

In [6]:
stop_words.extend(['should','could','also','us','would','last','one','two','three','four','five','six','seven','eight','nine'\
                   ,'ten','said','say','new','first','second','mr'])

In [7]:
%%time
df['Clean'] = df['text'].str.replace(r'(\W|\d)',' ')
df['Clean'] = df['Clean'].str.replace(r'\b(' + r'|'.join(stop_words) + r')\b\s*',' ')

Wall time: 1.38 s


In [8]:
df['Clean'].head()

0    tv future   hands  viewers  home theatre syste...
1    worldcom boss  left books alone  former worldc...
2    tigers wary  farrell  gamble  leicester      r...
3    yeading face newcastle  fa cup premiership sid...
4    ocean  twelve raids box office ocean  twelve  ...
Name: Clean, dtype: object

# Methods 1: Sklearn - Latent Dirichlet Allocation (LDA)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [10]:
n_features = 1000

In [11]:
tfidf_vectorizer = TfidfVectorizer(max_features=n_features)

In [12]:
tf = tfidf_vectorizer.fit_transform(df['Clean'])

In [13]:
from sklearn.decomposition import LatentDirichletAllocation

In [14]:
n_components=5

In [15]:
lda = LatentDirichletAllocation(n_components=n_components, max_iter=500,
                                learning_method='online',
                                learning_offset=50.,
                                batch_size=150,
                                random_state=21)

In [16]:
%time lda.fit(tf)

Wall time: 2min 29s


LatentDirichletAllocation(batch_size=150, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=500, mean_change_tol=0.001,
             n_components=5, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=21, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [17]:
tf_feature_names = tfidf_vectorizer.get_feature_names()

In [18]:
num_top_words = 20

In [19]:
# function for printing top words
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)

In [20]:
print_top_words(lda, tf_feature_names, num_top_words)

Topic #0: bn company market growth year bank economy firm sales shares oil government economic china prices deal financial business profits companies
Topic #1: film best show people music year awards uk government award bbc british band star tv told lord police album children
Topic #2: labour election party brown blair howard chancellor tories tory tax lib prime kennedy leader campaign dems liberal ukip gordon minister
Topic #3: game games people players world england win year time play club mobile team cup match technology back injury like get
Topic #4: sector computer category civil goal players cabinet mini sites round messages titles jobs winning policy council defeat card energy fraud


# Method 2: Gensim LDA

In [21]:
from gensim import corpora, models



In [22]:
t = list(df['Clean'].str.split(' '))

In [23]:
dictionary = corpora.Dictionary(t)

In [24]:
# dictionary.token2id

In [25]:
corpus = [dictionary.doc2bow(text) for text in t]

In [26]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [27]:
#%time lda = models.LdaModel(corpus_tfidf,id2word=dictionary,num_topics=n_components)
%time lda = models.LdaModel(corpus_tfidf,id2word=dictionary,iterations=1000,num_topics=n_components,random_state=21,update_every=1,alpha=.1,eval_every=5,passes=5)

Wall time: 15.1 s


In [28]:
import re

In [29]:
# function for printing top words
def print_gensim_top_words(model,num_topics,num_top_words):
    for i in range(num_topics):
        message = "Topic #%d: " % i
        message += " ".join(re.findall(r'[a-z]+',lda.print_topics(5,num_top_words)[i][1]))
        print(message)

In [30]:
print_gensim_top_words(lda,n_components,num_top_words)

Topic #0: film england players music game best awards sales wales year uk win british award world show ireland band oil cup
Topic #1: kilroy silk ukip uwb veritas miliband jaynes libya johansson dent hockney gatlin hsdpa jesse tickets owens adelaide houston rated lockerbie
Topic #2: blair labour party election brown prime tax minister kennedy government lib howard public tony tories tory holmes asylum plans leader
Topic #3: mobile search phones phone technology google bn data dollar growth sony china tv games people high linux video deficit digital
Topic #4: newry wal cantona mart marvel masks palestinian spider domain herlihy bpi fightstar robocop busted balfour beatty abbas dyer occ prints


# Method 3: Non-Negative Matrix Factorization (NMF)

In [31]:
n_features = 1200

In [32]:
tfidf_vectorizer = TfidfVectorizer(max_features=n_features)

In [33]:
tfidf = tfidf_vectorizer.fit_transform(df['Clean'])

In [34]:
from sklearn.decomposition import NMF

In [35]:
nmf = NMF(n_components=5,alpha=1.0,tol=1e-6,max_iter=500,random_state=21)

In [36]:
%time nmf.fit(tfidf)

Wall time: 121 ms


NMF(alpha=1.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=500,
  n_components=5, random_state=21, shuffle=False, solver='cd', tol=1e-06,
  verbose=0)

In [37]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [38]:
print_top_words(nmf, tfidf_feature_names, num_top_words)

Topic #0: bn growth year economy sales company market bank oil shares firm economic china prices profits deal government rise yukos analysts
Topic #1: game england win cup wales match play ireland team players side injury club back time final half season france coach
Topic #2: labour blair election party brown government minister howard prime chancellor tory tax tories leader people campaign public britain tony plans
Topic #3: film best awards award actor oscar films actress festival director star year comedy aviator movie show prize nominated nominations british
Topic #4: people mobile music technology users digital phone software use net computer microsoft tv service broadband phones video online internet uk
