In [1]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

seed = 100

In [2]:
df = pd.read_csv('bbc_data.csv')

In [3]:
df['type'].unique()

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype=object)

In [4]:
df['news'][df['type']=='entertainment'].sample(10)

876    b'UK TV channel rapped for CSI ad\n\nTV channe...
579    b'Lee to create new film superhero\n\nComic bo...
695    b"Bennett play takes theatre prizes\n\nThe His...
764    b"Soul sensation ready for awards\n\nSouth Wes...
637    b'U2\'s desire to be number one\n\nU2, who hav...
542    b"Housewives lift Channel 4 ratings\n\nThe deb...
877    b'Surprise win for anti-Bush film\n\nMichael M...
828    b'Controversial film tops festival\n\nA contro...
658    b"Celebrities get their skates on\n\nFormer En...
821    b'Britney attacks \'false tabloids\'\n\nPop st...
Name: news, dtype: object

In [5]:
df['news'][638]

'b\'Ocean\\\'s Twelve raids box office\\n\\nOcean\\\'s Twelve, the crime caper sequel starring George Clooney, Brad Pitt and Julia Roberts, has gone straight to number one in the US box office chart.\\n\\nIt took $40.8m (\\xc2\\xa321m) in weekend ticket sales, according to studio estimates. The sequel follows the master criminals as they try to pull off three major heists across Europe. It knocked last week\\\'s number one, National Treasure, into third place. Wesley Snipes\\\' Blade: Trinity was in second, taking $16.1m (\\xc2\\xa38.4m). Rounding out the top five was animated fable The Polar Express, starring Tom Hanks, and festive comedy Christmas with the Kranks.\\n\\nOcean\\\'s Twelve box office triumph marks the fourth-biggest opening for a December release in the US, after the three films in the Lord of the Rings trilogy. The sequel narrowly beat its 2001 predecessor, Ocean\\\'s Eleven which took $38.1m (\\xc2\\xa319.8m) on its opening weekend and $184m (\\xc2\\xa395.8m) in total

In [6]:
data = df.news.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

  data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
  data = [re.sub('\s+', ' ', sent) for sent in data]


In [7]:
def sent_to_words(sentences, deacc=True): 
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
data_words = list(sent_to_words(data))     

print(data_words[:1])

[['bus', 'consumer', 'confidence', 'up', 'nconsumers', 'confidence', 'in', 'the', 'state', 'of', 'the', 'us', 'economy', 'is', 'at', 'its', 'highest', 'for', 'five', 'months', 'and', 'they', 'are', 'optimistic', 'about', 'an', 'influential', 'survey', 'says', 'nthe', 'feel', 'good', 'factor', 'among', 'us', 'consumers', 'rose', 'in', 'december', 'for', 'the', 'first', 'time', 'since', 'july', 'according', 'to', 'new', 'data', 'the', 'conference', 'board', 'survey', 'of', 'households', 'pointed', 'to', 'renewed', 'optimism', 'about', 'job', 'creation', 'and', 'economic', 'growth', 'us', 'retailers', 'have', 'reported', 'strong', 'sales', 'over', 'the', 'past', 'days', 'after', 'slow', 'start', 'to', 'the', 'crucial', 'festive', 'season', 'naccording', 'to', 'figures', 'also', 'released', 'on', 'tuesday', 'sales', 'in', 'shopping', 'malls', 'in', 'the', 'week', 'to', 'december', 'were', 'higher', 'than', 'in', 'following', 'last', 'minute', 'rush', 'wal', 'mart', 'the', 'largest', 'us', 

In [8]:
#lemmatize
import en_core_web_sm
#nlp = en_core_web_sm.load()
def lemmatization(texts, allowed_postags=['NOUN','ADJ','VERB','ADV']):
    texts_out=[]
    for sent in texts:
        doc=nlp(' '.join(sent))
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:2])

['bus consumer confidence nconsumer confidence state economy be high month be optimistic influential survey say feel good factor consumer rise december first time accord new datum conference board survey household point renew optimism job creation economic growth retailer have report strong sale past day slow start crucial festive season naccorde figure also release tuesday sale shopping mall week december be high follow last minute rush wal mart large retailer have say december sale be expect be well previously forecast strong post christmas sale nit be expect annual sale growth month consumer confidence figure be consider key economic indicator consumer spending account about third economic activity united state continue economic expansion combine job growth have consumer end year high note say lynn franco director conference board consumer research centre consumer outlook suggest economy continue expand first half next year overall economy have perform strongly recent month prompt f

In [9]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum read occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform(data_lemmatized)

In [10]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=5,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=seed,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every=-1,       # compute perplexity every n iters, default: Don't
                                      n_jobs=-1,               # Use all available CPUs
                                      learning_decay=0.9
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

LatentDirichletAllocation(learning_decay=0.9, learning_method='online',
                          n_components=5, n_jobs=-1, random_state=100)


In [11]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -2616563.631168631
Perplexity:  1369.4588046576564
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.9,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 5,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


In [12]:
# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(data))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,dominant_topic
Doc0,0.0,0.0,0.0,0.0,0.99,4
Doc1,0.3,0.01,0.0,0.0,0.69,4
Doc2,0.0,0.14,0.0,0.0,0.86,4
Doc3,0.1,0.0,0.55,0.0,0.35,2
Doc4,0.47,0.0,0.18,0.0,0.35,0
Doc5,0.18,0.02,0.0,0.0,0.79,4
Doc6,0.0,0.51,0.0,0.0,0.48,1
Doc7,0.0,0.0,0.05,0.0,0.95,4
Doc8,0.0,0.03,0.0,0.15,0.81,4
Doc9,0.0,0.0,0.0,0.0,0.99,4


In [13]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

  default_term_info = default_term_info.sort_values(


In [14]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=15)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,say,government,party,election,labour,people,minister,blair,plan,tory,make,brown,tell,new,public
Topic 1,say,court,law,case,company,legal,claim,charge,year,action,tell,firm,use,trial,ban
Topic 2,say,use,people,make,technology,mobile,phone,service,new,user,computer,firm,year,network,company
Topic 3,say,win,year,game,good,film,play,time,make,award,player,world,star,new,come
Topic 4,say,year,rise,market,price,growth,sale,company,economy,month,new,rate,bank,country,high


In [20]:
def predict_topic(text, nlp=nlp):
    global sent_to_words
    global lemmatization

    # Step 1: Clean with simple_preprocess
    mytext_2 = list(sent_to_words(text))

    # Step 2: Lemmatize
    mytext_3 = lemmatization(mytext_2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # Step 3: Vectorize transform
    mytext_4 = vectorizer.transform(mytext_3)

    # Step 4: LDA Transform
    topic_probability_scores = lda_model.transform(mytext_4)
    topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), :].values.tolist()
    return topic, topic_probability_scores

# Predict the topic
mytext = ["b'Oscars race enters final furlong\n\nThe race for the Oscars entered its final stages as the deadline for voters to choose their winners passed.\n\nThe 5,808"]
topic, prob_scores = predict_topic(text = mytext)
print(prob_scores)

[[0.01446302 0.01343473 0.01342541 0.94528934 0.0133875 ]]


In [21]:
{1: 'politics', 2: 'crime', 3: 'tech', 4: 'entertaiment', 5: 'business'}

{1: 'politics', 2: 'crime', 3: 'tech', 4: 'entertaiment', 5: 'business'}

In [22]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=40, step=6)