In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import gensim
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models

from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer

import warnings
warnings.simplefilter('ignore')
from itertools import chain
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
from tqdm.auto import tqdm

In [2]:
# Process Data for LDA
df = pd.read_csv("./doc.csv")
df['Year']= pd.to_datetime(df['Year'])
df.set_index('Year')
def strip_intro(text):
    i = text.find("By unanimous vote,")
    if(i == -1):
        i = text.find("At the start")
        if(i == -1):
            return text
    return text[i:]
df
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(text):
    stop_free = ' '.join([word for word in text.lower().split() if word not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = ' '.join([lemma.lemmatize(word) for word in punc_free.split()])
    return normalized.split()

ldf = pd.DataFrame()
ldf['statement_clean'] = df['Statement'].apply(strip_intro)
ldf = ldf[ldf['statement_clean'].str.startswith(('By unanimous','At the start'))]
ldf['text_clean'] = ldf['statement_clean'].apply(clean)
ldf

Unnamed: 0,statement_clean,text_clean
0,"By unanimous vote, the Committee ratified the ...","[unanimous, vote, committee, ratified, desk, d..."
1,"By unanimous vote, the minutes for the meeting...","[unanimous, vote, minute, meeting, federal, op..."
2,"By unanimous vote, the minutes for the meeting...","[unanimous, vote, minute, meeting, federal, op..."
3,"By unanimous vote, the Committee elected the f...","[unanimous, vote, committee, elected, followin..."
4,"By unanimous vote, the minutes for the meeting...","[unanimous, vote, minute, meeting, federal, op..."
...,...,...
220,"By unanimous vote, the following officers of t...","[unanimous, vote, following, officer, committe..."
221,"By unanimous vote, the Committee ratified the ...","[unanimous, vote, committee, ratified, desk, d..."
222,"By unanimous vote, the Committee voted to appr...","[unanimous, vote, committee, voted, approve, r..."
223,"By unanimous vote, the following officers of t...","[unanimous, vote, following, officer, committe..."


In [3]:
dictionary = corpora.Dictionary(ldf['text_clean']) # Dictionary
doc_term_matrix = [dictionary.doc2bow(doc) for doc in ldf['text_clean'] ] # Corpus
lda = gensim.models.ldamodel.LdaModel

In [4]:
num_topics=7
%time ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.save_html(lda_display, 'lda.html')
pyLDAvis.display(lda_display)

CPU times: user 54.7 s, sys: 0 ns, total: 54.7 s
Wall time: 54.7 s


In [None]:
# THIS TAKES FOREVER SO ONLY RUN IF YOU HAVE TO (RECOMMENDED NUM OF TOPICS IS 7)
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = lda(corpus,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=doc_term_matrix, texts=list(ldf['text_clean']), start=2, limit=10, step=1)
limit=10; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [5]:
optimal_model = ldamodel
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=doc_term_matrix, texts=ldf['statement_clean'].reset_index()['statement_clean'])

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic # Dominant Topic per document

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,1.0,0.9811,"inflation, rate, market, committee, participan...","By unanimous vote, the Committee ratified the ..."
1,1,5.0,0.6556,"growth, committee, price, rate, range, market,...","By unanimous vote, the minutes for the meeting..."
2,2,5.0,0.6974,"growth, committee, price, rate, range, market,...","By unanimous vote, the minutes for the meeting..."
3,3,6.0,0.5710,"committee, foreign, market, currency, federal,...","By unanimous vote, the Committee elected the f..."
4,4,5.0,0.6150,"growth, committee, price, rate, range, market,...","By unanimous vote, the minutes for the meeting..."
...,...,...,...,...,...
219,219,1.0,0.6418,"inflation, rate, market, committee, participan...","By unanimous vote, the following officers of t..."
220,220,1.0,0.7964,"inflation, rate, market, committee, participan...","By unanimous vote, the Committee ratified the ..."
221,221,1.0,0.8098,"inflation, rate, market, committee, participan...","By unanimous vote, the Committee voted to appr..."
222,222,1.0,0.5998,"inflation, rate, market, committee, participan...","By unanimous vote, the following officers of t..."


In [6]:
sent_topics_sorteddf = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf = pd.concat([sent_topics_sorteddf, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf # Top Text Per Topic

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0.0,0.9974,"member, economic, would, committee, market, gr...","By unanimous vote, the minutes of the meeting ..."
1,1.0,0.9997,"inflation, rate, market, committee, participan...","By unanimous vote, the Committee ratified the ..."
2,2.0,0.9994,"price, growth, member, market, committee, infl...","By unanimous vote, the minutes of the meeting ..."
3,3.0,0.9988,"member, economic, would, market, committee, ec...","By unanimous vote, the minutes of the meeting ..."
4,4.0,0.9997,"market, financial, committee, price, federal, ...","By unanimous vote, the minutes of the meeting ..."
5,5.0,0.9989,"growth, committee, price, rate, range, market,...","By unanimous vote, the Committee ratified the..."
6,6.0,0.8171,"committee, foreign, market, currency, federal,...","By unanimous vote, the following officers of t..."
