In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re

In [2]:
DATASET_PATH = "parliament_13.csv"

In [3]:
df = pd.read_csv(DATASET_PATH, index_col=0)
df.head()

Unnamed: 0,section,date,topic,name,speech,length
0,answers to questions,2020-06-05,consequences for and enforcement against emplo...,josephine teo,the vast majority of employers are now paying ...,142
1,budget,2020-06-05,second supplementary estimates of expenditure ...,anthea ong,"thank you, mr chairman. i thank the senior min...",79
2,budget,2020-06-05,second supplementary estimates of expenditure ...,indranee rajah,"mr chairman, if i may now address ms ong's que...",261
3,budget,2020-06-05,second supplementary estimates of expenditure ...,anthea ong,"mr chairman, i beg leave to withdraw the amend...",36
4,budget,2020-06-05,second supplementary estimates of expenditure ...,anthea ong,"chairman, i beg to move, ""that the total sum t...",440


Remove punctuations

In [4]:
df['speech'] = df['speech'].map(lambda text: re.sub(r'[,\.!?"\']', '', text))
df.head()

Unnamed: 0,section,date,topic,name,speech,length
0,answers to questions,2020-06-05,consequences for and enforcement against emplo...,josephine teo,the vast majority of employers are now paying ...,142
1,budget,2020-06-05,second supplementary estimates of expenditure ...,anthea ong,thank you mr chairman i thank the senior minis...,79
2,budget,2020-06-05,second supplementary estimates of expenditure ...,indranee rajah,mr chairman if i may now address ms ongs quest...,261
3,budget,2020-06-05,second supplementary estimates of expenditure ...,anthea ong,mr chairman i beg leave to withdraw the amendm...,36
4,budget,2020-06-05,second supplementary estimates of expenditure ...,anthea ong,chairman i beg to move that the total sum to b...,440


# LDA Analysis

In [9]:
# consider 10 longest topics
from pprint import pprint
topics = df.groupby(["topic"])["length"].agg(["sum"]).sort_values(by="sum", ascending=False).reset_index()
df = df[df["topic"].isin(topics.loc[:2, "topic"])]

In [10]:
import gensim
from gensim.utils import simple_preprocess
from pprint import pprint
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ktkhu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Tokenize the text and remove stopwords

In [11]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data = df['speech'].values.tolist()
data_words = list(sent_to_words(data))

In [12]:
import spacy
stop_words = stopwords.words('english')

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [13]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = data_words_nostops#make_bigrams(data_words_nostops)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [14]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaMulticore

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [15]:
#[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

In [16]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=df["topic"].unique().shape[0], 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [17]:
pprint(lda_model.print_topics())

[(0,
  '0.013*"worker" + 0.011*"job" + 0.010*"work" + 0.009*"business" + '
  '0.009*"also" + 0.009*"help" + 0.008*"company" + 0.008*"support" + '
  '0.008*"need" + 0.008*"industry"'),
 (1,
  '0.010*"government" + 0.008*"people" + 0.007*"year" + 0.007*"also" + '
  '0.006*"need" + 0.006*"make" + 0.005*"many" + 0.005*"time" + 0.005*"well" + '
  '0.005*"society"'),
 (2,
  '0.028*"school" + 0.023*"student" + 0.014*"education" + 0.010*"child" + '
  '0.008*"learn" + 0.008*"need" + 0.007*"also" + 0.006*"well" + '
  '0.006*"community" + 0.006*"year"')]


In [18]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.5163555882664825

Coherence Score:  0.3683003523363962


In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis