In [6]:
# ---------------------------------- TITLE -------------------------------------
# 09_LDAproj_LDaimplementation.py
# AUTHOR: HARUKA TAKAGI
# DATE: JULY 14, 2020
# ENCODING: utf-8
# PYTHON VERSION: 3.7

# ---------------------------------- NOTES -------------------------------------
# The purpose of this script is to implement latent dirichlet allocation to FOMC
# Historical Material Documents.

# ---------------------------------- SETUP -------------------------------------

import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/harukatakagi/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
# ---------------------------------- CODE --------------------------------------

# Read in data from csv file
dataset = pd.read_csv('/Users/harukatakagi/Dropbox/FOMC_Board/FOMC_Historical_Materials_dataframe/version2/Greenbook_mini.csv')

# Check for NULL values
print(dataset.isnull().sum())
print("total null values : ",sum(dataset.isnull().values.ravel()))
print("total number of rows containing null values : ", sum([True for idx,row in dataset.iterrows() if any(row.isnull())]))

# If NULL values is greater than 1
# dataset = dataset.dropna()


doc_id    0
text      0
dtype: int64
total null values :  0
total number of rows containing null values :  0


In [8]:
dataset

Unnamed: 0,doc_id,text
0,fomc20000202gbpt120000127.txt,class fomc financi condit summari outlook prep...
1,fomc20000321gbpt120000315.txt,class fomc financi condit summari outlook prep...
2,fomc20000516gbpt120000511.txt,class fomc financi condit summari outlook prep...
3,fomc20000628gbpt120000621.txt,class fomc financi condit summari outlook prep...
4,fomc20000822gbpt120000816.txt,class fomc financi condit summari outlook prep...
5,fomc20001003gbpt120000927.txt,class fomc financi condit summari outlook prep...
6,fomc20001115gbpt120001108.txt,class fomc financi condit summari outlook prep...
7,fomc20001219gbpt120001213.txt,class fomc financi condit summari outlook prep...
8,FOMC20090128gbpt120090122.txt,class fomc restrict financi condit summari out...
9,FOMC20090318gbpt120090312.txt,class fomc restrict financi condit summari out...


In [13]:
# Prepare text for LDA analysis

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data = dataset.text.values.tolist()
data_words = list(sent_to_words(data))

id2word = corpora.Dictionary(data_words)
texts = data_words
corpus = [id2word.doc2bow(text) for text in texts]

In [30]:
from pprint import pprint

# number of topics
num_topics = 4

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.018*"percent" + 0.014*"year" + 0.014*"rate" + 0.013*"quarter" + '
  '0.011*"price" + 0.008*"month" + 0.007*"chang" + 0.006*"increas" + '
  '0.006*"market" + 0.006*"real"'),
 (1,
  '0.019*"percent" + 0.017*"rate" + 0.016*"year" + 0.014*"price" + '
  '0.014*"quarter" + 0.010*"increas" + 0.009*"month" + 0.007*"expect" + '
  '0.006*"chang" + 0.006*"product"'),
 (2,
  '0.022*"percent" + 0.013*"year" + 0.012*"price" + 0.012*"rate" + '
  '0.010*"quarter" + 0.009*"increas" + 0.007*"month" + 0.007*"chang" + '
  '0.006*"note" + 0.006*"expect"'),
 (3,
  '0.025*"percent" + 0.016*"rate" + 0.016*"quarter" + 0.013*"year" + '
  '0.011*"price" + 0.008*"increas" + 0.007*"expect" + 0.006*"month" + '
  '0.006*"chang" + 0.006*"declin"')]


In [31]:
import pyLDAvis.gensim_models
import pickle 
import pyLDAvis
import os

# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_data_filepath = os.path.join('./results/ldavis_prepared_'+str(num_topics))

# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, './results/ldavis_prepared_'+ str(num_topics) +'.html')

LDAvis_prepared

  default_term_info = default_term_info.sort_values(
  from imp import reload
