In [1]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline



In [3]:
df = pd.read_csv('./data/beige_book_national.csv')
df.drop(columns='Unnamed: 0', inplace=True)
df.head()

Unnamed: 0,national_report,date
0,\n\r\n var ref = document.referrer;\r\n ...,2019-04
1,\n\r\n var ref = document.referrer;\r\n ...,2019-03
2,\n\r\n var ref = document.referrer;\r\n ...,2019-01
3,\n\r\n var ref = document.referrer;\r\n ...,2018-12
4,\n\r\n var ref = document.referrer;\r\n ...,2018-10


In [5]:
# Convert to list
data = df['national_report'].values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

  data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
  data = [re.sub('\s+', ' ', sent) for sent in data]


[' var ref = document.referrer; var bBPath = beige-book-archive; function '
 'backToResults() { if (ref.indexOf(bBPath) > -1) { history.go(-1); } else { '
 'window.location.href = "./"; } } ‹ Back to Archive Search Beige Book: '
 'National Summary April 17, 2019 This report was prepared at the Federal '
 'Reserve Bank of St. Louis based on information collected on or before April '
 '8, 2019. This document summarizes comments received from contacts outside '
 'the Federal Reserve System and is not a commentary on the views of Federal '
 'Reserve officials. Overall Economic Activity Economic activity expanded at a '
 'slight-to-moderate pace in March and early April. While most Districts '
 'reported that growth continued at a similar pace as the previous report, a '
 'few Districts reported some strengthening. There was little change in the '
 'outlook among contacts in reporting Districts, with those expecting '
 'slight-to-modest growth in the months ahead. Reports on consumer spendi

## Tokenize and Clean-up using gensim’s simple_preprocess()

In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['var', 'ref', 'document', 'referrer', 'var', 'bbpath', 'beige', 'book', 'archive', 'function', 'backtoresults', 'if', 'ref', 'indexof', 'bbpath', 'history', 'go', 'else', 'window', 'location', 'href', 'back', 'to', 'archive', 'search', 'beige', 'book', 'national', 'summary', 'april', 'this', 'report', 'was', 'prepared', 'at', 'the', 'federal', 'reserve', 'bank', 'of', 'st', 'louis', 'based', 'on', 'information', 'collected', 'on', 'or', 'before', 'april', 'this', 'document', 'summarizes', 'comments', 'received', 'from', 'contacts', 'outside', 'the', 'federal', 'reserve', 'system', 'and', 'is', 'not', 'commentary', 'on', 'the', 'views', 'of', 'federal', 'reserve', 'officials', 'overall', 'economic', 'activity', 'economic', 'activity', 'expanded', 'at', 'slight', 'to', 'moderate', 'pace', 'in', 'march', 'and', 'early', 'april', 'while', 'most', 'districts', 'reported', 'that', 'growth', 'continued', 'at', 'similar', 'pace', 'as', 'the', 'previous', 'report', 'few', 'districts', 'report

## Remove Stopwords

In [76]:
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
stop_words = stopwords.words('english')
stop_words.extend(['http','https','www','com','@','...','…', 'var', 'ref', 'document', 
             'referrer', 'var', 'bbpath', 'href', 'archive', 'function', 'beige',
            'book', 'federal', 'reserve', 'summary', 'indexof', 'backtoresults',
            'history', 'go', 'else', 'window', 'location', 'back', 'search',
            'this', 'report', 'prepared', 'federal', 'reserve', 'bank', 'summarizes',
            'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august',
            'september', 'october', 'november', 'december', 'atlanta', 'boston', 'chicago',
            'cleveland', 'dallas', 'kansas', 'minneapolis', 'york', 'philadelphia', 
             'richmond', 'san', 'francisco', 'st', 'louis', 'officials', 'comments', 
             'views', 'commentary', 'national', 'based', 'information', 'collected',
             'received', 'contacts', 'outside', 'system', 'page', 'url', 'link', 
                   'district', 'districts', 'activity'])

In [77]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [78]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)


## Lemmatization

In [79]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# Run in terminal: python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:2])

['overall economic economic expand slight moderate pace early report growth continue similar pace previous report strengthen little change outlook report expect slight modest growth month ahead report consumer spending mixed suggest sluggish sale general retailer auto dealer report tourism generally upbeat report loan demand mix indicate steady growth report manufacturing favorable many note trade relate uncertainty report strong home sale note low demand high price home report agricultural condition remain weak express concern impact current future rainfall flooding employment wage employment continue increase nationwide report modest moderate growth report slight growth report gain variety industry employment increase highly concentrate high skilled job however labor market remain tight restrain rate growth majority cite shortage skilled laborer commonly manufacture construction also report difficulty find qualified worker technical professional position many report firm offer perk b

## Create the Document-Word matrix

In [80]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,
                             stop_words='english',
                             ngram_range= (2, 3)
                            )

data_vectorized = vectorizer.fit_transform(data_lemmatized)

In [81]:
len(vectorizer.get_feature_names())

4411

## Check the Sparsicity

In [82]:
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

Sparsicity:  5.760075336135186 %


In [83]:
data_dense

matrix([[0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Build LDA model with sklearn

In [84]:
lda_model = LatentDirichletAllocation(n_components=5,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=20190512,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=5, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=20190512, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)


## Diagnose model performance with perplexity and log-likelihood

In [85]:
# Log Likelihood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -1073207.5285747484
Perplexity:  2761.7656241463264
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 5,
 'n_jobs': -1,
 'n_topics': None,
 'perp_tol': 0.1,
 'random_state': 20190512,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


In [86]:
# Define Search Param
params = {
    'n_components': [5, 10, 15, 20, 25, 30], 
    'learning_decay': [.5, .7, .9]
}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=params, cv=5)

# Do the Grid Search
model.fit(data_vectorized)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_components': [5, 10, 15, 20, 25, 30], 'learning_decay': [0.5, 0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

## Best topic model and its parameters

In [87]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 5}
Best Log Likelihood Score:  -242327.3795687639
Model Perplexity:  2710.9294834743605


In [88]:
model.get_params

<bound method BaseEstimator.get_params of GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_components': [5, 10, 15, 20, 25, 30], 'learning_decay': [0.5, 0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)>

## Dominant topic in each document

In [89]:
data_vectorized

<429x4411 sparse matrix of type '<class 'numpy.int64'>'
	with 108999 stored elements in Compressed Sparse Row format>

In [91]:
# Create Document - Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)

# column names
#topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_topics)]

# index names
docnames = ["Doc" + str(i) for i in range(len(data))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), #columns=topicnames, 
                                 index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,0,1,2,3,4,dominant_topic
Doc0,0,0,0,0,1,4
Doc1,0,0,0,0,1,4
Doc2,0,0,0,0,1,4
Doc3,0,0,0,0,1,4
Doc4,0,0,0,0,1,4
Doc5,0,0,0,0,1,4
Doc6,0,0,0,0,1,4
Doc7,0,0,0,0,1,4
Doc8,0,0,0,0,1,4
Doc9,0,0,0,0,1,4


## Review topics distribution across documents

In [92]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution


Unnamed: 0,Topic Num,Num Documents
0,1,126
1,0,108
2,3,99
3,2,77
4,4,19


## Visualize the LDA model

In [93]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne')
panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


## Topic keywords

In [94]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)

# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names()
topicnames = [0, 1, 2, 3, 4]
df_topic_keywords.index = topicnames

# View
df_topic_keywords.head()

Unnamed: 0,ability pass,able pass,acceptable level,accord city,accord month,accord new,accord report,accord report city,accounting service,active oil,...,year strong,year year,year year basis,year year gain,year year increase,year year sale,yield corn,yield corn soybean,yield crop,yield expect
0,0.201386,0.202404,3.937099,12.364132,1.59552,8.606831,35.869426,3.331143,0.2008,0.204144,...,7.528881,25.654985,0.256199,6.57535,4.418261,5.973552,8.870364,6.334,10.181353,8.36449
1,23.446278,21.084554,0.200194,31.710789,0.200289,15.237478,60.218594,10.506168,17.198114,13.194936,...,0.212245,77.21218,18.702484,7.719647,5.428726,4.145104,5.522301,5.059241,0.203324,5.554671
2,0.200114,0.200548,1.065343,4.214355,0.220814,1.346934,24.004832,1.598808,0.200244,0.200378,...,0.202222,8.603234,0.221573,3.055402,0.385997,0.532332,0.20582,0.204982,0.203707,2.625877
3,0.200041,0.200232,6.597363,4.510053,12.783375,11.606639,15.706377,0.36342,0.200284,0.200048,...,2.856375,10.295546,3.614133,2.165587,2.723825,3.131723,0.200001,0.200001,0.210595,2.202213
4,4.952181,1.312262,0.200001,0.200671,0.200001,0.202118,0.20077,0.200462,0.200558,0.200495,...,0.200276,12.234054,0.205612,1.484015,5.04319,1.21729,0.201515,0.201776,0.201021,0.25275


## Top 10 keywords each topic

In [96]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=10)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,real estate,loan demand,labor market,retail sale,high level,city report,price increase,estate market,real estate market,report strong
Topic 1,real estate,city report,new city,new report,report increase,commercial real estate,commercial real,wage pressure,city note,consumer spending
Topic 2,retail sale,loan demand,consumer spending,residential construction,real estate,mortgage rate,business loan,auto sale,nonresidential construction,capital good
Topic 3,loan demand,retail sale,residential construction,capital spending,capital good,business loan,consumer spending,high level,new order,remain strong
Topic 4,labor market,moderate pace,real estate,modest pace,consumer spending,continue expand,modest moderate,remain tight,input cost,retail sale


## Predict the topics for a new piece of text

In [99]:
# Define function to predict topic for a given text document.
nlp = spacy.load('en', disable=['parser', 'ner'])

def predict_topic(text, nlp=nlp):
    global sent_to_words
    global lemmatization

    # Step 1: Clean with simple_preprocess
    mytext_2 = list(sent_to_words(text))

    # Step 2: Lemmatize
    mytext_3 = lemmatization(mytext_2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # Step 3: Vectorize transform
    mytext_4 = vectorizer.transform(mytext_3)

    # Step 4: LDA Transform
    topic_probability_scores = best_lda_model.transform(mytext_4)
    topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), :].values.tolist()
    return topic, topic_probability_scores


['real estate', 'loan demand', 'labor market', 'retail sale', 'high level', 'city report', 'price increase', 'estate market', 'real estate market', 'report strong']
[[0.2 0.2 0.2 0.2 0.2]]


In [100]:
# Predict the topic
#mytext = [""]
topic, prob_scores = predict_topic(text = mytext)
print(topic)
print(prob_scores)

['real estate', 'loan demand', 'labor market', 'retail sale', 'high level', 'city report', 'price increase', 'estate market', 'real estate market', 'report strong']
[[0.2 0.2 0.2 0.2 0.2]]
