# Earnings Calls - Topic Modeling

### Text Mining on Earnings Calls during a Pandemic as a Means to Predict End-Of-The-Month Stock Performances
####  Olin School of Business <br> Jose Luis Rodriguez  <br> jlr@wustl.edu <br> Fall 2021

In [1]:
import sys
import nltk
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.feature_extraction.text import  CountVectorizer #bag-of-words vectorizer 
from sklearn.decomposition import LatentDirichletAllocation #package for LDA

In [2]:
# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn

import matplotlib.pyplot as plt

In [3]:
%%capture
#define text normalization function
%run ./'02-Normalization.ipynb' #defining text normalization function

[nltk_data] Downloading package stopwords to /Users/jlroo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jlroo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jlroo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/jlroo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
def get_topic_words(vectorizer, lda_model, n_words):
    keywords = np.array(vectorizer.get_feature_names())
    topic_words = []
    for topic_weights in lda_model.components_:
        top_word_locs = (-topic_weights).argsort()[:n_words]
        topic_words.append(keywords.take(top_word_locs).tolist())
    return topic_words

### Hotels, Restaurants and Leisure

In [6]:
def earnings_lda(segment, max_features = 1000, components = 4, topic_prior = 0.1, word_prior = 0.1):
    bow_vectorizer = CountVectorizer(max_features = max_features)
    for secid in segment['related'].unique():
        data = segment[segment['related'] == secid]
        corpus = []
        for n in range(data.shape[0]):
            cdata = data.iloc[n].to_dict()
            corpus.extend([i.strip() for i in cdata.split('\n') if i.strip() != ''])
        normalized_corpus = normalize_corpus(corpus)
        bow_corpus = bow_vectorizer.fit_transform(normalized_corpus)
        lda_corpus = LatentDirichletAllocation(n_components = components,
                                               max_iter = 100,
                                               doc_topic_prior = topic_prior,
                                               topic_word_prior = word_prior).fit(bow_corpus)
        lda_topic_weights = lda_corpus.transform(bow_corpus)
        doc_names = ["Doc_" + str(i) for i in range(len(normalized_corpus))]
        topic_names = ["Topic_" + str(i) for i in range(components)]
        topic_df = pd.DataFrame(np.round(lda_topic_weights, components),
                                 columns = topic_names,
                                 index = doc_names)
        topic_df['secid'] = secid
        topic_df['direction'] = cdata['direction']
        topic_df['date_market'] = cdata['date_market']
        topic_df['speakers_number'] = cdata['speakers_number']
        topic_df['percent_change'] = cdata['percent_change']
        return topic_df

In [7]:
hrl_df = pd.read_csv('data/hrl_mrk21.csv')
hrl_df['date_market'] = pd.to_datetime(hrl_df['date_market'])
hrl_df['market_month'] = hrl_df['date_market'].apply(lambda i:str(i.month).zfill(2) + '-2021')

In [None]:
hrl_company_topics = earnings_lda(hrl_df)

## Monthly Earnings Calls Topics

In [18]:
netd = pd.DataFrame({'count':hrl_df.groupby(['market_month','direction'])['direction'].count()}).reset_index()
neg = netd[netd['direction'] == 'negative'].reset_index(drop=True)
direction = netd[netd['direction'] == 'positive'].reset_index(drop=True)
direction['net_change'] = (direction['count'] - neg['count']).reset_index(drop=True)
direction['market'] = direction['net_change'].apply(lambda i:'negative' if  i< 0 else 'positive')   
direction = direction[['market_month','net_change','market']]

Unnamed: 0,market_month,net_change,market
0,01-2021,-2.0,negative
1,02-2021,31.0,positive
2,03-2021,6.0,positive
3,04-2021,20.0,positive
4,05-2021,-9.0,negative
5,07-2021,6.0,positive
6,08-2021,17.0,positive
7,09-2021,-11.0,negative
8,10-2021,7.0,positive
9,11-2021,2.0,positive


In [20]:
month_lda = []
components = 4
for month in hrl_df['market_month'].unique():
    mdata = hrl_df[hrl_df['market_month'] == month].copy(deep=True)
    market = direction[direction['market_month'] == month]['market']
    net_change = direction[direction['market_month'] == month]['net_change']
    cps = []
    for n in range(mdata.shape[0]):
        cdata = mdata.iloc[n].to_dict()
        corpus = cdata['corpus']
        cps.extend([i.strip() for i in corpus.split('\n') if i.strip() != ''])
    normalized_corpus = normalize_corpus(cps)
    bow_vectorizer = CountVectorizer(max_features=1000)
    bow_corpus = bow_vectorizer.fit_transform(normalized_corpus)
    lda_corpus = LatentDirichletAllocation(n_components = components,
                                           max_iter = 100,
                                           doc_topic_prior = 0.2,
                                           topic_word_prior = 0.2).fit(bow_corpus)
    word_weights = lda_corpus.components_ / lda_corpus.components_.sum(axis=1)[:, np.newaxis]
    word_weights_df = pd.DataFrame(word_weights.T, 
                                   index = bow_vectorizer.get_feature_names(), 
                                   columns = ["Topic_" + str(i) for i in range(components)])
    word_weights_df = word_weights_df.sort_values(by = 'Topic_0', ascending = False)
    word_weights_df.loc[:,'market'] = market.to_string(header=False, index=False)
    word_weights_df.loc[:,'month'] = month
    word_weights_df.loc[:,'net_change'] = net_change.to_string(header=False, index=False)    
    month_lda.append(word_weights_df)

In [55]:
month_words = []
for i,mnt in enumerate(hrl_df['market_month'].unique()):
    weights_df = month_lda[i][['Topic_0','Topic_1','Topic_2','Topic_3']].copy(deep=True)
    market = direction[direction['market_month'] == mnt]['market']
    net_change = direction[direction['market_month'] == mnt]['net_change']
    weights_df.loc[:,'market'] = market.to_string(header=False, index=False)
    weights_df.loc[:,'month'] = mnt
    weights_df.loc[:,'net_change'] = net_change.to_string(header=False, index=False)    
    month_words.append(weights_df)

In [51]:
words_df = pd.concat(month_words)

In [52]:
words_df.head()

Unnamed: 0,Topic_0,Topic_1,Topic_2,Topic_3
sale,0.031617,0.00278,1.1e-05,0.001198
increase,0.025149,0.002475,1.1e-05,0.001386
revenue,0.023006,5e-06,1.1e-05,0.003449
cost,0.014367,0.005828,1.1e-05,6e-06
restaurant,0.012977,0.00338,0.018429,6e-06


In [28]:
corpus = []
for n in range(hrl_df.shape[0]):
    cps = hrl_df.iloc[n]['corpus']
    cps = [i.strip() for i in cps.split('\n') if i.strip() != '']
    corpus.extend(cps)

In [29]:
normalized_corpus = normalize_corpus(corpus)

In [30]:
#define the bag-of-words vectorizer:
bow_vectorizer = CountVectorizer(max_features=1000)
#vectorize the normalized data:
bow_corpus = bow_vectorizer.fit_transform(normalized_corpus)

In [31]:
components = 4
lda_corpus = LatentDirichletAllocation(n_components = components,
                                       max_iter = 100,
                                       doc_topic_prior = 0.2,
                                       topic_word_prior = 0.2).fit(bow_corpus)

In [32]:
no_top_words = 15
display_topics(lda_corpus, bow_vectorizer.get_feature_names(), no_top_words)

Topic 0:
cash capital balance share turn end debt measure guy return free ship future second sheet
Topic 1:
sale increase revenue margin restaurant cost high continue expect growth impact second drive fourth covid
Topic 2:
look kind yes lot little people maybe time obviously great could start still way open
Topic 3:
new continue brand growth customer team guest drive digital experience opportunity focus strong product restaurant


In [34]:
word_weights = lda_corpus.components_ / lda_corpus.components_.sum(axis=1)[:, np.newaxis]
word_weights_df = pd.DataFrame(word_weights.T, 
                               index = bow_vectorizer.get_feature_names(), 
                               columns = ["Topic_" + str(i) for i in range(components)])
word_weights_df = word_weights_df.sort_values(by = 'Topic_0', ascending = False)

### Topic Model Visualization


In [36]:
#prepare to display result in the Jupyter notebook
pyLDAvis.enable_notebook()

# mds is a function to use for visualizing the "distance" between topics]
vis_data = pyLDAvis.sklearn.prepare(lda_corpus,
                                     bow_corpus,
                                     bow_vectorizer,
                                     mds = 'tsne')

In [37]:
pyLDAvis.save_html(vis_data, 'lda-2021.html')

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
