# Topic Modeling: Financial News

### Loading Libraries

In [2]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualziation
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Warnings
import warnings

# Spacy
import spacy

# StatsModel
import statsmodels.api as sm

# Path & Collection
import logging
from pathlib import Path
from collections import OrderedDict, Counter

# IPywidget
from ipywidgets import interact, FloatRangeSlider

# Scikit-Learn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Gensim
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.matutils import Sparse2Corpus

# PyLDAvis
import pyLDAvis
from pyLDAvis.gensim_models import prepare

# Worlcloud
from termcolor import colored
from wordcloud import WordCloud

In [3]:
%matplotlib inline

In [4]:
np.random.seed(42)

sns.set_style('white')

warnings.filterwarnings('ignore')

In [7]:
pyLDAvis.enable_notebook()

In [8]:
stop_words = set(pd.read_csv('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words',
                             header=None,
                             squeeze=True).tolist())

#### Helper Viz Functions

In [9]:
def show_word_list(model, corpus, top=10, save=False):
    top_topics = model.top_topics(corpus=corpus, coherence='u_mass', topn=20)
    words, probs = [], []
    for top_topic, _ in top_topics:
        words.append([t[1] for t in top_topic[:top]])
        probs.append([t[0] for t in top_topic[:top]])

    fig, ax = plt.subplots(figsize=(model.num_topics*1.2, 5))
    sns.heatmap(pd.DataFrame(probs).T,
                annot=pd.DataFrame(words).T,
                fmt='',
                ax=ax,
                cmap='Blues',
                cbar=False)
    fig.tight_layout()
    if save:
        fig.savefig(f'fin_news_wordlist_{top}', dpi=300)

In [10]:
def show_coherence(model, corpus, tokens, top=10, cutoff=0.01):
    top_topics = model.top_topics(corpus=corpus, coherence='u_mass', topn=20)
    word_lists = pd.DataFrame(model.get_topics().T, index=tokens)
    order = []
    for w, word_list in word_lists.items():
        target = set(word_list.nlargest(top).index)
        for t, (top_topic, _) in enumerate(top_topics):
            if target == set([t[1] for t in top_topic[:top]]):
                order.append(t)

    fig, axes = plt.subplots(ncols=2, figsize=(15,5))
    title = f'# Words with Probability > {cutoff:.2%}'
    (word_lists.loc[:, order]>cutoff).sum().reset_index(drop=True).plot.bar(title=title, ax=axes[1]);

    umass = model.top_topics(corpus=corpus, coherence='u_mass', topn=20)
    pd.Series([c[1] for c in umass]).plot.bar(title='Topic Coherence', ax=axes[0])
    fig.tight_layout()
    fig.savefig(f'fin_news_coherence_{top}', dpi=300);

In [11]:
def show_top_docs(model, corpus, docs):
    doc_topics = model.get_document_topics(corpus)
    df = pd.concat([pd.DataFrame(doc_topic, 
                                 columns=['topicid', 'weight']).assign(doc=i) 
                    for i, doc_topic in enumerate(doc_topics)])

    for topicid, data in df.groupby('topicid'):
        print(topicid, docs[int(data.sort_values('weight', ascending=False).iloc[0].doc)])
        print(pd.DataFrame(lda.show_topic(topicid=topicid)))

### Loading Financial News

In [12]:
data_path = Path('..', 'data', 'us-financial-news')

In [13]:
section_titles = ['Press Releases - CNBC',
                  'Reuters: Company News',
                  'Reuters: World News',
                  'Reuters: Business News',
                  'Reuters: Financial Services and Real Estate',
                  'Top News and Analysis (pro)',
                  'Reuters: Top News',
                  'The Wall Street Journal &amp; Breaking News, Business, Financial and Economic News, World News and Video',
                  'Business &amp; Financial News, U.S &amp; International Breaking News | Reuters',
                  'Reuters: Money News',
                  'Reuters: Technology News']

In [14]:
def read_articles():
    articles = []
    counter = Counter()
    for f in data_path.glob('*/**/*.json'):
        article = json.load(f.open())
        if article['thread']['section_title'] in set(section_titles):
            text = article['text'].lower().split()
            counter.update(text)
            articles.append(' '.join([t for t in text if t not in stop_words]))
    return articles, counter

In [16]:
articles, counter = read_articles()

print(f'Done loading {len(articles):,.0f} articles')

In [19]:
most_common = (pd.DataFrame(counter.most_common(), columns=['token', 'count'])
               .pipe(lambda x: x[~x.token.str.lower().isin(stop_words)]))

In [21]:
most_common.head(10)

### Preprocessing with SpaCy

In [22]:
results_path = Path('results', 'financial_news')

if not results_path.exists():
    results_path.mkdir(parents=True)

In [23]:
def clean_doc(d):
    doc = []
    for t in d:
        if not any([t.is_stop, t.is_digit, not t.is_alpha, t.is_punct, t.is_space, t.lemma_ == '-PRON-']):        
            doc.append(t.lemma_)
    return ' '.join(doc)    

In [25]:
nlp = spacy.load('en_core_web_sm')

nlp.max_length = 6000000

nlp.disable_pipes('ner')

In [26]:
nlp.pipe_names

In [27]:
def preprocess(articles):
    iter_articles = (article for article in articles)
    clean_articles = []
    for i, doc in enumerate(nlp.pipe(iter_articles, 
                                     batch_size=100, 
                                     n_threads=8), 1):
        if i % 1000 == 0:
            print(f'{i / len(articles):.2%}', end=' ', flush=True)
        clean_articles.append(clean_doc(doc))
    return clean_articles

In [28]:
clean_articles = preprocess(articles)

In [29]:
clean_path = results_path / 'clean_text'

clean_path.write_text('\n'.join(clean_articles))

### Vectorizing data

In [31]:
docs = clean_path.read_text().split('\n')

len(docs)

#### Exploring Cleaned Data

In [32]:
article_length, token_count = [], Counter()

for i, doc in enumerate(docs, 1):
    if i % 1e6 == 0:
        print(i, end=' ', flush=True)
    d = doc.lower().split()
    article_length.append(len(d))
    token_count.update(d)

In [33]:
fig, axes = plt.subplots(ncols=2, figsize=(15, 5))

(pd.DataFrame(token_count.most_common(), columns=['token', 'count'])
 .pipe(lambda x: x[~x.token.str.lower().isin(stop_words)])
 .set_index('token')
 .squeeze()
 .iloc[:25]
 .sort_values()
 .plot
 .barh(ax=axes[0], title='Most frequent tokens'))
sns.boxenplot(x=pd.Series(article_length), ax=axes[1])
axes[1].set_xscale('log')
axes[1].set_xlabel('Word Count (log scale)')
axes[1].set_title('Article Length Distribution')
sns.despine()
fig.tight_layout()
fig.savefig(results_path / 'fn_explore', dpi=300);
plt.show()

In [34]:
pd.Series(article_length).describe(percentiles=np.arange(.1, 1.0, .1))

In [35]:
docs = [x.lower() for x in docs]

In [36]:
docs[3]

### Setting Vocab Parameters

In [37]:
min_df = .005

max_df = .1

ngram_range = (1, 1)

binary = False

In [38]:
vectorizer = TfidfVectorizer(stop_words='english',
                             min_df=min_df,
                             max_df=max_df,
                             ngram_range=ngram_range,
                             binary=binary)

dtm = vectorizer.fit_transform(docs)

tokens = vectorizer.get_feature_names()
dtm.shape

In [39]:
corpus = Sparse2Corpus(dtm, documents_columns=False)

id2word = pd.Series(tokens).to_dict()
dictionary = Dictionary.from_corpus(corpus, id2word)

### Training & Evaluating LDA Model

In [40]:
logging.basicConfig(filename='gensim.log',
                    format="%(asctime)s:%(levelname)s:%(message)s",
                    level=logging.DEBUG)

logging.root.level = logging.DEBUG

#### Training Models with 5-25 Topics

In [41]:
num_topics = [5, 10, 15, 20]

In [42]:
for topics in num_topics:
    print(topics)
    lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=topics,
                     chunksize=len(docs),
                     update_every=1,
                     alpha='auto',                     
                     eta='auto',                       
                     decay=0.5,                        
                     offset=1.0,
                     eval_every=1,
                     passes=10,
                     iterations=50,
                     gamma_threshold=0.001,
                     minimum_probability=0.01,         
                     minimum_phi_value=0.01,           
                     random_state=42)
    lda_model.save((results_path / f'model_{topics}').as_posix())

#### Evaluating Results

In [44]:
def eval_lda_model(ntopics, model, corpus=corpus, tokens=tokens):
    show_word_list(model=model, corpus=corpus, top=ntopics, save=True)
    show_coherence(model=model, corpus=corpus, tokens=tokens, top=ntopics)
    vis = prepare(model, corpus, dictionary, mds='tsne')
    pyLDAvis.save_html(vis, f'lda_{ntopics}.html')
    return 2 ** (-model.log_perplexity(corpus))

In [45]:
lda_models = {}

perplexity ={}

for ntopics in num_topics:
    print(ntopics)
    lda_models[ntopics] = LdaModel.load((results_path / f'model_{ntopics}').as_posix())
    perplexity[ntopics] = eval_lda_model(ntopics=ntopics, model=lda_models[ntopics])

#### Perplexity

In [46]:
pd.Series(perplexity).plot.bar()
sns.despine();
plt.show()

### PyLDAVis for 15 Topics

In [47]:
vis = prepare(lda_models[15], corpus, dictionary, mds='tsne')

pyLDAvis.display(vis)