In [36]:
import nltk; nltk.download('stopwords')

import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# NLTK Stop words
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kathmbell/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Stopwords

In [37]:
newStopWords = ['stockholder','company','sale','share'
                ,'stock','business','offering','security'
                ,'public','prospectus','purchase', 'commission'
                ,'underwriting','discount','risk-factor'
                ,'trading','exchange']

stop_words = stopwords.words('english')
stop_words.extend(newStopWords)

# Add stopwords from https://github.com/LexPredict/lexpredict-legal-dictionary/tree/master/en


### Import Data

In [52]:
df = pd.read_pickle("./data_test_0817.pkl")
df.columns
data = df['text'].values.tolist()

df.head()

Unnamed: 0,company,filing_date,form_type,format,no.,size,url,text
0,10SION Holdings Inc.,2018-10-29,S-1,[text] [html],320,979199,https://www.sec.gov/Archives/edgar/data/175668...,PECTUS SUBJECT TO COMPLETION - DATED OCTOBER _...
2,12 Retech Corp,2018-07-02,S-1,[text] [html],320,6397781,https://www.sec.gov/Archives/edgar/data/162761...,PECTUS SUMMARY 1
4,"1347 Property Insurance Holdings, Inc.",2018-01-08,S-1,[text] [html],320,2469600,https://www.sec.gov/Archives/edgar/data/159189...,PECTUS SUBJECT TO COMPLETION ...
6,"1895 Bancorp of Wisconsin, Inc.",2018-09-07,S-1,[text] [html],320,7184379,https://www.sec.gov/Archives/edgar/data/175169...,
8,8i Enterprises Acquisition Corp.,2019-02-22,S-1,[text] [html],320,1360947,https://www.sec.gov/Archives/edgar/data/175364...,"PECTUS SUBJECT TO COMPLETION, DATED ..."


In [39]:
# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

### Tokenize and Clean Text

In [40]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [41]:
print(data_words[:1])

[['pectus', 'subject', 'to', 'completion', 'dated', 'october', 'sion', 'holdings', 'inc', 'prospectus', 'shares', 'for', 'sale', 'by', 'sion', 'holdings', 'inc', 'at', 'price', 'of', 'per', 'share', 'and', 'shares', 'of', 'common', 'stock', 'as', 'dividend', 'to', 'stockholders', 'of', 'sixty', 'six', 'oilfield', 'services', 'inc', 'this', 'prospectus', 'covers', 'two', 'distributions', 'the', 'sale', 'of', 'shares', 'by', 'sion', 'holdings', 'at', 'price', 'of', 'per', 'share', 'on', 'self', 'underwritten', 'best', 'efforts', 'all', 'or', 'none', 'basis', 'the', 'net', 'proceeds', 'from', 'the', 'sale', 'of', 'such', 'shares', 'being', 'used', 'to', 'and', 'in', 'connection', 'with', 'purchase', 'two', 'companies', 'see', 'how', 'we', 'intend', 'to', 'use', 'the', 'net', 'proceeds', 'from', 'the', 'sale', 'of', 'our', 'shares', 'and', 'distribution', 'of', 'shares', 'of', 'sion', 'holdings', 'common', 'stock', 'to', 'the', 'stockholders', 'of', 'sixty', 'six', 'oilfield', 'services', 

### Bigram and Trigram Models

In [42]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# # See trigram example
# print(trigram_mod[bigram_mod[data_words[0]]])



### Remove Stopwords, Make Bigrams and Lemmatize

In [43]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


In [44]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['pectus', 'subject', 'completion_dated', 'october', 'sion_holding', 'inc', 'share', 'sion_holding', 'inc', 'price', 'share', 'common', 'dividend', 'stockholder', 'oilfield_service', 'inc', 'cover', 'distribution', 'share', 'sion_holdings', 'price', 'self', 'underwritten', 'none', 'basis', 'net', 'proceed', 'share', 'use', 'connection', 'company', 'see', 'intend', 'use', 'net', 'proceed', 'share', 'distribution', 'share', 'sion_holdings', 'common', 'stockholder', 'oilfield_service', 'inc', 'subject', 'sion_holdings', 'acquisition', 'operate', 'subsidiary', 'oilfield_service', 'see', 'share', 'distribute', 'oilfield_service', 'sion_holding', 'determine', 'initial', 'price', 'share', 'offer', 'cash', 'pursuant', 'value', 'share', 'issue', 'oilfield_service', 'arbitrarily', 'price', 'value', 'bear', 'relationship', 'asset', 'earning', 'criterion', 'value', 'see', 'set', 'initial', 'price', 'value', 'share', 'sion_holding', 'receive', 'net', 'proceed', 'share', 'cash', 'pende', 'sion_hold

### Create dictionary and Corpus

Gensim creates a unique id for each word in the document. The produced corpus shown above is a mapping of (word_id, word_frequency).

For example, (0, 1) above implies, word id 0 occurs once in the first document. Likewise, word id 1 occurs twice and so on.

This is used as the input by the LDA model.

If you want to see what word a given id corresponds to, pass the id as a key to the dictionary.

In [51]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
# print(corpus[:1])
print(len(id2word))

3815


In [46]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]


[[('abandon', 1),
  ('ability', 16),
  ('able', 20),
  ('absorb', 1),
  ('accelerated', 1),
  ('accept', 1),
  ('acceptable', 2),
  ('acceptance', 1),
  ('access', 4),
  ('accident', 1),
  ('accommodate', 1),
  ('accordingly', 2),
  ('account', 6),
  ('accounting', 5),
  ('accounting_standards', 2),
  ('accounts_payable', 1),
  ('accumulation', 1),
  ('accurate', 2),
  ('accurately', 4),
  ('accustom', 1),
  ('achieve', 3),
  ('acquire', 13),
  ('acquisition', 25),
  ('act', 21),
  ('action', 9),
  ('actively', 1),
  ('activity', 16),
  ('actual', 3),
  ('actual_result', 1),
  ('actually', 1),
  ('addition', 16),
  ('additional', 12),
  ('additionally', 2),
  ('address', 4),
  ('adequacy', 1),
  ('adequate', 6),
  ('administer', 2),
  ('administrative', 3),
  ('adopt', 1),
  ('adoption', 2),
  ('advance', 1),
  ('advantage', 1),
  ('adverse', 8),
  ('adverse_effect', 7),
  ('adversely', 3),
  ('adversely_affect', 6),
  ('adversely_affected', 2),
  ('affect', 23),
  ('affected', 3),
  (

### Building Topic Model

In [47]:
# Build LDA model
# Hyperparameters: chunksize is the number of documents to be used in each training chunk. 
# update_every determines how often the model parameters should be updated and 
# passes is the total number of training passes.

# alpha and eta are hyperparameters that affect sparsity of the topics.
# According to the Gensim docs, both defaults to 1.0/num_topics prior.



lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

### View Topic Model

In [48]:
# Print the Keyword in the 10 topics
# The weights reflect how important a keyword is to that topic.

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.018*"may" + 0.016*"common" + 0.015*"company" + 0.013*"target" + '
  '0.013*"share" + 0.011*"could" + 0.010*"product" + 0.009*"price" + '
  '0.008*"director" + 0.007*"financial"'),
 (1,
  '0.002*"share" + 0.001*"may" + 0.001*"common" + 0.001*"right" + '
  '0.001*"price" + 0.001*"company" + 0.001*"product" + 0.001*"financial" + '
  '0.001*"target" + 0.001*"could"'),
 (2,
  '0.035*"right" + 0.023*"share" + 0.016*"combination" + 0.015*"exercise" + '
  '0.013*"may" + 0.012*"common" + 0.010*"initial" + 0.010*"shareholder" + '
  '0.009*"warrant" + 0.008*"price"'),
 (3,
  '0.023*"sale" + 0.018*"fluid_end" + 0.017*"may" + 0.015*"share" + '
  '0.011*"product" + 0.010*"price" + 0.009*"common" + 0.008*"market" + '
  '0.007*"customer" + 0.007*"could"'),
 (4,
  '0.013*"prefer" + 0.013*"insurance" + 0.012*"share" + 0.009*"cytisinicline" '
  '+ 0.008*"policy" + 0.007*"dividend" + 0.007*"common" + 0.006*"maison" + '
  '0.006*"state" + 0.006*"provide"')]


### Model Perplexity and Coherence Score

Model perplexity and topic coherence provide a convenient measure to judge how good a given topic model is. 

Topic Coherence -- use to find how many is the optimal number of topics to model. A higher coherence score the better.

https://rare-technologies.com/what-is-topic-coherence/

In [49]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.964612661486225

Coherence Score:  0.30378106625433227


### Finding Optimum Coherence Score

In [27]:
for value in range(2,10,2):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=value, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print("Topics: ", value, "Coherence Score: ", coherence_lda)

Topics:  2 Coherence Score:  0.28660694254903085
Topics:  4 Coherence Score:  0.3091722008638235
Topics:  6 Coherence Score:  0.32997396699645976
Topics:  8 Coherence Score:  0.3429149095722927


### Visulalize Topic Keywords

In [53]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### LDA Mallet Model

Mallet's version often gives better quality topics