# Topic Modeling Attempt #1

Author: Heidi Smith

This an an inital try at topic modeling using an LDA model on the small scale of monopoly/EIC texts

Code adapted from [here](https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0)

In [None]:
# download needed packages
!pip install gensim

In [21]:
import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords
import pandas as pd
import gensim.corpora as corpora
from pprint import pprint

In [11]:
# Read in csv
df = pd.read_csv('monopoly_all.csv')

In [29]:
# Remove stopwords/preprocess
stop_words = stopwords.words('english')
stop_words.extend(['thus', 'thereof', 'thence', 'thee', 'therein', 'wherein', 'whereby', 'whereas', 
                   'also', 'us', 'upon', 'would', 'within', 'indeed', 'become'])

def preprocess(data):
    for text in data:
        yield(gensim.utils.simple_preprocess(str(text)))
        
def remove_stopwords(data):
    return [[word for word in simple_preprocess(str(doc))
            if word not in stop_words] for doc in data]

data = df.text.values.tolist()
data = list(preprocess(data))
data = remove_stopwords(data)

In [30]:
# Creation of corpus
# Dictionary
id2word = corpora.Dictionary(data)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data]

In [32]:
# Train LDA model
num_topics = 5

# Build model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                      id2word=id2word,
                                      num_topics=num_topics)

# Print keyword in each topic
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.008*"may" + 0.007*"great" + 0.006*"king" + 0.006*"trade" + 0.005*"made" + '
  '0.004*"one" + 0.004*"shall" + 0.004*"price" + 0.003*"much" + 0.003*"many"'),
 (1,
  '0.010*"price" + 0.008*"bound" + 0.007*"printed" + 0.007*"may" + '
  '0.006*"trade" + 0.005*"great" + 0.004*"king" + 0.004*"one" + 0.003*"time" + '
  '0.003*"made"'),
 (2,
  '0.008*"trade" + 0.007*"may" + 0.006*"great" + 0.005*"king" + 0.004*"price" '
  '+ 0.004*"made" + 0.004*"time" + 0.004*"bound" + 0.004*"company" + '
  '0.004*"yet"'),
 (3,
  '0.008*"trade" + 0.008*"may" + 0.005*"king" + 0.005*"company" + '
  '0.004*"price" + 0.004*"made" + 0.004*"time" + 0.004*"much" + 0.004*"great" '
  '+ 0.004*"printed"'),
 (4,
  '0.009*"may" + 0.008*"trade" + 0.006*"king" + 0.006*"great" + '
  '0.006*"company" + 0.004*"made" + 0.004*"one" + 0.004*"many" + 0.004*"much" '
  '+ 0.004*"good"')]
