# Topic Modeling

In [None]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction import _stop_words as stop_words

In [None]:
#Read in file and split into segments to vectorize
text = open('kafka_metamorphosis.txt', encoding="utf-8").read()

def tokenize(text):
    lowercase_text = text.lower()
    split_words = re.split('\W+', lowercase_text)
    return split_words

tokens = tokenize(text)

#Split into a collection of documents of 1,000 words
segment_length = 1000
data_to_vectorize = list()

nseg = round(len(tokens) / segment_length)
for i in range(nseg):
    segment = tokens[segment_length*i:segment_length*(i+1)]
    data_to_vectorize.append(' '.join(segment))

In [None]:
# setup vectorizer and process text
vec = CountVectorizer(input='content',
                      stop_words='english',
                      lowercase=True)

dtm = vec.fit_transform(tokens)
dc, vc = dtm.shape
print("read {0} documents with {1} vocabulary".format(dc,vc))

In [None]:
# Build the LDA model

#Parameter n_components = number of topics to extract (if topics are too similar, extract more)
simple_lda_model = LatentDirichletAllocation(n_components=10,
                                             max_iter=5,
                                             learning_method='batch',
                                             random_state=1)    
# get fitted data and transformed matrix
simple_lda_data = simple_lda_model.fit(dtm)

# extract the features to a simple list
feature_names = vec.get_feature_names_out()

#Variable n_words: how many words do we want for each topic?
n_words = 50

In [None]:
# Generate list of topics
for topic_idx, topic in enumerate(simple_lda_model.components_):
    print("Topic #%d:" % topic_idx)
    for i in topic.argsort()[:-n_words - 1:-1]:
        print("{0} ".format(feature_names[i]),end="")
    print("\n")

_Acknowledgements_: This notebook is heavily inspired by Jed Dobson's ["Cultural Analytics" course notebook](https://github.com/jeddobson/ENGL64.05-21F/blob/main/homework/Homework-03.ipynb).