## Set up

In [3]:
! pip install funcy



In [4]:
! pip install tzdata



In [5]:
! pip install --no-dependencies pyLDAvis



In [6]:
! pip install wget
! pip install gensim




In [7]:
from collections import defaultdict
import wget
from gensim import corpora, models
import pandas as pd
import pyLDAvis.gensim
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Upload data

In [8]:
path = ".../"
file_name = 'browne_travels.csv'
df = pd.read_csv('browne_travels.csv')
df.head()

Unnamed: 0,author,title,text,lemmas
0,Browns,Travels,IF the desire of literary fame were the chief ...,desire literary fame chief motive submit publi...
1,Browns,Travels,The retrospect on the events of his life which...,retrospect event life briefly mention ensue pa...
2,Browns,Travels,"But their descriptions, when given without the...",description give small appearance interested v...
3,Browns,Travels,"The writer is aware, that when the length of t...",writer aware length time pass dar fûr consider...
4,Browns,Travels,A more creative imagination would have drawn m...,creative imagination draw animated picture min...


### prepare data for topic model

In [9]:
# extract the data out of the DataFrame
documents = df['lemmas'].to_list()

In [10]:
len(documents[0])

330

In [11]:
# tokenize - the syntax below will create a list of lists
texts =[
    [word for word in document.lower().split()]
    for document in documents
]

In [12]:
# create a count of each token
frequency = defaultdict(int)
for text in texts:
  for token in text:
    frequency[token] += 1

In [13]:
# remove words that appear only 1 time
texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

### build topic model

In [14]:
# create a dictionary based off our texts
# The dictionary maps each token to a unique integer id
dictionary = corpora.Dictionary(texts)

In [15]:
# create a corpus based off our dictionary and our texts
corpus = [dictionary.doc2bow(text) for text in texts]

In [16]:
# build LDA model
lda_model = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, passes=50)

In [17]:
# explore topics
lda_model.print_topics()

[(0,
  '0.009*"slave" + 0.008*"place" + 0.008*"egypt" + 0.006*"sea" + 0.006*"near" + 0.006*"country" + 0.005*"river" + 0.005*"far" + 0.005*"bring" + 0.005*"sell"'),
 (1,
  '0.007*"man" + 0.007*"great" + 0.006*"use" + 0.006*"place" + 0.006*"woman" + 0.005*"disease" + 0.005*"different" + 0.005*"see" + 0.005*"egypt" + 0.005*"people"'),
 (2,
  '0.013*"egypt" + 0.011*"africa" + 0.007*"little" + 0.007*"great" + 0.006*"abu" + 0.005*"place" + 0.005*"water" + 0.005*"east" + 0.005*"south" + 0.004*"man"'),
 (3,
  '0.010*"slave" + 0.009*"effect" + 0.009*"large" + 0.008*"eye" + 0.007*"day" + 0.006*"observe" + 0.006*"egypt" + 0.006*"country" + 0.005*"kahira" + 0.005*"small"'),
 (4,
  '0.015*"city" + 0.011*"wall" + 0.011*"small" + 0.010*"water" + 0.010*"antient" + 0.010*"remain" + 0.010*"river" + 0.009*"large" + 0.009*"town" + 0.008*"stone"'),
 (5,
  '0.020*"egypt" + 0.012*"day" + 0.009*"course" + 0.008*"red" + 0.008*"bring" + 0.008*"carry" + 0.006*"place" + 0.006*"suez" + 0.006*"generally" + 0.006*"

In [18]:
# Find topics in each document
lda_model.get_document_topics(corpus[67])

[(5, 0.35222086), (8, 0.49777046)]

In [19]:
# visualize
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis