In [3]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
#nltk.download('wordnet')

In [21]:
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [22]:
import pandas as pd
documents = pd.read_csv('NativeVoicesCorpus-Sample.csv') #this file should be in your working directory - the same folder where this Jupyter Notebook is saved
documents

Unnamed: 0,Document,Date,Text
0,ASP-v1-003,1789-08-22,"No. 3. [1st Session. THE SIX NATIONS, THE WYAN..."
1,ASP-v1-004,1789-08-22,No. 4. [1st Session. THE SOUTHERN TRIBES. COMM...
2,ASP-v1-005,1789-09-16,No. 5. [1st Session. WABASH. COMMUNICATED to T...
3,ASP-v1-006,1789-09-17,"No. 6. [1st Session. THE SIX NATIONS, THE WYAN..."
4,ASP-v1-007,1789-09-18,No. 7. 1st Session. INDIAN TREATIES. COMMUNICA...
5,ASP-v1-008,1790-01-11,No. 8. [2d Session. THE CREEKS. COMMUNICATED T...
6,ASP-v1-010,1790-08-04,No. 10. [2d Session. r THE CREEKS. COMMUNICATE...
7,ASP-v1-011,1790-08-06,No. 11. [2d Session. THE CREEKS. COMMUNICATED ...
8,ASP-v1-012,1790-08-07,No. 12. [2d Session. THE CREEKS. COMMUNICATED ...
9,ASP-v1-013,1790-08-11,No. 13. [2d Session. THE CHEROKEES. COMMUNICAT...


In [23]:
processed_docs = documents['Text'].map(preprocess)

In [25]:
processed_docs

0     [session, nation, wyandot, commun, senat, augu...
1     [session, southern, tribe, commun, senat, augu...
2     [session, wabash, commun, senat, septemb, gent...
3     [session, nation, wyandot, commun, senat, sept...
4     [session, indian, treati, commun, senat, septe...
5     [session, creek, commun, senat, januari, gentl...
6     [session, creek, commun, senat, august, gentle...
7     [session, creek, commun, senat, august, gentle...
8     [session, creek, commun, senat, august, gentle...
9     [session, cheroke, commun, senat, august, gent...
10    [session, northwestern, indian, commun, congre...
11    [session, indian, depred, commun, congress, ja...
12    [session, tuscarora, commun, hous, repres, mar...
13    [session, cheroke, nation, creek, commun, sena...
14    [session, wabash, indian, commun, congress, oc...
15    [session, cheroke, commun, senat, novemb, hawk...
16    [session, clair, defeat, indian, commun, congr...
17    [session, cheroke, commun, senat, januari,

In [26]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 accept
1 accompani
2 advis
3 arthur
4 august
5 chippewa
6 claim
7 clair
8 committe
9 commun
10 conclud


In [27]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [29]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 4),
  (7, 1),
  (8, 3),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 2),
  (17, 2)],
 [(0, 3),
  (2, 3),
  (3, 2),
  (4, 1),
  (5, 1),
  (7, 11),
  (8, 1),
  (9, 3),
  (10, 2),
  (11, 1),
  (13, 3),
  (15, 3),
  (16, 2),
  (18, 1),
  (19, 1),
  (20, 2),
  (21, 3),
  (22, 2),
  (23, 1),
  (24, 2),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 2),
  (29, 1),
  (30, 1),
  (31, 2),
  (32, 1),
  (33, 1),
  (34, 2),
  (35, 7),
  (36, 5),
  (37, 18),
  (38, 4),
  (39, 3),
  (40, 4),
  (41, 1),
  (42, 3),
  (43, 1),
  (44, 3),
  (45, 4),
  (46, 2),
  (47, 1),
  (48, 2),
  (49, 2),
  (50, 2),
  (51, 2),
  (52, 3),
  (53, 1),
  (54, 1),
  (55, 3),
  (56, 9),
  (57, 6),
  (58, 4),
  (59, 1),
  (60, 1),
  (61, 13),
  (62, 8),
  (63, 1),
  (64, 2),
  (65, 4),
  (66, 2),
  (67, 5),
  (68, 7),
  (69, 8),
  (70, 3),
  (71, 2),
  (72, 3),
  (73, 2),
  (74, 2),
  (75, 9),
  (76, 1),
  (77, 4),
  (78, 3),
  

In [30]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.1430591618758445),
 (1, 0.16843300997245966),
 (2, 0.16843300997245966),
 (3, 0.15950894854342976),
 (4, 0.18797455252645096),
 (5, 0.13544162330313914),
 (6, 0.5417664932125565),
 (7, 0.16843300997245966),
 (8, 0.40632486990941735),
 (9, 0.12123832964126031),
 (10, 0.151067510590586),
 (11, 0.13544162330313914),
 (12, 0.16843300997245966),
 (13, 0.18797455252645096),
 (14, 0.15950894854342976),
 (15, 0.10822007825473605),
 (16, 0.27088324660627827),
 (17, 0.3557964909939059)]


In [31]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [32]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.037*"cheroke" + 0.025*"thousand" + 0.022*"white" + 0.017*"line" + 0.016*"articl" + 0.016*"town" + 0.015*"busi" + 0.015*"dollar" + 0.015*"stipul" + 0.013*"resid"
Topic: 1 
Words: 0.022*"offic" + 0.021*"john" + 0.015*"town" + 0.014*"frontier" + 0.014*"aforesaid" + 0.013*"captain" + 0.013*"letter" + 0.013*"command" + 0.013*"copi" + 0.013*"late"
Topic: 2 
Words: 0.026*"letter" + 0.025*"send" + 0.017*"fort" + 0.016*"return" + 0.016*"come" + 0.015*"copi" + 0.015*"town" + 0.014*"major" + 0.013*"affair" + 0.012*"georgia"
Topic: 3 
Words: 0.022*"arm" + 0.021*"manner" + 0.017*"month" + 0.017*"author" + 0.016*"town" + 0.016*"aforesaid" + 0.015*"forc" + 0.012*"ohio" + 0.012*"subject" + 0.012*"line"
Topic: 4 
Words: 0.048*"cheroke" + 0.025*"white" + 0.020*"georgia" + 0.019*"citizen" + 0.019*"letter" + 0.015*"line" + 0.015*"kill" + 0.014*"town" + 0.014*"frontier" + 0.012*"offic"
Topic: 5 
Words: 0.019*"march" + 0.017*"send" + 0.017*"offic" + 0.015*"letter" + 0.013*"cheroke" + 0.01