In [1]:
! pip install funcy



In [2]:
! pip install gensim
! pip install tzdata
! pip install --no-dependencies pyLDAvis
! pip install wget



In [3]:
# imports
import requests
import spacy
from collections import defaultdict
import wget
from gensim import corpora, models
import pandas as pd
import pyLDAvis.gensim
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [4]:
nlp = spacy.load("en_core_web_sm")
nlp.disable_pipes('ner', 'parser')

['ner', 'parser']

### Topic Modeling HW

In [5]:
response = requests.get('https://www.gutenberg.org/cache/epub/514/pg514.txt')
text = response.text

In [6]:
text.find('“Christmas won’t be Christmas without any presents,”')

2554

In [7]:
text.find('*** END ')

1028735

In [8]:
start = 2554
end = 1028734

In [9]:
tale = text[start:end]

In [10]:
tale_paras = tale.split('\r\n\r\n')

In [11]:
author = []
title = []

In [12]:
for para in tale_paras:
    author.append('Alcott')
    title.append('Little')

In [13]:
tale_df = pd.DataFrame(list(zip(author, title, tale_paras)), columns=['author', 'title', 'text'])

In [14]:
tale_df.head()

Unnamed: 0,author,title,text
0,Alcott,Little,“Christmas won’t be Christmas without any pres...
1,Alcott,Little,"“It’s so dreadful to be poor!” sighed Meg, loo..."
2,Alcott,Little,“I don’t think it’s fair for some girls to hav...
3,Alcott,Little,"“We’ve got Father and Mother, and each other,”..."
4,Alcott,Little,The four young faces on which the firelight sh...


In [15]:
def process_text(text):
    """Remove new line characters and lemmatize text. Returns string of lemmas"""
    text = text.replace('\n', ' ')
    doc = nlp(text)
    tokens = [token for token in doc]
    no_stops = [token for token in tokens if not token.is_stop]
    no_punct = [token for token in no_stops if token.is_alpha]
    lemmas = [token.lemma_ for token in no_punct]
    lemmas_lower = [lemma.lower() for lemma in lemmas]
    lemmas_string = ' '.join(lemmas_lower)
    return lemmas_string

In [16]:
tale_df['lemmas'] = tale_df['text'].apply(process_text)

In [17]:
tale_df.head()

Unnamed: 0,author,title,text,lemmas
0,Alcott,Little,“Christmas won’t be Christmas without any pres...,christmas wo christmas present grumble jo lie rug
1,Alcott,Little,"“It’s so dreadful to be poor!” sighed Meg, loo...",dreadful poor sigh meg look old dress
2,Alcott,Little,“I don’t think it’s fair for some girls to hav...,think fair girl plenty pretty thing girl add l...
3,Alcott,Little,"“We’ve got Father and Mother, and each other,”...",get father mother say beth contentedly corner
4,Alcott,Little,The four young faces on which the firelight sh...,young face firelight shine brighten cheerful w...


In [18]:
length_filter = tale_df['lemmas'].str.len() > 25

In [19]:
tale_df = tale_df[length_filter]

In [20]:
tale_df.head()

Unnamed: 0,author,title,text,lemmas
0,Alcott,Little,“Christmas won’t be Christmas without any pres...,christmas wo christmas present grumble jo lie rug
1,Alcott,Little,"“It’s so dreadful to be poor!” sighed Meg, loo...",dreadful poor sigh meg look old dress
2,Alcott,Little,“I don’t think it’s fair for some girls to hav...,think fair girl plenty pretty thing girl add l...
3,Alcott,Little,"“We’ve got Father and Mother, and each other,”...",get father mother say beth contentedly corner
4,Alcott,Little,The four young faces on which the firelight sh...,young face firelight shine brighten cheerful w...


In [21]:
def remove_new_lines(text):
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    return text

In [22]:
tale_df['text'] = tale_df['text'].apply(remove_new_lines)

In [23]:
# save our work
tale_df.to_csv('littwomen_novel.csv', index=False)

In [25]:
file_name = 'littwomen_novel.csv'
df = pd.read_csv(file_name)

In [26]:
docs = df['lemmas'].to_list()

In [28]:
len(docs[0])

49

In [29]:
texts =[
    [word for word in doc.lower().split()]
    for doc in docs
]

In [30]:
frequency = defaultdict(int)
for text in texts:
  for token in text:
    frequency[token] += 1

In [31]:
texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

In [32]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda_model = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, passes=50)
lda_model.print_topics()

[(0,
  '0.035*"laurie" + 0.031*"say" + 0.026*"jo" + 0.023*"look" + 0.020*"hand" + 0.019*"shake" + 0.018*"away" + 0.013*"eye" + 0.012*"bow" + 0.011*"begin"'),
 (1,
  '0.050*"say" + 0.031*"jo" + 0.028*"yes" + 0.020*"add" + 0.019*"oh" + 0.018*"laurie" + 0.015*"shall" + 0.013*"look" + 0.013*"amy" + 0.012*"sort"'),
 (2,
  '0.018*"old" + 0.017*"jo" + 0.014*"hand" + 0.012*"forget" + 0.012*"look" + 0.010*"gentleman" + 0.010*"take" + 0.008*"laugh" + 0.008*"man" + 0.008*"like"'),
 (3,
  '0.040*"jo" + 0.023*"beth" + 0.023*"come" + 0.015*"look" + 0.014*"face" + 0.014*"amy" + 0.013*"say" + 0.013*"laurie" + 0.011*"go" + 0.011*"away"'),
 (4,
  '0.018*"meg" + 0.014*"little" + 0.013*"go" + 0.012*"dress" + 0.011*"time" + 0.010*"amy" + 0.008*"get" + 0.008*"look" + 0.007*"laugh" + 0.007*"white"'),
 (5,
  '0.024*"day" + 0.017*"sit" + 0.013*"work" + 0.012*"long" + 0.011*"little" + 0.008*"year" + 0.008*"sweet" + 0.008*"play" + 0.008*"amy" + 0.008*"give"'),
 (6,
  '0.015*"good" + 0.014*"man" + 0.013*"old" + 0

In [35]:
lda_model.get_document_topics(corpus[0])

[(3, 0.5118977), (13, 0.3880128)]

In [36]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

In [37]:
## TF idf topic model
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lda_model_tfidf = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=20, passes=50)
corpus_lda = lda_model_tfidf[corpus_tfidf]
lda_model_tfidf.print_topics()

[(0,
  '0.008*"silk" + 0.007*"somewhat" + 0.007*"lace" + 0.007*"tall" + 0.007*"improve" + 0.007*"likewise" + 0.006*"tuck" + 0.006*"lend" + 0.006*"tree" + 0.006*"flatter"'),
 (1,
  '0.008*"embrace" + 0.007*"implore" + 0.007*"mixture" + 0.007*"allude" + 0.007*"correct" + 0.007*"merriment" + 0.007*"vent" + 0.006*"manuscript" + 0.006*"forlorn" + 0.006*"impertinent"'),
 (2,
  '0.007*"fail" + 0.006*"lovely" + 0.006*"pleasant" + 0.006*"usual" + 0.006*"place" + 0.005*"drive" + 0.005*"month" + 0.005*"letter" + 0.005*"hang" + 0.005*"tongue"'),
 (3,
  '0.008*"laurie" + 0.008*"ask" + 0.008*"jo" + 0.007*"say" + 0.006*"look" + 0.006*"stop" + 0.006*"laugh" + 0.006*"amy" + 0.006*"think" + 0.006*"smile"'),
 (4,
  '0.008*"number" + 0.007*"arrangement" + 0.007*"particle" + 0.007*"rash" + 0.007*"fist" + 0.006*"defend" + 0.006*"blind" + 0.006*"worn" + 0.006*"ghost" + 0.006*"amusement"'),
 (5,
  '0.013*"lamb" + 0.010*"suggest" + 0.010*"attempt" + 0.008*"immensely" + 0.007*"blunt" + 0.007*"meaning" + 0.007*"

In [38]:
lda_model_tfidf.get_document_topics(corpus_tfidf[0])

[(0, 0.014893395),
 (1, 0.014893395),
 (2, 0.014893395),
 (3, 0.014893395),
 (4, 0.014893395),
 (5, 0.014893395),
 (6, 0.014893395),
 (7, 0.014893395),
 (8, 0.014893395),
 (9, 0.14615186),
 (10, 0.014893395),
 (11, 0.45020485),
 (12, 0.014893395),
 (13, 0.014893395),
 (14, 0.15045558),
 (15, 0.014893395),
 (16, 0.014893395),
 (17, 0.014893395),
 (18, 0.014893395),
 (19, 0.014893395)]

In [39]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_tfidf, corpus_tfidf, dictionary)
vis