In [1]:
# imports
import requests
import pandas as pd
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")
nlp.disable_pipes('ner', 'parser')

['ner', 'parser']

### Topic Modeling HW

In [4]:
response = requests.get('https://www.gutenberg.org/cache/epub/514/pg514.txt')
text = response.text

In [5]:
text.find('“Christmas won’t be Christmas without any presents,”')

2554

In [6]:
text.find('*** END ')

1028735

In [7]:
start = 2554
end = 1028734

In [8]:
tale = text[start:end]

In [9]:
tale_paras = tale.split('\r\n\r\n')

In [10]:
author = []
title = []

In [11]:
for para in tale_paras:
    author.append('Alcott')
    title.append('Little')

In [12]:
tale_df = pd.DataFrame(list(zip(author, title, tale_paras)), columns=['author', 'title', 'text'])

In [13]:
tale_df.head()

Unnamed: 0,author,title,text
0,Alcott,Little,“Christmas won’t be Christmas without any pres...
1,Alcott,Little,"“It’s so dreadful to be poor!” sighed Meg, loo..."
2,Alcott,Little,“I don’t think it’s fair for some girls to hav...
3,Alcott,Little,"“We’ve got Father and Mother, and each other,”..."
4,Alcott,Little,The four young faces on which the firelight sh...


In [15]:
def process_text(text):
    """Remove new line characters and lemmatize text. Returns string of lemmas"""
    text = text.replace('\n', ' ')
    doc = nlp(text)
    tokens = [token for token in doc]
    no_stops = [token for token in tokens if not token.is_stop]
    no_punct = [token for token in no_stops if token.is_alpha]
    lemmas = [token.lemma_ for token in no_punct]
    lemmas_lower = [lemma.lower() for lemma in lemmas]
    lemmas_string = ' '.join(lemmas_lower)
    return lemmas_string

In [16]:
tale_df['lemmas'] = tale_df['text'].apply(process_text)

In [17]:
tale_df.head()

Unnamed: 0,author,title,text,lemmas
0,Alcott,Little,“Christmas won’t be Christmas without any pres...,christmas wo christmas present grumble jo lie rug
1,Alcott,Little,"“It’s so dreadful to be poor!” sighed Meg, loo...",dreadful poor sigh meg look old dress
2,Alcott,Little,“I don’t think it’s fair for some girls to hav...,think fair girl plenty pretty thing girl add l...
3,Alcott,Little,"“We’ve got Father and Mother, and each other,”...",get father mother say beth contentedly corner
4,Alcott,Little,The four young faces on which the firelight sh...,young face firelight shine brighten cheerful w...


In [22]:
length_filter = tale_df['lemmas'].str.len() > 25

In [23]:
tale_df = tale_df[length_filter]

In [24]:
tale_df.head()

Unnamed: 0,author,title,text,lemmas
0,Alcott,Little,“Christmas won’t be Christmas without any pres...,christmas wo christmas present grumble jo lie rug
1,Alcott,Little,"“It’s so dreadful to be poor!” sighed Meg, loo...",dreadful poor sigh meg look old dress
2,Alcott,Little,“I don’t think it’s fair for some girls to hav...,think fair girl plenty pretty thing girl add l...
3,Alcott,Little,"“We’ve got Father and Mother, and each other,”...",get father mother say beth contentedly corner
4,Alcott,Little,The four young faces on which the firelight sh...,young face firelight shine brighten cheerful w...


In [25]:
def remove_new_lines(text):
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    return text

In [26]:
tale_df['text'] = tale_df['text'].apply(remove_new_lines)

In [27]:
# save our work
tale_df.to_csv('littwomen_novel.csv', index=False)