### set up

In [1]:
# imports
import requests
import pandas as pd
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")
nlp.disable_pipes('ner', 'parser')

['ner', 'parser']

### functions to get text, split text into paragraphs, organize paragraphs, and lemmatize text

In [24]:
# make a function for all the above
def get_text(url):
    response = requests.get(url)
    text = response.text
    return text

In [25]:
def divide_paras(text, start, end, para_break):
    text = text[start:end]
    paras = text.split(para_break)
    return paras

In [26]:
def make_df(author, title, paras):
    df = pd.DataFrame(paras, columns=['text'])
    df.insert(0, "author", author)
    df.insert(1, "title", title)
    return df

In [27]:
# get text
text = get_text('https://www.gutenberg.org/cache/epub/72452/pg72452.txt')

In [28]:
# set start index
start = text.find('IF the desire of literary fame were')

In [29]:
# sanity check
start

1941

In [31]:
# set end index
end = text.find('*** END OF THE PROJECT GUTENBERG EBOOK TRAVELS IN AFRICA, EGYPT, AND SYRIA FROM THE YEAR 1792 TO 1798 ***') - 1

In [33]:
#sanity check
end

817453

In [34]:
# set paragraph break
para_break = '\r\n\r\n'

In [41]:
# divide text into paragraphs
travels_paras = divide_paras(text=text, start=start, end=end, para_break='\r\n\r\n')

In [42]:
# make DataFrame
travels_df = make_df(author="Browns", title="Travels", paras=expectations_paras)

In [43]:
# sanity check
travels_df.head()

Unnamed: 0,author,title,text
0,Browns,Travels,IF the desire of literary fame were the chief ...
1,Browns,Travels,The retrospect on the events of his life which...
2,Browns,Travels,"But their descriptions, when given without the..."
3,Browns,Travels,"The writer is aware, that when the length of t..."
4,Browns,Travels,A more creative imagination would have drawn m...


In [44]:
# extract lemmas
travels_df['lemmas'] = travels_df['text'].apply(process_text)

### filter dataframes

In [45]:
# filter out strings shorter than 25 characters
length_filter = travels_df['lemmas'].str.len() > 25

In [46]:
filter_df = travels_df[length_filter]

In [47]:
filter_df.head()

Unnamed: 0,author,title,text,lemmas
0,Browns,Travels,IF the desire of literary fame were the chief ...,desire literary fame chief motive submit publi...
1,Browns,Travels,The retrospect on the events of his life which...,retrospect event life briefly mention ensue pa...
2,Browns,Travels,"But their descriptions, when given without the...",description give small appearance interested v...
3,Browns,Travels,"The writer is aware, that when the length of t...",writer aware length time pass dar fûr consider...
4,Browns,Travels,A more creative imagination would have drawn m...,creative imagination draw animated picture min...


In [48]:
def remove_new_lines(text):
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    return text

In [49]:
# apply above function, you can ignore the warning.
filter_df['text'] = filter_df['text'].apply(remove_new_lines)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_df['text'] = filter_df['text'].apply(remove_new_lines)


In [50]:
# save our work
filter_df.to_csv('browne_travels.csv', index=False)