In [17]:
import re
import spacy
import requests
from bs4 import BeautifulSoup
from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [18]:
url = "https://www.gutenberg.org/cache/epub/345/pg345.txt"

In [19]:
raw_text = requests.get(url).text

In [20]:
import string

def remove_punctuation(input_string):
    # Make a translation table that maps all punctuation characters to None
    translator = str.maketrans("", "", string.punctuation)

    # Apply the translation table to the input string
    result = input_string.translate(translator)

    return result

In [21]:
def preprocess_text(raw_text):
    response_pretty = BeautifulSoup(raw_text, 'html.parser')
    page_text = response_pretty.get_text()
    page_text = re.sub(r'[^\x00-\x7F]+', '', page_text)
    page_text = page_text.replace("\r\n", " ").replace("'s", " ")
    # page_text = page_text.replace("'s", " ")
    # " ".join(page_text.split())
    page_text = re.sub(r'\s+', ' ', page_text)

    return remove_punctuation(page_text) 

In [22]:
words = word_tokenize(raw_text)

# Get the list of English stop words
stop_words = set(stopwords.words('english'))

# Remove stop words from the text
filtered_text = [word for word in words if word.lower() not in stop_words]

# Reconstruct the text without stop words
filtered_text = ' '.join(filtered_text)


In [6]:
preprocessed_text = preprocess_text(raw_text)

In [7]:
preprocessed_text



In [23]:
filtered_text



In [8]:
# !python -m spacy download en_core_web_md

In [24]:
nlp = spacy.load("en_core_web_md")

In [10]:
# lemmatizer = nlp.get_pipe("lemmatizer")

In [11]:
book = nlp(preprocessed_text)

In [25]:
f_book = nlp(filtered_text)

In [12]:
labels = [x.label_ for x in book.ents]
Counter(labels) # need to focus on PERSON and GPE labels

Counter({'PERSON': 2133,
         'CARDINAL': 575,
         'TIME': 496,
         'ORG': 478,
         'DATE': 445,
         'GPE': 352,
         'ORDINAL': 164,
         'NORP': 148,
         'LOC': 112,
         'FAC': 73,
         'PRODUCT': 36,
         'WORK_OF_ART': 24,
         'QUANTITY': 16,
         'LAW': 13,
         'LANGUAGE': 12,
         'EVENT': 5,
         'MONEY': 1})

In [26]:
f_labels = [x.label_ for x in f_book.ents]
Counter(f_labels) 

Counter({'PERSON': 2195,
         'CARDINAL': 630,
         'DATE': 488,
         'ORG': 396,
         'TIME': 372,
         'GPE': 337,
         'ORDINAL': 162,
         'NORP': 117,
         'LOC': 76,
         'WORK_OF_ART': 67,
         'FAC': 42,
         'QUANTITY': 16,
         'LANGUAGE': 15,
         'PRODUCT': 12,
         'LAW': 7,
         'MONEY': 6,
         'EVENT': 2,
         'PERCENT': 1})

In [13]:
book.ents[1].text, book.ents[1].label_, book.ents[1].start, book.ents[1].end

('Dracula', 'PERSON', 5, 6)

In [14]:
for token in book[:10]:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)


The the DET DT det Xxx True True
Project Project PROPN NNP compound Xxxxx True False
Gutenberg Gutenberg PROPN NNP compound Xxxxx True False
eBook eBook PROPN NNP ROOT xXxxx True False
of of ADP IN prep xx True True
Dracula Dracula PROPN NNP pobj Xxxxx True False
This this DET DT det Xxxx True True
ebook ebook NOUN NN nsubj xxxx True False
is be AUX VBZ ROOT xx True True
for for ADP IN prep xxx True True


In [15]:
items = [x.text for x in book.ents]
Counter(items).most_common(30)

[('Van Helsing', 257),
 ('Lucy', 195),
 ('one', 151),
 ('Jonathan', 137),
 ('Mina', 136),
 ('first', 117),
 ('Arthur', 112),
 ('two', 104),
 ('tonight', 72),
 ('Lucys', 72),
 ('Godalming', 72),
 ('Seward', 64),
 ('Harker', 63),
 ('London', 57),
 ('Project Gutenberg', 52),
 ('John', 51),
 ('Quincey', 46),
 ('today', 44),
 ('Morris', 42),
 ('Renfield', 38),
 ('Sewards Diary', 35),
 ('three', 32),
 ('Whitby', 30),
 ('One', 29),
 ('tomorrow', 27),
 ('last night', 26),
 ('yesterday', 23),
 ('Quincey Morris', 23),
 ('UnDead', 23),
 ('Mrs Harker', 23)]

In [27]:
f_items = [x.text for x in f_book.ents]
Counter(f_items).most_common(30)

[('Van Helsing', 297),
 ('Lucy', 276),
 ('one', 244),
 ('Mina', 168),
 ('Jonathan', 158),
 ('Arthur', 127),
 ('first', 117),
 ('Harker', 106),
 ('two', 104),
 ('Seward', 72),
 ('Godalming', 64),
 ('London', 61),
 ('night', 61),
 ('John', 54),
 ('Renfield', 46),
 ('Morris', 44),
 ('morning', 43),
 ('half', 42),
 ('morrow', 41),
 ('Quincey', 40),
 ('Project Gutenberg™', 40),
 ('three', 35),
 ('One', 33),
 ('Un-Dead', 33),
 ('last night', 31),
 ('Whitby', 29),
 ('Seward ’ Diary', 27),
 ('yesterday', 25),
 ('Dracula', 24),
 ('Carfax', 23)]

In [16]:
book.ents[21]

Sewards Diary

In [136]:
def get_person_counts(doc):
    counts = {}

    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            if ent.text not in counts.keys():
                counts[ent.text] = {}
                counts[ent.text]['count'] = 1
                counts[ent.text]['position'] = [(ent.start, ent.end)]
            else:
                counts[ent.text]['count'] += 1
                counts[ent.text]['position'].append((ent.start, ent.end))    

    return sorted(counts.items(), key=lambda x: x[1]['count'], reverse=True)


In [137]:
person_counts = get_person_counts(book)

In [16]:
preprocessed_text.split()[:10]

['The',
 'Project',
 'Gutenberg',
 'eBook',
 'of',
 'Dracula',
 'This',
 'ebook',
 'is',
 'for']

In [142]:
person_counts[8]

('Seward',
 {'count': 65,
  'position': [(29168, 29169),
   (54746, 54747),
   (55048, 55049),
   (55090, 55091),
   (56121, 56122),
   (56364, 56365),
   (58594, 58595),
   (62997, 62998),
   (64194, 64195),
   (70402, 70403),
   (73513, 73514),
   (102558, 102559),
   (107848, 107849),
   (108689, 108690),
   (109813, 109814),
   (109947, 109948),
   (110252, 110253),
   (110350, 110351),
   (112462, 112463),
   (114005, 114006),
   (114357, 114358),
   (114462, 114463),
   (114491, 114492),
   (114646, 114647),
   (116112, 116113),
   (116154, 116155),
   (116169, 116170),
   (117064, 117065),
   (120372, 120373),
   (120587, 120588),
   (120656, 120657),
   (121046, 121047),
   (121577, 121578),
   (121954, 121955),
   (122295, 122296),
   (122950, 122951),
   (124201, 124202),
   (127979, 127980),
   (128342, 128343),
   (129556, 129557),
   (132944, 132945),
   (134209, 134210),
   (139713, 139714),
   (140310, 140311),
   (142505, 142506),
   (142673, 142674),
   (142695, 142696

In [143]:
def get_location_counts(person_counts, preprocessed_text, nlp):

    for entry in person_counts:
        entry[1]['associated_places'] = {}
        for pos in entry[1]['position']:
            start, end = pos
            section_of_text = preprocessed_text[start-100:end+100]
            doc = nlp(section_of_text)

            for ent in doc.ents:
                if ent.label_ == 'GPE':
                    if ent.text not in entry[1]['associated_places'].keys():
                        entry[1]['associated_places'][ent.text] = 1
                    else:
                        entry[1]['associated_places'][ent.text] += 1
    
    return person_counts

In [144]:
full_counts = get_location_counts(person_counts, preprocessed_text, nlp)

In [148]:
full_counts[21]

('Harkers',
 {'count': 14,
  'position': [(107567, 107568),
   (109182, 109183),
   (114773, 114774),
   (115376, 115377),
   (138936, 138937),
   (140978, 140979),
   (150163, 150164),
   (159105, 159106),
   (159235, 159236),
   (159391, 159392),
   (164539, 164540),
   (166169, 166170),
   (166531, 166532),
   (171281, 171282)],
  'associated_places': {'Korea': 1}})

In [149]:
full_counts[21]

('Harkers',
 {'count': 14,
  'position': [(107567, 107568),
   (109182, 109183),
   (114773, 114774),
   (115376, 115377),
   (138936, 138937),
   (140978, 140979),
   (150163, 150164),
   (159105, 159106),
   (159235, 159236),
   (159391, 159392),
   (164539, 164540),
   (166169, 166170),
   (166531, 166532),
   (171281, 171282)],
  'associated_places': {'Korea': 1}})

In [166]:
def order_associated_places(full_counts):
    for person in full_counts:
        person["associated_places"] = sorted(person["associated_places"], key=lambda x: x["count"], reverse=True)

    return full_counts

def format_list(full_counts):
    people = []

    for i, entry in enumerate(full_counts):
        people.append({})
        people[i]["name"] = entry[0]
        people[i]["count"] = entry[1]['count']
        people[i]["associated_places"] = []

        for j, entry_ in enumerate(entry[1]['associated_places'].items()):
            place = {}
            k, v = entry_
            place['name'] = k
            place['count'] = v
            people[i]["associated_places"].append(place)
    people = order_associated_places(people)
    return people
        

In [164]:
for i, entry in enumerate(full_counts[21][1]['associated_places'].items()):
    print(entry)

('Korea', 1)


In [167]:
format_list(full_counts)

[{'name': 'Van Helsing',
  'count': 279,
  'associated_places': [{'name': 'London', 'count': 4},
   {'name': 'Whitby', 'count': 3},
   {'name': 'ay', 'count': 2},
   {'name': 'Yorkshire', 'count': 1},
   {'name': 'Transylvania', 'count': 1},
   {'name': 'Iceland', 'count': 1},
   {'name': 'Turkey', 'count': 1},
   {'name': 'Newcastle', 'count': 1},
   {'name': 'Durham', 'count': 1},
   {'name': 'Harwich', 'count': 1},
   {'name': 'Dover', 'count': 1},
   {'name': 'Titicaca', 'count': 1},
   {'name': 'York', 'count': 1},
   {'name': 'Leeds', 'count': 1},
   {'name': 'Duns', 'count': 1},
   {'name': 'rrives', 'count': 1},
   {'name': 'Scarborough', 'count': 1},
   {'name': 'ger', 'count': 1}]},
 {'name': 'Lucy',
  'count': 198,
  'associated_places': [{'name': 'London', 'count': 8},
   {'name': 'England', 'count': 3},
   {'name': 'Transylvania', 'count': 2},
   {'name': 'Carfax', 'count': 1},
   {'name': 'Turkey', 'count': 1},
   {'name': 'tunn', 'count': 1},
   {'name': 'st', 'count': 1