In [1]:
import os
import spacy
import transformers
import requests
from bs4 import BeautifulSoup
from collections import Counter, OrderedDict

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import StanfordNERTagger

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def remove_stopwords(raw_text:str):
    words = word_tokenize(raw_text)
    stop_words = set(stopwords.words('english'))
    filtered_text = [word for word in words if word.lower() not in stop_words]
    filtered_text = ' '.join(filtered_text)

    return filtered_text

def get_persons_and_positions(results, text):
    person_and_positions = []

    start_position = 0
    end_position = 0

    for word, tag in results:
        if tag == 'PERSON':
            start_position = text.find(word, end_position)
            end_position = start_position + len(word)
            
            person_and_positions.append({'name': word, 'position':(start_position,end_position)})


    return person_and_positions

def get_person_counts(fullnames_and_positions):
    person_counts = {}
    for person in fullnames_and_positions:
        if person['name'] not in person_counts.keys():
            person_counts[person['name']] = {}
            person_counts[person['name']]['count'] = 1
            person_counts[person['name']]['position'] = [person['position']]
        else:
            person_counts[person['name']]['count'] += 1
            person_counts[person['name']]['position'].append(person['position'])

    return sorted(person_counts.items(), key=lambda x: x[1]['count'], reverse=True)

def get_fullnames(persons_and_positions):
    fullnames_and_positions = []
    i = 0
    while i < len(persons_and_positions):
        current = persons_and_positions[i]
        name = current['name']
        start, end = current['position']
        
        while i < len(persons_and_positions) - 1 and persons_and_positions[i + 1]['position'][0] - end == 1:
            i += 1
            next_item = persons_and_positions[i]
            name += " " + next_item['name']
            end = next_item['position'][1]
        
        fullnames_and_positions.append({'name': name, 'position': (start, end)})
    
        i += 1

    return fullnames_and_positions
    


def get_section_of_text(preprocessed_text,start, end):
    before_name = preprocessed_text[:start].split()[-100:]
    after_name = preprocessed_text[end:].split()[:101]

    return " ".join(before_name) + " ".join(after_name)

def get_associated_places_counts(person_counts:list, preprocessed_text:str, nlp):
    """
    Getting associated palaces for each instance of person and the counts
    """
    for entry in person_counts:
        entry[1]['associated_places'] = {}
        for pos in entry[1]['position']:
            start, end = pos
            section_of_text = get_section_of_text(preprocessed_text, start, end)
            doc = nlp(section_of_text)

            for ent in doc.ents:
                if ent.label_ == 'GPE':
                    if ent.text not in entry[1]['associated_places'].keys():
                        entry[1]['associated_places'][ent.text] = 1
                    else:
                        entry[1]['associated_places'][ent.text] += 1
    
    return person_counts

def order_associated_places(full_counts:list):
    """
    Sorting list of places by the number of times it appears with each person
    """
    for person in full_counts:
        person["associated_places"] = sorted(person["associated_places"], key=lambda x: x["count"], reverse=True)

    return full_counts

def format_list(full_counts:list):
    """
    Formatting list into the required form as stated in test instructions
    """
    people = []

    for i, entry in enumerate(full_counts):
        people.append({})
        people[i]["name"] = entry[0]
        people[i]["count"] = entry[1]['count']
        people[i]["associated_places"] = []

        for entry_ in entry[1]['associated_places'].items():
            place = {}
            k, v = entry_
            place['name'] = k
            place['count'] = v
            people[i]["associated_places"].append(place)
    people = order_associated_places(people)
    return people

In [3]:
url = "https://www.gutenberg.org/cache/epub/345/pg345.txt"
response = requests.get(url)
raw_text = response.text

In [4]:
filtered_text = remove_stopwords(raw_text)


In [5]:
java_path = r"C:/Program Files/Java/jdk-21/bin/java.exe"
os.environ['JAVAHOME'] = java_path

stanford_dir = os.path.abspath(os.path.join(os.getcwd(), 'stanford-ner-2020-11-17'))
jarfile = os.path.join(stanford_dir, 'stanford-ner.jar')
modelfile = os.path.join(stanford_dir, 'classifiers', 'english.all.3class.distsim.crf.ser.gz')
st = StanfordNERTagger(modelfile, jarfile)


In [6]:
tokenized_text_ = nltk.word_tokenize(filtered_text)
results = st.tag(tokenized_text_)

In [7]:
person_and_positions = get_persons_and_positions(results, filtered_text)

In [8]:
person_and_positions

[{'name': 'I.', 'position': (782, 784)},
 {'name': 'Jonathan', 'position': (785, 793)},
 {'name': 'Harker', 'position': (794, 800)},
 {'name': 'Jonathan', 'position': (824, 832)},
 {'name': 'Harker', 'position': (833, 839)},
 {'name': 'Jonathan', 'position': (864, 872)},
 {'name': 'Harker', 'position': (873, 879)},
 {'name': 'Jonathan', 'position': (903, 911)},
 {'name': 'Harker', 'position': (912, 918)},
 {'name': 'Mina', 'position': (953, 957)},
 {'name': 'Murray', 'position': (976, 982)},
 {'name': 'Mina', 'position': (1056, 1060)},
 {'name': 'Murray', 'position': (1061, 1067)},
 {'name': 'Mina', 'position': (1091, 1095)},
 {'name': 'Murray', 'position': (1096, 1102)},
 {'name': 'X.', 'position': (1121, 1123)},
 {'name': 'Mina', 'position': (1124, 1128)},
 {'name': 'Murray', 'position': (1129, 1135)},
 {'name': 'Lucy', 'position': (1159, 1163)},
 {'name': 'Westenra', 'position': (1164, 1172)},
 {'name': 'Seward', 'position': (1199, 1205)},
 {'name': 'Seward', 'position': (1233, 1239

In [9]:
fullnames_and_positions = get_fullnames(person_and_positions)

In [10]:
person_counts = get_person_counts(fullnames_and_positions)

In [11]:
person_counts

[('Van Helsing',
  {'count': 306,
   'position': [(1660, 1671),
    (158659, 158670),
    (158853, 158864),
    (160222, 160233),
    (160318, 160329),
    (161225, 161236),
    (165664, 165675),
    (165771, 165782),
    (165946, 165957),
    (166373, 166384),
    (166788, 166799),
    (169476, 169487),
    (169674, 169685),
    (169810, 169821),
    (170365, 170376),
    (171033, 171044),
    (171638, 171649),
    (171968, 171979),
    (172536, 172547),
    (172644, 172655),
    (173206, 173217),
    (173596, 173607),
    (174123, 174134),
    (174627, 174638),
    (178744, 178755),
    (179270, 179281),
    (179928, 179939),
    (181541, 181552),
    (182194, 182205),
    (182530, 182541),
    (182902, 182913),
    (183127, 183138),
    (183803, 183814),
    (185268, 185279),
    (185770, 185781),
    (186452, 186463),
    (186569, 186580),
    (187876, 187887),
    (188579, 188590),
    (189158, 189169),
    (189988, 189999),
    (190287, 190298),
    (190465, 190476),
    (198792,

In [12]:
nlp = spacy.load("en_core_web_md")

In [13]:
persons_and_associated_locations = get_associated_places_counts(person_counts, filtered_text, nlp)

In [14]:
final_list = format_list(persons_and_associated_locations)

In [15]:
final_list

[{'name': 'Van Helsing',
  'count': 306,
  'associated_places': [{'name': 'London', 'count': 37},
   {'name': 'Amsterdam', 'count': 22},
   {'name': 'Varna', 'count': 20},
   {'name': 'Carfax', 'count': 17},
   {'name': 'Seward', 'count': 13},
   {'name': 'Piccadilly', 'count': 8},
   {'name': 'Van Helsing', 'count': 7},
   {'name': 'Dardanelles', 'count': 6},
   {'name': 'Helsing', 'count': 5},
   {'name': 'America', 'count': 4},
   {'name': 'Haarlem', 'count': 3},
   {'name': 'Antwerp', 'count': 3},
   {'name': 'Sussex', 'count': 3},
   {'name': 'cabmen', 'count': 3},
   {'name': 'Scotland', 'count': 3},
   {'name': 'Bucharest', 'count': 3},
   {'name': 'Floridas', 'count': 2},
   {'name': 'ESCAPED', 'count': 2},
   {'name': 'Holland', 'count': 2},
   {'name': 'Texas', 'count': 2},
   {'name': 'Derby', 'count': 2},
   {'name': 'Transylvania', 'count': 2},
   {'name': 'Omne', 'count': 2},
   {'name': 'Kukri', 'count': 2},
   {'name': 'Whitby', 'count': 2},
   {'name': 'Vampire', 'coun

In [16]:
final_list[1]

{'name': 'Lucy',
 'count': 261,
 'associated_places': [{'name': 'London', 'count': 26},
  {'name': 'Amsterdam', 'count': 13},
  {'name': 'Helsing', 'count': 7},
  {'name': 'Miss.', 'count': 6},
  {'name': 'Van Helsing', 'count': 5},
  {'name': 'Seward', 'count': 5},
  {'name': 'Carfax', 'count': 4},
  {'name': 'America', 'count': 4},
  {'name': 'Scotland', 'count': 3},
  {'name': 'Floridas', 'count': 3},
  {'name': 'Sussex', 'count': 3},
  {'name': 'cabmen', 'count': 3},
  {'name': 'Hampstead', 'count': 3},
  {'name': 'Piccadilly', 'count': 2},
  {'name': 'Korea', 'count': 1},
  {'name': 'Esk', 'count': 1},
  {'name': 'Nuremberg', 'count': 1},
  {'name': 'next abbey', 'count': 1},
  {'name': 'West Lighthouse', 'count': 1},
  {'name': 'Hamburg', 'count': 1},
  {'name': 'Haarlem', 'count': 1},
  {'name': 'Antwerp', 'count': 1},
  {'name': 'Holland', 'count': 1},
  {'name': 'poorleft', 'count': 1},
  {'name': 'London Whitby', 'count': 1},
  {'name': 'sincefirst', 'count': 1},
  {'name': '