In [18]:
import spacy
import re
from collections import Counter

In [19]:
FNAME = '../data/a_portrait_new.txt'
input_file = open(FNAME, encoding='utf8').readlines()

# we replace all the unwanted symbols (@, #, $) and clean the text
pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' 
modded = [re.sub(pat, '', x.strip()) for x in input_file]
modded = [x for x in modded if x != '']
joined_sentences = ' '.join(modded)

# load the cleaned sentences in spacy
nlp = spacy.load("en_core_web_sm")
start_data = nlp(joined_sentences)

In [20]:
# individual sentences in which we find a given NE record
# sents = [ent.sent for ent in start_data.ents if ent.text == 'Dante']
# print(len(sents))

mentions_sents = dict()

# all entities, counted and all of the sentences in which they were mentioned
# create the initial data schema for the entities of type PERSON
# that we will at some point persist in a database 
visited = []
for ent in start_data.ents:
    if ent.label_ == 'PERSON':
        sent_string = str(ent.sent)
        if ent.text not in visited:
            visited.append(ent.text)
            mentions_sents[ent.text] = {
                'count': 1,
                'sent': {sent_string: {'additional': [], 'times': sent_string.count(ent.text)}}
            }
        else:
            mentions_sents[ent.text]['count'] = mentions_sents[ent.text]['count'] + 1
            mentions_sents[ent.text]['sent'][sent_string] = {'additional': [], 'times': sent_string.count(ent.text)}

In [21]:
# Create a set of all the entities for the lookup below
all_ner = set([ent.text for ent in start_data.ents if ent.label_ == 'PERSON'])

In [22]:
# This is making me a little bit sick (tripple nested loops)
# maybe implement it with generators to be faster or at least to look better
for ner, elements in mentions_sents.items():
    for k, v in elements.get('sent').items():
        for name in all_ner:
            if name != ner and name in k:
                # if we want to add the count as well
                mentions_sents[ner]['sent'][k]['additional'].append(name)

In [23]:
# we create a counter object to use the most_common method to get the entity with highest count
items = [x.text for x in start_data.ents if x.label_ == 'PERSON']
cter = Counter(items)

In [24]:
min_val = 1
max_val = cter.most_common(1)[0][1]

# this function accepts the passed count value and determines the rank of the entity
# based on a simple 33/33/33
def calc_rank(count):
    percentage = ((count - min_val) * 100) / (max_val - min_val)

    if percentage <= 33:
        return 3
    elif percentage > 33 and percentage <= 66:
        return 2
    else:
        return 1
    

In [25]:
# iterate through the entities, set their rank
def set_rank(mentions_sents):
    for entity, vals in mentions_sents.items():
        mentions_sents[entity]['rank'] = calc_rank(vals['count'])

In [26]:
set_rank(mentions_sents)

In [27]:
# represent the entities from the data
for entity in mentions_sents:
    print(f"Name: {entity} --> Count: {mentions_sents[entity]['count']} --> Rank: {mentions_sents[entity]['rank']}")

Name: James Joyce Chapter --> Count: 1 --> Rank: 3
Name: Betty Byrne --> Count: 1 --> Rank: 3
Name: Tralala --> Count: 3 --> Rank: 3
Name: Uncle Charles --> Count: 10 --> Rank: 3
Name: Dante --> Count: 47 --> Rank: 1
Name: Charles --> Count: 20 --> Rank: 2
Name: Michael Davitt --> Count: 2 --> Rank: 3
Name: Eileen --> Count: 6 --> Rank: 3
Name: Apologize --> Count: 3 --> Rank: 3
Name: Rody Kickham --> Count: 4 --> Rank: 3
Name: Nasty Roche --> Count: 9 --> Rank: 3
Name: Stephen Dedalus --> Count: 5 --> Rank: 3
Name: Jack Lawtons --> Count: 1 --> Rank: 3
Name: Hamilton Rowan --> Count: 2 --> Rank: 3
Name: Wolsey --> Count: 2 --> Rank: 3
Name: Arnall --> Count: 22 --> Rank: 2
Name: Simon Moonan --> Count: 9 --> Rank: 3
Name: Arnalls --> Count: 3 --> Rank: 3
Name: Jack Lawton --> Count: 6 --> Rank: 3
Name: Jimmy Magee --> Count: 2 --> Rank: 3
Name: Tullabeg --> Count: 1 --> Rank: 3
Name: Wells --> Count: 8 --> Rank: 3
Name: Casey --> Count: 35 --> Rank: 1
Name: Ill --> Count: 17 --> Rank: