In [1]:
%pip install -r requirements.txt

Collecting en-core-web-sm@ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889 (from -r requirements.txt (line 10))
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
Note: you may need to restart the kernel to use updated packages.


### Import necessary libraries

In [2]:
import spacy 
import seaborn 
import pandas as pd

### Loading English language model

In [3]:
nlp = spacy.load("en_core_web_sm")

## Text Analysis

# Character Identification Solutions

### Part of Speech (POS) Tagging

Using POS tagging, I can identify the nouns in the text. This method may not be as accurate as NER, but it can help me identify the characters in the text.

In [4]:
def proper_nouns(tagged_text):
    proper_nouns = []
    i = 0
    while i < len(tagged_text):
        if tagged_text[i][1] == 'NNP':
            if i < len(tagged_text) - 1 and tagged_text[i + 1][1] == 'NNP':
                proper_nouns.append(tagged_text[i][0].lower() + ' ' + tagged_text[i + 1][0].lower())
                i += 1
            else:
                proper_nouns.append(tagged_text[i][0].lower())
        i += 1
    return proper_nouns

def characters_in_text(file, doc=None):
    if doc is None:
        with open(file, 'r') as f:
            text = f.read()
        doc = nlp(text)

    characters = proper_nouns([(word.text, word.tag_) for word in doc])
    return characters

In [5]:
characters_in_text('texts/alice.txt')

['alice',
 'wonderland',
 'alice',
 'wonderland',
 'lewis carroll',
 'millennium fulcrum',
 'edition',
 'rabbit',
 'hole',
 'alice',
 'alice',
 'white',
 'rabbit',
 'alice',
 'rabbit',
 'rabbit',
 'watch',
 'alice',
 'alice',
 'alice',
 'orange marmalade',
 'alice',
 'think--',
 'alice',
 '--yes',
 'longitude',
 'alice',
 'latitude',
 'longitude',
 "ma'am",
 'new zealand',
 'australia',
 'alice',
 "dinah'll",
 'dinah',
 'dinah',
 'alice',
 'dinah',
 'dinah',
 'alice',
 'white rabbit',
 'alice',
 'rabbit',
 'alice',
 'alice',
 'alice',
 'dark',
 'alice',
 'alice',
 'alice',
 'alice',
 'alice',
 'alice',
 'alice',
 'alice',
 'alice',
 'alice',
 'alice',
 'alice',
 'chapter ii',
 'pool',
 'curiouser',
 'alice',
 'english',
 'good',
 'alice',
 'christmas',
 'alice',
 'foot',
 'esq',
 'hearthrug',
 'near the',
 'fender',
 "alice 's",
 'love',
 'poor alice',
 'alice',
 'white rabbit',
 'duchess',
 'duchess',
 '!',
 'alice',
 'rabbit',
 'rabbit',
 'alice',
 'dear',
 'that',
 'ada',
 'mabel',


# Named Entity Recognition (NER)

Using NER, I can identify the entities in the text. Identifying the entity as a person could help me to identify the characters in the text. But in some instances where the character is not a person, this method may not work. 

For example, (Alice in Wonderland) Lobster Quadrille is a dance that Alice attends and is not a person. 

In [6]:
def character_tracker_NER(file: str, doc=None) -> dict:
    """
    file: str
    doc: spacy doc object
    returns a dictionary of characters, with their associated count and context.
    
    This is function that takes a spacy doc object or a file if the doc is not entered 
    and returns a dictionary of characters, with their associated count and context. 
    """

    if doc is None:
        with open(file, "r") as f:
            text = f.read()
            doc = nlp(text)

    honorifics = ["Mr.", "Mrs.", "Miss", "Ms.", "Dr.", "Mx.", "Sir", "Madam", "Lord", "Lady", "Dame", "Brother", "Sister", "Father", "Mother", "Reverend", "Pastor", "Rabbi", "Imam", "Sheikh", "Sultan", "Pope", "Bishop", "Cardinal", "Archbishop", "Monsignor", "Elder", "Deacon", "Deaconess", "Minister", "Chaplain", "Canon", "Vicar", "Parson", "Curate", "Priest", "Nun", "Monk", "Friar", "Abbot", "Abbess", "Mother Superior", "Father Superior", "Sister Superior", "Brother Superior", "His Holiness", "His Eminence", "His Grace", "His Excellency", "His Honor", "His Worship", "Her Holiness", "Her Eminence", "Her Grace", "Her Excellency", "Her Honor", "Her Worship", "Your Holiness", "Your Eminence", "Your Grace", "Your Excellency", "Your Honor", "Your Worship", "Your Majesty", "Your Highness", "Your Excellency", "Your Honor", "Your Worship", "Your Lordship", "Your Ladyship", "Your Worship", "Your Reverence", "Your Grace", "Your Excellency", "Your Honor", "Your Worship", "Your Lordship", "Your Ladyship", "Your Worship", "Your Reverence", "Your Grace", "Your Excellency", "Your Honor", "Your Worship", "Your Lordship", "Your Ladyship", "Your Worship", "Your Reverence", "Your Grace", "Your Excellency", "Your Honor", "Your Worship", "Your Lordship", "Your Ladyship", "Your Worship", "Your Reverence", "Your Grace", "Your Excellency", "Your Honor", "Your Worship", "Your Lordship", "Your Ladyship", "Your Worship", "Your Reverence", "Your Grace", "Your Excellency", "Your Honor", "Your Worship", "Your Lordship", "Your Ladyship", "Your Worship", "Your Reverence", "Your Grace", "Your Excellency", "Your Honor", "Your Worship"]
    
    characters = {}

    for entity in doc.ents:
        # print(entity.text, entity.label_)
        # entity need to appear more than 5 times to be classed a character
        # source: Esner et al. (2017) "Character Identification in Novels Using Named Entity Recognition and Similarity Measures"
        if entity.label_ == "PERSON" and text.count(entity.text) > 5:
            # add list of characters as an object
            if entity.text not in characters:
                characters[entity.text] = {
                    "count": text.count(entity.text),
                    "context": [entity.sent.text]
                }
            else:
                characters[entity.text]["context"].append(entity.sent.text)
    return characters

In [7]:
characters = character_tracker_NER("texts/alice.txt")
if len(set(characters)) == 21:
    print("Success!", len(set(characters)), "unique characters found.")
print("Characters found:", set(characters), len(set(characters)))

Characters found: set() 0
