In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import matplotlib.pyplot as plt
import networkx as nx

In [2]:
nlp = spacy.load("en_core_web_sm")

## Load the books and read into nlp

In [3]:
import os

# Scan directory and load books
books_load = [book for book in os.scandir("data") if ".txt" in book.name]

In [4]:
# Confirm that the books are loaded correctly
books_load

[<DirEntry 'II - Times of Contempt.txt'>,
 <DirEntry 'III - Baptism of Fire.txt'>,
 <DirEntry 'V - The Lady of the Lake.txt'>,
 <DirEntry 'C - The Last Wish.txt'>,
 <DirEntry 'IV - The Tower of the Swallow.txt'>,
 <DirEntry 'I - Blood of Elves.txt'>,
 <DirEntry 'E - something ends something begins.txt'>,
 <DirEntry 'B - The Sword of Destiny.txt'>]

In [8]:
# Read the books and load into nlp
book = books_load[1]
text = open(book).read()
doc = nlp(text)

In [None]:
# Visualize the named entities
displacy.render(doc[0:2000], style="ent", jupyter=True)

# Load character names

In [None]:
characters_df = pd.read_csv("characters.csv")

# Create lists of named entities for each sentence

In [None]:
sentence_entity_df = []

# Loop through sentences to store named entities
for sentence in doc.sents:
    entity_list = [ent.text for ent in sentence.ents]
    sentence_entity_df.append({'sentence': sentence, 'entities': entity_list})

# Convert list to dataframe and display
sentence_entity_df = pd.DataFrame(sentence_entity_df)

In [None]:
# Function to filter out the non-character entities from the dataframe
def filter_entity(ent_list, characters_df):
    return [ent for ent in ent_list
            if ent in list(characters_df.character)
            or ent in list(characters_df.first_name)]

In [None]:
# Apply above function to  clean entity dataframe of non-character entities
sentence_entity_df['character_entities'] = sentence_entity_df['entities'].apply(lambda x: filter_entity(x, characters_df))

# Remove empty entity lists
sentence_entity_df = sentence_entity_df[sentence_entity_df['character_entities'].map(len) > 0]