## Preparation

In [1]:
#!pip install spacy
#!pip install networkx
#!python -m spacy download en_core_web_sm # see https://spacy.io/usage/models
#!python -m spacy download en_core_web_trf
#!pip install pyvis
#!pip install python-louvain # community?

In [2]:
import spacy
import pandas as pd
import os
import codecs
import re 
import numpy as np
#from spacy import displacy
import networkx as nx
from pyvis.network import Network
import community.community_louvain as cl
import math

#import matplotlib.pyplot as plt

In [3]:
# Load spacy English languague model
NER = spacy.load("en_core_web_sm")
#NER2 = spacy.load("en_core_web_trf")

#pd.set_option('display.max_rows', None)

In [6]:
# flatten a list of lists
def flatten(l):
    return [item for sublist in l for item in sublist]

# Function to filter out non-character entities
def filter_entity(ent_list, character_df):
    # check if entity is any part of a character name
    return flatten([character_df.character[
                np.any(
                    [np.array(character_df.character==ent), 
                     np.array(character_df.character_firstname==ent), 
                     np.array(character_df.character_secondname==ent)], 
                    axis=0).tolist()
            ] for ent in ent_list])
            #if ent in list(character_df.character)
            #or ent in list(character_df.character_firstname)
            #or ent in list(character_df.character_secondname)]

## Load character names

In [7]:
# Read characters
character_df = pd.read_csv("malazan_characters.csv")

# take only characters from the current book
character_df = character_df.loc[character_df["book"] == "Gardens of the Moon"]

In [8]:
# remove titles only
title_list = [
    'Agent in Genabaris',
    'Assassin High House Shadow',
    'Black Moranth patrol leader',
    'Captain High House Light',
    'Captain (Itko Kan)',
    "Dassem Ultor's daughter",
    'Herald of High House Death',
    'Knight High House Dark',
    'Mason High House Death',
    'Soldier High House Death',
    'Throne (Unaligned)',
    'Virgin High House Death'
]
for title in title_list:
    character_df = character_df.drop(character_df[character_df.character==title].index)

In [9]:
# Remove brackets and stuff of names
character_df['character'] = character_df['character'].apply(lambda x: re.sub("\/|\(|\)", ' ', x)) #[\(].*?[\)]|
character_df['character'] = character_df['character'].apply(lambda x: re.sub("The |Corporal |Sergeant |Sergeant| Campaign|Empress |Emperor |Captain|Captain |Prince |Hound | Light Touch", "", x))

# split the names into firstname and lastname so we can find every mention of them
character_df['character_firstname'] = character_df['character'].apply(lambda x: x.split(' ', 3)[0])
character_df['character_secondname'] = character_df['character'].apply(lambda x: x.split(' ', 3)[-1])
#character_df['character_thirdname'] = character_df['character'].apply(lambda x: x.split(' ', 3)[2])

In [10]:
character_df

Unnamed: 0,book,character,character_firstname,character_secondname
0,Gardens of the Moon,A'Karonys,A'Karonys,A'Karonys
2,Gardens of the Moon,Anomander Rake,Anomander,Rake
3,Gardens of the Moon,Antsy,Antsy,Antsy
4,Gardens of the Moon,Apsalar,Apsalar,Apsalar
5,Gardens of the Moon,Aragan,Aragan,Aragan
...,...,...,...,...
153,Gardens of the Moon,Trotts,Trotts,Trotts
154,Gardens of the Moon,Turban Orr,Turban,Orr
155,Gardens of the Moon,Vildron,Vildron,Vildron
157,Gardens of the Moon,Vorcan,Vorcan,Vorcan


## Load books (first one here)

In [None]:
# Get all book files in the data directory
all_books = [b for b in os.scandir('data') if b.name.endswith(".txt")]
print(all_books)

# choose one for the analysis and get the text
book = all_books[0]
f = codecs.open(book, "r", "utf-8")
book_text = f.read()

## Run spacy model (in a loop)

In [22]:
# variables for applying the model
epoch_size = 250000
epochs = round(len(book_text)/epoch_size+0.5)

In [23]:
filtered_df = pd.DataFrame(data={'sentence': ('test'), 'entities': ['Kruppe'], 'character_entities': ['Kruppe']})

# split the book into epochs so the NER() function can handle the size
for e in range(epochs):
    # keep us updated on the current epoch
    print("epoch " + str(e+1) + " of " + str(epochs) + " running")
    
    # run the language model on the subpart
    book_doc = NER(book_text[e*epoch_size:(e+1)*epoch_size])
    
    # get all the entities per sentence
    sent_entity_df = []
    for sent in book_doc.sents:
        entity_list = [ent.text for ent in sent.ents]
        sent_entity_df.append({"sentence": sent, "entities": entity_list})    
    
    # make it a dataframe
    sent_entity_df = pd.DataFrame(sent_entity_df)
    # remove all rows/sentences without any entities
    sent_entity_df = sent_entity_df[sent_entity_df['entities'].map(len)>0] #sent_entity_df.drop(sent_entity_df[sent_entity_df['entities'].map(len)==0].index, inplace=True)
    # filter out those entities that are characters
    sent_entity_df['character_entities'] = sent_entity_df['entities'].apply(lambda x: filter_entity(x, character_df))
    # now remove all rows/sentences without any characters
    sent_entity_df = sent_entity_df[sent_entity_df['character_entities'].map(len)>0]

    # save those entities into a global dataframe
    filtered_df = pd.concat([filtered_df, sent_entity_df])

epoch 1 of 5 running
epoch 2 of 5 running
epoch 3 of 5 running
epoch 4 of 5 running
epoch 5 of 5 running


In [24]:
df = filtered_df[1:-1]
# this might destroy the windowing function?
df = df.reset_index()
df.columns = df.columns.str.replace('index', 'sentence_number')
df.head(100)

Unnamed: 0,sentence_number,sentence,entities,character_entities
0,46,"(Dassem, Ultor, .)",[Dassem],[Dassem Ultor]
1,50,"(Is, Dassem, dead, ?, ', \n\n)",[Dassem],[Dassem Ultor]
2,103,"(Laseen, ., ', \n\n, ', Laseen, ?, ', \n\n, ',...","[Laseen, Laseen, Napan]","[Laseen, Laseen, Napan tutor]"
3,109,"(', My, tutor, 's, Napan, ,, ', Ganoes, explai...",[Napan],[Napan tutor]
4,118,"(Her, dusky, blue, skin, marked, her, as, Napa...","[Napan, grey]",[Napan tutor]
...,...,...,...,...
95,898,"(Paran, reached, out, with, his, sword, and, s...",[Paran],[Ganoes Stabro Paran]
96,900,"(Movement, rippled, in, the, gloom, within, ,,...",[Paran],[Ganoes Stabro Paran]
97,910,"(Paran, looked, down, at, one, of, the, soldie...","[Paran, one]",[Ganoes Stabro Paran]
98,919,"(Sheathing, his, sword, ,, Paran, entered, the...",[Paran],[Ganoes Stabro Paran]


In [25]:
df.iloc[86].sentence

Onearm's Host.'

Aragan blinked.

In [26]:
#df.loc[df.index[0]:df.index[5]]
df[df.index[0]:df.index[4]]

Unnamed: 0,sentence_number,sentence,entities,character_entities
0,46,"(Dassem, Ultor, .)",[Dassem],[Dassem Ultor]
1,50,"(Is, Dassem, dead, ?, ', \n\n)",[Dassem],[Dassem Ultor]
2,103,"(Laseen, ., ', \n\n, ', Laseen, ?, ', \n\n, ',...","[Laseen, Laseen, Napan]","[Laseen, Laseen, Napan tutor]"
3,109,"(', My, tutor, 's, Napan, ,, ', Ganoes, explai...",[Napan],[Napan tutor]


In [27]:
# save the single book_docs as well
#book_doc

In [28]:
# Visualize identified entities
#displacy.render(book_doc[0:10], style="ent", jupyter=True)

## Building relationships

In [29]:
# variables for building relationship
# TODO: apply the window size not for the reindexed version but for the actual sentence indexing
# we want characters in a relationship that accure in a paragraph
# -> somehow make the window realize the paragraphs, not only the sentences?!
# this needs to be done in the text splitting (epochs...)
# TODO: so have a look at the book text file and make a list of paragraphs(!!!) instead of sentences
# TODO: remove the poems and small intros before a chapter/book
window_size = 5
relationships = []

In [30]:
# get relationships from the entities with a fixed window size
for i in range(df.index[-1]):
    # build the character list that appear in close proximity
    end_i = min(i+window_size, df.index[-1])
    char_list = sum((df.loc[i: end_i].character_entities), [])
    
    # Remove duplicated characters that are next to each other
    char_list.sort()
    char_unique = [char_list[i] for i in range(len(char_list)) 
                   if (i==0) or char_list[i] != char_list[i-1]]

    # list source-target relationships
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [31]:
# build relationship dataframe
relationship_df = pd.DataFrame(relationships)
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [32]:
print(relationship_df.size)
relationship_df.head(10)

2493


Unnamed: 0,source,target,value
0,Dassem Ultor,Laseen,2
1,Laseen,Napan tutor,5
2,Napan tutor,Surly,4
3,Caladan Brood,K'azz D'Avore,6
4,K'azz D'Avore,Laseen,6
5,Napan tutor,Rigga,1
6,Rigga,Surly,3
7,Laseen,Rigga,15
8,Caladan Brood,Ilgrand Lender,2
9,Ilgrand Lender,K'azz D'Avore,2


In [33]:
# save this relational dataframe as csv
relationship_df.to_csv("malazan_relationships_"+book.name.title()+".csv", index=False)

### The most important characters in The Malazan Book of the Fallen

### Evolution of characters' importance