# Extraction and Preprocessing

In [None]:
!pip install beautifulsoup4
!pip install EbookLib
!pip install -U spacy
!python -m spacy download en_core_web_md

In [61]:
import pandas as pd
import networkx as nx
import spacy
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
from itertools import combinations

In [None]:
# Read the epub file
book = epub.read_epub('assets/GOT.epub')

# Extract the text from the epub file
docs = book.get_items_of_type(ebooklib.ITEM_DOCUMENT)
chapters = []

# Restrict the content to only the first book and keep the chapters separate
for i, chapter in enumerate(docs):
    chapters.append(chapter.get_body_content()) if i >= 7 else None
    if i == 78:
      break


In [5]:
# Get the chapter names and content and store them in separate lists
chapter_titles = list(map(lambda x: BeautifulSoup(x, 'html.parser').get_text().splitlines()[1], chapters))
chapter_contents = list(map(lambda x: BeautifulSoup(x, 'html.parser').get_text().split('\n', 2)[2].replace('\n', ' '), chapters))

# spaCy PoS-Tagging

In [8]:
# Load the spaCy medium sized English model pipeline trained on written web text that includes vocabulary, syntax and entities.
nlp = spacy.load("en_core_web_md")

In [22]:
def extract_entities_pos(text):
    """
    Extracts proper singular noun entities from the given text.
    This function processes the input text using a natural language processing (NLP) pipeline,
    separates the text into sentences, and then extracts tokens that are tagged as proper
    singular nouns (NNP) from each sentence. The extracted entities are grouped by sentence
    and returned as a list of lists.
    Args:
        text (str): The input text to be processed.
    Returns:
        List[List[str]]: A list of lists, where each inner list contains the proper singular
        noun entities extracted from a sentence in the input text.
    """

    # Process the text creating a Doc object that is a sequence of Token objects
    doc = nlp(text)

    # Separate the text in sentences
    sentences = list(doc.sents)

    entities = []

    for sentence in sentences:

        sentence_entities = []

        for token in sentence:
            
            # Check if the token is a proper singular noun and add it to the list of entities
            if token.tag_ == 'NNP':

                sentence_entities.append(token.text)

        # Only add the list of entities if there is at least one entity in the sentence
        if len(sentence_entities) > 0:

            entities.append(sentence_entities)

    return entities

In [23]:
book_entities = []
for chapter in chapter_contents:
  book_entities.append(extract_entities_pos(chapter))

In [None]:
book_entities[0] # just for visualization purposes

# spaCy Named Entity Recognition

In [6]:
def extract_entities_ner(text):
    doc = nlp(text)
    sentences = list(doc.sents)
    entities = []

    for sentence in sentences:
        sentence_entities = []
        sent_doc = nlp(sentence.text)

        for ent in sent_doc.ents:
            if ent.label_ in ['PERSON', 'ORG', 'GPE']:
                entity = ent.text.strip()

                if entity.endswith(("'s","’s")):
                    entity = entity[:-2]
                
                if entity.startswith('“'):
                    entity = entity[1:]

                if entity != '':
                    sentence_entities.append(entity)

        sentence_entities = list(set(sentence_entities))

        if len(sentence_entities) > 1:
            entities.append(sentence_entities)

    return entities

In [9]:
entities = []
for chapter in chapter_contents:
  entities.append(extract_entities_ner(chapter))

In [None]:
entities[0] # just for visualization purposes

# Converting entities to network data

In [62]:
def get_network_data(entities, chapter_names):

    final_sources = []
    final_targets = []
    final_chapters_names = []
    for i, chapter in enumerate(entities):

      for row in chapter:
        pairs = combinations(row, 2)
        for pair in pairs:

          final_sources.append(pair[0])
          final_targets.append(pair[1])
          final_chapters_names.append(chapter_names[i])

    df = pd.DataFrame({'source':final_sources, 'target':final_targets, 'chapter': final_chapters_names})

    return df

In [63]:
got_network_df = get_network_data(entities, chapter_titles)

got_network_df = got_network_df.groupby(got_network_df.columns.tolist(), observed=True).size().reset_index().rename(columns={0:'weight'})

In [64]:
titles = got_network_df['chapter'].unique()

# Converting network data into networks

In [69]:
graphs = {}
for title in got_network_df['chapter'].unique():
    graphs[title] = nx.from_pandas_edgelist(got_network_df[got_network_df['chapter'] == title], edge_attr='weight')
    nx.write_gexf(graphs[title], f'assets/got-{title}.gexf')

In [65]:
multigraph = nx.from_pandas_edgelist(got_network_df, edge_attr=True, create_using=nx.MultiGraph())

for source, target, values in multigraph.edges(data=True):
    chapter = values['chapter']

    if chapter not in multigraph.nodes[source].get('chapters',''):
      text = " ".join([multigraph.nodes[source].get('chapters',''), chapter])
      multigraph.nodes[source]['chapters'] = text
    if chapter not in multigraph.nodes[target].get('chapters',''):
      text = " ".join([multigraph.nodes[target].get('chapters',''), chapter])
      multigraph.nodes[target]['chapters'] = text


In [67]:
global_data = {'Character': 'Global', 'Entities': nx.number_of_nodes(multigraph), 'Interactions' : nx.number_of_edges(multigraph),
        'Graph Density': nx.density(multigraph), 'Connected Components': nx.number_connected_components(multigraph),
        'Average Degree': sum([v for _, v in multigraph.degree])/nx.number_of_nodes(multigraph),
        'Best Friends': " and ".join(sorted(multigraph.edges(data=True), key = lambda x: x[2]['weight'], reverse=True)[0][:2])
        }
character_worlds = pd.DataFrame(global_data, index=[0])

# Get info about each pov chapter
for title in titles:
    nodes = []
    for node, data in multigraph.nodes(data=True):
        if title in data['chapters']:
            nodes.append(node)
    G = nx.subgraph(multigraph, nodes)
    nx.density(G)
    nx.number_connected_components(G)
    data = {'Character': title, 'Entities': len(nodes), 'Interactions' : nx.number_of_edges(G),
            'Graph Density': nx.density(G), 'Connected Components': nx.number_connected_components(G),
            'Average Degree': sum([v for _, v in G.degree])/len(nodes),
            'Best Friends': (" and ".join(sorted(G.edges(data=True), key = lambda x: x[2]['weight'], reverse=True)[0][:2]))
            }
    character_worlds = pd.concat([character_worlds, pd.DataFrame(data, index=[0])], ignore_index=True)


In [None]:
character_worlds

In [70]:
nx.write_gexf(multigraph, 'assets/got-multigraph.gexf')