# Extraction and Preprocessing

In [None]:
!pip install beautifulsoup4
!pip install EbookLib
!pip install -U spacy
!python -m spacy download en_core_web_md

In [3]:
import pandas as pd
import networkx as nx
import spacy
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup

In [42]:
# Read the epub file
book = epub.read_epub('assets/GOT.epub')

# Extract the text from the epub file
docs = book.get_items_of_type(ebooklib.ITEM_DOCUMENT)
chapters = []

# Restrict the content to only the first book and keep the chapters separate
for i, chapter in enumerate(docs):
    chapters.append(chapter.get_body_content()) if i >= 7 else None
    if i == 78:
      break


In [43]:
# Get the chapter names and content and store them in separate lists
chapter_titles = list(map(lambda x: BeautifulSoup(x, 'html.parser').get_text().splitlines()[1], chapters))
chapter_contents = list(map(lambda x: BeautifulSoup(x, 'html.parser').get_text().split('\n', 2)[2].replace('\n', ' '), chapters))

# spaCy Pos-Tagging

In [21]:
# Load the spaCy medium sized English model pipeline trained on written web text that includes vocabulary, syntax and entities.
nlp = spacy.load("en_core_web_md")

In [22]:
def extract_entities_pos(text):
    """
    Extracts proper singular noun entities from the given text.
    This function processes the input text using a natural language processing (NLP) pipeline,
    separates the text into sentences, and then extracts tokens that are tagged as proper
    singular nouns (NNP) from each sentence. The extracted entities are grouped by sentence
    and returned as a list of lists.
    Args:
        text (str): The input text to be processed.
    Returns:
        List[List[str]]: A list of lists, where each inner list contains the proper singular
        noun entities extracted from a sentence in the input text.
    """

    # Process the text creating a Doc object that is a sequence of Token objects
    doc = nlp(text)

    # Separate the text in sentences
    sentences = list(doc.sents)

    entities = []

    for sentence in sentences:

        sentence_entities = []

        for token in sentence:
            
            # Check if the token is a proper singular noun and add it to the list of entities
            if token.tag_ == 'NNP':

                sentence_entities.append(token.text)

        # Only add the list of entities if there is at least one entity in the sentence
        if len(sentence_entities) > 0:

            entities.append(sentence_entities)

    return entities

In [23]:
book_entities = []
for chapter in chapter_contents:
  book_entities.append(extract_entities_pos(chapter))

In [45]:
book_entities[0]

[['Bran'],
 ['lord'],
 ['Bran'],
 ['Mance', 'Rayder', 'King', 'Wall'],
 ['Bran'],
 ['Old', 'Nan'],
 ['Long', 'Night'],
 ['Robb'],
 ['Night', 'Watch'],
 ['lord', 'father'],
 ['Robb', 'Jon', 'Bran'],
 ['Winterfell', 'grey'],
 ['Bran'],
 ['Father', 'Bran', 'Lord', 'Stark', 'Winterfell'],
 ['Bran'],
 ['lord', 'father'],
 ['Lord', 'Eddard', 'Stark', 'Theon', 'Greyjoy'],
 ['Ice'],
 ['Robb'],
 ['Jory', 'Cassel'],
 ['Ice',
  'Robert',
  'House',
  'Baratheon',
  'First',
  'King',
  'Rhoynar',
  'First',
  'Lord',
  'Seven',
  'Protector',
  'Realm',
  'House',
  'Stark',
  'Lord',
  'Winterfell',
  'Warden',
  'North'],
 ['Jon', 'Snow'],
 ['Father'],
 ['Bran'],
 ['Greyjoy'],
 ['Theon'],
 ['Jon', 'Greyjoy'],
 ['Bran', 'Bran'],
 ['Jon'],
 ['Jon'],
 ['Winterfell'],
 ['Robb'],
 ['Tullys', 'Riverrun'],
 ['Jon', 'Snow'],
 ['Stark'],
 ['Jon'],
 ['Robb'],
 ['Jon', 'Robb', 'Robb'],
 ['Robb'],
 ['Jon'],
 ['Robb'],
 ['Robb', 'Jon'],
 ['Bran'],
 ['Robb'],
 ['Bran'],
 ['Father', 'Bran'],
 ['lord'],
 ['Jon

# spaCy NER

In [82]:
def extract_entities_ner(text):
    doc = nlp(text)
    sentences = list(doc.sents)
    entities = []

    for sentence in sentences:
        sentence_entities = []
        sent_doc = nlp(sentence.text)

        for ent in sent_doc.ents:
            if ent.label_ in ['PERSON', 'ORG', 'GPE']:
                entity = ent.text.strip()

                if entity.endswith(("'s","’s")):
                    entity = entity[:-2]
                
                if entity.startswith('“'):
                    entity = entity[1:]

                if entity != '':
                    sentence_entities.append(entity)

        sentence_entities = list(set(sentence_entities))

        if len(sentence_entities) > 1:
            entities.append(sentence_entities)

    return entities

In [83]:
entities = []
for chapter in chapter_contents:
  entities.append(extract_entities_ner(chapter))

# Converting entities to network data

In [84]:
def get_network_data(entities, chapter_names):

    final_sources = []
    final_targets = []
    final_chapters_names = []
    for i, chapter in enumerate(entities):

      for row in chapter:
        source = row[0]
        targets = row[1:]

        for target in targets:

          final_sources.append(source)
          final_targets.append(target)
          final_chapters_names.append(chapter_names[i])

    df = pd.DataFrame({'source':final_sources, 'target':final_targets, 'chapter': final_chapters_names})

    return df

In [85]:
got_network_df = get_network_data(entities, chapter_titles)

got_network_df = got_network_df.groupby(got_network_df.columns.tolist(), observed=True).size().reset_index().rename(columns={0:'weight'})

In [86]:
print(got_network_df['chapter'].unique())

['EDDARD' 'TYRION' 'DAENERYS' 'JON' 'CATELYN' 'ARYA' 'SANSA' 'BRAN']


# Converting network data into networks

In [87]:
graphs = {}
for title in got_network_df['chapter'].unique():
    graphs[title] = nx.from_pandas_edgelist(got_network_df[got_network_df['chapter'] == title], edge_attr='weight')
    nx.write_gexf(graphs[title], f'assets/got-{title}.gexf')

In [88]:
# def create_multigraph(df):
#   multigraph = nx.MultiGraph()

#   for _, row in df.iterrows():
#     source = row['source']
#     target = row['target']
#     chapter = row['chapter']
#     weight = row['weight']

#     if not multigraph.has_node(source):
#       multigraph.add_node(source, chapters=chapter)
#     else:
#       if chapter not in multigraph.nodes[source]['chapters']:
#         text = multigraph.nodes[source]['chapters'] + " " + chapter
#         multigraph.nodes[source]['chapters'] = text

#     if not multigraph.has_node(target):
#       multigraph.add_node(target, chapters=chapter)
#     else:
#       if chapter not in multigraph.nodes[target]['chapters']:
#         text = multigraph.nodes[target]['chapters'] + " " + chapter
#         multigraph.nodes[target]['chapters'] = text

#     if not multigraph.has_edge(source, target):
#     multigraph.add_edge(source, target, chapters=[chapter], weight=weight)
#     else:
#       for key in multigraph[source][target]:
#           multigraph[source][target][key]['chapters'].append(chapter)
#           multigraph[source][target][key]['weight'] += weight
#   return multigraph

# MG = create_multigraph(got_network_df)

In [89]:
multigraph = nx.from_pandas_edgelist(got_network_df, edge_attr=True, create_using=nx.MultiGraph())

for source, target, values in multigraph.edges(data=True):
    chapter = values['chapter']

    if chapter not in multigraph.nodes[source].get('chapters',''):
      text = " ".join([multigraph.nodes[source].get('chapters',''), chapter])
      multigraph.nodes[source]['chapters'] = text
    if chapter not in multigraph.nodes[target].get('chapters',''):
      text = " ".join([multigraph.nodes[target].get('chapters',''), chapter])
      multigraph.nodes[target]['chapters'] = text


In [90]:
for node in multigraph.nodes(data=True):
  print(node)

('Aegon', {'chapters': ' EDDARD JON BRAN CATELYN DAENERYS ARYA'})
('Aerys Targaryen', {'chapters': ' EDDARD SANSA'})
('Jaehaerys', {'chapters': ' EDDARD SANSA'})
('King Robert', {'chapters': ' EDDARD BRAN TYRION CATELYN SANSA ARYA JON'})
('Aegon Targaryen', {'chapters': ' TYRION EDDARD'})
('the Seven Kingdoms of old', {'chapters': ' TYRION'})
('Aemon', {'chapters': ' TYRION'})
('Citadel', {'chapters': ' TYRION JON BRAN DAENERYS'})
('Lannisters', {'chapters': ' EDDARD ARYA TYRION CATELYN BRAN'})
('Aggo', {'chapters': ' DAENERYS'})
('Rakharo', {'chapters': ' DAENERYS'})
('khal', {'chapters': ' DAENERYS'})
('Alliser Thorne', {'chapters': ' JON'})
('malignance', {'chapters': ' JON'})
('Alyssa', {'chapters': ' CATELYN'})
('Ser Vardis', {'chapters': ' CATELYN TYRION'})
('Andal', {'chapters': ' DAENERYS'})
('Khaleesi', {'chapters': ' DAENERYS'})
('Andals', {'chapters': ' DAENERYS BRAN EDDARD SANSA'})
('Rhaesh Andahli', {'chapters': ' DAENERYS'})
('Anguy', {'chapters': ' EDDARD'})
('Dornish Ma

In [91]:
nx.write_gexf(multigraph, 'assets/got-multigraph.gexf')