In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx

import matplotlib.pyplot as plt

In [1]:
# English Language Model download
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
# Load spacy English languague model
model_path = './lib/python3.10/site-packages/en_core_web_sm/en_core_web_sm-3.6.0/'
NER = spacy.load(model_path)

## Load book data

In [3]:
import os

# Get all books from data folder by scan directory function of OS
books = [b for b in os.scandir('Data') if 'txt' in b.name]

In [4]:
# Display all books we have
books

[<DirEntry '03. Frank Herbert - Children of Dune.txt'>,
 <DirEntry '01. Frank Herbert - Dune.txt'>,
 <DirEntry '02. Frank Herbert - Dune Messiah.txt'>]

Basically, we have 3 books here.

In [5]:
# Check the first book
book_text_1 = open(books[1], encoding='ISO-8859-1').read()
# Set the limit
NER.max_length = 2000000
# Extract the Entity
book_doc_1 = NER(book_text_1)

In [6]:
# Visualize identified entities
displacy.render(book_doc_1[0:1000], style="ent", jupyter=True)

In [7]:
# Get all entities by sentence
sentence_df = []

for sent in book_doc_1.sents:
    ent_list = [ent.text for ent in sent.ents]
    sentence_df.append({"sentence": sent, "entity": ent_list})

In [9]:
# Convert to dataframe
sentence_df = pd.DataFrame(sentence_df)
# Display the df
sentence_df.head()

Unnamed: 0,sentence,entity
0,"(Dune, \n, Frank, Herbert, \n\n, Copyright, 19...","[Dune\nFrank Herbert\n\nCopyright, 1965, 1]"
1,"(A, beginning, is, the, time, for, taking, the...",[]
2,"(This, every, sister, of, the, Bene, Gesserit,...",[Bene Gesserit]
3,"(To, begin, your, study, of, the, life, of, Mu...","[Muad'Dib, first, the 57th year, the Padishah ..."
4,"(And, take, the, most, special, care, that, yo...","[Muad'Dib, Arrakis]"


## Load character name

In [10]:
# Read characters
character_df = pd.read_excel("Data/Character.xlsx")

In [11]:
# Split the character to first name
character_df["first_name"] = character_df["Character"].apply( lambda x: x.split(" ",1)[0])

In [12]:
# Display
character_df

Unnamed: 0,Character,first_name
0,Lady Jessica,Lady
1,Paul Atreides,Paul
2,Leto I Atreides,Leto
3,Vladimir Harkonnen,Vladimir
4,Chani,Chani
5,Stilgar,Stilgar
6,Gaius Helen Mohiam,Gaius
7,Duncan Idaho,Duncan
8,Wellington Yueh,Wellington
9,Gurney Halleck,Gurney


## Data Processing

In [13]:
# Filter out None Character rows from the data
def filter_out(entity_col, character_df):
    return [x for x in entity_col if x in list(character_df['first_name'])]

# Apply the function to
sentence_df['character_entity'] = sentence_df['entity'].apply(lambda x: filter_out(x, character_df))

In [14]:
sentence_df.head()

Unnamed: 0,sentence,entity,character_entity
0,"(Dune, \n, Frank, Herbert, \n\n, Copyright, 19...","[Dune\nFrank Herbert\n\nCopyright, 1965, 1]",[]
1,"(A, beginning, is, the, time, for, taking, the...",[],[]
2,"(This, every, sister, of, the, Bene, Gesserit,...",[Bene Gesserit],[]
3,"(To, begin, your, study, of, the, life, of, Mu...","[Muad'Dib, first, the 57th year, the Padishah ...",[]
4,"(And, take, the, most, special, care, that, yo...","[Muad'Dib, Arrakis]",[]


In [15]:
# Remove empty observations
df_filtered = sentence_df[sentence_df['character_entity'].apply(lambda x: len(x) > 0)].reset_index()

In [16]:
df_filtered.head()

Unnamed: 0,index,sentence,entity,character_entity
0,8,"(In, the, week, before, their, departure, to, ...","[the week, Arrakis, Paul]",[Paul]
1,10,"(The, old, woman, was, let, in, by, the, side,...",[Paul],[Paul]
2,16,"(Paul, 's, mother, answered, in, her, soft, co...","[Paul, Atreides]",[Paul]
3,26,"(Within, the, shadows, of, his, bed, ,, Paul, ...",[Paul],[Paul]
4,31,"(Paul, lay, awake, wondering, :, What, 's, a, ...","[Paul, gom]",[Paul]


## Create Relationship
### 1. Loop through window size to extract the relationship

In [17]:
# Window size (a number of senctences) to extract the relationship
window_size = 5

# Create an empty list to store the combined character_entities
char_list = []

# Iterate through rolling windows of the DataFrame
for window in df_filtered['character_entity'].rolling(window=window_size):
    if window.size == window_size:
        char_list.extend(sum(window.dropna(), []))        

In [18]:
char_list[:10]

['Paul',
 'Paul',
 'Paul',
 'Paul',
 'Paul',
 'Paul',
 'Paul',
 'Paul',
 'Paul',
 'Paul']

### 2. Keep the unique character in sequence

In [19]:
# Keep only Unique list of character relationship
# Initialize a list to store unique elements
unique_list = []

# Initialize a variable to store the previous element
prev_element = None

# Iterate through char_list
for element in char_list:
    # Check if the current element is different from the previous one
    if element != prev_element:
        unique_list.append(element)
    # Update the previous element
    prev_element = element

In [20]:
unique_list[:10]

['Paul',
 'Piter',
 'Paul',
 'Piter',
 'Feyd-Rautha',
 'Paul',
 'Piter',
 'Feyd-Rautha',
 'Piter',
 'Paul']

### 3. Build relationship direction

In [21]:
# Initialize 2 list
source = []
target = []

# Iterate through all elements of the list
for i in range(len(unique_list)-1):
    # Check if the current element is different from the previous one
    source.append(unique_list[i])
    target.append(unique_list[i+1])       

# Create a dataframe of this relationship direction
relationship_df = pd.DataFrame({"Source" : source,
                               "Target" : target})

In [22]:
# Display
relationship_df.head()

Unnamed: 0,Source,Target
0,Paul,Piter
1,Piter,Paul
2,Paul,Piter
3,Piter,Feyd-Rautha
4,Feyd-Rautha,Paul


### 4. Create relationship weight

In [23]:
# Sort the cases with a->b and b->a
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df

Unnamed: 0,Source,Target
0,Paul,Piter
1,Paul,Piter
2,Paul,Piter
3,Feyd-Rautha,Piter
4,Feyd-Rautha,Paul
...,...,...
7598,Liet-Kynes,Vladimir
7599,Paul,Vladimir
7600,Liet-Kynes,Paul
7601,Liet-Kynes,Vladimir


In [24]:
# Set weight equal to 1 for each row
relationship_df["Weight"] = 1
# Aggregate data to create weight
relationship_df = relationship_df.groupby(["Source","Target"], sort=False, as_index=False).sum()

In [25]:
relationship_df.sort_values(by = 'Weight', 
                            ascending = False)

Unnamed: 0,Source,Target,Weight
22,Paul,Stilgar,2060
5,Gurney,Paul,1250
30,Jamis,Paul,506
31,Chani,Paul,472
11,Leto,Paul,385
...,...,...,...
66,Leto,Otheym,1
41,Count,Paul,1
68,Gurney,Otheym,1
26,Lady,Thufir,1


In [26]:
# Save data into a file
relationship_df.to_csv("relationship_df.csv", index=False) 