In [None]:
import json
import pandas as pd
import numpy as np
import netwulf as nw
import matplotlib.pyplot as plt
import networkx as nx
import random as random 
import re
from collections import Counter

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
nltk.download('stopwords')

pd.set_option('display.max_colwidth', None)

In [None]:
# Define some helper functions

def it_is_a_character(short_name):
    if sum([short_name in long_name.split(' ')[0] for long_name in characters.tolist()]) >= 1:
        # TODO: returns Old Woman Prisoner, when Woman is passed?
        return True
    else:
        return False
    # NOTE: only return true if the short_name is the first name in the long_name? short_name in long_name.split(' ')[0]

def return_long_name(short_name, include_idx=False):
    for idx, long_name in enumerate(characters.tolist()):
        if short_name in long_name.split(' ')[0]:
            # print(short_name, long_name.split(' ')[0])
            if include_idx:
                return long_name, idx
            else:
                return long_name
                

In [244]:
# load tokenized script. Less data but contains all names still
script_tokenized = pd.read_csv('./data/Game_of_Thrones_Script_tokenized.csv')

script = pd.read_csv('./data/Game_of_Thrones_Script.csv')
# check how many nan values there are
print(script.isna().sum()) # 3 names, 1 sentence, remove the nan valued rows
script = script.dropna()

# load dataset with all characters and extract json charactername to a dataframe
with open('data/characters.json') as f:
    data = json.load(f)

Release Date     0
Season           0
Episode          0
Episode Title    0
Name             3
Sentence         1
dtype: int64


In [242]:
"""
!Pipeline:

1. Load full script and character names
2. Find shortest name of a character to use as stopword filter
3. Filter the whole script so it only contains 3+ letter words
4. Filter script on the 1000 most used 3+ words in the script
4a. remove names from the 1000list
5. Now we "tokenize" the script on only Capital words (hopefully containing names)
6. We now find names in each sentence using it_is_a_character and long_name



"""

In [344]:
""" 
!1. Character names
"""
df = pd.DataFrame(data['characters'])
print("Characters", df.shape[0])

df = df[df['characterLink'].notnull()]
print("Characters with link", df.shape[0])
characters = df.characterName

l = []
for name in np.unique(characters):
    for n in name.split(' '):
        l.append(n)
unique_charnames = np.unique(l)


Characters 389
Characters with link 368


In [369]:
"The" in unique_charnames

True

In [373]:
"""
!2. Find shortest name of a character to use as stopword filter
"""

# find the shortest name
min_len = 10000

# reverse list


# for c in unique_charnames[::-1]:
#     if len(c) < min_len:
#         min_len = len(c)
#         print(c, min_len)

# for c in characters:
#     c_min = len(min(c.split(' ')))
#     if c_min < min_len:
#         min_len = c_min
#         print(c, c_min)

# We conclude that the shortest real name is 4 characters long.
min_len = 4

# unique_charnames

In [None]:
for sentence in script.

# Soooo problems
How do we match both "The Red Woman" without getting "Red stableboy" from "Red" and not getting "The Bear" from "The"???

Multiple first names... e.g. Jon Aron and Jon Snow, where if Jon is mentioned (probably referring to Jon Snow), then the edge will go to Jon Aron 
because of alphabetical reasons

Remove "The" and the like from both character namelist and script, then "The Old Woman" and "The Bear" are atleast not conflicted

In [374]:
"""
!3 Filter whole script to only include 2+ words, but make sure to keep contractions
"""
regex = r'\b[\w\']{3,}\b'
script['filtered'] = script['Sentence'].str.findall(regex).str.join(' ')
script['filtered']

0        What you expect They're savages One lot steals goat from another lot and before you know they're ripping each other pieces
1                                           I've never seen wildlings thing like this I've never seen thing like this not ever life
2                                                                                                             How close did you get
3                                                                                                               Close any man would
4                                                                                                         should head back the wall
                                                                    ...                                                            
23906                                                                  think can all agree that ships take precedence over brothels
23907                                                                       

In [375]:
# Find 1000 most common words in the filtered script.

# We want to keep contractions,so we remove all punctuation except apostrophes
puncs = '!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~' # ' is removed
# combine all sentences into one document
sent_filtered = script.filtered.str.cat(sep=' ')

sent_filtered = re.sub(f"[{re.escape(puncs)}]+", '', sent_filtered)
sent_filtered = sent_filtered.lower()
sent_filtered = sent_filtered.replace('\d+', '') # remove numbers
sent_filtered = sent_filtered.replace('\n', '') # remove newlines
sent_filtered = sent_filtered.replace(' +', ' ') # remove double spaces
sent_filtered = sent_filtered.strip() # remove trailing and leading spaces

sent_filtered = sent_filtered.split(' ')
print(sent_filtered[:10])

Common1000Words = Counter(sent_filtered).most_common(1000)
Common1000Words = [word for word, count in Common1000Words]
Common1000Words = [word.capitalize() for word in Common1000Words]

print(f"20 most common words {Common1000Words[:20]}")
"They're" in Common1000Words # shows it was smart to keep contractions
Common1000Words # Now King is also in the list

# can even check some words we fear might be in the list
"King" in Common1000Words, "Queen" in Common1000Words, "Lord" in Common1000Words, "Lady" in Common1000Words, "Bran" in Common1000Words

['what', 'you', 'expect', "they're", 'savages', 'one', 'lot', 'steals', 'goat', 'from']
20 most common words ['The', 'You', 'And', 'Your', 'For', 'That', 'Have', 'Not', 'What', 'Are', 'Was', 'With', 'Will', 'Him', 'This', 'They', "Don't", 'Her', 'But', 'All']


(True, True, True, True, True)

In [376]:
"""
!4a. remove names from the 1000list
"""
commonWords = [word for word in Common1000Words if word not in unique_charnames]
len(commonWords) # 892

892

In [377]:
"""
!4. Filter script on the 1000 most used 3+ words in the script
"""
# remove the words that are in the 1000 most common words

script['filtered'] = script['filtered'].apply(lambda x: ' '.join([word for word in x.split() if word not in (commonWords)]))


In [378]:
"""
!5. Now we "tokenize" the script on only Capital words (hopefully containing names)
"""
# Regex for returning words that start with a capital letter, followed by a lowercase letter
regex = r'\b[A-Z][a-z]+\b'

CapitalWords = [np.unique(re.findall(regex, s)).tolist() for s in script.filtered]

In [379]:
"""
!6. 
"""

i = 0
print('idx\tShort Name\tLong Name')
for idx, words in enumerate(CapitalWords):
    l = 0
    for word in words:
        if it_is_a_character(word):
            l += 1
            print(f'{idx}\t{word}    \t{return_long_name(word)}')
            if l > i:
                i = l
                idxx = idx
                print(f'{idxx} # of chars: {l}')
print(f'At {idxx}, there was {i} characters')

idx	Short Name	Long Name
20	Bran    	Bran Stark
20 # of chars: 1
21	Bran    	Bran Stark
23	Bran    	Bran Stark
24	Lord    	Lord Galbart Glover
24	Night    	Night's Watch Deserter
24 # of chars: 2
24	Stark    	Stark Bannerman
24 # of chars: 3
27	Cat    	Catelyn Stark
28	The    	The Bear
29	Bran    	Bran Stark
30	Ned    	Ned Umber
33	The    	The Bear
33	White    	White Walker
34	White    	White Walker
37	Baratheon    	Baratheon Guard
37	Robert    	Robert Baratheon
39	King    	King Joffrey Baratheon Dwarf
41	Eddard    	Eddard Stark
41	Lord    	Lord Galbart Glover
41	Stark    	Stark Bannerman
41	Winterfell    	Winterfell Shepherd
41 # of chars: 4
44	Jon    	Jon Arryn
47	The    	The Bear
48	White    	White Walker
49	The    	The Bear
49	White    	White Walker
69	Bran    	Bran Stark
70	Lord    	Lord Galbart Glover
70	Stark    	Stark Bannerman
70	The    	The Bear
73	Stark    	Stark Bannerman
75	The    	The Bear
79	Jon    	Jon Arryn
82	Jon    	Jon Arryn
82	Robert    	Robert Baratheon
91	Jon    

In [380]:
[print(word, '   \t', return_long_name(word)) for word in CapitalWords[12376] if it_is_a_character(word)]

Beric    	 Beric Dondarrion
Cersei    	 Cersei Lannister
Ilyn    	 Ilyn Payne
Joffrey    	 Joffrey Baratheon
Lannister    	 Lannister Captain
Meryn    	 Meryn Trant
Myr    	 Myranda
Red    	 Red Keep Stableboy
The    	 The Bear
Thoros    	 Thoros of Myr
Tywin    	 Tywin Lannister
Walder    	 Walder Frey


[None, None, None, None, None, None, None, None, None, None, None, None]

In [267]:
# Observation, all characternanmes start with a capital letter in the
# original script, so we can filter our dataset down tremendously

# Regex for returning words that start with a capital letter, followed by a lowercase letter, and is more than 3 characters long
regex = r'\b[A-Z][a-z]{3,}\b'

# regex for returning words that start with a capital letter, followed by a lowercase letter,
# makes two consequent words into one word if they are both capitalized
# regex = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?\b' # courtesy of GPT4

# Captures pairs of capitalized start words, removes 3 letter words [courtesy of GPT4]
# regex = r'\b[A-Z][a-z]{3,}(?:\s+[A-Z][a-z]{3,})?\b'

[np.unique(re.findall(regex, sentence)).tolist() for sentence in script.Sentence.iloc[20:30]]
# script.Sentence.iloc[2]

# NOTE: maybe names called multiple times in a sentence are actually multiple times important, 
# NOTE: So no stemming with np.unique

# NOTE: Link the short name "Bran" with the long name "Brandon Stark" and then see 
# how many unique names there are. Then we dont get doubles. (If we decide to use unique mentions) Not very likely tbh

[['Bran', 'Keep'],
 ['Bran'],
 ['Relax'],
 ['Bran', 'Quick'],
 ['Lord', 'Night', 'Stark', 'They', 'Watch'],
 [],
 [],
 [],
 [],
 ['Bran', 'Tell']]

In [59]:
[s for s in script.Sentence.tolist()]

# for idx, s in enumerate(script.Sentence.tolist()):
#     # if type(s) != str:
#         # print(idx, s)

# script.Sentence.iloc[18092] This was the NaN value for a sentence, removed up earlier now.

["What do you expect? They're savages. One lot steals a goat from another lot and before you know it, they're ripping each other to pieces.",
 "I've never seen wildlings do a thing like this. I've never seen a thing like this, not ever in my life.",
 'How close did you get?',
 'Close as any man would.',
 'We should head back to the wall.',
 'Do the dead frighten you?',
 "Our orders were to track the wildlings. We tracked them. They won't trouble us no more.",
 "You don't think he'll ask us how they died? Get back on your horse.",
 'Whatever did it to them could do it to us. They even killed the children.',
 "It's a good thing we're not children. You want to run away south, run away. Of course, they will behead you as a deserter … If I don't catch you first. Get back on your horse. I won't say it again.",
 'Your dead men seem to have moved camp.',
 'They were here.',
 'See where they went.',
 'What is it?',
 "It's …",
 "Go on. Father's watching.",
 'And your mother.',
 'Fine work, as al

In [183]:
# Filter so each list becomes the resultant long name of the short name present in the list.
# The list will contain capitalized words that arent names so we first filter those out

# Filter out all words that are not names
CapitalWords = [np.unique(re.findall(regex, sentence)).tolist() for sentence in script.Sentence]

i = 0
print('idx\tShort Name\tLong Name')
for idx, words in enumerate(CapitalWords):
    l = 0
    for word in words:
        if it_is_a_character(word):
            l += 1
            print(f'{idx}\t{word}    \t{return_long_name(word)}')
            if l > i:
                i = l
                idxx = idx
                print(f'{idxx} # of chars: {l}')
print(f'At {idxx}, there was {i} characters')


idx	Short Name	Long Name
20	Bran    	Bran Stark
20 # of chars: 1
21	Bran    	Bran Stark
23	Bran    	Bran Stark
24	Lord    	Lord Galbart Glover
24	Night    	Night's Watch Deserter
24 # of chars: 2
24	Stark    	Stark Bannerman
24 # of chars: 3
29	Bran    	Bran Stark
33	White    	White Walker
34	White    	White Walker
37	Baratheon    	Baratheon Guard
37	Robert    	Robert Baratheon
39	King    	King Joffrey Baratheon Dwarf
41	Eddard    	Eddard Stark
41	Lord    	Lord Galbart Glover
41	North    	Northman Rioter
41	Stark    	Stark Bannerman
41 # of chars: 4
41	Winterfell    	Winterfell Shepherd
41 # of chars: 5
48	White    	White Walker
49	White    	White Walker
69	Bran    	Bran Stark
70	Lord    	Lord Galbart Glover
70	Stark    	Stark Bannerman
73	Stark    	Stark Bannerman
82	Hand    	Handmaid
82	Robert    	Robert Baratheon
83	Hand    	Handmaid
93	Winterfell    	Winterfell Shepherd
94	North    	Northman Rioter
96	Lord    	Lord Galbart Glover
96	Tyrion    	Tyrion Lannister
115	Arya    	Arya Sta

In [188]:
# We observe the Sentence with most names in it for testing.

[print(word, '   \t', return_long_name(word)) for word in CapitalWords[12376] if it_is_a_character(word)]

"""
Alton Lannister
38           Cersei Lannister
113           Jaime Lannister
132           Kevan Lannister
152          Lancel Lannister
153         Lannister Captain
154         Lannister Captain

we see even though lannister is the last name, we filter only on first names, our func still 
flunks because there exists lannister guard (first name lannister)

# Cahnged the "return_long_name" func to include the split of the long name
# if short_name in long_name.split(' ')[0]:
# so now it returns what made the func it_is_a_character return true, So Alton doesnt get more edges than deserved
"""

Beric    	 Beric Dondarrion
Cersei    	 Cersei Lannister
Ilyn    	 Ilyn Payne
Joffrey    	 Joffrey Baratheon
Lannister    	 Lannister Captain
Meryn    	 Meryn Trant
Thoros    	 Thoros of Myr
Tywin    	 Tywin Lannister
Walder    	 Walder Frey


'\nAlton Lannister\n38           Cersei Lannister\n113           Jaime Lannister\n132           Kevan Lannister\n152          Lancel Lannister\n153         Lannister Captain\n154         Lannister Captain\n\nwe see even though lannister is the last name, we filter only on first names, our func still \nflunks because there exists lannister guard (first name lannister)\n\n# Cahnged the "return_long_name" func to include the split of the long name\n# if short_name in long_name.split(\' \')[0]:\n# so now it returns what made the func it_is_a_character return true, So Alton doesnt get more edges than deserved\n'

In [265]:
[print(word, '   \t', return_long_name(word)) for word in script.filtered[idxx] if it_is_a_character(word)]
"""
Aerys    	 Aerys II Targaryen
Baratheon    Baratheon Guard
Grand    	 Grand Maester Pycelle
King    	 King Joffrey Baratheon Dwarf

here we see another problem, the word King will be used alot, but it doesnt refer to a random dward each time,
so there should be some words we need to filter out from the text, like King and Queen and such - or filter the 
characterlist down to only some of the more popular ones, like the main characters.
"""
idxx

o    	 Aeron Greyjoy
b    	 Baby Sam
s    	 Aerys II Targaryen
e    	 Aeron Greyjoy
s    	 Aerys II Targaryen
s    	 Aerys II Targaryen
e    	 Aeron Greyjoy
d    	 Addam Marbrand
w    	 Bowen Marsh
i    	 Alliser Thorne
t    	 Alton Lannister
h    	 Akho
l    	 Alliser Thorne
o    	 Aeron Greyjoy
v    	 Davos Seaworth
e    	 Aeron Greyjoy
d    	 Addam Marbrand
w    	 Bowen Marsh
a    	 Addam Marbrand
t    	 Alton Lannister
c    	 Archmaester Marwyn
h    	 Akho
p    	 Captain of the Archers
e    	 Aeron Greyjoy
o    	 Aeron Greyjoy
p    	 Captain of the Archers
l    	 Alliser Thorne
e    	 Aeron Greyjoy
b    	 Baby Sam
u    	 Anguy
r    	 Aeron Greyjoy
n    	 Aeron Greyjoy
t    	 Alton Lannister
h    	 Akho
e    	 Aeron Greyjoy
i    	 Alliser Thorne
r    	 Aeron Greyjoy
s    	 Aerys II Targaryen
k    	 Akho
i    	 Alliser Thorne
n    	 Aeron Greyjoy
b    	 Baby Sam
l    	 Alliser Thorne
a    	 Addam Marbrand
c    	 Archmaester Marwyn
k    	 Akho
e    	 Aeron Greyjoy
n    	 Aeron Greyjoy

8736

In [240]:
"""
Can pull a list of 1000 common words, or create our own GoT 1000 most common words, and filter out those words
As we will filter out words that are less than 4 characters long, we can do the same for the common words, so we get even more words filtered out.
"""

# We want to keep contractions,so we remove all punctuation except apostrophes
puncs = '!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~' # ' is removed
# combine all sentences into one document
sent_filtered = script.Sentence.str.cat(sep=' ')

sent_filtered = re.sub(f"[{re.escape(puncs)}]+", '', sent_filtered)
sent_filtered = sent_filtered.lower()
sent_filtered = sent_filtered.replace('\d+', '') # remove numbers
sent_filtered = sent_filtered.replace('\n', '') # remove newlines
sent_filtered = sent_filtered.replace(' +', ' ') # remove double spaces
sent_filtered = sent_filtered.strip() # remove trailing and leading spaces

sent_filtered = sent_filtered.split(' ')
print(sent_filtered[:10])

Common1000Words = Counter(sent_filtered).most_common(1000)
Common1000Words = [word for word, count in Common1000Words]
Common1000Words = [word.capitalize() for word in Common1000Words]

print(f"10 most common words {Common1000Words[:10]}")
"They're" in Common1000Words # shows it was smart to keep contractions

['what', 'do', 'you', 'expect', "they're", 'savages', 'one', 'lot', 'steals', 'a']
10 most common words ['The', 'You', 'To', 'I', 'A', 'And', 'Of', 'Your', 'My', 'It']


True

In [148]:
# script.Sentence.iloc[idxx]
characters[characters.str.contains('Lannister')]

6             Alton Lannister
38           Cersei Lannister
113           Jaime Lannister
132           Kevan Lannister
152          Lancel Lannister
153         Lannister Captain
154         Lannister Captain
155       Lannister Guardsman
156       Lannister Messenger
157           Lannister Scout
202          Martyn Lannister
349          Tyrion Lannister
350           Tywin Lannister
369          Willem Lannister
382    Young Cersei Lannister
Name: characterName, dtype: object

In [95]:

# np.argmax(['Keep' in long_name for long_name in characters.tolist()])
# characters.iloc[263]

return_long_name('Stark', include_idx=False)

# NOTE: How do we know which Stark is 

'Arya Stark'