In [1]:
import pandas as pd
import json
import numpy as np
import re
import pickle

In [50]:
# we load all datasets containing the characters that are used for analysis
df_raw_script = pd.read_csv('data/Game_of_Thrones_Script.csv')

with open('data/characters.json') as f:
    json_characters = json.load(f)
df_raw_characters = pd.DataFrame(json_characters['characters'])

with open('data/episodes.json') as f:
    json_episodes = json.load(f)
df_raw_episodes = pd.DataFrame(json_episodes['episodes'])

#### Preprocess/clean the raw datasets

Goals of the synchronization of character names in datasets.  
We want the datasets to be linked together. For the sentiment analysis in the script, we want Samwell Tarly 
to be able to be linked up with Samwell Tarly in the episode scenes dataset, and include it into the network.
The network itself can use all characters from the scenes dataset, then its a bigger network, but only the people that are also in the script
can have the sentiment score included in the network. This is OK as we suspect the biggest characters to be in both datasets, while smaller insignificant
characters arent in both.

There's a few steps to complete this process.
1. The script dataset had a lot of weird names for characters, so they will be replaced according to the "names_changed.csv"  

No more steps. In network attributes, we just have a condition for inclusion of sentiment, both be in df_characters and df_script.Name  


In [3]:
### script
# No cleaning for script yet.
df_script = df_raw_script.copy()

In [38]:
### characters
# creating the list of characters
df_characters = df_raw_characters[df_raw_characters['characterLink'].notnull()]
df_characters = df_characters.characterName # ! Capitalized
# find duplicates in df_characters
print(df_characters[df_characters.duplicated(keep=False)].sort_values().iloc[:6])
# filter the duplicates out of df_characters
df_characters = df_characters.drop_duplicates(keep='first')

74      Goldcloak
75      Goldcloak
93       Handmaid
94       Handmaid
98    High Septon
99    High Septon
Name: characterName, dtype: object


In [5]:
### episodes
# explode the scenes to get a column for each scene feature
df_scenes = pd.DataFrame(df_raw_episodes.explode('scenes').scenes.tolist())

# create a list of all the characters in each scene
characters = []
for i in range(len(df_scenes)):
    characters.append([])
    for j in range(len(df_scenes['characters'][i])):
        characters[i].append(df_scenes['characters'][i][j]['name'])

df_scenes['characters'] = characters #! Capitalized
# df_scenes = df_scenes[['sceneStart','sceneEnd','location','subLocation','characters']]


### **Look at problematic characters that have multiple names in the script**

In [6]:
# We create lists that contain single names, double names and triple names to compare and see which characters are have multiple names
# most importantly, see which characters that are the biggest offenders
single_names = [name for name in df_raw_script.Name.dropna().unique() if len(name.split(' ')) == 1]
double_names = [name for name in df_raw_script.Name.dropna().unique() if len(name.split(' ')) == 2]
triple_names = [name for name in df_raw_script.Name.dropna().unique() if len(name.split(' ')) == 3]

In [7]:
duplicate_names = pd.DataFrame(columns=['single_name', 'count', 'in_double_name'])
row = 0
for name in single_names:
    for d_name in double_names:
        if name in d_name. split(' ')[0]:
            count = len(df_raw_script.Name[df_raw_script.Name == name])
            duplicate_names.loc[row] = [name, count, d_name]
            row += 1

duplicate_names.sort_values(by='count', ascending=False).iloc[0:25]

Unnamed: 0,single_name,count,in_double_name
18,sam,399,sam tarly
47,daario,166,daario naharis
87,sandor,129,sandor clegane
46,beric,92,beric dondarrion
3,guard,77,guard captain
37,roose,77,roose bolton
25,loras,75,loras tyrell
44,barristan,67,barristan selmy
29,lancel,67,lancel lannister
48,walder,55,walder frey


In [51]:
# We build a dictionary that contains the names that are used in the script and the names that we need them to be changed to
# this was done manually by our GoT expert Nicolaj.
### Changed names
changed_names = pd.read_csv('data/changed_names.csv') #! lowercase
# change so each row in old_name becomes the key, and new_name becomes the value
dict_changed_names = changed_names.set_index('old_name').to_dict()['new_name']

changed_names.head(5)

Unnamed: 0,old_name,new_name
0,sam,samwell tarly
1,sam tarly,samwell tarly
2,daario,daario naharis
3,sandor,sandor clegane
4,beric,beric dondarrion


In [52]:
# Clean the script dataset so all names are in correspondance to the "characters" dataset. (using changed_names.csv)
# we made a copy furhter up, called df_script
for old_name in dict_changed_names:
    df_script['Name'] = df_script['Name'].replace(old_name, dict_changed_names[old_name])

df_script.iloc[3310:3315] # we see sam has been changed to samwell tarly and edd to eddison tollett, correct.

Unnamed: 0,Release Date,Season,Episode,Episode Title,Name,Sentence
3310,2012-04-01,Season 2,Episode 1,The North Remembers,eddison tollett,"I was born in a place like this. Later, I fell..."
3311,2012-04-01,Season 2,Episode 1,The North Remembers,samwell tarly,Are those girls?
3312,2012-04-01,Season 2,Episode 1,The North Remembers,eddison tollett,Craster's daughters.
3313,2012-04-01,Season 2,Episode 1,The North Remembers,samwell tarly,I haven't seen a girl in six months.
3314,2012-04-01,Season 2,Episode 1,The North Remembers,eddison tollett,I'd keep on not seeing them if I were you.


In [53]:
duplicate_names = pd.DataFrame(columns=['single_name', 'count', 'in_double_name'])
row = 0
for name in single_names:
    for d_name in double_names:
        if name in d_name. split(' ')[0]:
            count = len(df_script.Name[df_script.Name == name])
            duplicate_names.loc[row] = [name, count, d_name]
            row += 1

duplicate_names.sort_values(by='count', ascending=False).iloc[0:25]

Unnamed: 0,single_name,count,in_double_name
3,guard,78,guard captain
28,men,32,tommen lannister
53,olly,25,lollys stokeworth
52,olly,25,ollys mother
30,meryn,19,meryn trant
7,all,17,alliser thorn
8,all,17,all together
9,all,17,alliser throne
6,all,17,all three
5,all,17,alliser thorne


In [54]:
# save the updated script dataset
df_script.to_csv('data/Game_of_Thrones_Script_updated.csv', index=False)

In [55]:
# 
print(set(changed_names.new_name) - set(df_characters.str.lower()))
# all new names to be changed to are now included in the list of characters

{'guard', 'iron dwarf', 'fire dwarf', 'wolf dwarf', 'gay dwarf', 'incest dwarf'}


In [None]:
# stripped version?

### **Look at problematic character names in the episodes dataset**

In [18]:
df_scenes.characters.explode().unique().__len__()
# Theres clearly too many character names being referenced in the scenes dataset,
# which means its tough to link up with the characters dataset. 

578

In [27]:
# We want to know which of the "old names" from our changed_names.csv exist in the scenes dataset
for old_name in dict_changed_names:
    if old_name in df_scenes.characters.explode().unique():
        print(old_name)
# None


### **Filter out the names in the character dataset that dont exists in the others**

In [57]:
# df_characters
# df_scenes.characters
# df_script.Name

len(df_characters.unique()) == df_characters.shape[0] # True

i = 0
for character in df_characters:
    if character.lower() not in df_script.Name.unique():
        i += 1
        print(character, i)

# df_script.Name.unique()

Aerys II Targaryen 1
Akho 2
Alton Lannister 3
Alys Karstark 4
Amory Lorch 5
Archmaester Marwyn 6
Areo Hotah 7
Armeca 8
Arthur Dayne 9
Baby Sam 10
Baratheon Guard 11
Biter 12
Black Walder Rivers 13
Brea 14
Brienne of Tarth 15
Brother Ray 16
Brynden Tully 17
Captain of the Archers 18
Catspaw Assassin 19
Child of the Forest 20
Citadel Maester 21
Colen of Greenpools 22
Dagmer Cleftjaw 23
Dickon Tarly 24
Dongo the Giant 25
Donnel Waynwood 26
Dontos Hollard 27
Doran Martell 28
Dothraki Bloodrider #1 29
Dothraki Bloodrider #2 30
Drennan 31
Ellaria Sand 32
Eon Hunter 33
Faceless Man 34
Gendry 35
Gerold Hightower 36
Goldcloak 37
Goldcloak #1 38
Goldcloak #2 39
Great Master #1 40
Great Master #2 41
Great Master #3 42
Great Master #4 43
Great Master #5 44
Great Master #6 45
Great Master #7 46
Greizhen mo Ullhor 47
Hallyne 48
Handmaid 49
Harald Karstark 50
High Priestess 51
High Septon 52
High Sparrow 53
Hoster Tully 54
Howland Reed 55
Hugh of the Vale 56
Ilyn Payne 57
Jaqen H'ghar 58
Jaqen's Disg