In [1]:
import pandas as pd
import json
import numpy as np
import re
import pickle

In [2]:
# we load all datasets containing the characters that are used for analysis
df_raw_script = pd.read_csv('data/Game_of_Thrones_Script.csv')

with open('data/characters.json') as f:
    json_characters = json.load(f)
df_raw_characters = pd.DataFrame(json_characters['characters'])

with open('data/episodes.json') as f:
    json_episodes = json.load(f)
df_raw_episodes = pd.DataFrame(json_episodes['episodes'])

#### Preprocess/clean the raw datasets

Thought behind is to only include names in the dataset that occur in all datasets, and correct them so they are the same names.  
1. First, replace the names in each dataset with the names we have outlined in the list "names changed.csv"
2. Secondly, collect all names from each dataset into a "set".
3. Thirdly, find the intersection between all sets. This is now the list of names we can use in each dataset.
4. fourthly, filter all datasets to only include dialogue, characters and scenes where these characters are.

In [3]:
### script
# No cleaning for script yet.
df_script = df_raw_script.copy()

In [38]:
### characters
# creating the list of characters
df_characters = df_raw_characters[df_raw_characters['characterLink'].notnull()]
df_characters = df_characters.characterName # ! Capitalized
# find duplicates in df_characters
print(df_characters[df_characters.duplicated(keep=False)].sort_values().iloc[:6])
# filter the duplicates out of df_characters
df_characters = df_characters.drop_duplicates(keep='first')

74      Goldcloak
75      Goldcloak
93       Handmaid
94       Handmaid
98    High Septon
99    High Septon
Name: characterName, dtype: object


In [5]:
### episodes
# explode the scenes to get a column for each scene feature
df_scenes = pd.DataFrame(df_raw_episodes.explode('scenes').scenes.tolist())

# create a list of all the characters in each scene
characters = []
for i in range(len(df_scenes)):
    characters.append([])
    for j in range(len(df_scenes['characters'][i])):
        characters[i].append(df_scenes['characters'][i][j]['name'])

df_scenes['characters'] = characters #! Capitalized
# df_scenes = df_scenes[['sceneStart','sceneEnd','location','subLocation','characters']]


### **First look at the character dataset and filter out names in there that don't exist elsewhere**

In [42]:
# df_characters
# df_scenes.characters
# df_script.Name

len(df_characters.unique()) == df_characters.shape[0] # True



True

### **Look at problematic characters that have multiple names in the script**

In [6]:
# We create lists that contain single names, double names and triple names to compare and see which characters are have multiple names
# most importantly, see which characters that are the biggest offenders
single_names = [name for name in df_raw_script.Name.dropna().unique() if len(name.split(' ')) == 1]
double_names = [name for name in df_raw_script.Name.dropna().unique() if len(name.split(' ')) == 2]
triple_names = [name for name in df_raw_script.Name.dropna().unique() if len(name.split(' ')) == 3]

In [7]:
duplicate_names = pd.DataFrame(columns=['single_name', 'count', 'in_double_name'])
row = 0
for name in single_names:
    for d_name in double_names:
        if name in d_name. split(' ')[0]:
            count = len(df_raw_script.Name[df_raw_script.Name == name])
            duplicate_names.loc[row] = [name, count, d_name]
            row += 1

duplicate_names.sort_values(by='count', ascending=False).iloc[0:25]

Unnamed: 0,single_name,count,in_double_name
18,sam,399,sam tarly
47,daario,166,daario naharis
87,sandor,129,sandor clegane
46,beric,92,beric dondarrion
3,guard,77,guard captain
37,roose,77,roose bolton
25,loras,75,loras tyrell
44,barristan,67,barristan selmy
29,lancel,67,lancel lannister
48,walder,55,walder frey


In [8]:
# We build a dictionary that contains the names that are used in the script and the names that we need them to be changed to
# this was done manually by our GoT expert Nicolaj.
### Changed names
changed_names = pd.read_csv('data/changed_names.csv') #! lowercase
# change so each row in old_name becomes the key, and new_name becomes the value
dict_changed_names = changed_names.set_index('old_name').to_dict()['new_name']

changed_names.head(5)

Unnamed: 0,old_name,new_name
0,sam,samwell tarly
1,sam tarly,samwell tarly
2,daario,daario naharis
3,sandor,sandor clegane
4,beric,beric dondarrion


In [9]:
# Clean the script dataset so all names are in correspondance to the "characters" dataset. (using changed_names.csv)
# we made a copy furhter up, called df_script
for old_name in dict_changed_names:
    df_script['Name'] = df_script['Name'].replace(old_name, dict_changed_names[old_name])

df_script.iloc[3310:3315] # we see sam has been changed to samwell tarly and edd to eddison tollett, correct.

Unnamed: 0,Release Date,Season,Episode,Episode Title,Name,Sentence
3310,2012-04-01,Season 2,Episode 1,The North Remembers,eddison tollett,"I was born in a place like this. Later, I fell..."
3311,2012-04-01,Season 2,Episode 1,The North Remembers,samwell tarly,Are those girls?
3312,2012-04-01,Season 2,Episode 1,The North Remembers,eddison tollett,Craster's daughters.
3313,2012-04-01,Season 2,Episode 1,The North Remembers,samwell tarly,I haven't seen a girl in six months.
3314,2012-04-01,Season 2,Episode 1,The North Remembers,eddison tollett,I'd keep on not seeing them if I were you.


In [10]:
duplicate_names = pd.DataFrame(columns=['single_name', 'count', 'in_double_name'])
row = 0
for name in single_names:
    for d_name in double_names:
        if name in d_name. split(' ')[0]:
            count = len(df_script.Name[df_script.Name == name])
            duplicate_names.loc[row] = [name, count, d_name]
            row += 1

duplicate_names.sort_values(by='count', ascending=False).iloc[0:25]

Unnamed: 0,single_name,count,in_double_name
3,guard,78,guard captain
28,men,32,tommen lannister
53,olly,25,lollys stokeworth
52,olly,25,ollys mother
30,meryn,19,meryn trant
7,all,17,alliser thorn
8,all,17,all together
9,all,17,alliser throne
6,all,17,all three
5,all,17,alliser thorne


In [11]:
# save the updated script dataset
df_script.to_csv('data/Game_of_Thrones_Script_updated.csv', index=False)

In [12]:
# 
print(set(changed_names.new_name) - set(df_characters.str.lower()))
# all new names to be changed to are now included in the list of characters

{'guard', 'iron dwarf', 'fire dwarf', 'wolf dwarf', 'gay dwarf', 'incest dwarf'}


In [13]:
# Create a stripped version only with names from both sets
set(df_script.Name.str.lower()) - set(df_characters.str.lower())

{'aemon',
 'aeron',
 'aerson',
 'ahsa',
 'all',
 'all three',
 'all together',
 'allister',
 'alton',
 'amory',
 'announcer',
 'archers',
 'archmaester',
 'areo',
 'armory',
 'assassin',
 'attendant',
 'axell florent',
 'banker',
 'barriston',
 'belicho',
 'black haired prostitute',
 'black walder',
 'blackfish',
 'blacksmith',
 'blonde prostitute',
 'bloodrider',
 'bolton bannerman',
 'bolton guard',
 'bolton officer',
 'both',
 'boy',
 'braavosi man',
 'brans voice',
 'brienne',
 'brothel keeper',
 'brother',
 'brothers',
 'bryndel',
 'brynden',
 'buer',
 'buyer',
 'bystanders',
 'camello',
 'captain',
 'cassel',
 'child',
 'child of forest',
 'clarenzo',
 'client',
 'cohollo',
 'cold',
 'colen',
 'cooper',
 'crayah',
 'cressen',
 'crowd',
 'cuard',
 'dagmer',
 'daughter',
 'davos',
 'deanerys targarian',
 'derryk',
 'dickon',
 'dim dalba',
 'dirah',
 'doloroud edd',
 'dolorous',
 'dolrous edd',
 'donnel',
 'doran',
 'dornish lord',
 'dornish prince',
 'dothraki',
 'dothraki man',
 '

### **Look at problematic character names in the episodes dataset**

In [18]:
df_scenes.characters.explode().unique().__len__()
# Theres clearly too many character names being referenced in the scenes dataset,
# which means its tough to link up with the characters dataset. 

578

In [27]:
# We want to know which of the "old names" from our changed_names.csv exist in the scenes dataset
for old_name in dict_changed_names:
    if old_name in df_scenes.characters.explode().unique():
        print(old_name)
# None, so we just remove the names that are not in the characters dataset
