We use this Notebook to format the original data into an output json file (got.json) that we will use in our D3 visualization.
The input data comes from the awesome datasets created by Jefrey Lancaster (github.com/jeffreylancaster).

In [None]:
#Importing libraries
import pandas as pd
import psycopg2 as pg
import pandas.io.sql as psql
import numpy as np;
import json
from pandas.io.json import json_normalize  

In [None]:
# Import GOT Houses families 
# Can be found in github.com/jeffreylancaster/game-of-thrones/blob/master/data/characters-groups.json
with open("./characters-groups.json") as f: 
    d = json.load(f)  
characters_gr = json_normalize(d['groups']) 
print(characters_gr.shape)
characters_gr.head(3) 

# Import colors asociated to characters
# Can be found in github.com/jeffreylancaster/game-of-thrones/blob/master/data/colors.json
with open("./colors.json") as f: 
    d = json.load(f)  
colors_or = json_normalize(d['colors']) 
print(colors_or.shape)
colors_or.head(3)


In [None]:
# Import episodes.json. Main json file.
# Can be found in github.com/jeffreylancaster/game-of-thrones/blob/master/data/episodes.json
with open("./episodes.json") as f: 
    d = json.load(f)  
episodes = json_normalize(d['episodes']) 
print(episodes.shape)
episodes.head(3) 

In [None]:
#Most of the interesting information of each episode is within the scenes array.
episodes_mod=episodes.copy()
new_df=pd.DataFrame()
char_fin=[]
char_size=[]
for ix,i in episodes_mod.iterrows():
    characters=[]
    #print (ix)
    #print(i)
    i=0;
    for j in episodes_mod.scenes[ix]:
            #print(ix)
            #print(j["sceneStart"])
            #print(j["sceneEnd"])
            #print(( pd.to_timedelta(j["sceneEnd"], unit='s')-pd.to_timedelta(j["sceneStart"], unit='s')).seconds)
            
            # Here we calculate how much time each character appears in each scene.
            j.update({"duration":( pd.to_timedelta(j["sceneEnd"], unit='s')-pd.to_timedelta(j["sceneStart"], unit='s')).seconds})
            for x in j["characters"]:
                      x.update({"duration":j["duration"]})
                      #we check whether the character dies in this episode.
                      if "alive" not in x:
                          characters.append({"name":x["name"],"duration":x["duration"],"position":i,"death":0})
                      else:
                          characters.append({"name":x["name"],"duration":x["duration"],"position":i,"death":1}) 
                      i=i+1;
    #print(characters)
    temp_dat=pd.DataFrame(characters)
    #We group by name of character to get how much time in each episode a character appears and the moment in which appears.
    temp_dat=temp_dat.groupby(["name"],as_index=False).agg({"duration":"sum","position":"min","death":"max"})
    
    #Reorder elements to have proper position column
    temp_dat.sort_values(["position"],ascending=True,inplace=True)
    temp_dat.drop(["position"],axis=1,inplace=True)
    temp_dat.reset_index(inplace=True,drop=True)
    temp_dat.reset_index(inplace=True)
    temp_dat.rename({"index":"position"},axis=1,inplace=1)
    temp_dat["position"]=temp_dat["position"]*(65/np.shape(temp_dat)[0]) #normalize to 65 = max nb of users in a episode
    #print(temp_dat["position"].max())
    #print(temp_dat)
    
    #add other relevant columns
    temp_dat["episode"]= episodes_mod.index[ix]
    temp_dat["episodeTitle"]= episodes_mod.episodeTitle[ix]
    temp_dat["seasonNum"]= episodes_mod.seasonNum[ix]
    temp_dat["episodeDescription"]= episodes_mod.episodeDescription[ix]
    temp_dat["nbcharacters"]= np.shape(temp_dat)[0]

    new_df= new_df.append(temp_dat)
     
new_df.reset_index(inplace=True,drop=True)
#Format to minutes.
new_df["time_in_episode"]=np.round(new_df["duration"]/60,1)


In [None]:
#Use the colors of colors.json for each character.
colors=colors_or.copy()
colors.dropna(subset=["class"],inplace=True)
for i,j in colors.iterrows():
    colors.loc[i,"class"]= colors.loc[i,"class"][0]
   
characters_gr["name"]=characters_gr.name.str.lower()
#Merge colors and character families in characters_gr
characters_gr=characters_gr.merge(colors[["class","webSafe"]],right_on="class",left_on="name")

#we add to new_df the family and colour of the character.
for ix,i in characters_gr.iterrows():
        print (i)
        new_df.loc[new_df["name"].isin(i["characters"]),"family"]=i["name"]
        new_df.loc[new_df["name"].isin(i["characters"]),"colour"]=i["webSafe"]

new_df.family.fillna("include",inplace=True)
new_df.loc[new_df["family"]=="include","colour"]="#CC9999"
new_df.loc[new_df["family"]=="white walkers","colour"]="#87CEFA"
new_df.loc[new_df["family"]=="night's watch","colour"]="#000000"
new_df.loc[new_df["family"]=="wildlings","colour"]="#66CCCC"
#check
#new_df.loc[new_df["colour"].isna()]

In [None]:
#Result dataframe that we will export as json file. 
new_df.head(10)

In [None]:
new_df.to_json("./data_out/got.json",orient='records')