# Data cleaning

In [1]:
import json
import numpy as np
import pandas as pd
from progressbar import ProgressBar

This notebook allows us to reformat the data that we created in `data-creation.ipynb` so that it is JavaScript-legible and more intuitive when creating visualizations.

---
## Network data
Here, we would like to create a file `network_data.json` that takes the structure of the network data seen <a href="https://raw.githubusercontent.com/holtzy/D3-graph-gallery/master/DATA/data_network.json">here</a>. This will allow for easy parsability when it comes to reading in the network data, facilitating the creation of our network visualization. The only difference is that we'll separate the connections by decade; so we'll only look at connections between Oscar nominees and winners in the decade that they were nominated for their Academy Awards.

In [71]:
# read in relevant data sets
actors_to_id_df = pd.read_csv('../webpage-dev/data/actor_id.csv', index_col=0)
id_to_actors_df = pd.read_csv('../webpage-dev/data/id_actor.csv', index_col=0)
connections_df = pd.read_csv('../webpage-dev/data/connections_backup.csv', index_col=0)
awards_df = pd.read_csv('../webpage-dev/data/awards.csv', index_col=0)

actor_to_id = actors_to_id_df.to_dict()['id']
id_to_actor = id_to_actors_df.to_dict()['actor']

In [72]:
# years
years = [1928, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]

In [99]:
# init connections_by_decade
connections_by_decade = {}

# generate nodes and links for each decade
for i in range(len(years)):
    
    # define year range
    year_range = (years[i], years[i+1]) if i+1<len(years) else (years[i],2020)
    
    # filter by nomination years
    decade_actors = list(set(list(awards_df.loc[(awards_df.year >= year_range[0])*(awards_df.year < year_range[1])].index)))
    
    # define nodes
    decade_nodes = [{'id': actor_to_id[actor], 'name': actor} for actor in decade_actors]
    
    # create connections matrix
    connections_mtx = np.array(connections_df.values, copy=True)
    # get actor ids in decade and not in decade
    actor_id_in_decade = [actor_to_id[actor] for actor in decade_actors]
    set_all = set(list(range(connections_mtx.shape[0])))
    set_in = set(actor_id_in_decade)
    actor_id_not_in_decade = set_all.difference(set_in)
    
    # set actor connections not in decade to zero
    for j in actor_id_not_in_decade:
        for k in actor_id_not_in_decade:
            connections_mtx[j,k] = 0
    
    # create sources and targets
    sources_tot, targets_tot = np.where(~np.eye(connections_mtx.shape[0],dtype=bool)*(connections_mtx) > 0)
    sources, targets = [], []
    for j in range(sources_tot.shape[0]):
        if not (sources_tot[j] in actor_id_not_in_decade or targets_tot[j] in actor_id_not_in_decade):
            sources.append(sources_tot[j])
            targets.append(targets_tot[j])
            
    decade_links = [{'source': sources[i], 
          'target': targets[i], 
          'weight': connections_mtx[sources[i], targets[i]]} for i in range(len(sources))]

    # append
    connections_by_decade[years[i]] = {'nodes': decade_nodes, 'links': decade_links}

In [100]:
# define np encoder
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)

In [101]:
# write out to file
with open('../webpage-dev/data/network_data.json', 'w') as fp:
    json.dump(connections_by_decade, fp, cls=NpEncoder)

In [102]:
connections_by_decade[2010]['nodes']

[{'id': 10, 'name': 'Alan Arkin'},
 {'id': 156, 'name': 'Christoph Waltz'},
 {'id': 270, 'name': 'Frances McDormand'},
 {'id': 107, 'name': 'Bryan Cranston'},
 {'id': 79, 'name': 'Benedict Cumberbatch'},
 {'id': 750, 'name': 'Richard E. Grant'},
 {'id': 103, 'name': 'Brie Larson'},
 {'id': 268, 'name': 'Florence Pugh'},
 {'id': 653, 'name': 'Michelle Williams'},
 {'id': 879, 'name': 'Timothée Chalamet'},
 {'id': 807, 'name': 'Sam Elliott'},
 {'id': 820, 'name': 'Scarlett Johansson'},
 {'id': 24, 'name': 'Allison Janney'},
 {'id': 484, 'name': 'Julianne Moore'},
 {'id': 497, 'name': 'Kate Winslet'},
 {'id': 196, 'name': 'Denzel Washington'},
 {'id': 933, 'name': 'Yalitza Aparicio'},
 {'id': 50, 'name': 'Annette Bening'},
 {'id': 529, 'name': 'Laurie Metcalf'},
 {'id': 736, 'name': 'Rami Malek'},
 {'id': 125, 'name': 'Casey Affleck'},
 {'id': 622, 'name': 'Matthew McConaughey'},
 {'id': 429, 'name': 'Jessica Chastain'},
 {'id': 127, 'name': 'Cate Blanchett'},
 {'id': 761, 'name': 'Robert