# Preparing data for the website

In [1]:
import pandas as pd

In [2]:
# Load the data
DATA_PATH = '../../data'
WEBSITE_DATA_PATH = '../../website/app/data'
catalogue = pd.read_csv(f'{DATA_PATH}/titles.csv')
persons = pd.read_csv(f'{DATA_PATH}/persons.csv')
genres = pd.read_csv(f'{DATA_PATH}/genres.csv')
countries = pd.read_csv(f'{DATA_PATH}/countries.csv')

In [3]:
catalogue['actors'] = catalogue.actors.str.split(',')
catalogue['month'] = catalogue['release date on Netflix'].dt.month
catalogue['year'] = catalogue['release date on Netflix'].dt.year 

In [4]:
# Dictionnary for faster countries and genres access 
countries_to_name = countries.set_index('id').to_dict(orient='index')
genres_to_name = genres.set_index('id').to_dict(orient='index')
persons_to_name = persons.set_index('id').to_dict(orient='index')

def get_genre_name(id_):
    return genres_to_name[int(id_)]['name']

def get_country_name(id_):
    return countries_to_name[int(id_)]['name']

def get_person_name(id_):
    return persons_to_name[int(id_)]['name']

In [5]:
series = catalogue[~catalogue['number of seasons'].isna()].copy()
movies = catalogue[catalogue['number of seasons'].isna()].copy()

## Graph information

In [6]:
import networkx as nx

In [7]:
G = nx.Graph()

In [8]:
actors_per_item = movies.sort_values('imdb', ascending=False)[:2000]['actors'].dropna()
actors_per_item = actors_per_item.apply(set)

In [10]:
h = 0
k = 0
for i, a1 in actors_per_item.iteritems():
    for j, a2 in actors_per_item.iteritems():
        if (i != j) and (set(a1) & set(a2)):
            G.add_edge(i, j, weight = len(set(a1) & set(a2)), id = h)
            h +=1

In [11]:
nx.set_node_attributes(G, movies.title.to_dict(), name = "name")
nx.set_node_attributes(G, movies.audio.to_dict(), name = "audio")

In [12]:
giant_cc = sorted(nx.connected_components(G), key=len, reverse=True)[0]

In [13]:
G = nx.subgraph(G, giant_cc)

In [15]:
from itertools import count
groups = set(nx.get_node_attributes(G,'audio').values())
mapping = dict(zip(sorted(groups),count()))
nodes = G.nodes()
colors = {n:mapping[G.nodes[n]['audio']] for n in nodes}

In [19]:
pos = nx.spring_layout(G, iterations=100)

nx.set_node_attributes(G, {k:v[0] for k,v in pos.items()}, name = "x")
nx.set_node_attributes(G, {k:v[1] for k,v in pos.items()}, name = "y")

In [20]:
from networkx.readwrite import json_graph
data = json_graph.node_link_data(G, attrs = {"link": "edges"})

In [21]:
import json
with open(f'{WEBSITE_DATA_PATH}/movies_network.json', 'w') as f:
    json.dump(data, f)