# Retrieving information per country

In [1]:
import pandas as pd

In [2]:
# Load the data
DATA_PATH = '../../data'
WEBSITE_DATA_PATH = '../../website/app/data'

catalogue = pd.read_csv(f'{DATA_PATH}/titles.csv')
persons = pd.read_csv(f'{DATA_PATH}/persons.csv')
genres = pd.read_csv(f'{DATA_PATH}/genres.csv')
countries = pd.read_csv(f'{DATA_PATH}/countries.csv')

In [3]:
# Split attributes with multiple values per row
catalogue['genres'] = catalogue.genres.str.split(',')
catalogue['countries'] = catalogue.countries.str.split(',')
catalogue['actors'] = catalogue.actors.str.split(',')
catalogue['directors'] = catalogue.directors.str.split(',')
catalogue['alternate genres'] = catalogue['alternate genres'].str.split(',')
catalogue['release date on Netflix'] = pd.to_datetime(catalogue['release date on Netflix'])
catalogue.rename(columns = {'year': 'actual_release_year'}, inplace = True)

catalogue['month'] = catalogue['release date on Netflix'].dt.month
catalogue['year'] = catalogue['release date on Netflix'].dt.year 

In [4]:
# Dictionnary for faster countries and genres access 
countries_to_name = countries.set_index('id').to_dict(orient='index')
genres_to_name = genres.set_index('id').to_dict(orient='index')
persons_to_name = persons.set_index('id').to_dict(orient='index')

def get_genre_name(id_):
    return genres_to_name[int(id_)]['name']

def get_country_name(id_):
    return countries_to_name[int(id_)]['name']

def get_person_name(id_):
    return persons_to_name[int(id_)]['name']

In [5]:
series = catalogue[~catalogue['number of seasons'].isna()].copy()
movies = catalogue[catalogue['number of seasons'].isna()].copy()

In [6]:
movies_exploded_countries = movies.explode('countries').dropna(subset = ['countries'])
series_exploded_countries = series.explode('countries').dropna(subset = ['countries'])

movies_exploded_countries['countries'] = movies_exploded_countries.countries.apply(get_country_name)
series_exploded_countries['countries'] = series_exploded_countries.countries.apply(get_country_name)

In [7]:
movies_exploded_countries['count_movies'] = 1
series_exploded_countries['count_series'] = 1

n_movies_per_country = movies_exploded_countries.groupby('countries')['count_movies'].count()

n_series_per_country = series_exploded_countries.groupby('countries')['count_series'].count()

In [8]:
from collections import Counter

catalogue_countries_exploded = catalogue.explode('countries').dropna(subset = ['countries'])
catalogue_countries_exploded['countries'] = catalogue_countries_exploded.countries.apply(get_country_name)
genres_per_country = catalogue_countries_exploded.groupby('countries').aggregate({'genres': 'sum'})
genres_per_country['genres'] = genres_per_country.genres.apply(lambda l: Counter([get_genre_name(g) for g in l]))
genres_per_country['genres'] = genres_per_country.genres.apply(lambda c: [g for g, v in c.most_common(10)])

In [9]:
countries_info = pd.DataFrame(n_movies_per_country).merge(pd.DataFrame(n_series_per_country), left_index=True, right_index=True)\
                                                   .merge(genres_per_country, left_index=True, right_index=True)
countries_info.to_json(f'{WEBSITE_DATA_PATH}/countries_info.json', orient='index')