In [None]:
# imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# import datasets
netflix_shows = pd.read_csv("/Users/stlp/Desktop/zuckflix_meta/Data/netflix_shows.csv")
netflix_films = pd.read_csv("/Users/stlp/Desktop/zuckflix_meta/Data/netflix_films.csv")
netflix_titles = pd.read_csv("/Users/stlp/Desktop/zuckflix_meta/Data/netflix_titles_2.csv")

In [None]:
netflix_shows.head()

In [None]:
netflix_films.head()

In [None]:
# function to get unique values of a column
def getUnique(data):
    unique_values = set()
    for value in data:
        if type(value) is float:
            unique_values.add(None)
        else:
            values = value.split(", ")
            for i in values:
                unique_values.add(i)
    return list(unique_values)

In [None]:
# function get count of a unique value that is part of a column that needs to be parsed
def getCount(data, token):
    count = 0
    for value in data:
        if type(value) is not float:
            values = value.split(", ")
            if token in values:
                count += 1
    return count

In [None]:
# Unique genres and countries from dataset
unique_genres_shows = getUnique(netflix_shows['listed_in'])
unique_genres_films = getUnique(netflix_films['listed_in'])
unique_countries_shows = getUnique(netflix_shows['country'])
unique_countries_films = getUnique(netflix_films['country'])

In [None]:
unique_genres_films.remove('Movies')
unique_genres_films.remove('International Movies')
unique_genres_films.remove('Independent Movies')
unique_genres_shows.remove('International TV Shows')

In [None]:
unique_genres_shows_df = pd.DataFrame(unique_genres_shows, columns = ['unique_genres'])
unique_genres_shows_df.to_csv("/Users/stlp/Desktop/zuckflix_meta/Data/unique_genres_shows.csv", index = False)

unique_genres_films_df = pd.DataFrame(unique_genres_films, columns = ['unique_genres'])
unique_genres_films_df.to_csv("/Users/stlp/Desktop/zuckflix_meta/Data/unique_genres_films.csv", index = False)

unique_countries_shows_df = pd.DataFrame(unique_countries_shows, columns = ['unique_genres'])
unique_countries_shows_df.to_csv("/Users/stlp/Desktop/zuckflix_meta/Data/unique_countries_shows.csv", index = False)

unique_countries_films_df = pd.DataFrame(unique_countries_films, columns = ['unique_genres'])
unique_countries_films_df.to_csv("/Users/stlp/Desktop/zuckflix_meta/Data/unique_countries_films.csv", index = False)


## Baseline Stats and Preliminary Visualizations

### Duration

In [None]:
# Statistics on Duration of Shows
netflix_shows[['seasons']].describe()

In [None]:
# Statistics on Duration of Movies
netflix_films[['length']].describe()

In [None]:
figs, axes = plt.subplots(1, 2, figsize = (10, 5))

season_dist = sns.histplot(data = netflix_shows, x = 'seasons', discrete = True, ax = axes[0])
season_dist.set_xlim(0, 18)
season_dist.set_title("Distribution of TV show seasons")

length_dist = sns.histplot(data = netflix_films, x = 'length', bins = 20, ax = axes[1])
length_dist.set_title("Distribution of film lengths")

### Release Year

In [None]:
# Statistics on Release Year of Shows
netflix_shows[['release_year']].describe()

In [None]:
# Statistics on Release Year of Movies
netflix_films[['release_year']].describe()

In [None]:
figs, axes = plt.subplots(2, 1, figsize = (10, 10))

release_year_s = sns.histplot(data = netflix_shows, x = 'release_year', discrete = True, ax = axes[0])
release_year_s.set_title("Distribution of TV Show release years")

release_year_f = sns.histplot(data = netflix_films, x = 'release_year', discrete = True, ax = axes[1])
release_year_f.set_title("Distribution of film release years")

### Month Added

In [None]:
netflix_shows[['month_added']].describe()

In [None]:
netflix_films[['month_added']].describe()

In [None]:
figs, axes = plt.subplots(1, 2, figsize = (10, 5))

month_added_s = sns.histplot(data = netflix_shows, x = 'month_added', discrete = True, ax = axes[0])
month_added_s.set_title("Distribution of TV shows (month added)")

month_added_f = sns.histplot(data = netflix_films, x = 'month_added', discrete = True, ax = axes[1])
month_added_f.set_title("Distribution of films (month added)")

### Rating

In [None]:
netflix_films['rating'].value_counts(dropna = False)

In [None]:
rating_show_dist = pd.DataFrame({"rating":netflix_shows['rating'].value_counts(dropna = False).index.tolist(), "count":netflix_shows['rating'].value_counts(dropna = False)})
rating_film_dist = pd.DataFrame({"rating":netflix_films['rating'].value_counts(dropna = False).index.tolist(), "count":netflix_films['rating'].value_counts(dropna = False)})

In [None]:
rating_show_dist.drop([float("NaN")], axis = 0)

In [None]:
rating_film_dist.drop([float("NaN")], axis = 0)

In [None]:
figs, axes = plt.subplots(1, 2, figsize = (25, 10))

rating_dist_s = sns.barplot(x = 'rating', y = 'count', data = rating_show_dist, ax = axes[0])
rating_dist_s.set_title("Distribution of TV show ratings")

rating_dist_f = sns.barplot(x = 'rating', y = 'count', data = rating_film_dist, ax = axes[1])
rating_dist_f.set_title("Distribution of film ratings")


### Countries

In [None]:
country_dict_s = dict.fromkeys(unique_countries_shows, 0)
for country in unique_countries_shows:
    count_s = getCount(netflix_shows['country'], country)
    country_dict_s[country] = count_s

country_dict_f = dict.fromkeys(unique_countries_films, 0)
for country in unique_countries_films:
    count_f = getCount(netflix_films['country'], country)
    country_dict_f[country] = count_f

In [None]:
country_df_s = pd.DataFrame({"country": country_dict_s.keys(), "count": country_dict_s.values()})
country_df_f = pd.DataFrame({"country": country_dict_f.keys(), "count": country_dict_f.values()})

In [None]:
country_count_s = country_df_s.sort_values(by=['count'], ascending = False).head(10)
country_count_s

In [None]:
country_count_f = country_df_f.sort_values(by=['count'], ascending = False).head(10)
country_count_f

In [None]:
figs, axes = plt.subplots(2, 1, figsize = (15, 10))

country_dist_s = sns.barplot(x = 'country', y = 'count', data = country_count_s, ax = axes[0])
country_dist_s.set_title("Distribution of top 10 countries (TV Shows)")

country_dist_f = sns.barplot(x = 'country', y = 'count', data = country_count_f, ax = axes[1])
country_dist_f.set_title("Distribution of top 10 countries (films)")

### Genres

In [None]:
genre_dict_s = dict.fromkeys(unique_genres_shows, 0)
for genre in unique_genres_shows:
    count_s = getCount(netflix_shows['listed_in'], genre)
    genre_dict_s[genre] = count_s

genre_dict_f = dict.fromkeys(unique_genres_films, 0)
for genre in unique_genres_films:
    count_f = getCount(netflix_films['listed_in'], genre)
    genre_dict_f[genre] = count_f

In [None]:
genre_df_s = pd.DataFrame({"genre": genre_dict_s.keys(), "count": genre_dict_s.values()})
genre_df_f = pd.DataFrame({"genre": genre_dict_f.keys(), "count": genre_dict_f.values()})

In [None]:
genre_count_s = genre_df_s.sort_values(by=['count'], ascending = False).head(10)
genre_count_s

In [None]:
genre_count_f = genre_df_f.sort_values(by=['count'], ascending = False).head(10)
genre_count_f

In [None]:
figs, axes = plt.subplots(2, 1, figsize = (23, 10))

genre_dist_s = sns.barplot(x = 'genre', y = 'count', data = genre_count_s, ax = axes[0])
genre_dist_s.set_title("Distribution of top 10 genres (TV Shows)")

genre_dist_f = sns.barplot(x = 'genre', y = 'count', data = genre_count_f, ax = axes[1])
genre_dist_f.set_title("Distribution of top 10 genres (films)")

## Exploring Relationships

### What variables can I work with?
    Country (Categorical)
    Year Added (Numerical)
    Release Year (Numerical)
    Month Added (Numerical)
    Rating (Categorical)
    Length (Numerical)
    Seasons (Numerical)
    Genre (listed_in) (Categorical)
    Type (Categorical)


### Special Functions

In [None]:
# combination of groupby and getCount function
def getCountGB(data, unique_data, group_by, count_by):
    grouped_by = data.groupby([group_by])
    gb_count_dict = dict.fromkeys(grouped_by.groups.keys(), None)
    for element in iter(grouped_by):
        count_dict = dict.fromkeys(unique_data, 0)
        state, frame = element
        for unique_value in unique_data:
            count = getCount(frame[count_by], unique_value)
            count_dict[unique_value] = count
        gb_count_dict[state] = count_dict
    return gb_count_dict

In [None]:
# custom group by function to handle entries that need to be parsed first
def group_by(data, unique_data):
    unique_dict = dict.fromkeys(unique_data, None)
    for unique_value in unique_data:
        index = []
        for j, values in enumerate(data):
            if type(values) is not float:
                if unique_value in values:
                    index.append(j)
        unique_dict[unique_value] = index
    return unique_dict

## What relationships can I explore?
### Year Added (Trends)
    - Is there a trend in content of a specific rating over time? (Rating) [DO FOR FILMS] 
    - Is there a trend in genre of content over time? (Genre) [DO FOR FILMS]
    - Is Netflix favoring shorter or longer films/shows over time? (Length)[DONE]
    - Is there a particular actor/actress that Netflix really likes recently? [NEED TO FIX]
    - What is the trend of growth in volume for different country’s content? (Country) [DONE]

### Is there a trend in genre of content over time? (Genre)

In [None]:
shows_2016_2020 = netflix_shows[(netflix_shows['year_added'] >= 2016) & (netflix_shows['year_added'] < 2021)]
genre_dict_s_year = getCountGB(shows_2016_2020, unique_genres_shows, 'year_added', 'listed_in')

genre_by_year_s = pd.DataFrame.from_dict(genre_dict_s_year, orient = 'index')

genre_by_year_s = genre_by_year_s[genre_count_s['genre']]
sns.set(rc = {'figure.figsize':(10, 10)})
sns.set_style("whitegrid")
sns.lineplot(data = genre_by_year_s.loc[:, genre_by_year_s.columns!="Kids' TV"], lw = 3)

### Is there a trend in content of a specific rating over time? (Rating)

In [None]:
shows_ma = netflix_shows[(netflix_shows['rating'] == "TV-MA") & (netflix_shows['year_added'] >= 2016) & (netflix_shows['year_added'] < 2021)]
genre_year_ma = getCountGB(shows_ma, unique_genres_shows, 'year_added', 'listed_in')
genre_by_year_s = pd.DataFrame.from_dict(genre_year_ma, orient = 'index')

genre_count_s['genre'] != "Kids' TV"
genre_by_year_s[genre_count_s['genre']]
genre_by_year_s[genre_count_s['genre']].loc[:, genre_by_year_s[genre_count_s['genre']].columns!="Kids' TV"]

genre_by_year_s = genre_by_year_s[genre_count_s['genre']]
sns.set(rc = {'figure.figsize':(10, 10)})
sns.set_style("whitegrid")
sns.lineplot(data = genre_by_year_s.loc[:, genre_by_year_s.columns!="Kids' TV"], lw = 3)


### Is Netflix favoring shorter or longer films/shows over time? (Length)

In [None]:
films_2016_2021 = netflix_films[netflix_films['year_added'] >= 2016]
films_avg_length_by_year = films_2016_2021.groupby(['year_added'])['length'].mean().to_frame().reset_index()

sns.lineplot(data = films_avg_length_by_year, x = "year_added", y = "length", lw = 3)

In [None]:
index = []
for i, countries in enumerate(netflix_shows['country']):
    if type(countries) is not float:
        if "South Korea" in countries:
            index.append(i)
korean_shows = netflix_shows.iloc[index]

In [None]:
shows_actors = getUnique(korean_shows['cast'])
actor_dict_s = dict.fromkeys(shows_actors, 0)
for actor in shows_actors:
    count = getCount(korean_shows['cast'], actor)
    actor_dict_s[actor] = count

In [None]:
shows_actors_df = pd.DataFrame.from_dict(actor_dict_s, orient = 'index').reset_index()
shows_actors_df = shows_actors_df.rename(columns = {"index": "actor", 0: "appearance"})
shows_actors_df.sort_values(by = "appearance", ascending = False).head(15)

In [None]:
country_dict_s_year = getCountGB(shows_2016_2020, unique_countries_shows, 'year_added', 'country')

country_by_year_s = pd.DataFrame.from_dict(country_dict_s_year, orient = 'index')
country_by_year_s = country_by_year_s[country_count_s['country']]

sns.set(rc = {'figure.figsize':(10, 10)})
sns.set_style("whitegrid")
sns.lineplot(data = country_by_year_s, lw = 3)

In [None]:
sns.lineplot(data = country_by_year_s.loc[:, country_by_year_s.columns!="United States"], lw = 3)

In [None]:
films_2016_2021 = netflix_films[(netflix_films['year_added'] >= 2016)]
country_dict_f_year = getCountGB(films_2016_2021, unique_countries_films, 'year_added', 'country')

country_list = list(country_count_f['country'])
country_by_year_f = pd.DataFrame.from_dict(country_dict_f_year, orient = 'index')
country_by_year_f = country_by_year_f[country_list]

sns.set(rc = {'figure.figsize':(10, 10)})
sns.set_style("whitegrid")
sns.lineplot(data = country_by_year_f, lw = 3)

### Genre
    - What are the dominating genres for popular ratings of films/shows? (Rating) [MIGHT NOT DO THIS BECAUSE ALREADY DID IT BUT OVER A TIME PERIOD]
    - What is the average length of shows or movies in each popular genre? (Length/Seasons) [MIGHT NOT DO THIS FOR TV SHOWS]

### What is the average length of shows or movies in each popular genre? (Length/Seasons)

Group by genre, for the most popular genres (Dramas, Comedies, Documentaries | TV Dramas, TV Comedies, Crime TV Shows) and then get the average length in each genre. 

In [None]:
unique_genres = ['Dramas', 'Comedies', 'Documentaries']
unique_dict = group_by(netflix_films['listed_in'], unique_genres)
df = []
for key, value in unique_dict.items():
    average = netflix_films['length'].iloc[value].mean()
    row = [key, average]
    df.append(row)
avg_length_by_genre = pd.DataFrame(df, columns = ["genre", "length"])
sns.barplot(x = "genre", y = "length", data = avg_length_by_genre)
avg_length_by_genre

### Is there a genre that Netflix prioritizes each month? (Month Added)

### What is the most common release year in each of the popular genres? (Release Year)

### Recent year movies and TV shows make up the most of Netflix's catalog [OBVIOUS!]

### Country
    - Is there more of a bias towards a type of entertainment for different countries? (Type)
    - What is the average length of a show or movie for different countries? (Length)
    - Is there a specific genre from that country that is dominating Netflix’s catalog overall and over time (Genre)
    - What is the distribution of ratings the specific country? 

### Is there more of a bias towards a type of entertainment for different countries? (Type)

In [None]:
unique_countries_titles = getUnique(netflix_titles['country'])
unique_countries_titles.remove(None)

titles_country = group_by(netflix_titles['country'], unique_countries_titles)

titles_country_2 = dict()

for (key, value) in titles_country.items():
    if len(value) >= 170:
        titles_country_2[key] = value

titles_country = titles_country_2

titles_by_type = pd.DataFrame()
for key, value in titles_country.items():
    by_type = netflix_titles.iloc[value].groupby(['type'])['type'].count().to_frame()
    by_type.rename(columns = {"type": key})
    titles_by_type[key] = by_type

titles_by_type = titles_by_type.T

titles_by_type.plot(kind = 'bar', stacked = True)

### What is the average length of a show or movie for different countries? (Length)

In [None]:
unique_countries_films.remove(None)
films_country = group_by(netflix_films['country'], unique_countries_films)

avg_length_by_country = dict.fromkeys(films_country, 0)
for key, value in films_country.items():
    average = netflix_films.iloc[value]['length'].mean()
    avg_length_by_country[key] = average

In [None]:
avg_length_by_country_df = pd.DataFrame.from_dict(avg_length_by_country, orient = 'index')
avg_length_by_country_df = avg_length_by_country_df.loc[country_count_f['country']]
avg_length_by_country_df['country'] = avg_length_by_country_df.index
avg_length_by_country_df = avg_length_by_country_df.rename(columns = {0: "average_length"})
sns.barplot(data = avg_length_by_country_df, x = 'average_length', y = 'country')

### Is there a specific genre from that country that is dominating Netflix’s catalog overall (Genre)

In [None]:
genres_country_dict = dict()
for country_key, country_value in films_country.items():
    group = netflix_films.iloc[country_value]
    sub_group = group_by(group['listed_in'], unique_genres_films)
    for genre_key, genre_value in sub_group.items():
        if len(genre_value) >= 50:
            genres_country_dict[(country_key, genre_key)] = len(genre_value)

In [None]:
country, genre = list(zip(*genres_country_dict.keys()))
genres_country_dict_df = {'country': country, 'genre': genre, 'count': genres_country_dict.values()}
genres_country_dict_df = pd.DataFrame.from_dict(genres_country_dict_df)
genres_country_dict_df.head(5)

### What is the distribution of ratings for each country? 

### Others
    - What is the dominating rating of movies for different months added? Is there a consistent trend? (Rating + Month Added)
    - What is the average length of movies and shows for different ratings? (Rating + Length)
    - Is there a genre that Netflix prioritizes each month depending on the country? (Country and Month Added and Genre)
    - Most credited director on Netflix

In [None]:
films_india_index = group_by(netflix_films['country'], 'India')
films_india_index = list(films_india_index.values())[0]
films_india = netflix_films.iloc[films_india_index]

grouped_by = films_india.groupby(['month_added'])
for element in iter(grouped_by):
    state, frame = element
    

# Pitch Idea: The Perfect Movie and Show Formula

1. Why pitch both films and shows? -> show growth in volume of both shows and films on Netflix
2. Films
    - Overall
    - Country-specific
3. Shows
    - Overall
    - Country specific