In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import re
from visualization_functions import *
from text_analysis import *
from statistical_analysis import *
import numpy as np
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.auto import tqdm
nltk.download('vader_lexicon')
import plotly.express as px
import importlib

ModuleNotFoundError: No module named 'text_analysis'

In [None]:
path_main = r'C:\Users\adity\Downloads\OPEN IIT NETFLIX\Cleaned_CSVS\final_cleaned_main.csv'
path_movies_2025 = r'C:\Users\adity\Downloads\OPEN IIT NETFLIX\Cleaned_CSVS\cleaned_2025_Movies.csv'
path_movies_2025_raw = r'C:\Users\adity\Downloads\OPEN IIT NETFLIX\TV_Movies_Show\netflix_movies_detailed_up_to_2025.csv'
path_tv_shows = r'C:\Users\adity\Downloads\OPEN IIT NETFLIX\TV_Movies_Show\netflix_tv_shows_detailed_up_to_2025.csv'

### Taniya's Analysis

In [None]:
df = pd.read_csv(path_main)

In [None]:
# Convert categorical Netflix ratings to numeric scores
import os
rating_map = {
    'G': 1, 'TV-Y': 1, 'TV-Y7': 2,
    'PG': 2, 'TV-G': 2, 'TV-PG': 3,
    'PG-13': 3, 'TV-14': 4,
    'R': 4, 'NC-17': 5, 'TV-MA': 5
}
df['vote_average'] = df['rating'].map(rating_map).fillna(3)

# Create output directory if missing
os.makedirs("outputs", exist_ok=True)

print(f"‚úÖ Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"Columns: {list(df.columns)}")

In [None]:
# ============================================================
#  CONTENT CREATOR ANALYSIS
# ============================================================

print("\n===== STEP 3: CONTENT CREATOR ANALYSIS =====")

# --- 3.1 Most Prolific Directors ---
top_directors = get_top_creators(df, "director", n=20)
print("\nüé¨ Top 10 Directors:\n", top_directors.head(10))
plot_top_creators(top_directors, "Top 20 Most Prolific Directors on Netflix")
top_directors.to_csv("outputs/top_directors.csv")

# --- 3.2 Most Frequent Actors ---
top_actors = get_top_creators(df, "cast", n=20)
print("\n‚≠ê Top 10 Actors:\n", top_actors.head(10))
plot_top_creators(top_actors, "Top 20 Most Frequent Actors on Netflix")
top_actors.to_csv("outputs/top_actors.csv")

# --- 3.3 Director‚ÄìActor Collaboration Network ---
print("\nüï∏Ô∏è Building collaboration network...")
G = build_collaboration_network(df)
print(f"Network contains {len(G.nodes())} nodes and {len(G.edges())} edges.")
plot_network(G)
print("Network plotted successfully!")

In [None]:
# ============================================================
# VISUALIZATIONS
# ============================================================

# --- 4.1 Director‚ÄìGenre Heatmap ---
from helper_functions import *
print("\nüé≠ Creating Director‚ÄìGenre Specialization Heatmap...")

matrix = director_genre_matrix(df, min_titles=3)

plot_heatmap(
    matrix,
    row_label="Director",
    col_label="Genre",
    top_rows=15,
    title="Director‚ÄìGenre Specialization Map"
)


import os
os.makedirs("outputs", exist_ok=True)
matrix.to_csv("outputs/director_genre_matrix.csv", index=True)
print("‚úÖ Director‚ÄìGenre matrix successfully saved to 'outputs/director_genre_matrix.csv'.")
1
# --- 4.2 Creator Timelines (Yearly Trends) ---
print("\nüìÖ Plotting director timeline trends...")
plot_creator_timeline(df, creator_col="director", top_n=5)

print("\nüìÖ Plotting actor timeline trends...")
plot_creator_timeline(df, creator_col="cast", top_n=5)

# --- 4.3 Country Distribution ---
print("\nüåç Analyzing country distribution of creators...")
plot_creator_country_distribution(df, creator_col="director")
print("\nüåç Comparing international vs domestic creators...")
intl_summary = plot_international_vs_domestic(df, creator_col="director", home_country="India")
print(intl_summary)

print("\nüåé For actors as well...")
actor_intl_summary = plot_international_vs_domestic(df, creator_col="cast", home_country="India")
print(actor_intl_summary)
# --- 4.4 Cast Frequency Distribution ---
print("\nüé≠ Plotting actor appearance frequency distribution...")
plot_cast_frequency_distribution(df)


In [None]:
# ============================================================
# STATISTICAL TESTS
# ============================================================

print("\n===== CHI-SQUARE TEST =====")

# Director vs Rating
chi2, p, dof, contingency = chi_square_test(df, 'director', 'rating')
print(f"Chi-square statistic = {chi2:.2f}, dof = {dof}, p-value = {p:.4f}")

if p < 0.05:
    print("‚úÖ Reject H‚ÇÄ ‚Üí Significant relationship detected.")
    print("Certain directors tend to target specific rating categories.")
else:
    print("‚ùå Fail to reject H‚ÇÄ ‚Üí No strong relationship found.")

# Optional visualization
plot_chi_square_heatmap(contingency, var1_name="Director", var2_name="Rating", top_n=10)
print("\n===== STEP 5.2: ENTROPY (CREATOR SPECIALIZATION) =====")

# Compute specialization for directors across genres
entropy_df = compute_entropy(df, entity_col='director', category_col='listed_in')

print("\nüìä Sample (Top 5 Most Diverse Directors):")
print(entropy_df.head(5))

# Plot both views
plot_entropy(entropy_df, entity_col='director', top_n=10)
print("\n===== STEP 5.3: ANOVA & LSD TESTS =====")
F_stat, p_val = anova_test(df, group_col='director', value_col='duration')

if F_stat is not None:
    print(f"ANOVA F-statistic = {F_stat:.3f}, p-value = {p_val:.4f}")
    if p_val < 0.05:
        print("‚úÖ Reject H‚ÇÄ ‚Üí Significant difference in average duration among directors.")
    else:
        print("‚ùå Fail to reject H‚ÇÄ ‚Üí No significant difference in duration across directors.")
df['duration_num'] = (
    df['duration']
    .astype(str)
    .str.extract(r'(\d+)')
    .astype(float)
)

print("\nüì¶ Visualizing duration distribution by director...")
generate_styled_boxplot(df, cat='director', val='duration_num', outlier=False)


### NOTE: run this one at the end
# print("\nüìä Running F-test and LSD post-hoc comparison for directors (vote_average)...")
# result = director_rating_significance(df, val_col="vote_average")

# if result:
#     print("\nSignificant differences found. Sample output (first 5 entries):")
#     for key in list(result.keys())[:5]:
#         print(f"{key} > {result[key]}")
# else:
#     print("\nNo significant rating difference detected across directors.")

# ============================================================
# END OF ANALYSIS
# ============================================================
print("\n‚úÖ All analyses completed successfully! Check the 'outputs/' folder for saved results.")


### Daksh's Analysis

In [None]:
df['type'].value_counts().index

In [None]:
df_catalog=(
    df[['type','show_id']]
    .drop_duplicates(subset=['show_id'])
)

In [None]:
df_catalog['type'].value_counts().sum()

In [None]:
pie_by_count(
    df_catalog,
    title="Catalog Composition:Movies vs TV Shows",
    column="type",
)


In [None]:
df_listed_in=(
    df[['listed_in','show_id']]
    .drop_duplicates()
)

In [None]:
s = df_listed_in["listed_in"].value_counts().head(15)
plot_treemap_from_series(
    s,
    caption="Genre-wise share (Top 15)",
)

In [None]:
df_ratings = (
    df[["show_id", "rating"]]
    .drop_duplicates(subset=["show_id"])
)

In [None]:
barh_top_counts_series_black_background(
    s=df_ratings['rating'].value_counts(ascending=False),
    # figtitle="Fig-4:Ratings Distribution in Netflix Catalog",
    title="Content Ratings Distribution",
    xlabel="Ratings",
)

In [None]:
df['category'].value_counts()/len(df)*100

In [None]:
pie_by_count(
    df,
    column="category",
    title="Category Distribution",
    startangle = 0
)

In [None]:
df_titles = (
    df[["show_id", "type", "date_added", "category","country"]]
    .drop_duplicates(subset=["show_id"])
)

In [None]:
df_titles["date_added"] = pd.to_datetime(df["date_added"])

In [None]:
df_titles["year_month"] = df_titles["date_added"].dt.to_period("M").dt.to_timestamp()

In [None]:
content_per_month = (
    df_titles
    .groupby("year_month")["show_id"]
    .nunique()
    .sort_index()
)

In [None]:
generate_line_chart(
    s=content_per_month,
    title="Content Added Over Time (by date_added)",
    figtitle="Content Addition over time",
    xlabel="Year",
    ylabel="Number of Titles Added"
    )

In [None]:
df_country = (
    df[["show_id","country"]]
    .drop_duplicates()
)

In [None]:
s = df_country['country'].value_counts().head(15)
barh_top_counts_series_black_background(
    s,
    title="Top 15 Countries of Production (Count & Share)",
    xlabel="Country",
    color="#E50914",
)

### Geographic Intelligence: A Dual-Prong Strategy
This visualization is a cornerstone for the "Geographic Content Analysis" and directly addresses key strategic questions about Netflix's "global expansion strategy". The data reveals a clear dual-prong approach.

1. The US-Centric Core: The United States remains the undisputed primary "production hub", accounting for 41.3% (3,690 titles) of the content. This massive domestic library serves as the historical foundation of the catalog.

2. Aggressive Global Diversification: This chart provides a clear answer to the strategic question, "Is Netflix diversifying away from US content?". The answer is a definitive yes.


International content now forms the majority (58.7%) of the catalog, proving that the "International expansion strategy" has successfully shifted the "International vs. domestic content ratio"  to be globally focused.


5.1 Identification of Strategic Regional Hubs
Beyond the US, the analysis identifies the "Top content-producing countries"  that serve as key regional hubs.

Primary Hubs (India & UK): India (11.7%) and the United Kingdom (9.0%) are not just minor players; they are major, strategic production centers. India's position as the #2 producer is a critical insight, highlighting a deep investment in that specific market, far outpacing all other countries except the US.


Emerging Hubs (South Korea & Japan): This data validates the "Emerging Hubs"  strategy. The presence of South Korea (2.6%) and Japan (3.6%) in the top 10 confirms the strategic investment in high-growth, high-interest markets, as noted in the hackathon's own case study example.


The "Long-Tail" Strategy: The presence of the "Others" (9.3%) category as the 3rd largest bar is a significant finding. It implies that beyond investing in specific hubs, Netflix is also pursuing a "long-tail" strategy, sourcing content from a wide variety of smaller countries. This diversity is key to appealing to a global subscriber base.

In [None]:
df_titles["main_country"] = (
    df_titles["country"]
    .fillna("")
    .str.split(",")
    .str[0]
    .str.strip()
)

In [None]:
country_to_lang = {
    "United States": "English",
    "South Africa": "English",           
    "Others": "Other",
    "India": "Hindi",             
    "United Kingdom": "English",
    "Germany": "German",
    "Mexico": "Spanish",
    "Turkey": "Turkish",
    "Australia": "English",
    "Finland": "Finnish",
    "China": "Mandarin Chinese",
    "Nigeria": "English",               
    "Japan": "Japanese",
    "Spain": "Spanish",
    "France": "French",
    "Belgium": "French",           
    "South Korea": "Korean",
    "Argentina": "Spanish",
    "Russia": "Russian",
    "Canada": "English",          
    "Hong Kong": "Cantonese",            
    "Italy": "Italian",
    "Ireland": "English",
    "New Zealand": "English",
    "Jordan": "Arabic",
    "Colombia": "Spanish",
    "Switzerland": "German",  
    "Israel": "Hebrew",
    "Brazil": "Portuguese",
    "Taiwan": "Mandarin Chinese",
    "Bulgaria": "Bulgarian",
    "Poland": "Polish",
    "Saudi Arabia": "Arabic",
    "Thailand": "Thai",
    "Indonesia": "Indonesian",
    "Egypt": "Arabic",
    "Kuwait": "Arabic",
    "Malaysia": "Malay",
    "Vietnam": "Vietnamese",
    "Sweden": "Swedish",
    "Lebanon": "Arabic",
    "Romania": "Romanian",
    "Philippines": "Filipino",  
    "Iceland": "Icelandic",
    "Denmark": "Danish",
    "United Arab Emirates": "Arabic",
    "Netherlands": "Dutch",
    "Norway": "Norwegian",
    "Syria": "Arabic",
    "Mauritius": "French",      
    "Austria": "German",
    "Czech Republic": "Czech",
    "Cameroon": "French",             
    "Uruguay": "Spanish",
    "Kenya": "English",                
    "Chile": "Spanish",
    "Luxembourg": "French",        
    "Bangladesh": "Bengali",
    "Portugal": "Portuguese",
    "Hungary": "Hungarian",
    "Senegal": "French",
    "Singapore": "English",              
    "Serbia": "Serbian",
    "Namibia": "English",
    "Peru": "Spanish",
    "Mozambique": "Portuguese",
    "Belarus": "Russian",     
    "Ghana": "English",
    "Zimbabwe": "English",
    "Puerto Rico": "Spanish",
    "Pakistan": "Urdu",
    "Cyprus": "Greek",
    "Paraguay": "Spanish",
    "Croatia": "Croatian",
    "Cambodia": "Khmer",
    "Georgia": "Georgian",
    "Soviet Union": "Russian",           
    "Greece": "Greek",
    "West Germany": "German",            
    "Iran": "Persian (Farsi)",
    "Venezuela": "Spanish",
    "Slovenia": "Slovene",
    "Guatemala": "Spanish",
    "Ukraine": "Ukrainian",
    "Jamaica": "English",
    "Somalia": "Somali",
}


In [None]:
df_titles["language_guess"] = df_titles["main_country"].map(country_to_lang).fillna("Other")

In [None]:
s=df_titles['language_guess'].value_counts().head(15)
barh_top_counts_series_black_background(
    s,
    title="Top 15 Languages of Production (Count & Share)",
    xlabel="Language",
    color="#E50914",
)

In [None]:
language_per_month = (
    df_titles
    .groupby(["year_month", "language_guess"])["show_id"]
    .nunique()
    .unstack("language_guess")
    .fillna(0)
    .sort_index()
)

In [None]:
generate_line_chart(
    s=language_per_month["English"],
    title="English Language Content Added Over Time",
    figtitle="",
    xlabel="Year",
    ylabel="Number of Language Added"
    )

In [None]:
generate_line_chart(
    s=language_per_month["Hindi"],
    title="Hindi Language Added Over Time (by date_added)",
    xlabel="Year",
    figtitle="",
    ylabel="Number of Language Added"
    )

In [None]:
generate_line_chart(
    s=language_per_month["Other"],
    title="Other Language Added Over Time (by date_added)",
    xlabel="Year",
    figtitle="",
    ylabel="Number of Language Added"
    )

Temporal Analysis of Linguistic Strategy: A Simultaneous Global Expansion
A combined analysis of the language distribution and its growth over time reveals a critical insight: Netflix's global expansion was not a sequential strategy (US-first, then international), but a simultaneous, parallel investment.

1. A Unified "Firehose" Strategy: The most significant finding is that the temporal charts for English, Hindi, and Other Languages all follow the exact same pattern. They all demonstrate:

A slow "trickle" era before 2015.

A dramatic, volatile "firehose" of content additions beginning around 2015-2017. This proves that when Netflix made its "strategic pivot" to rapidly scale its catalog, it scaled its English, Indian, and "long-tail" global content in lockstep. The global expansion was part of the core strategy from the first day of the "Streaming Wars" pivot, not an afterthought.

2. Validating the "Strategic Hubs" Timeline: The "Top 15 Languages" chart identifies English (53.0%) and Hindi (12.0%) as the top two specific linguistic pillars. The line charts confirm when this investment happened. The aggressive, spiky growth in the "Hindi Language Added Over Time" chart confirms that the investment in the Indian market was a core, high-volume component of the post-2015 content acquisition drive.

3. The "Long-Tail" is an Active, High-Volume Strategy: The "Other Language" chart provides a crucial, non-obvious insight. This category, which represents the 9.9% "Other" block, is not a passive collection.

Its peak addition events (reaching over 90 titles) are higher than the peaks for Hindi (around 75).

This proves the "long-tail" global strategy is an active, high-volume acquisition process, just as aggressive as the focus on specific linguistic hubs.

In summary, this combined analysis proves Netflix's content strategy evolved by scaling its domestic (English) and international (Hindi, Other) catalogs simultaneously, executing a single, unified global "firehose" strategy.

### Sourendra's Analysis

In [None]:
# Set a style for all plots
sns.set_style("whitegrid")
new_df = df.copy()

## Date_added Change into datetime format

In [None]:
# Convert the 'date_added' column to datetime objects
new_df['date_added'] = pd.to_datetime(new_df['date_added'].str.strip())

# Verify the data type has changed
print("\nData types of new_df after conversion:")
print(new_df.dtypes)

## New Year, Month, Day columns

In [None]:
# Extract time-based features from the 'date_added' column
new_df['year_added'] = new_df['date_added'].dt.year
new_df['month_added'] = new_df['date_added'].dt.month
new_df['day_of_week_added'] = new_df['date_added'].dt.dayofweek

# Display the first few rows with the new columns
print("\nDataFrame with new temporal features:")
print(new_df[['date_added', 'year_added', 'month_added', 'day_of_week_added']].head())

## Content lag column addition

In [None]:
# Calculate the difference in years
new_df['content_lag'] = new_df['year_added'] - new_df['release_year']

# Display the first few rows with the new 'content_lag' column
print("\nDataFrame with 'content_lag' feature:")
print(new_df[['title', 'release_year', 'year_added', 'content_lag']].head(3))

## Grouping togther by Year added

In [None]:
# Group by year_added and type from our cleaned DataFrame
yearly_counts = new_df.groupby('year_added')['type'].value_counts().unstack().fillna(0)

print("\nContent counts per year:")
print(yearly_counts)

## Bar chart of Content added per year

In [None]:
# Group by year_added and type from our cleaned DataFrame
yearly_counts = new_df.groupby('year_added')['type'].value_counts().unstack().fillna(0)

#Generating the stacked bar plot
bar_stacked(yearly_counts, title='Content Added to Netflix Each Year (Stacked by Type)', xlabel='Year Added', ylabel='Number of Titles Added')

## Plotting of Cumulative Sum of Titles per year

In [None]:
cumulative_counts = yearly_counts.cumsum()

generate_line_chart(
    cumulative_counts,
    title="Cumulative Growth of Netflix Catalog",
    xlabel="Year",
    ylabel="Total Number of Titles",
    figtitle="",
    color="red",
    marker="o"
)


## Analyze Content Additions by Month

In [None]:
# Count titles per month
monthly_additions = new_df['month_added'].value_counts().sort_index()

# Convert month numbers to abbreviations
import calendar
monthly_additions.index = [calendar.month_abbr[i] for i in monthly_additions.index]

# Plot using the helper function
bar_chart_vertical(
    monthly_additions,
    title="Total Content Added by Month (All Years)",
    xlabel="Month",
    ylabel="Number of Titles Added",
    color="#E50914"
)

## Content Additions by Day of the Week

In [None]:
import pandas as pd
import calendar

#Count titles added per weekday (Monday=0, Sunday=6)
day_of_week_additions = new_df['day_of_week_added'].value_counts().sort_index()

#Map numeric day indices (0‚Äì6) to weekday names
day_names = [calendar.day_name[i] for i in day_of_week_additions.index]

#Make a Series with weekday names as index
day_counts = pd.Series(day_of_week_additions.values, index=day_names)

#Convert the index to an ordered categorical type (Mon‚ÜíSun)
ordered_days = list(calendar.day_name)
day_counts.index = pd.CategoricalIndex(day_counts.index, categories=ordered_days, ordered=True)

# Plot
bar_chart_vertical(
    day_counts,
    title="Number of Titles Added by Day of the Week",
    xlabel="Day of the Week",
    ylabel="Number of Titles Added",
    rotation=45,
)


## Heatmap for a Year-Month View

In [None]:
# --- Temporal Analysis: Year‚ÄìMonth Heatmap ---

# Generate the heatmap showing how many titles were added per year and month
heatmap_by_category(
    new_df,
    row_col="year_added",
    col_col="month_added",
    value_col="show_id",
    #cmap="viridis",
    figsize=(20, 10),
    title="Heatmap of Content Added by Year and Month"
)


## Shift in Content Strategy (Older vs. Newer Content)

In [None]:
# Calculate the average content lag for each year
avg_lag_by_year = new_df.groupby('year_added')['content_lag'].mean()

# Use the helper function
generate_line_chart(
    avg_lag_by_year,
    title="Average Lag Between Release Year and Addition Year on Netflix",
    xlabel="Year Added to Netflix",
    ylabel="Average Lag (in Years)",
    figtitle="",
    color="red",
    marker="o"
)


## Visualize the Distribution of Content Lag with a Box Plot (Plotly)

In [None]:
# --- Temporal Analysis: Content Lag Distribution by Year ---
# Plot interactive boxplot for content lag by year added
generate_boxplot_interactive(
    df=new_df,
    cat="year_added",
    val="content_lag",
    outlier=True
)


## Visualize the Distribution of Content Lag with a Box Plot (matplotlib)

In [None]:
# --- Temporal Analysis: Distribution of Content Lag by Year ---

# Plot the distribution of content lag by year added
generate_boxplot(
    df=new_df,
    cat="year_added",
    val="content_lag",
    outlier=True
)


## Average Lag for Movies vs. TV Shows

In [None]:
# Calculate the average content lag per year for Movies
avg_lag_movies = new_df[new_df['type'] == 'Movie'] \
    .groupby('year_added')['content_lag'].mean()

# Calculate the average content lag per year for TV Shows
avg_lag_tv_shows = new_df[new_df['type'] == 'TV Show'] \
    .groupby('year_added')['content_lag'].mean()

# Prepare data dictionary for the helper function
data = {
    "Movies": avg_lag_movies,
    "TV Shows": avg_lag_tv_shows
}

# Call the modular helper function
from helper_functions import generate_multi_line_chart

generate_multi_line_chart(
    data_dict=data,
    title="Average Content Lag: Movies vs. TV Shows",
    xlabel="Year Added to Netflix",
    ylabel="Average Lag (in Years)",
)


## Monthly Heatmap of Movies

In [None]:
import calendar
from matplotlib.colors import LinearSegmentedColormap

# --- Create pivot-ready data ---
# Crosstab is not needed here since the helper will handle it internally
# Just ensure your 'month_added' is numeric (1‚Äì12)
# and your 'type' column exists
# Example: movies per month

# Netflix red gradient (for visual consistency)
light_red = "#FFB3B3"
mid_red = "#E50914"
deep_red = "#B81D24"
movie_cmap = LinearSegmentedColormap.from_list(
    "netflix_movie_white",
    [light_red, mid_red, deep_red],
    N=256
)

# --- Call the helper ---
generate_heatmap_flexible(
    df=new_df[new_df["type"] == "Movie"],
    index_col="type",               # Not really used (since we filter for Movie)
    column_col="month_added",       # X-axis = Month
    value_col="show_id",            # Count number of movies
    aggfunc="count",
    cmap=movie_cmap,
    figsize=(14, 3),
    orientation="horizontal",
    title="Monthly Additions ‚Äî Movies",
    xlabel="Month",
    ylabel="",                      # No y-axis label for this layout
    cbar_label="Number of Movie Titles Added",
)


## Monthly heatmap of TV shows



In [None]:
import calendar
import pandas as pd
from matplotlib.colors import LinearSegmentedColormap

# --- New Netflix Themed Gradient (Charcoal to Red) ---
# This theme uses the dark background color often seen in the Netflix UI,
# transitioning to the iconic red for emphasis.
charcoal = "#221F1F"  # A deep, dark gray, almost black
netflix_red = "#E50914" # The classic, vibrant Netflix red
off_white = "#F5F5F1"  # A bright, clean off-white for the highest values

# Create the new colormap
show_cmap_dark = LinearSegmentedColormap.from_list(
    "netflix_show_dark", [charcoal, netflix_red, off_white], N=256
)

# --- Heatmap ---
# We use the same corrected logic as before, simply swapping the color map.
generate_heatmap_flexible(
    df=new_df[new_df["type"] == "TV Show"],
    index_col="type",               # The row will be 'TV Show'
    column_col="month_added",       # X-axis will be the months
    value_col="show_id",            # Value to count for the heatmap intensity
    aggfunc="count",                # Aggregation function
    cmap=show_cmap_dark,            # Apply our new dark theme colormap
    figsize=(14, 3),
    orientation="horizontal",
    title="Monthly Additions ‚Äî TV Shows",
    xlabel="Month",
    ylabel="",
    cbar_label="Number of TV Shows Added"
)

### Aditya's Analysis

In [None]:
df.shape

In [None]:
import matplotlib.pyplot as plt
plt.rcParams['axes.facecolor'] = '#141414'
plt.rcParams['figure.facecolor'] = '#141414'
plt.rcParams['text.color'] = '#FFFFFF'
plt.rcParams['axes.labelcolor'] = '#FFFFFF'
plt.rcParams['xtick.color'] = '#FFFFFF'
plt.rcParams['ytick.color'] = '#FFFFFF'

In [None]:
from importlib import reload
import helper_functions
reload(helper_functions)

In [None]:
plot_top_countries_by_shows(df, 50)

In [None]:
plot_india_vs_outside_movies(df, country_name="India")

In [None]:
from helper_functions import * 
plot_category_frequency_per_country(df, top_n = 40)

In [None]:
plot_category_frequency_by_country(df, top_n=20)

In [None]:
plot_avg_movie_duration_by_country(df, top_n=50)

In [None]:
plot_avg_seasons_by_country(df, top_n=50)

In [None]:
plot_movie_coproduction_heatmap(df, top_n=30)

In [None]:
df_movies = pd.read_csv(path_movies)

In [None]:

pivot = plot_genre_heatmap_by_country(
    df_movies, 
    top_n=30, 
    country_col='country', 
    genre_col='genres', 
    id_col='show_id', 
    annotate=True
)

In [None]:
! pip install pandas geopandas plotly pycountry

In [None]:
from helper_functions import *
plot_country_budget_profit(df_movies, country_col='country', budget_col='budget', revenue_col='revenue', id_col='show_id', top_n=50)

In [None]:
countries = sorted(df_movies['country'].dropna().unique())
plot_country_map(countries)

In [None]:
df_movies = safe_split_and_explode_cast(df_movies, 'cast') 

### Gap Analysis (Advance Analysis)

In [None]:
from helper_functions import *
plot_top_cast_avg_popularity(df_movies, 40)

### Text Analysis

In [None]:
df_vader=pd.read_csv(path_movies_2025_raw)

In [None]:
df_desc=df_vader[['show_id','description']]

In [None]:
df_desc=df_desc.fillna('Unknown description.')

In [None]:
df_vader['description']=df_desc['description']

In [None]:
vaders = score_descriptions(df_desc, id_col="show_id", text_col="description")

In [None]:
vaders.head()

In [None]:
df_vader = df_vader.merge(vaders, on="show_id", how="left")

In [None]:
def vader_label(comp):
    if comp >= 0.05:
        return "positive"
    elif comp <= -0.05:
        return "negative"
    else:
        return "neutral"

df_vader["sentiment"] = df_vader["compound"].apply(vader_label)

In [None]:
pie_by_count(
    df_vader,
    column='sentiment',
    title='Sentiment Distribution of Netflix Movie Descriptions',
    startangle=0
)

In [None]:
df_genre=df_vader[['show_id','genres','sentiment']]

In [None]:
# Split the 'country' column on commas and expand into multiple rows
df_genre= df_genre.assign(genres=df_genre['genres'].str.split(',')).explode('genres')

# Clean up any whitespace around country names
df_genre['genres'] = df_genre['genres'].str.strip()

# Reset index for cleanliness
df_genre = df_genre.reset_index(drop=True)

In [None]:
sent_dist = (
    df_genre
    .groupby(["genres", "sentiment"])
    .size()
    .reset_index(name="count")
)


In [None]:
sent_dist.head()

In [None]:
total_per_genre = sent_dist.groupby("genres")["count"].transform("sum")
sent_dist["perc"] = (sent_dist["count"] / total_per_genre) * 100


In [None]:
sent_pivot = sent_dist.pivot(
    index="genres",
    columns="sentiment",
    values="perc",
).fillna(0).sort_values("neutral", ascending=False)


In [None]:
plot_sentiment_by_genre(sent_pivot, title="Sentiment Distribution by Genre for Movies")


# Insights:-
Sentiment Analysis Across Genres (VADER):- <br>
Applying VADER to title descriptions shows a clear genre-driven pattern. ‚ÄúWar‚Äù, ‚ÄúWestern‚Äù, ‚ÄúAction‚Äù, and ‚ÄúThriller‚Äù titles have a higher share of negative-leaning descriptions, mainly because their synopses contain conflict words such as fight, war, killer, mission, revenge, crime. In contrast, ‚ÄúRomance‚Äù, ‚ÄúMusic‚Äù, and to some extent ‚ÄúDocumentary‚Äù show a larger positive segment, as their blurbs tend to use supportive/emotional language like love, journey, celebrates, follows, family. Neutral descriptions are relatively few across all genres, indicating that even short Netflix-style synopses usually contain at least one sentiment-bearing word that pushes the score up or down. Overall, the sentiment we observe is a property of how the genre is written about, not how the genre is received by viewers. Hence, negative sentiment here should be read as ‚Äúconflict-/threat-heavy description‚Äù, not ‚Äúusers dislike this genre.‚Äù

In [None]:
df_genre_exploded=df_vader.copy()
# Split the 'country' column on commas and expand into multiple rows
df_genre_exploded= df_genre_exploded.assign(genres=df_vader['genres'].str.split(',')).explode('genres')

# Clean up any whitespace around country names
df_genre_exploded['genres'] = df_genre_exploded['genres'].str.strip()

# Reset index for cleanliness
df_genre_exploded= df_genre_exploded.reset_index(drop=True)

In [None]:
result = (
    df_genre_exploded.groupby('genres')
      .agg(avg_popularity=('popularity', 'mean'),
           avg_sent=("compound", "mean"),
           movie_count=('title', 'nunique'))
      .reset_index()
)

In [None]:
temp=result[result['movie_count']>=5]
temp.head()

In [None]:
fig = genre_popularity_sentiment_scatter(temp)
fig.show()

‚Ä¢ Popularity ‚â† positivity. Some of the most popular genres sit around 25‚Äì35 avg popularity but have neutral to clearly negative average sentiment (bottom-right area). That means darker/conflict-heavy genres still perform well ‚Üí sentiment is not the primary driver of demand.

‚Ä¢ Positive but mid-popular genres exist. A few bubbles in the upper-middle (higher sentiment, mid popularity) suggest there are genres whose descriptions sound ‚Äúnice‚Äù but don‚Äôt get as much traction ‚Äî these are good candidates for better surfacing/promotions.

‚Ä¢ High-sentiment + small bubble = under-supplied. Small bubbles high on the y-axis (high avg_sent, low title count) are ‚Äúpleasantly written but small‚Äù genres ‚Üí catalogue expansion or regional carousels could lift them.

‚Ä¢ Overall: genre/content type explains popularity better than description tone; use sentiment mainly for mood tagging / experience, not ranking.

In [None]:
result = (
    df_genre_exploded.groupby('genres')
      .agg(avg_rating=('rating', 'mean'),
           avg_sent=("compound", "mean"),
           movie_count=('title', 'nunique'))
      .reset_index()
)

In [None]:
temp=result[result['movie_count']>=5]
temp.head()

In [None]:
fig = genre_rating_sentiment_scatter(temp)
fig.show()

‚Ä¢ No strong linear link. Genres with higher user ratings (right side, ~6.4‚Äì6.8) are not consistently the ones with more positive descriptions ‚Äî some of them still sit around 0 or even slightly negative sentiment. So viewers don‚Äôt reward ‚Äúhappy-sounding‚Äù descriptions directly.

‚Ä¢ Well-rated dark genres exist. There are bubbles in the 6.0‚Äì6.3 rating band with clearly negative sentiment (below 0). That means genres with conflict/crime/serious themes can still get good audience scores ‚Äî tone of description ‚â† quality.

‚Ä¢ Best quadrant to grow: top-right (higher rating, positive sentiment) has only a few bubbles and many of them are small ‚Üí not many titles live there. That‚Äôs a ‚Äúmake more of this‚Äù space: genres that people rate well and that are described in an appealing / positive way.

‚Ä¢ Conclusion: IMDB/user rating seems to be driven more by content quality / genre than by the wording/sentiment of the synopsis. Sentiment is better used as a mood tag than a predictor of rating.

# TV Shows:-

In [None]:
df_vader2=pd.read_csv(path_tv_shows)

In [None]:
df_desc2=df_vader[['show_id','description']]

In [None]:
df_desc2=df_desc2.fillna('Unknown description.')

In [None]:
df_vader2['description']=df_desc2['description']

In [None]:
df_vader2=add_vader_scores(df_vader2, text_col="description", unknown_text="Unknown description.")

In [None]:
def vader_label(comp):
    if comp >= 0.05:
        return "positive"
    elif comp <= -0.05:
        return "negative"
    else:
        return "neutral"

df_vader2["sentiment"] = df_vader2["compound"].apply(vader_label)

In [None]:
pie_by_count(
    df_vader2,
    column='sentiment',
    title='Sentiment Distribution of Netflix TV Show Descriptions',
    startangle=0
)

In [None]:
df_genre2=df_vader2[['show_id','genres','sentiment']]

In [None]:
# Split the 'country' column on commas and expand into multiple rows
df_genre2= df_genre2.assign(genres=df_genre2['genres'].str.split(',')).explode('genres')

# Clean up any whitespace around country names
df_genre2['genres'] = df_genre2['genres'].str.strip()

# Reset index for cleanliness
df_genre2 = df_genre2.reset_index(drop=True)

In [None]:
sent_dist2 = (
    df_genre2
    .groupby(["genres", "sentiment"])
    .size()
    .reset_index(name="count")
)


In [None]:
total_per_genre = sent_dist2.groupby("genres")["count"].transform("sum")
sent_dist2["perc"] = (sent_dist2["count"] / total_per_genre) * 100

In [None]:
sent_pivot2 = sent_dist2.pivot(
    index="genres",
    columns="sentiment",
    values="perc",
).fillna(0).sort_values("neutral", ascending=False)

In [None]:
plot_sentiment_by_genre(sent_pivot2,title="Sentiment Distribution by Genre for TV Shows")

Sentiment Analysis Across TV Show Genres (VADER):<br>
Applying VADER to TV show descriptions also reveals a strong genre-driven tone. Informational genres like ‚ÄúNews‚Äù, ‚ÄúTalk‚Äù, and ‚ÄúDocumentary‚Äù are dominated by neutral descriptions, since their blurbs are factual (‚Äúcovers‚Ä¶‚Äù, ‚Äúexplores‚Ä¶‚Äù, ‚Äúdiscusses‚Ä¶‚Äù) rather than emotional. In contrast, family-oriented and entertainment genres ‚Äî ‚ÄúKids‚Äù, ‚ÄúComedy‚Äù, ‚ÄúAnimation‚Äù, and ‚ÄúFamily‚Äù ‚Äî show a much larger positive segment, reflecting warm/supportive language such as family, friends, adventure, fun, follow the story of‚Ä¶. Conflict-heavy or plot-tension genres like ‚ÄúCrime‚Äù, ‚ÄúMystery‚Äù, ‚ÄúAction & Adventure‚Äù, and ‚ÄúSci-Fi & Fantasy‚Äù exhibit higher negative-leaning descriptions because their synopses contain words related to danger, pursuit, threats, or battles. As with movies, neutral text is relatively limited overall, meaning even short TV blurbs usually contain at least one word that pushes sentiment up or down. Therefore, the sentiment here should be interpreted as ‚Äútone of description by TV sub-genre‚Äù rather than viewer liking ‚Äî negative ‚âà ‚Äúhigh-stakes / conflict-driven storyline,‚Äù not ‚Äúaudience dislikes this show.‚Äù

In [None]:
df_genre_exploded=df_vader2.copy()
# Split the 'country' column on commas and expand into multiple rows
df_genre_exploded= df_genre_exploded.assign(genres=df_vader['genres'].str.split(',')).explode('genres')

# Clean up any whitespace around country names
df_genre_exploded['genres'] = df_genre_exploded['genres'].str.strip()

# Reset index for cleanliness
df_genre_exploded= df_genre_exploded.reset_index(drop=True)

In [None]:
result = (
    df_genre_exploded.groupby('genres')
      .agg(avg_popularity=('popularity', 'mean'),
           avg_sent=("compound", "mean"),
           tv_show_count=('title', 'nunique'))
      .reset_index()
)

In [None]:
temp=result[result['tv_show_count']>=5]
temp.head()

In [None]:
fig = tv_genre_popularity_sentiment_scatter(temp)
fig.show()

Popularity vs Sentiment by Genre (VADER):<br>
The scatter shows that average sentiment across TV genres is tight and mildly positive (‚âà0.13‚Äì0.18), but popularity varies a lot more. This means viewer interest is moving more with the genre itself than with how positive the description sounds. A few genres in the top-right (high popularity, high sentiment) are clear ‚Äúkeep promoting‚Äù buckets, while some mid-sentiment, mid-popularity genres with large bubbles indicate broad catalog but only average success, so they are better targets for discoverability improvements than for rewriting descriptions. Overall, sentiment is not a strong discriminator of TV genre performance here ‚Äî use it mainly for mood tagging, not ranking.

In [None]:
result = (
    df_genre_exploded.groupby('genres')
      .agg(avg_rating=('rating', 'mean'),
           avg_sent=("compound", "mean"),
           tv_show_count=('title', 'nunique'))
      .reset_index()
)

In [None]:
temp=result[result['tv_show_count']>=5]
temp.head()

In [None]:
fig = tv_genre_rating_sentiment_scatter(temp)
fig.show()

TV Genres ‚Äì Rating vs Sentiment (VADER):<br>

The whole cloud is very tight: most TV genres sit between 5.3‚Äì5.8 average rating and 0.13‚Äì0.18 sentiment, so user ratings don‚Äôt change much across genres.

A couple of genres in the top-right (higher rating and higher sentiment) look like the ‚Äúcleanest‚Äù TV buckets ‚Äî they are written in a more positive tone and are also rated slightly better ‚Üí good candidates to surface.

There are also genres with average ratings but slightly lower sentiment ‚Äî this shows viewers can still rate a show decently even when the description sounds more serious/conflict-driven.

Overall: for TV, description tone is not a strong driver of user rating; rating differences are small and likely explained by the genre/content type itself, not by how positive the synopsis is.

In [None]:
df.shape

In [None]:
from importlib import reload
import helper_functions
reload(helper_functions)

In [None]:
exclude = ['no_data', 'UR', 'NC-17']
df_movies = df[df['type'] == 'Movie'].copy()
df_shows = df[df['type'] == 'TV Show'].copy()

movie_rating_counts = df_movies['rating'].value_counts().sort_index()
show_rating_counts = df_shows['rating'].value_counts().sort_index()
# Colors
netflix_red = "#E50914"

movie_plot = movie_rating_counts.drop(labels=[r for r in exclude if r in movie_rating_counts.index], errors='ignore')
show_plot = show_rating_counts.drop(labels=[r for r in exclude if r in show_rating_counts.index], errors='ignore')

# Movie Ratings Plot
plt.figure(figsize=(8,5))

plt.bar(movie_plot.index, movie_plot.values, color=netflix_red)
plt.title("Rating Distribution for Movies", fontsize=14, weight='bold')
plt.xlabel("Rating", fontsize=12)
plt.ylabel("Number of Movies", fontsize=12)
plt.gca().set_facecolor("black")
plt.show()
plt.figure(figsize=(8,5))
plt.bar(show_plot.index, show_plot.values, color=netflix_red)
plt.title("Rating Distribution for TV Shows", fontsize=14, weight='bold')
plt.xlabel("Rating", fontsize=12)
plt.ylabel("Number of TV Shows", fontsize=12)
plt.gca().set_facecolor("black")
plt.show()

# ANALYSIS OF COUNTRIES PRODUCING MOVIES AND TV SHOWS OF EACH CATEGORY

In [None]:
df['country_list'] = df['country'].str.split(", ")
df['genre_list'] = df['listed_in'].str.split(", ")

df_country = df.explode('country_list')
df_genre = df.explode('genre_list')
r_movies_country = df_country[(df_country['type'] == 'Movie') & (df_country['rating'] == 'R')]
r_shows_country = df_country[(df_country['type'] == 'TV Show') & (df_country['rating'] == 'R')]

r_movies_genre = df_genre[(df_genre['type'] == 'Movie') & (df_genre['rating'] == 'R')]
r_shows_genre = df_genre[(df_genre['type'] == 'TV Show') & (df_genre['rating'] == 'R')]
import matplotlib.pyplot as plt

top_r_movie_countries = r_movies_country['country_list'].value_counts().head(10)

plt.figure(figsize=(10,5))
plt.bar(top_r_movie_countries.index, top_r_movie_countries.values, color='red')
plt.title("Top Countries Producing R-Rated Movies", fontsize=14, weight='bold')
plt.xlabel("Country")
plt.ylabel("Number of R-Rated Movies")
plt.xticks(rotation=45)
plt.gca().set_facecolor("black")
plt.show()
top_r_show_countries = r_shows_country['country_list'].value_counts().head(10)

plt.figure(figsize=(10,5))
plt.bar(top_r_show_countries.index, top_r_show_countries.values, color='red')
plt.title("Top Countries Producing R-Rated TV Shows", fontsize=14, weight='bold')
plt.xlabel("Country")
plt.ylabel("Number of R-Rated TV Shows")
plt.xticks(rotation=45)
plt.gca().set_facecolor("black")
plt.show()

# TOP GENRES PRODUCING R-RATED CONTENT

In [None]:
top_r_movie_genres = r_movies_genre['genre_list'].value_counts().head(10)

plt.figure(figsize=(10,5))
plt.bar(top_r_movie_genres.index, top_r_movie_genres.values, color='red')
plt.title("Top Genres in R-Rated Movies", fontsize=14, weight='bold')
plt.xlabel("Genre")
plt.ylabel("Number of R-Rated Movies")
plt.xticks(rotation=45)
plt.gca().set_facecolor("black")
plt.show()
top_r_show_genres = r_shows_genre['genre_list'].value_counts().head(10)

plt.figure(figsize=(10,5))
plt.bar(top_r_show_genres.index, top_r_show_genres.values, color='red')
plt.title("Top Genres in R-Rated TV Shows", fontsize=14, weight='bold')
plt.xlabel("Genre")
plt.ylabel("Number of R-Rated TV Shows")
plt.gca().set_facecolor("black")
plt.xticks(rotation=45)
plt.show()

# PG-13

In [None]:
r_movies_country = df_country[(df_country['type'] == 'Movie') & (df_country['rating'] == 'PG-13')]
r_shows_country = df_country[(df_country['type'] == 'TV Show') & (df_country['rating'] == 'PG-13')]

r_movies_genre = df_genre[(df_genre['type'] == 'Movie') & (df_genre['rating'] == 'PG-13')]
r_shows_genre = df_genre[(df_genre['type'] == 'TV Show') & (df_genre['rating'] == 'PG-13')]
top_r_movie_countries = r_movies_country['country_list'].value_counts().head(10)

plt.figure(figsize=(10,5))
plt.bar(top_r_movie_countries.index, top_r_movie_countries.values, color='red')
plt.title("Top Countries Producing PG13-Rated Movies", fontsize=14, weight='bold')
plt.xlabel("Country")
plt.ylabel("Number of PG13-Rated Movies")
plt.xticks(rotation=45)
plt.gca().set_facecolor("black")
plt.show()
top_r_show_countries = r_shows_country['country_list'].value_counts().head(10)

plt.figure(figsize=(10,5))
plt.bar(top_r_show_countries.index, top_r_show_countries.values, color='red')
plt.title("Top Countries Producing PG13-Rated TV Shows", fontsize=14, weight='bold')
plt.xlabel("Country")
plt.ylabel("Number of PG13-Rated TV Shows")
plt.xticks(rotation=45)
plt.gca().set_facecolor("black")
plt.show()

In [None]:
top_r_movie_genres = r_movies_genre['genre_list'].value_counts().head(10)

plt.figure(figsize=(10,5))
plt.bar(top_r_movie_genres.index, top_r_movie_genres.values, color='red')
plt.title("Top Genres in PG13-Rated Movies", fontsize=14, weight='bold')
plt.xlabel("Genre")
plt.ylabel("Number of PG13-Rated Movies")
plt.xticks(rotation=45)
plt.gca().set_facecolor("black")
plt.show()
top_r_show_genres = r_shows_genre['genre_list'].value_counts().head(10)

plt.figure(figsize=(10,5))
plt.bar(top_r_show_genres.index, top_r_show_genres.values, color='red')
plt.title("Top Genres in PG13-Rated TV Shows", fontsize=14, weight='bold')
plt.xlabel("Genre")
plt.ylabel("Number of PG13-Rated TV Shows")
plt.xticks(rotation=45)
plt.gca().set_facecolor("black")
plt.show()

# G-RATED CONTENT

In [None]:
r_movies_country = df_country[(df_country['type'] == 'Movie') & (df_country['rating'] == 'G')]
r_shows_country = df_country[(df_country['type'] == 'TV Show') & (df_country['rating'] == 'G')]

r_movies_genre = df_genre[(df_genre['type'] == 'Movie') & (df_genre['rating'] == 'G')]
r_shows_genre = df_genre[(df_genre['type'] == 'TV Show') & (df_genre['rating'] == 'G')]
import matplotlib.pyplot as plt

top_r_movie_countries = r_movies_country['country_list'].value_counts().head(10)

plt.figure(figsize=(10,5))
plt.bar(top_r_movie_countries.index, top_r_movie_countries.values, color='red')
plt.title("Top Countries Producing G-Rated Movies", fontsize=14, weight='bold')
plt.xlabel("Country")
plt.ylabel("Number of G-Rated Movies")
plt.xticks(rotation=45)
plt.gca().set_facecolor("black")
plt.show()
top_r_show_countries = r_shows_country['country_list'].value_counts().head(10)

plt.figure(figsize=(10,5))
plt.bar(top_r_show_countries.index, top_r_show_countries.values, color='red')
plt.title("Top Countries Producing G-Rated TV Shows", fontsize=14, weight='bold')
plt.xlabel("Country")
plt.ylabel("Number of G-Rated TV Shows")
plt.xticks(rotation=45)
plt.gca().set_facecolor("black")
plt.show()

In [None]:
top_r_movie_genres = r_movies_genre['genre_list'].value_counts().head(10)

plt.figure(figsize=(10,5))
plt.bar(top_r_movie_genres.index, top_r_movie_genres.values, color='red')
plt.title("Top Genres in G-Rated Movies", fontsize=14, weight='bold')
plt.xlabel("Genre")
plt.ylabel("Number of G-Rated Movies")
plt.xticks(rotation=45)
plt.gca().set_facecolor("black")
plt.show()
top_r_show_genres = r_shows_genre['genre_list'].value_counts().head(10)

plt.figure(figsize=(10,5))
plt.bar(top_r_show_genres.index, top_r_show_genres.values, color='red')
plt.title("Top Genres in G-Rated TV Shows", fontsize=14, weight='bold')
plt.xlabel("Genre")
plt.ylabel("Number of G-Rated TV Shows")
plt.xticks(rotation=45)
plt.gca().set_facecolor("black")
plt.show()

# RATING EVOLUTION OVER TIME

In [None]:
df['release_year'] = df['release_year'].astype(int)

# Filter out unwanted ratings
df_filtered = df[~df['rating'].isin(['UR', 'no_data'])]

# Group by year + rating
year_rating = df_filtered.groupby(['release_year', 'rating']).size().reset_index(name='count')

# Pivot
pivot_year_rating = year_rating.pivot(index='release_year', columns='rating', values='count').fillna(0)

# Sort by year
pivot_year_rating = pivot_year_rating.sort_index()

# Netflix color theme (more distinguishable shades)
netflix_colors = [
    "#E50914",  # Netflix Red
    "#B81D24",  # Dark Netflix Red
    "#F5A3A3",  # Soft red-pink
    "#8B0000",  # Deep dark red
    "#C72C41",  # Crimson
    "#FF6B6B",  # Light Coral Red
    "#9A031E"   # Wine Red
][:len(pivot_year_rating.columns)]

# Plot
plt.figure(figsize=(14,7))
pivot_year_rating.plot.area(
    figsize=(14,7),
    linewidth=1.5,
    alpha=0.9,
    color=netflix_colors
)

plt.title("Rating Evolution Over Time on Netflix", fontsize=18, weight='bold', color="#E50914")
plt.xlabel("Release Year", fontsize=14, color="black")
plt.ylabel("Number of Titles", fontsize=14, color="black")

# Add readable year labels
plt.xticks(ticks=range(pivot_year_rating.index.min(), pivot_year_rating.index.max()+1, 3), 
           rotation=45, color="black")
plt.yticks(color="black")

plt.legend(title="Rating", facecolor="black", edgecolor="#E50914", labelcolor="white", title_fontsize=12)

plt.grid(alpha=0.25)
plt.gca().set_facecolor("black")
plt.show()

In [None]:
def categorize_rating(r):
    if r in ['G', 'PG']:
        return 'Family-Friendly'
    elif r == 'PG-13':
        return 'Teen / General Audience'
    elif r == 'R':
        return 'Mature'
    else:
        return None  # ignore no_data or small categories

df['content_group'] = df['rating'].apply(categorize_rating)
content_dist = df.groupby(['type', 'content_group']).size().reset_index(name='count')
content_dist['percent'] = (
    content_dist['count'] / content_dist.groupby('type')['count'].transform('sum') * 100
)

content_dist['percent'] = content_dist['percent'].round(2)

content_percent = content_dist
import seaborn as sns
import matplotlib.pyplot as plt

# Netflix theme colors
netflix_red_dark = "#E50914"   # Movies (darker red)
netflix_red_light = "#E87C7C"  # TV Shows (lighter red)
netflix_black = "#141414"
netflix_white = "#141414"

plt.figure(figsize=(10,6))
sns.barplot(
    data=content_dist,
    x='content_group',
    y='percent',
    hue='type',
    palette=[netflix_red_dark, netflix_red_light]  # Dark for movies, light for TV
)

plt.title("Family-Friendly vs Mature Content Balance on Netflix", fontsize=18, weight='bold', color=netflix_white)
plt.xlabel("Content Category", fontsize=14, color=netflix_white)
plt.ylabel("Number of Titles", fontsize=14, color=netflix_white)

plt.xticks(color=netflix_white)
plt.yticks(color=netflix_white)
ax = plt.gca()
ax.set_facecolor(netflix_black)
ax.spines['bottom'].set_color(netflix_white)
ax.spines['left'].set_color(netflix_white)
ax.spines['top'].set_color(netflix_black)   # hide
ax.spines['right'].set_color(netflix_black) # hide

plt.legend(
    title="Content Type",
    facecolor=netflix_black,
    edgecolor=netflix_white,
    labelcolor="#FFFFFF",
    title_fontsize=12,
    fontsize=11
)

plt.grid(alpha=0.15, color=netflix_white)
plt.show()

In [None]:
movies = df[df['type'] == 'Movie'].copy()
shows = df[df['type'] == 'TV Show'].copy()
import numpy as np

movies['minutes'] = (
    movies['duration']
    .str.extract(r'(\d+)')
    .astype(float)
)

# remove invalid or zero-length entries
movies = movies[movies['minutes'] > 0]
movies['minutes'] = movies['minutes'].replace([np.inf, -np.inf], np.nan).dropna()
shows['seasons'] = (
    shows['duration']
    .str.extract(r'(\d+)')
    .astype(float)
)

shows = shows[shows['seasons'] > 0]
import matplotlib.pyplot as plt
import seaborn as sns

netflix_red = "#E50914"
netflix_black = "#141414"
netflix_white = "#FFFFFF"

plt.figure(figsize=(12,6))
sns.histplot(movies['minutes'], kde=True, color=netflix_red)

plt.title("Distribution of Movie Runtimes on Netflix", fontsize=18, weight='bold', color=netflix_red)
plt.xlabel("Runtime (minutes)", fontsize=14, color=netflix_black)
plt.ylabel("Number of Movies", fontsize=14, color=netflix_black)
plt.xticks(color=netflix_black)
plt.yticks(color=netflix_black)

ax = plt.gca()
ax.set_facecolor(netflix_black)
for spine in ax.spines.values():
    spine.set_color(netflix_black)

plt.grid(alpha=0.2)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x='seasons', data=shows, palette=[netflix_red])

plt.title("Distribution of Number of Seasons in Netflix TV Shows", fontsize=18, weight='bold', color=netflix_red)
plt.xlabel("Number of Seasons", fontsize=14, color=netflix_black)
plt.ylabel("Number of Shows", fontsize=14, color=netflix_black)
plt.xticks(color=netflix_black)
plt.yticks(color=netflix_black)

ax = plt.gca()
ax.set_facecolor(netflix_black)
for spine in ax.spines.values():
    spine.set_color(netflix_black)

plt.grid(alpha=0.2)
plt.show()

# AVERAGE DURATION

In [None]:
movies = df[df['type'] == 'Movie'].copy()
tv = df[df['type'] == 'TV Show'].copy()

# Extract minutes for movies
movies['duration_clean'] = movies['duration'].str.extract(r'(\d+)').astype(int)

# Extract number of seasons for TV Shows
tv['duration_clean'] = tv['duration'].str.extract(r'(\d+)').astype(int)

# Split genres into separate rows
movies = movies.assign(genre = movies['listed_in'].str.split(', ')).explode('genre')
tv = tv.assign(genre = tv['listed_in'].str.split(', ')).explode('genre')

netflix_red = "#E50914"
netflix_black = "#141414"
netflix_white = "#FFFFFF"

# Movies
movie_genre_avg = movies.groupby('genre')['duration_clean'].mean().sort_values()

plt.figure(figsize=(11,5))
plt.bar(movie_genre_avg.index, movie_genre_avg.values, color=netflix_red)

plt.title("Average Movie Runtime by Genre (Netflix)", fontsize=16, weight='bold', color=netflix_black)
plt.ylabel("Minutes", fontsize=13, color=netflix_black)
plt.xticks(rotation=65, color=netflix_black)
plt.yticks(color=netflix_black)
ax = plt.gca()
ax.set_facecolor(netflix_black)

# Outline spines
for spine in ax.spines.values():
    spine.set_color(netflix_red)

# Light grid in white
plt.grid(alpha=0.15, color=netflix_white)

plt.show()

In [None]:
netflix_red = "#E50914"
netflix_black = "#141414"
netflix_white = "#FFFFFF"

# TV Shows
tv_genre_avg = tv.groupby('genre')['duration_clean'].mean().sort_values()

plt.figure(figsize=(16,5))
plt.bar(tv_genre_avg.index, tv_genre_avg.values, color=netflix_red)

plt.title("Average Number of Seasons by Genre (Netflix TV Shows)", fontsize=16, weight='bold', color=netflix_black)
plt.ylabel("Seasons (Average)", fontsize=13, color=netflix_black)
plt.xticks(rotation=65, color=netflix_black)
plt.yticks(color=netflix_black)

ax = plt.gca()
ax.set_facecolor(netflix_black)

# Red spines for Netflix look
for spine in ax.spines.values():
    spine.set_color(netflix_red)

# Light white grid
plt.grid(alpha=0.15, color=netflix_white)

plt.show()


# Stratergic analysis

In [None]:
#distribution of movies
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def get_top4_donut_data(df, content_type):
    dist = df[df['type'] == content_type]['rating']
    rating_percent = (dist.value_counts(normalize=True) * 100).round(2)
    rating_percent_df = rating_percent.reset_index()
    rating_percent_df.columns = ['rating', 'percent']

    rating_percent_df = rating_percent_df.sort_values(by="percent", ascending=False)
    top4 = rating_percent_df.head(4)
    others_value = rating_percent_df.iloc[4:]['percent'].sum()

    donut_df = top4.copy()
    donut_df.loc[len(donut_df)] = ['Others', others_value]

    return donut_df

# Prepare data
movie_data = get_top4_donut_data(df, "Movie")
tv_data = get_top4_donut_data(df, "TV Show")

# Create subplot layout (1 row, 2 columns)
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "domain"}, {"type": "domain"}]],
    subplot_titles=["Movies", "TV Shows"]
)

# Colors (Netflix red shades)
colors = ["#ff0000", "#cc0000", "#990000", "#660000", "#330000"]

# Add Movies donut
fig.add_trace(
    go.Pie(
        labels=movie_data['rating'],
        values=movie_data['percent'],
        hole=0.55,
        marker=dict(colors=colors),
        textinfo='percent+label'
    ),
    row=1, col=1
)

# Add TV Shows donut
fig.add_trace(
    go.Pie(
        labels=tv_data['rating'],
        values=tv_data['percent'],
        hole=0.55,
        marker=dict(colors=colors),
        textinfo='percent+label'
    ),
    row=1, col=2
)

# Layout style
fig.update_layout(
    title_text="Rating Distribution: Movies vs TV Shows (Top 4 + Others)",
    template="plotly_dark",
    paper_bgcolor="black",
    plot_bgcolor="black",
    font=dict(color="white", size=14),
    legend=dict(orientation="h", x=0.5, xanchor="center", y=-0.1)
)

fig.show()

In [None]:

#helper functions
def compute_genre_rating_pivot(df, content_type, drop_no_data=True, drop_aggregate_category=None):

    # Filter for the selected content type
    sub = df[df['type'] == content_type][['rating','listed_in','type']]
    sub = sub.assign(listed_in=sub['listed_in'].str.split(',\s*')).explode('listed_in')
    counts = sub.groupby(['rating', 'listed_in']).size().reset_index(name='count')
    pivot = counts.pivot(index='listed_in', columns='rating', values='count').fillna(0)
    if drop_no_data and 'no_data' in pivot.columns and content_type=='Movie':
        pivot = pivot.drop(columns=['no_data','NC-17','UR'])
    
    
    pivot_percent = pivot.div(pivot.sum(axis=1), axis=0) * 100
    
    # Optionally remove the "Movies" combined genre row
    if drop_aggregate_category in pivot_percent.index:
        pivot_percent = pivot_percent.drop(index=drop_aggregate_category)

    return pivot, pivot_percent.round(1)
import plotly.express as px

def plot_netflix_heatmap(data, title="Heatmap", 
                         x_label="Rating", y_label="Genre", 
                         color_scale=["#1a0000", "#8b0000", "#ff0000"]):


    fig = px.imshow(
        data,
        color_continuous_scale=color_scale,
        aspect="auto",
        labels=dict(x=x_label, y=y_label, color="Percent (%)")
    )

    fig.update_layout(
        title=title,
        template="plotly_dark",
        paper_bgcolor="black",
        plot_bgcolor="black",
        font=dict(color="white", size=14),
        xaxis=dict(side="top")
    )

    fig.show()

import plotly.express as px

def plot_netflix_heatmap(data, title="Heatmap", 
                         x_label="Rating", y_label="Genre", 
                         color_scale=["#1a0000", "#8b0000", "#ff0000"]):

    fig = px.imshow(
        data,
        color_continuous_scale=color_scale,
        aspect="auto",   # keep this for dynamic sizing
        labels=dict(x=x_label, y=y_label, color="Percent (%)"),
    )

    # Make squares (equal scaling)
    fig.update_xaxes(
        side="top",
        tickfont=dict(size=16),      # bigger font
        title_font=dict(size=18),             # equal scale
        scaleratio=1
    )
    fig.update_yaxes(
        tickfont=dict(size=16),      # bigger font
        title_font=dict(size=18),             # equal scale
        scaleratio=1
    )

    fig.update_layout(
        title=dict(text=title, font=dict(size=22)),  # bigger title
        template="plotly_dark",
        paper_bgcolor="black",
        plot_bgcolor="black",
        font=dict(color="white", size=16),  # general font increase
        width=900,
        height=900,  # makes it big & clear
        margin=dict(l=80, r=80, t=120, b=80)
    )

    fig.show()


In [None]:
movie_pivot, movie_percent = compute_genre_rating_pivot(df, content_type="Movie",drop_aggregate_category='Movies')
movie_percent.head(5)
plot_netflix_heatmap(movie_percent)

# demographic distribution

In [None]:
country=df[df['type']=='Movie'][['type','country','listed_in']]
country = country.assign(listed_in=country['listed_in'].str.split(',\s*')).explode('listed_in')
country = country.assign(country=country['country'].str.split(',\s*')).explode('country')
country = country[country['listed_in'] != 'International Movies']
# Count # of movies in each (country, genre) pair
country_genre_counts = (
    country.groupby(['country', 'listed_in'])
           .size()
           .reset_index(name='movie_count')
)

# Get the top genre for each country
top_genre_by_country = (
    country_genre_counts.sort_values(['country', 'movie_count'], ascending=[True, False])
                        .groupby('country')
                        .head(1)
                        .reset_index(drop=True)
)

top_genre_by_country.head()
top_genre_by_country = top_genre_by_country.rename(columns={'listed_in': 'top_genre'})
import plotly.express as px

fig = px.choropleth(
    top_genre_by_country,
    locations="country",
    locationmode="country names",
    color="top_genre",
    title="Most Popular Movie Genre by Country",
    color_discrete_sequence=px.colors.sequential.Reds
)

fig.update_layout(
    template="plotly_dark",
    paper_bgcolor="black",
    plot_bgcolor="black",
    font=dict(color="white", size=12),
)

fig.show()


# common genre

In [None]:
import pandas as pd
import plotly.express as px

# Split and explode genres
genre_df = df[df['type']=='Movie'][['type','listed_in']]
genre_df = genre_df.assign(listed_in=genre_df['listed_in'].str.split(',\s*')).explode('listed_in')

# Count number of titles per genre
genre_count = genre_df['listed_in'].value_counts().reset_index()
genre_count.columns = ['genre', 'count']

# Plot Bar Chart
fig = px.bar(
    genre_count.head(15),  # Top 15 genres for clean display
    x='genre',
    y='count',
    text='count',
    title='Most Common Moive Genres on Netflix',
    color='count',
    template='plotly_dark',
    color_continuous_scale='reds'
)

fig.update_layout(
    xaxis_title="Genre",
    yaxis_title="Number of Titles",
    paper_bgcolor="black",
    plot_bgcolor="black",
    font=dict(color="white")
)

fig.update_traces(textposition="outside")

fig.show()


# family vs mature

In [None]:
import pandas as pd

# Step 1: Simplify ratings into Family vs Mature
df['rating_simple'] = df['rating'].replace({
    'G':'Family', 'PG':'Family', 'PG-13':'Mature',
    'TV-Y':'Family', 'TV-Y7':'Family', 'TV-G':'Family',
    'TV-PG':'Family',
    'R':'Mature', 'NC-17':'Mature', 'TV-14':'Mature', 'TV-MA':'Mature'
})

# Step 2: Filter valid rows
df_clean = df[df['rating_simple'].isin(['Family','Mature'])].copy()

# Step 3: (Optional) Filter only Movies
df_clean = df_clean[df_clean['type'] == 'Movie'].copy()

# Step 4: Expand genres
df_clean['listed_in'] = df_clean['listed_in'].str.split(',\s*')
df_clean = df_clean.explode('listed_in')

df_clean=df_clean[df_clean['type']=='Movie'][['type','listed_in','rating_simple']]
df_clean.head()
# Count Family vs Mature occurrences per genre
genre_counts = df_clean.groupby(['listed_in', 'rating_simple']).size().reset_index(name='count')

# Convert to percent within each genre
genre_percent = (
    genre_counts
    .groupby('listed_in')
    .apply(lambda x: x.assign(percent=(x['count'] / x['count'].sum() * 100)))
    .reset_index(drop=True)
)

genre_percent
import plotly.express as px

fig = px.bar(
    genre_percent,
    x="listed_in",
    y="percent",
    color="rating_simple",
    barmode="group",
    labels={'listed_in':'Genre', 'percent':'Percentage (%)', 'rating_simple':'Audience Type'},
    title="Family vs Mature Content Distribution by Genre (Movies)",
    color_discrete_map={'Family':'#E50914', 'Mature':'white'}
)

fig.update_layout(
    template="plotly_dark",
    plot_bgcolor="black",
    paper_bgcolor="black",
    font=dict(color="white", size=14),
    xaxis_tickangle=45
)

fig.show()


In [None]:
import pandas as pd

# Step 1: Simplify ratings into Family vs Mature
df['rating_simple'] = df['rating'].replace({
    'G':'Family', 'PG':'Family', 'PG-13':'Mature',
    'TV-Y':'Family', 'TV-Y7':'Family', 'TV-G':'Family',
    'TV-PG':'Family',
    'R':'Mature', 'NC-17':'Mature', 'TV-14':'Mature', 'TV-MA':'Mature'
})

# Step 2: Filter valid rows
df_clean = df[df['rating_simple'].isin(['Family','Mature'])].copy()

# Step 3: (Optional) Filter only Movies
df_clean = df_clean[df_clean['type'] == 'TV Show'].copy()

# Step 4: Expand genres
df_clean['listed_in'] = df_clean['listed_in'].str.split(',\s*')
df_clean = df_clean.explode('listed_in')

df_clean=df_clean[df_clean['type']=='TV Show'][['type','listed_in','rating_simple']]
df_clean.head()
# Count Family vs Mature occurrences per genre
genre_counts = df_clean.groupby(['listed_in', 'rating_simple']).size().reset_index(name='count')

# Convert to percent within each genre
genre_percent = (
    genre_counts
    .groupby('listed_in')
    .apply(lambda x: x.assign(percent=(x['count'] / x['count'].sum() * 100)))
    .reset_index(drop=True)
)

genre_percent
import plotly.express as px

fig = px.bar(
    genre_percent,
    x="listed_in",
    y="percent",
    color="rating_simple",
    barmode="group",
    labels={'listed_in':'Genre', 'percent':'Percentage (%)', 'rating_simple':'Audience Type'},
    title="Family vs Mature Content Distribution by Genre (TV Shows)",
    color_discrete_map={'Family':'#E50914', 'Mature':'white'}
)

fig.update_layout(
    template="plotly_dark",
    plot_bgcolor="black",
    paper_bgcolor="black",
    font=dict(color="white", size=14),
    xaxis_tickangle=45
)

fig.show()


# emerging content

In [None]:
import pandas as pd
import numpy as np

# Filter Movies only
df_em = df[df['type'] == 'Movie'][['date_added', 'listed_in']].copy()

# Convert date
df_em['date_added'] = pd.to_datetime(df_em['date_added'], errors='coerce')
df_em['year_added'] = df_em['date_added'].dt.year

# Expand genres
df_em['listed_in'] = df_em['listed_in'].str.split(',\s*')
df_em = df_em.explode('listed_in')

# Count titles by genre per year
genre_trend = df_em.groupby(['listed_in', 'year_added']).size().reset_index(name='count')

# Compute growth slopes (only for genres with ‚â•2 data points)
trend = genre_trend.groupby('listed_in').apply(
    lambda x: np.polyfit(x['year_added'], x['count'], 1)[0] if len(x) > 1 else 0
).reset_index(name='growth_rate')

# Top Emerging Genres
emerging_genres = trend.sort_values(by='growth_rate', ascending=False).head(8)
emerging_genres
import plotly.express as px

top_trending = genre_trend[genre_trend['listed_in'].isin(emerging_genres['listed_in'])]

fig = px.line(
    top_trending,
    x='year_added',
    y='count',
    color='listed_in',
    title='Emerging Movie Genres on Netflix Over Time',
    markers=True
)

fig.update_layout(template="plotly_dark")
fig.update_layout(
    template="plotly_dark",
    font=dict(size=22),              # <-- Increase all text size
    title_font=dict(size=32),        # <-- Title size
    legend=dict(font=dict(size=26)), # <-- Legend font size
)

fig.update_xaxes(title_font=dict(size=25), tickfont=dict(size=21))
fig.update_yaxes(title_font=dict(size=25), tickfont=dict(size=21))

fig.show()


In [None]:
# Filter Movies only
df_em = df[df['type'] == 'TV Show'][['date_added', 'listed_in']].copy()

# Convert date
df_em['date_added'] = pd.to_datetime(df_em['date_added'], errors='coerce')
df_em['year_added'] = df_em['date_added'].dt.year

# Expand genres
df_em['listed_in'] = df_em['listed_in'].str.split(',\s*')
df_em = df_em.explode('listed_in')

# Count titles by genre per year
genre_trend = df_em.groupby(['listed_in', 'year_added']).size().reset_index(name='count')

# Compute growth slopes (only for genres with ‚â•2 data points)
trend = genre_trend.groupby('listed_in').apply(
    lambda x: np.polyfit(x['year_added'], x['count'], 1)[0] if len(x) > 1 else 0
).reset_index(name='growth_rate')

# Top Emerging Genres
emerging_genres = trend.sort_values(by='growth_rate', ascending=False).head(8)
emerging_genres

import plotly.express as px

top_trending = genre_trend[genre_trend['listed_in'].isin(emerging_genres['listed_in'])]

fig = px.line(
    top_trending,
    x='year_added',
    y='count',
    color='listed_in',
    title='Emerging TV Shows Genres on Netflix Over Time',
    markers=True
)

fig.update_layout(template="plotly_dark")
fig.update_layout(
    template="plotly_dark",
    font=dict(size=22),              # <-- Increase all text size
    title_font=dict(size=32),        # <-- Title size
    legend=dict(font=dict(size=26)), # <-- Legend font size
)

fig.update_xaxes(title_font=dict(size=25), tickfont=dict(size=21))
fig.update_yaxes(title_font=dict(size=25), tickfont=dict(size=21))

fig.show()


# Genre co-occurance

In [None]:
import pandas as pd
import itertools

# --- Step 1: Split Genres ---
df_mix = df.copy()
df_mix['listed_in'] = df_mix['listed_in'].str.split(',\s*')

# --- Step 2: Choose Movies (Change to TV Shows if needed) ---
df_expanded = df_mix[df_mix['type'] == 'Movie'][['title', 'listed_in']].explode('listed_in')

# --- Step 3: Generate Genre Pairs For Each Title ---
pairs = (
    df_expanded.groupby('title')['listed_in']
    .apply(lambda x: list(itertools.combinations(sorted(set(x)), 2)))
    .explode()
    .dropna()
)

# --- Step 4: Count Pair Frequency ---
pair_counts = pairs.value_counts().reset_index(name='count')
pair_counts.columns = ['pair', 'count']

# Split pair tuple ‚Üí two separate columns
pair_counts[['genre1', 'genre2']] = pair_counts['pair'].apply(pd.Series)
pair_counts = pair_counts.drop(columns=['pair'])

# --- Step 5: Build Symmetric Matrix and Add Diagonal ---
# Duplicate reversed pairs (B,A) for symmetry
pair_sym = pd.concat([
    pair_counts[['genre1', 'genre2', 'count']],
    pair_counts.rename(columns={'genre1': 'genre2', 'genre2': 'genre1'})[['genre1', 'genre2', 'count']]
])

# Add diagonal values = genre individual counts
diag = df_expanded['listed_in'].value_counts().reset_index()
diag.columns = ['genre1', 'count']
diag['genre2'] = diag['genre1']

pair_full = pd.concat([pair_sym, diag], ignore_index=True)

# Final full matrix
matrix = pair_full.pivot_table(index='genre1', columns='genre2', values='count', fill_value=0)

# --- Step 6: Limit to Top 15 Most Common Genres ---
top15_genres = diag.sort_values('count', ascending=False).head(15)['genre1']
matrix = matrix.loc[top15_genres, top15_genres]

# --- Step 7: Normalize Co-occurrence (Percentage Similarity) ---
genre_counts = diag.set_index('genre1')['count']
norm_matrix = matrix.copy()

for g1 in norm_matrix.index:
    for g2 in norm_matrix.columns:
        denom = min(genre_counts[g1], genre_counts[g2])
        norm_matrix.loc[g1, g2] = norm_matrix.loc[g1, g2] / denom

import plotly.express as px

fig = px.imshow(
    norm_matrix,
    text_auto=".2f",
    color_continuous_scale="Reds",
    aspect="auto"
)

fig.update_layout(
    title={
        'text': "Normalized Genre Co-Occurrence (Top 15 Movie Genres)",
        'y': 0.95,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(size=22, color="white", family="Arial Black")
    },
    template="plotly_dark",
    width=800,
    height=800,
    coloraxis_colorbar=dict(
        title=dict(
            text="Similarity",
            font=dict(color="white")
        ),
        tickfont=dict(color="white")
    )
)


fig.update_xaxes(tickangle=45)
fig.update_xaxes(
    tickfont=dict(size=16, color="white")
)

fig.update_yaxes(
    tickfont=dict(size=16, color="white")
)

fig.show()



### John Analysis

In [None]:
movies=pd.read_csv(path_movies_2025_raw)
shows=pd.read_csv(path_tv_shows)

In [None]:
missing_percentage = movies.isnull().mean() * 100
print(missing_percentage)

In [None]:
movies.drop(columns=['duration'],inplace=True)
movies=movies.fillna('missing')

In [None]:
movies_exp=movies.copy()
movies_exp['genres']=movies_exp['genres'].apply(lambda x: [genre.strip() for genre in x.split(',')])

In [None]:
df=movies.copy()
df['genres'] = df['genres'].str.split(', ')
movies_exploded = df.explode('genres', ignore_index=True)
movies_exploded=movies_exploded[movies_exploded['genres']!='missing']

## Rating Analysis for each genre

In [None]:
generate_boxplot_interactive(movies_exploded,'genres','rating',True)

### F-test to check the statistical significance of differences between mean values of ratings of genres

In [None]:
f_stat,f_crit,sse_rating=Ftest(movies_exploded,'genres','rating',0.05)
print(f_stat," ",f_crit)

### LSD test to find the significant differences

In [None]:
dic=LSD_test(movies_exploded,'genres','rating',sse_rating,0.05)

### Results from F-test and LSD tests

In [None]:
for key,val in dic.items():
    if len(val)==0:
        print(key,"is not significantly higher rated than any genre")
        print()
    else:
        print(key,"is significantly higher rated than the following genres:")
        for g in val:
            print(g)
        print()

## Popularity Analysis for each Genre

In [None]:
generate_boxplot(movies_exploded,'genres','popularity',False)

### F-test to check the statistical significance of differences between mean values of popularity scores of genres

In [None]:
f_stat,f_crit,sse_pop=Ftest(movies_exploded,'genres','popularity',0.05)
print(f_stat," ",f_crit)

### LSD test to find the significant differences

In [None]:
dic=LSD_test(movies_exploded,'genres','popularity',sse_pop,0.05)

### Results from F-test and LSD tests

In [None]:
for key,val in dic.items():
    if len(val)==0:
        print(key,"is not significantly higher popularity rated than any genre")
        print()
    else:
        print(key,"is significantly higher popularity rated than the following genres:")
        for g in val:
            print(g)
        print()

## Revenue vs Genre Analysis

In [None]:
generate_boxplot(movies_exploded,'genres','revenue',False)

### F-test to check the statistical significance of differences between mean values of revenues of genres

In [None]:
f_stat,f_crit,sse_rev=Ftest(movies_exploded,'genres','revenue',0.05)
print(f_stat," ",f_crit)

### LSD test to find the significant differences

In [None]:
dic=LSD_test(movies_exploded,'genres','revenue',sse_rev,0.05)

### Results from F-test and LSD tests

In [None]:
for key,val in dic.items():
    if len(val)==0:
        print(key,"does not generate higher revenue than any genre")
        print()
    else:
        print(key,"generates more revenue than the following genres:")
        for g in val:
            print(g)
        print()

## Budget vs Revenue Analysis

In [None]:
plot_categorywise_corr(movies_exploded,'budget','revenue','genres')

## Budget vs Popularity Analysis

In [None]:
plot_categorywise_corr(movies_exploded,'budget','popularity','genres')

## Popularity of every genre with time

In [None]:
plot_change_over_time(movies_exploded,'popularity','release_year','genres')

## Change in number of movies of a particular genre produced per year across time

In [None]:
plot_number_across_time(movies_exploded,'genres','release_year')

cumulative_number_plot(movies_exploded,'release_year','genres')

In [None]:
sunburst_plot(movies_exploded,'genres','language')

In [None]:
dic={'Animation ,Family': 6.703772167487692,
 'Animation ,Romance': 7.010959183673473,
 'Animation ,Thriller': 6.537142857142857,
 'Animation ,Science Fiction': 6.796076666666667,
 'Animation ,Western': 7.013400000000002,
 'Animation ,Horror': 6.51493220338983,
 'Animation ,History': 6.69362962962963,
 'Animation ,Comedy': 6.728316568047331,
 'Animation ,Fantasy': 6.944435272045022,
 'Animation ,Documentary': 7.3961250000000005,
 'Animation ,Action': 6.802515418502201,
 'Animation ,Drama': 7.033176923076923,
 'Animation ,TV Movie': 7.055278688524591,
 'Animation ,War': 6.774380952380951,
 'Animation ,Music': 6.862806451612902,
 'Animation ,Adventure': 6.779389999999994,
 'Animation ,Mystery': 7.067451923076923,
 'Animation ,Crime': 6.874444444444445,
 'Family ,Romance': 6.423662921348316,
 'Family ,Thriller': 6.3635,
 'Family ,Science Fiction': 6.551341269841269,
 'Family ,Western': 7.222333333333334,
 'Family ,Horror': 6.787285714285714,
 'Family ,History': 6.9904375000000005,
 'Family ,Comedy': 6.472672172808132,
 'Family ,Fantasy': 6.564265350877192,
 'Family ,Documentary': 7.054866666666668,
 'Family ,Action': 6.547907407407409,
 'Family ,Drama': 6.6194028776978415,
 'Family ,TV Movie': 6.4601567164179095,
 'Family ,War': 6.3342857142857145,
 'Family ,Music': 6.652397058823531,
 'Family ,Adventure': 6.575573268921088,
 'Family ,Mystery': 7.033571428571434,
 'Family ,Crime': 6.281391304347826,
 'Romance ,Thriller': 5.95856,
 'Romance ,Science Fiction': 6.099848101265822,
 'Romance ,Western': 6.585714285714286,
 'Romance ,Horror': 5.279918032786884,
 'Romance ,History': 6.623303571428572,
 'Romance ,Comedy': 6.128391588785054,
 'Romance ,Fantasy': 6.580858108108108,
 'Romance ,Documentary': 5.012499999999999,
 'Romance ,Action': 5.927465517241378,
 'Romance ,Drama': 6.150581213307238,
 'Romance ,TV Movie': 6.174547872340423,
 'Romance ,War': 6.751310344827586,
 'Romance ,Music': 6.585409638554216,
 'Romance ,Adventure': 6.432603174603173,
 'Romance ,Mystery': 5.868558139534883,
 'Romance ,Crime': 6.127797101449276,
 'Thriller ,Science Fiction': 5.672908212560393,
 'Thriller ,Western': 6.17771875,
 'Thriller ,Horror': 5.578617058311571,
 'Thriller ,History': 6.590990099009901,
 'Thriller ,Comedy': 5.906476190476189,
 'Thriller ,Fantasy': 5.849629921259841,
 'Thriller ,Documentary': 6.626124999999999,
 'Thriller ,Action': 5.93834372367312,
 'Thriller ,Drama': 6.05841648898366,
 'Thriller ,TV Movie': 5.959585365853656,
 'Thriller ,War': 6.505560000000001,
 'Thriller ,Music': 5.9985333333333335,
 'Thriller ,Adventure': 5.8284851063829795,
 'Thriller ,Mystery': 5.883181451612907,
 'Thriller ,Crime': 6.1233979166666614,
 'Science Fiction ,Western': 5.8069999999999995,
 'Science Fiction ,Horror': 5.397810198300279,
 'Science Fiction ,History': 6.333333333333333,
 'Science Fiction ,Comedy': 6.042070312499996,
 'Science Fiction ,Fantasy': 6.142125714285715,
 'Science Fiction ,Documentary': 6.608,
 'Science Fiction ,Action': 6.0924635193132985,
 'Science Fiction ,Drama': 6.1495272727272745,
 'Science Fiction ,TV Movie': 5.535052631578949,
 'Science Fiction ,War': 6.09044,
 'Science Fiction ,Music': 5.185307692307693,
 'Science Fiction ,Adventure': 6.188800518134717,
 'Science Fiction ,Mystery': 5.9826402877697875,
 'Science Fiction ,Crime': 6.200464285714287,
 'Western ,Horror': 5.2896,
 'Western ,History': 6.900000000000001,
 'Western ,Comedy': 6.244733333333333,
 'Western ,Fantasy': 5.448142857142857,
 'Western ,Action': 5.87448888888889,
 'Western ,Drama': 6.3417246376811605,
 'Western ,Adventure': 6.194533333333333,
 'Western ,Mystery': 6.609624999999999,
 'Western ,Crime': 6.479499999999999,
 'Horror ,History': 6.016388888888888,
 'Horror ,Comedy': 5.472899713467047,
 'Horror ,Fantasy': 5.621473451327433,
 'Horror ,Documentary': 5.7299,
 'Horror ,Action': 5.471048192771086,
 'Horror ,Drama': 5.755779097387168,
 'Horror ,TV Movie': 5.123553571428571,
 'Horror ,War': 5.841105263157894,
 'Horror ,Music': 5.820230769230769,
 'Horror ,Adventure': 5.355139784946236,
 'Horror ,Mystery': 5.67314902807776,
 'Horror ,Crime': 5.835108108108109,
 'History ,Comedy': 6.802937499999999,
 'History ,Fantasy': 5.580157894736842,
 'History ,Documentary': 7.077058823529412,
 'History ,Action': 6.512993243243243,
 'History ,Drama': 6.6900779220779265,
 'History ,TV Movie': 6.512466666666667,
 'History ,War': 6.769941558441556,
 'History ,Music': 6.843285714285716,
 'History ,Adventure': 6.230181818181821,
 'History ,Mystery': 6.404937499999998,
 'History ,Crime': 6.8552173913043495,
 'Comedy ,Fantasy': 6.474484913793107,
 'Comedy ,Documentary': 6.24227027027027,
 'Comedy ,Action': 6.083804999999999,
 'Comedy ,Drama': 6.255237693389604,
 'Comedy ,TV Movie': 6.324078947368418,
 'Comedy ,War': 6.335099999999999,
 'Comedy ,Music': 6.717794520547943,
 'Comedy ,Adventure': 6.426360507246375,
 'Comedy ,Mystery': 6.45362658227848,
 'Comedy ,Crime': 6.1060393442622924,
 'Fantasy ,Documentary': 6.2,
 'Fantasy ,Action': 6.305766375545847,
 'Fantasy ,Drama': 6.334671270718234,
 'Fantasy ,TV Movie': 6.3172435897435895,
 'Fantasy ,War': 6.397692307692308,
 'Fantasy ,Music': 6.522274999999999,
 'Fantasy ,Adventure': 6.410884745762711,
 'Fantasy ,Mystery': 6.32623711340206,
 'Fantasy ,Crime': 6.042708333333334,
 'Documentary ,Action': 5.941416666666668,
 'Documentary ,Drama': 6.488163934426229,
 'Documentary ,TV Movie': 7.162520000000001,
 'Documentary ,War': 6.773999999999999,
 'Documentary ,Music': 7.054520325203252,
 'Documentary ,Adventure': 7.3554,
 'Documentary ,Mystery': 6.725,
 'Documentary ,Crime': 6.724918367346939,
 'Action ,Drama': 6.245582758620694,
 'Action ,TV Movie': 5.876624999999999,
 'Action ,War': 6.466456647398843,
 'Action ,Music': 5.23942857142857,
 'Action ,Adventure': 6.224772941176472,
 'Action ,Mystery': 6.085233830845772,
 'Action ,Crime': 6.135787318361951,
 'Drama ,TV Movie': 6.355903669724774,
 'Drama ,War': 6.683484745762705,
 'Drama ,Music': 6.495752380952378,
 'Drama ,Adventure': 6.485839650145774,
 'Drama ,Mystery': 6.044304545454548,
 'Drama ,Crime': 6.276533898305085,
 'TV Movie ,War': 6.16575,
 'TV Movie ,Music': 6.9369375,
 'TV Movie ,Adventure': 6.261852459016395,
 'TV Movie ,Mystery': 6.486189655172413,
 'TV Movie ,Crime': 6.5253265306122445,
 'War ,Music': 6.517333333333333,
 'War ,Adventure': 6.486,
 'War ,Mystery': 7.26,
 'War ,Crime': 6.930999999999999,
 'Music ,Adventure': 6.355846153846152,
 'Music ,Mystery': 6.7,
 'Music ,Crime': 6.6754999999999995,
 'Adventure ,Mystery': 6.3748068181818205,
 'Adventure ,Crime': 6.317634408602151,
 'Mystery ,Crime': 6.267904436860068}

In [None]:
plot_top20(dic,'genres','rating')

In [None]:
dic={'News ,Kids': 255.00650000000002,
 'News ,Family': 60.726,
 'News ,Crime': 82.81925,
 'News ,Reality': 51.18425,
 'News ,Documentary': 70.12715384615385,
 'News ,Talk': 139.2387659574468,
 'News ,Drama': 131.427,
 'News ,Comedy': 160.4151111111111,
 'News ,War & Politics': 43.9018,
 'Kids ,Action & Adventure': 69.10409859154929,
 'Kids ,Family': 77.06653924914673,
 'Kids ,Crime': 54.2415,
 'Kids ,Animation': 68.61630477759475,
 'Kids ,Reality': 121.2045882352941,
 'Kids ,Mystery': 79.03999999999999,
 'Kids ,Documentary': 44.0856875,
 'Kids ,Talk': 50.943,
 'Kids ,Drama': 60.28620512820517,
 'Kids ,Comedy': 78.40964984227128,
 'Kids ,Soap': 141.90928571428572,
 'Kids ,Sci-Fi & Fantasy': 69.05667222222226,
 'Kids ,War & Politics': 13.99,
 'Action & Adventure ,Family': 90.08746710526317,
 'Action & Adventure ,Crime': 76.32718534482756,
 'Action & Adventure ,Animation': 63.167946280991735,
 'Action & Adventure ,Reality': 138.097375,
 'Action & Adventure ,Mystery': 61.925904040403985,
 'Action & Adventure ,Documentary': 62.9835,
 'Action & Adventure ,Western': 76.20757142857143,
 'Action & Adventure ,Talk': 65.13799999999999,
 'Action & Adventure ,Drama': 75.36334132420096,
 'Action & Adventure ,Comedy': 68.96865485074628,
 'Action & Adventure ,Soap': 102.32745454545456,
 'Action & Adventure ,Sci-Fi & Fantasy': 67.55941985522236,
 'Action & Adventure ,War & Politics': 69.16906666666667,
 'Family ,Crime': 73.09110416666665,
 'Family ,Animation': 81.1914224924012,
 'Family ,Reality': 89.36863333333334,
 'Family ,Mystery': 66.5634406779661,
 'Family ,Documentary': 39.04836111111112,
 'Family ,Western': 31.264,
 'Family ,Talk': 107.65431578947367,
 'Family ,Drama': 89.01904968152873,
 'Family ,Comedy': 87.33403776435046,
 'Family ,Soap': 152.98734285714283,
 'Family ,Unknown': 78.63992,
 'Family ,Sci-Fi & Fantasy': 76.11289932885909,
 'Family ,War & Politics': 53.763333333333335,
 'Crime ,Animation': 49.64264444444445,
 'Crime ,Reality': 71.17202702702703,
 'Crime ,Mystery': 55.69120091324194,
 'Crime ,Documentary': 46.81597297297296,
 'Crime ,Western': 54.3452,
 'Crime ,Talk': 86.180625,
 'Crime ,Drama': 70.99882043935052,
 'Crime ,Comedy': 66.76142613636362,
 'Crime ,Soap': 175.18486206896554,
 'Crime ,Unknown': 19.65,
 'Crime ,Sci-Fi & Fantasy': 60.83068518518518,
 'Crime ,War & Politics': 41.23266666666666,
 'Animation ,Mystery': 49.22004666666664,
 'Animation ,Documentary': 36.943875000000006,
 'Animation ,Western': 52.125,
 'Animation ,Talk': 146.09825,
 'Animation ,Drama': 49.20669078947365,
 'Animation ,Comedy': 60.039748333333286,
 'Animation ,Soap': 63.83075,
 'Animation ,Unknown': 31.3784,
 'Animation ,Sci-Fi & Fantasy': 61.44317040816322,
 'Animation ,War & Politics': 37.096199999999996,
 'Reality ,Mystery': 82.12795238095237,
 'Reality ,Documentary': 53.17171910112361,
 'Reality ,Talk': 103.99740566037742,
 'Reality ,Drama': 74.99035064935062,
 'Reality ,Comedy': 106.73122580645162,
 'Reality ,Soap': 84.83200000000001,
 'Reality ,Sci-Fi & Fantasy': 120.3536666666667,
 'Reality ,War & Politics': 38.474,
 'Mystery ,Documentary': 65.8735172413793,
 'Mystery ,Western': 51.177,
 'Mystery ,Talk': 117.35419999999999,
 'Mystery ,Drama': 57.00943277310927,
 'Mystery ,Comedy': 56.59489189189191,
 'Mystery ,Soap': 204.67560975609757,
 'Mystery ,Unknown': 27.910000000000004,
 'Mystery ,Sci-Fi & Fantasy': 55.556555555555526,
 'Mystery ,War & Politics': 43.4020588235294,
 'Documentary ,Talk': 84.16424999999998,
 'Documentary ,Drama': 58.577218750000014,
 'Documentary ,Comedy': 39.60485416666667,
 'Documentary ,Soap': 90.63966666666666,
 'Documentary ,Unknown': 34.27533333333333,
 'Documentary ,Sci-Fi & Fantasy': 55.508428571428574,
 'Documentary ,War & Politics': 53.83479166666666,
 'Western ,Drama': 111.2166,
 'Western ,Comedy': 46.903499999999994,
 'Western ,Sci-Fi & Fantasy': 115.809,
 'Talk ,Drama': 78.89722222222223,
 'Talk ,Comedy': 214.17582716049375,
 'Talk ,Soap': 101.745,
 'Talk ,Sci-Fi & Fantasy': 42.4425,
 'Talk ,War & Politics': 113.37133333333333,
 'Drama ,Comedy': 62.84385267588694,
 'Drama ,Soap': 148.26034541577818,
 'Drama ,Unknown': 53.67816438356165,
 'Drama ,Sci-Fi & Fantasy': 61.8958846584546,
 'Drama ,War & Politics': 52.87439112903227,
 'Comedy ,Soap': 140.85902400000003,
 'Comedy ,Unknown': 36.967404761904774,
 'Comedy ,Sci-Fi & Fantasy': 62.52213867488432,
 'Comedy ,War & Politics': 32.98109090909091,
 'Soap ,Unknown': 168.46800000000002,
 'Soap ,Sci-Fi & Fantasy': 77.37752380952381,
 'Soap ,War & Politics': 45.051,
 'Unknown ,Sci-Fi & Fantasy': 51.455,
 'Sci-Fi & Fantasy ,War & Politics': 48.8685}

In [None]:
plot_top20(dic,'genres','popularity')

In [None]:
plot_lossmakers(movies_exploded,'revenue','budget','genres')

In [None]:
fraction_lossmaking(movies_exploded,'genres','revenue','budget')

In [None]:
chi2_cont_test(movies_exploded,'genres')

In [None]:
gap_analysis(movies_exploded,'director','popularity')

In [None]:
gap_analysis(movies_exploded,'director','rating')

# TV shows only

In [None]:
df=shows.select_dtypes(include=['number'])
df.drop(columns=['show_id','vote_average'],inplace=True)
corr = df.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
missing_percentage = shows.isnull().mean() * 100
print(missing_percentage)

In [None]:
shows['duration'].unique()

In [None]:
shows=shows.fillna('missing')

In [None]:
shows_exp=shows.copy()
shows_exp['genres']=shows_exp['genres'].apply(lambda x: [genre.strip() for genre in x.split(',')])

In [None]:
df=shows.copy()
df['genres'] = df['genres'].str.split(', ')
shows_exploded = df.explode('genres', ignore_index=True)
shows_exploded=shows_exploded[shows_exploded['genres']!='missing']
shows_exploded=shows_exploded[shows_exploded['genres']!='Unknown']

## Rating vs Genre Anlysis

In [None]:
generate_boxplot_interactive(shows_exploded,'genres','rating',True)

In [None]:
f_stat,f_crit,sse_rating=Ftest(shows_exploded,'genres','rating',0.05)
print(f_stat," ",f_crit)

In [None]:
dic=LSD_test(shows_exploded,'genres','rating',sse_rating,0.05)

In [None]:
for key,val in dic.items():
    if len(val)==0:
        print(key,"is not significantly higher rated than any genre")
        print()
    else:
        print(key,"is significantly higher rated than the following genres:")
        for g in val:
            print(g)
        print()

## Popularity vs genre sorted by median popularity

In [None]:
generate_boxplot(shows_exploded,'genres','popularity',False)

In [None]:
f_stat,f_crit,sse_pop=Ftest(shows_exploded,'genres','popularity',0.05)
print(f_stat," ",f_crit)

In [None]:
dic=LSD_test(shows_exploded,'genres','popularity',sse_pop,0.05)

In [None]:
for key,val in dic.items():
    if len(val)==0:
        print(key,"is not significantly higher popularity rated than any genre")
        print()
    else:
        print(key,"is significantly higher popularity rated than the following genres:")
        for g in val:
            print(g)
        print()

In [None]:
plot_change_over_time(shows_exploded,'popularity','release_year','genres')

In [None]:
plot_number_across_time(shows_exploded,'genres','release_year')

In [None]:
cumulative_number_plot(shows_exploded,'release_year','genres')

In [None]:
sunburst_plot(shows_exploded,'genres','language')

In [None]:
dic={'Drama ,Action & Adventure': 6.662406392694064,
 'Drama ,Mystery': 6.521516806722694,
 'Drama ,Kids': 5.647910256410256,
 'Drama ,Comedy': 6.176691521346972,
 'Drama ,Western': 7.098649999999999,
 'Drama ,Documentary': 6.813968749999999,
 'Drama ,Unknown': 5.619287671232877,
 'Drama ,Talk': 5.111111111111111,
 'Drama ,Animation': 7.24181359649123,
 'Drama ,Reality': 6.125584415584414,
 'Drama ,War & Politics': 5.521370967741933,
 'Drama ,Sci-Fi & Fantasy': 6.571832026875698,
 'Drama ,Crime': 6.6063973256924555,
 'Drama ,Soap': 5.355972281449893,
 'Drama ,Family': 5.325178343949043,
 'Drama ,News': 7.3,
 'Action & Adventure ,Mystery': 6.884989898989897,
 'Action & Adventure ,Kids': 6.709835680751175,
 'Action & Adventure ,Comedy': 6.96639365671641,
 'Action & Adventure ,Western': 6.296,
 'Action & Adventure ,Documentary': 6.488187500000001,
 'Action & Adventure ,Talk': 5.133333333333334,
 'Action & Adventure ,Animation': 7.153025826446273,
 'Action & Adventure ,Reality': 6.609541666666668,
 'Action & Adventure ,War & Politics': 6.704106666666668,
 'Action & Adventure ,Sci-Fi & Fantasy': 7.056924508790062,
 'Action & Adventure ,Crime': 6.70922844827586,
 'Action & Adventure ,Soap': 7.0955,
 'Action & Adventure ,Family': 6.936427631578952,
 'Mystery ,Kids': 5.748696969696969,
 'Mystery ,Comedy': 6.978216216216218,
 'Mystery ,Western': 6.9,
 'Mystery ,Documentary': 7.243448275862069,
 'Mystery ,Unknown': 6.340000000000001,
 'Mystery ,Talk': 4.68,
 'Mystery ,Animation': 7.584060000000003,
 'Mystery ,Reality': 6.2714285714285705,
 'Mystery ,War & Politics': 6.779529411764706,
 'Mystery ,Sci-Fi & Fantasy': 6.961425925925927,
 'Mystery ,Crime': 6.619093607305938,
 'Mystery ,Soap': 5.393780487804878,
 'Mystery ,Family': 6.201932203389831,
 'Kids ,Comedy': 6.493463722397476,
 'Kids ,Documentary': 3.83125,
 'Kids ,Talk': 5.225,
 'Kids ,Animation': 6.084003294892912,
 'Kids ,Reality': 5.347058823529412,
 'Kids ,War & Politics': 7.0,
 'Kids ,Sci-Fi & Fantasy': 6.662466666666671,
 'Kids ,Crime': 6.969,
 'Kids ,Soap': 6.471428571428571,
 'Kids ,Family': 6.44149146757679,
 'Kids ,News': 5.0,
 'Comedy ,Western': 3.7035,
 'Comedy ,Documentary': 6.070270833333333,
 'Comedy ,Unknown': 6.047619047619048,
 'Comedy ,Talk': 5.082993827160493,
 'Comedy ,Animation': 6.9019775,
 'Comedy ,Reality': 5.789664516129032,
 'Comedy ,War & Politics': 6.510363636363636,
 'Comedy ,Sci-Fi & Fantasy': 6.942268104776573,
 'Comedy ,Crime': 6.868409090909091,
 'Comedy ,Soap': 5.357032000000001,
 'Comedy ,Family': 6.09250151057402,
 'Comedy ,News': 5.222481481481482,
 'Western ,Animation': 4.533333333333334,
 'Western ,Sci-Fi & Fantasy': 7.9705,
 'Western ,Crime': 7.161199999999999,
 'Western ,Family': 3.95,
 'Documentary ,Unknown': 7.666666666666667,
 'Documentary ,Talk': 5.022916666666666,
 'Documentary ,Animation': 4.875,
 'Documentary ,Reality': 5.740117977528091,
 'Documentary ,War & Politics': 6.741041666666667,
 'Documentary ,Sci-Fi & Fantasy': 7.528571428571428,
 'Documentary ,Crime': 6.602801801801804,
 'Documentary ,Soap': 6.433333333333333,
 'Documentary ,Family': 4.8805555555555555,
 'Documentary ,News': 4.2153846153846155,
 'Unknown ,Animation': 6.5,
 'Unknown ,Sci-Fi & Fantasy': 6.05,
 'Unknown ,Crime': 6.1,
 'Unknown ,Soap': 6.275,
 'Unknown ,Family': 5.048,
 'Talk ,Animation': 5.441750000000001,
 'Talk ,Reality': 4.625141509433962,
 'Talk ,War & Politics': 5.666666666666667,
 'Talk ,Sci-Fi & Fantasy': 5.5,
 'Talk ,Crime': 5.4875,
 'Talk ,Soap': 2.5,
 'Talk ,Family': 3.7719473684210527,
 'Talk ,News': 4.205872340425532,
 'Animation ,War & Politics': 7.342,
 'Animation ,Sci-Fi & Fantasy': 7.151852040816318,
 'Animation ,Crime': 7.594133333333332,
 'Animation ,Soap': 1.625,
 'Animation ,Family': 6.587009118541033,
 'Reality ,War & Politics': 9.2,
 'Reality ,Sci-Fi & Fantasy': 7.82,
 'Reality ,Crime': 5.705405405405406,
 'Reality ,Soap': 2.675,
 'Reality ,Family': 5.668799999999999,
 'Reality ,News': 2.0,
 'War & Politics ,Sci-Fi & Fantasy': 7.486807692307693,
 'War & Politics ,Crime': 6.980666666666666,
 'War & Politics ,Soap': 4.075,
 'War & Politics ,Family': 5.840111111111112,
 'War & Politics ,News': 6.8,
 'Sci-Fi & Fantasy ,Crime': 7.15325925925926,
 'Sci-Fi & Fantasy ,Soap': 6.201333333333333,
 'Sci-Fi & Fantasy ,Family': 6.913335570469802,
 'Crime ,Soap': 5.454413793103448,
 'Crime ,Family': 4.854291666666666,
 'Crime ,News': 3.95,
 'Soap ,Family': 5.467476190476191,
 'Family ,News': 5.5}

In [None]:
plot_top20(dic,'genres','rating')

In [None]:
dic={'Crime ,Drama': 70.99882043935055,
 'Crime ,Action & Adventure': 76.32718534482757,
 'Crime ,Reality': 71.17202702702703,
 'Crime ,Documentary': 46.815972972972986,
 'Crime ,Western': 54.3452,
 'Crime ,Family': 73.09110416666668,
 'Crime ,Kids': 54.2415,
 'Crime ,Sci-Fi & Fantasy': 60.83068518518517,
 'Crime ,War & Politics': 41.23266666666667,
 'Crime ,News': 82.81924999999998,
 'Crime ,Comedy': 66.76142613636362,
 'Crime ,Talk': 86.18062499999999,
 'Crime ,Soap': 175.18486206896551,
 'Crime ,Animation': 49.64264444444444,
 'Crime ,Unknown': 19.65,
 'Crime ,Mystery': 55.691200913242064,
 'Drama ,Action & Adventure': 75.36334132420089,
 'Drama ,Reality': 74.99035064935066,
 'Drama ,Documentary': 58.577218750000014,
 'Drama ,Western': 111.21660000000004,
 'Drama ,Family': 89.01904968152864,
 'Drama ,Kids': 60.28620512820515,
 'Drama ,Sci-Fi & Fantasy': 61.89588465845468,
 'Drama ,War & Politics': 52.874391129032254,
 'Drama ,News': 131.427,
 'Drama ,Comedy': 62.84385267588699,
 'Drama ,Talk': 78.89722222222223,
 'Drama ,Soap': 148.26034541577832,
 'Drama ,Animation': 49.206690789473676,
 'Drama ,Unknown': 53.67816438356166,
 'Drama ,Mystery': 57.009432773109175,
 'Action & Adventure ,Reality': 138.097375,
 'Action & Adventure ,Documentary': 62.98349999999999,
 'Action & Adventure ,Western': 76.20757142857143,
 'Action & Adventure ,Family': 90.08746710526314,
 'Action & Adventure ,Kids': 69.10409859154929,
 'Action & Adventure ,Sci-Fi & Fantasy': 67.55941985522239,
 'Action & Adventure ,War & Politics': 69.16906666666667,
 'Action & Adventure ,Comedy': 68.96865485074628,
 'Action & Adventure ,Talk': 65.13799999999999,
 'Action & Adventure ,Soap': 102.32745454545456,
 'Action & Adventure ,Animation': 63.16794628099174,
 'Action & Adventure ,Mystery': 61.925904040404056,
 'Reality ,Documentary': 53.17171910112363,
 'Reality ,Family': 89.36863333333332,
 'Reality ,Kids': 121.2045882352941,
 'Reality ,Sci-Fi & Fantasy': 120.35366666666668,
 'Reality ,War & Politics': 38.474,
 'Reality ,News': 51.184250000000006,
 'Reality ,Comedy': 106.73122580645159,
 'Reality ,Talk': 103.99740566037737,
 'Reality ,Soap': 84.83200000000001,
 'Reality ,Mystery': 82.12795238095235,
 'Documentary ,Family': 39.04836111111111,
 'Documentary ,Kids': 44.085687500000006,
 'Documentary ,Sci-Fi & Fantasy': 55.50842857142858,
 'Documentary ,War & Politics': 53.83479166666667,
 'Documentary ,News': 70.12715384615385,
 'Documentary ,Comedy': 39.604854166666655,
 'Documentary ,Talk': 84.16425000000001,
 'Documentary ,Soap': 90.63966666666666,
 'Documentary ,Animation': 36.943875,
 'Documentary ,Unknown': 34.27533333333333,
 'Documentary ,Mystery': 65.8735172413793,
 'Western ,Family': 31.264,
 'Western ,Sci-Fi & Fantasy': 115.809,
 'Western ,Comedy': 46.903499999999994,
 'Western ,Animation': 52.125,
 'Western ,Mystery': 51.177,
 'Family ,Kids': 77.06653924914673,
 'Family ,Sci-Fi & Fantasy': 76.1128993288591,
 'Family ,War & Politics': 53.763333333333335,
 'Family ,News': 60.726,
 'Family ,Comedy': 87.33403776435043,
 'Family ,Talk': 107.65431578947369,
 'Family ,Soap': 152.98734285714286,
 'Family ,Animation': 81.1914224924012,
 'Family ,Unknown': 78.63992000000002,
 'Family ,Mystery': 66.56344067796611,
 'Kids ,Sci-Fi & Fantasy': 69.05667222222223,
 'Kids ,War & Politics': 13.99,
 'Kids ,News': 255.00650000000002,
 'Kids ,Comedy': 78.40964984227134,
 'Kids ,Talk': 50.943,
 'Kids ,Soap': 141.90928571428574,
 'Kids ,Animation': 68.61630477759476,
 'Kids ,Mystery': 79.04,
 'Sci-Fi & Fantasy ,War & Politics': 48.86849999999999,
 'Sci-Fi & Fantasy ,Comedy': 62.522138674884395,
 'Sci-Fi & Fantasy ,Talk': 42.4425,
 'Sci-Fi & Fantasy ,Soap': 77.37752380952381,
 'Sci-Fi & Fantasy ,Animation': 61.4431704081633,
 'Sci-Fi & Fantasy ,Unknown': 51.455,
 'Sci-Fi & Fantasy ,Mystery': 55.55655555555553,
 'War & Politics ,News': 43.901799999999994,
 'War & Politics ,Comedy': 32.98109090909091,
 'War & Politics ,Talk': 113.37133333333334,
 'War & Politics ,Soap': 45.051,
 'War & Politics ,Animation': 37.096199999999996,
 'War & Politics ,Mystery': 43.40205882352941,
 'News ,Comedy': 160.41511111111114,
 'News ,Talk': 139.23876595744682,
 'Comedy ,Talk': 214.17582716049395,
 'Comedy ,Soap': 140.85902399999998,
 'Comedy ,Animation': 60.0397483333333,
 'Comedy ,Unknown': 36.96740476190477,
 'Comedy ,Mystery': 56.594891891891905,
 'Talk ,Soap': 101.745,
 'Talk ,Animation': 146.09825,
 'Talk ,Mystery': 117.35419999999999,
 'Soap ,Animation': 63.83075,
 'Soap ,Unknown': 168.468,
 'Soap ,Mystery': 204.67560975609757,
 'Animation ,Unknown': 31.3784,
 'Animation ,Mystery': 49.22004666666666,
 'Unknown ,Mystery': 27.910000000000004}

In [None]:
plot_top20(dic,'genres','popularity')

In [None]:
gap_analysis(shows_exploded,'director','popularity',False)

In [None]:
gap_analysis(shows_exploded,'director','rating',False)