# Streaming Platforms EDA

Analyzing content libraries for Netflix, Hulu, Prime Video, and Disney+.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import warnings
warnings.filterwarnings('ignore')

# fix numpy compatibility issues
import numpy
if not hasattr(numpy, 'bool'):
    numpy.bool = bool
if not hasattr(numpy, 'int'):
    numpy.int = int
if not hasattr(numpy, 'float'):
    numpy.float = float
if not hasattr(numpy, 'complex'):
    numpy.complex = complex

plt.style.use('default')
sns.set_palette("husl")

df = pd.read_csv('../datasets/cleaned/movies_cleaned.csv')

print("Dataset shape:", df.shape)
print("\nFirst few rows of production_countries:")
print(df['production_countries'].head())

Dataset shape: (985, 22)

Columns: ['Title', 'Year', 'Age', 'Rotten Tomatoes', 'Netflix', 'Hulu', 'Prime Video', 'Disney+', 'budget', 'genres', 'id', 'overview', 'popularity', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'title', 'vote_average', 'vote_count', 'budget_filled']

First few rows of production_countries:
0    [{"iso_3166_1": "HK", "name": "Hong Kong"}, {"...
1    [{"iso_3166_1": "FR", "name": "France"}, {"iso...
2    [{"iso_3166_1": "US", "name": "United States o...
3    [{"iso_3166_1": "US", "name": "United States o...
4    [{"iso_3166_1": "GB", "name": "United Kingdom"...
Name: production_countries, dtype: object


## Netflix Growth Over Time

In [None]:
netflix_growth = df[df['Netflix'] == 1].groupby('Year').size().reset_index(name='count')

fig = px.line(netflix_growth, x='Year', y='count',
              title='Netflix Content Library Growth Over Years',
              labels={'count': 'Number of Titles', 'Year': 'Release Year'})

fig.update_layout(showlegend=False)
fig.show()

## Content by Country

In [None]:
# parse JSON strings to extract country names
def extract_country_names(countries_json):
    try:
        if pd.isna(countries_json) or countries_json == '[]':
            return []
        countries = json.loads(countries_json)
        return [country['name'] for country in countries]
    except:
        return []

def extract_language_names(languages_json):
    try:
        if pd.isna(languages_json) or languages_json == '[]':
            return []
        languages = json.loads(languages_json)
        return [lang['name'] for lang in languages]
    except:
        return []

df['country_names'] = df['production_countries'].apply(extract_country_names)
df['language_names'] = df['spoken_languages'].apply(extract_language_names)

# expand data so each country gets its own row
countries_expanded = []
for idx, row in df.iterrows():
    for country in row['country_names']:
        new_row = row.copy()
        new_row['country'] = country
        countries_expanded.append(new_row)

countries_df = pd.DataFrame(countries_expanded)

def get_top_countries_by_platform(platform_column, top_n=10):
    if len(countries_df[countries_df[platform_column] == 1]) == 0:
        return pd.Series(dtype='int64')
    return countries_df[countries_df[platform_column] == 1]['country'].value_counts().head(top_n)

platforms = ['Netflix', 'Hulu', 'Prime Video', 'Disney+']
fig = make_subplots(rows=2, cols=2, 
                    subplot_titles=platforms,
                    vertical_spacing=0.15)

for idx, platform in enumerate(platforms, 1):
    row = (idx-1)//2 + 1
    col = (idx-1)%2 + 1
    
    top_countries = get_top_countries_by_platform(platform)
    
    if len(top_countries) > 0:
        fig.add_trace(
            go.Bar(x=top_countries.values, 
                   y=top_countries.index, 
                   orientation='h',
                   showlegend=False),
            row=row, col=col
        )

fig.update_layout(height=800, title_text="Top Countries by Platform")
fig.update_xaxes(title_text="Number of Titles")
fig.show()

# quick stats
print("\nCountry stats:")
for platform in platforms:
    platform_countries = countries_df[countries_df[platform] == 1]['country'].nunique()
    total_titles = len(countries_df[countries_df[platform] == 1])
    print(f"{platform}: {platform_countries} countries, {total_titles} titles")


Country Distribution Summary:
Netflix: 29 unique countries, 431 total titles
Hulu: 28 unique countries, 324 total titles
Prime Video: 35 unique countries, 508 total titles
Disney+: 21 unique countries, 228 total titles


## Genre Analysis

In [None]:
genre_stats = df.groupby('genres').agg({
    'vote_average': 'mean',
    'vote_count': 'sum'
}).reset_index()

fig = make_subplots(rows=1, cols=2, subplot_titles=('Average Rating', 'Total Votes'))

top_rated = genre_stats.nlargest(10, 'vote_average')
most_votes = genre_stats.nlargest(10, 'vote_count')

fig.add_trace(
    go.Bar(x=top_rated['genres'], y=top_rated['vote_average']),
    row=1, col=1
)
fig.add_trace(
    go.Bar(x=most_votes['genres'], y=most_votes['vote_count']),
    row=1, col=2
)

fig.update_layout(height=500, showlegend=False)
fig.show()

correlation = df['vote_average'].corr(df['vote_count'])
print(f"Rating vs votes correlation: {correlation:.2f}")


Correlation between ratings and vote count: 0.38


## Platform Comparison

In [None]:
platforms = ['Netflix', 'Hulu', 'Prime Video', 'Disney+']
platform_stats = []

for platform in platforms:
    platform_data = df[df[platform] == 1]
    stats = {
        'Platform': platform,
        'Total_Titles': len(platform_data),
        'Avg_Rating': platform_data['vote_average'].mean(),
        'Avg_Votes': platform_data['vote_count'].mean(),
        'Max_Rating': platform_data['vote_average'].max(),
        'Min_Rating': platform_data['vote_average'].min()
    }
    platform_stats.append(stats)

platform_comparison = pd.DataFrame(platform_stats)
print("Platform stats:")
print(platform_comparison.round(2))

# rating distribution by platform
fig = go.Figure()
for platform in platforms:
    platform_data = df[df[platform] == 1]['vote_average'].dropna()
    if len(platform_data) > 0:
        fig.add_trace(go.Box(y=platform_data, name=platform, boxpoints='outliers'))

fig.update_layout(title="Rating Distribution by Platform", yaxis_title="Rating", height=500)
fig.show()

# genre distribution matrix
platform_genre_dist = pd.DataFrame()

for platform in platforms:
    genre_counts = df[df[platform] == 1]['genres'].value_counts()
    platform_genre_dist[platform] = genre_counts

platform_genre_dist = platform_genre_dist.fillna(0)
top_genres = platform_genre_dist.sum(axis=1).nlargest(10).index
platform_genre_dist_top = platform_genre_dist.loc[top_genres]

fig = px.imshow(platform_genre_dist_top,
                title="Genre Distribution by Platform",
                labels=dict(x="Platform", y="Genre", color="Titles"),
                aspect="auto")
fig.update_layout(height=600)
fig.show()

best_platform = platform_comparison.loc[platform_comparison['Avg_Rating'].idxmax(), 'Platform']
best_rating = platform_comparison['Avg_Rating'].max()
print(f"\nHighest rated platform: {best_platform} ({best_rating:.2f})")

# genre bias (what % of each platform's content is each genre)
genre_bias = pd.DataFrame()

for platform in platforms:
    platform_content = df[df[platform] == 1]
    if len(platform_content) > 0:
        genre_percentages = (platform_content['genres'].value_counts() / len(platform_content) * 100)
        genre_bias[platform] = genre_percentages

genre_bias = genre_bias.fillna(0)
top_bias_genres = genre_bias.sum(axis=1).nlargest(8).index
genre_bias_top = genre_bias.loc[top_bias_genres]

fig = go.Figure()
for platform in platforms:
    fig.add_trace(go.Bar(
        name=platform,
        x=genre_bias_top.index,
        y=genre_bias_top[platform],
        text=[f'{val:.1f}%' for val in genre_bias_top[platform]],
        textposition='auto'
    ))

fig.update_layout(
    title="Genre Focus by Platform (%)",
    xaxis_title="Genre",
    yaxis_title="% of Platform Content",
    barmode='group',
    height=600,
    xaxis_tickangle=-45
)
fig.show()

Platform Comparison Summary:
      Platform  Total_Titles  Avg_Rating  Avg_Votes  Max_Rating  Min_Rating
0      Netflix           299        6.29     977.84         8.2         0.0
1         Hulu           225        6.31    1029.02         8.2         3.5
2  Prime Video           371        6.10     484.41        10.0         0.0
3      Disney+           181        6.46    1969.35         8.0         4.1



Platform with highest average rating: Disney+
Average rating: 6.46


# 5. Seasonal and Regional Trends

Let's analyze if there are any seasonal patterns in content releases and viewership across different genres and regions.

In [None]:
df['release_decade'] = (df['release_year'] // 10) * 10
decade_counts = df.groupby('release_decade').size()

fig = px.bar(x=decade_counts.index, y=decade_counts.values,
             title="Content by Release Decade",
             labels={'x': 'Decade', 'y': 'Number of Titles'})
fig.show()

yearly_trends = df.groupby('release_year').size()
recent_years = yearly_trends[yearly_trends.index >= 2010]

fig = px.line(x=recent_years.index, y=recent_years.values,
              title="Content Released per Year (2010-present)",
              labels={'x': 'Year', 'y': 'Titles Released'})
fig.show()

# seasonal patterns
seasonal_data = df.groupby('release_month').agg({
    'title': 'count',
    'vote_average': 'mean'
}).round(2)
seasonal_data.columns = ['releases', 'avg_rating']

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

fig = go.Figure()
fig.add_trace(go.Scatter(x=months, y=seasonal_data['releases'],
                        mode='lines+markers', name='Releases',
                        yaxis='y1', line=dict(color='blue')))

fig.add_trace(go.Scatter(x=months, y=seasonal_data['avg_rating'],
                        mode='lines+markers', name='Avg Rating',
                        yaxis='y2', line=dict(color='red')))

fig.update_layout(
    title="Seasonal Release Patterns",
    xaxis_title="Month",
    yaxis=dict(title="Number of Releases", side="left"),
    yaxis2=dict(title="Average Rating", side="right", overlaying="y"),
    legend=dict(x=0.02, y=0.98)
)
fig.show()

peak_month = seasonal_data['releases'].idxmax()
best_month = seasonal_data['avg_rating'].idxmax()
print(f"Most releases: {months[peak_month-1]} ({seasonal_data.loc[peak_month, 'releases']} titles)")
print(f"Best ratings: {months[best_month-1]} ({seasonal_data.loc[best_month, 'avg_rating']:.2f} avg)")


Regional Content Analysis:

Top 10 Content-Producing Countries (All Platforms):
United States of America: 860 titles
United Kingdom: 139 titles
France: 60 titles
Germany: 58 titles
Canada: 52 titles
Australia: 26 titles
Spain: 17 titles
India: 16 titles
Italy: 12 titles
China: 10 titles

Spain: 17 titles
India: 16 titles
Italy: 12 titles
China: 10 titles



Key Findings:
- Correlation between ratings and viewership: 0.38
- Median rating: 6.30
- Median vote count: 439

SUMMARY OF FINDINGS

1. Netflix Content Growth:
   - Peak year: 2013 with 23 titles

2. Platform Ratings:
   - Netflix: 6.29 avg rating, 299 titles
   - Hulu: 6.31 avg rating, 225 titles
   - Prime Video: 6.10 avg rating, 371 titles
   - Disney+: 6.46 avg rating, 181 titles

3. Content Distribution:
   - Top content producer: United States of America (860 titles)
   - Most common language: English

4. Rating vs Viewership:
   - Strong positive correlation (0.38) - higher rated content gets more views
