In [None]:
%pip install kagglehub pandas numpy matplotlib seaborn plotly networkx

In [None]:
import kagglehub
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter
import plotly.figure_factory as ff
from datetime import datetime, timedelta
import warnings
import networkx as nx
from collections import Counter, defaultdict
import re

warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
path = kagglehub.dataset_download("shivamb/amazon-prime-movies-and-tv-shows")
print("Path to dataset files:", path)

df = pd.read_csv(f"{path}/amazon_prime_titles.csv")

print("\n=== DATASET OVERVIEW ===")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

print("\n=== FIRST FEW ROWS ===")
print(df.head())

print("\n=== DATA TYPES ===")
print(df.dtypes)

print("\n=== MISSING VALUES ===")
missing_data = df.isnull().sum()
print(missing_data)

print("\n=== BASIC STATISTICS ===")
print(df.describe(include='all'))


In [None]:
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month
df['quarter_added'] = df['date_added'].dt.quarter
df['season_added'] = df['month_added'].map({12: 'Winter', 1: 'Winter', 2: 'Winter',
                                           3: 'Spring', 4: 'Spring', 5: 'Spring',
                                           6: 'Summer', 7: 'Summer', 8: 'Summer',
                                           9: 'Fall', 10: 'Fall', 11: 'Fall'})
df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce')
df['duration_type'] = df['duration'].str.extract(r'(\d+)\s*(min|Season)', expand=False)[1]
df['duration_value'] = df['duration'].str.extract(r'(\d+)', expand=False).astype(float)
current_year = datetime.now().year
df['content_age'] = current_year - df['release_year']
df['age_category'] = pd.cut(df['content_age'], 
                           bins=[0, 5, 10, 20, 50, 100], 
                           labels=['Very Recent (0-5y)', 'Recent (5-10y)', 
                                  'Older (10-20y)', 'Classic (20-50y)', 'Vintage (50y+)'])
def clean_text(text):
    if pd.isna(text):
        return ''
    return str(text).strip()
df['description'] = df['description'].apply(clean_text)
df['title'] = df['title'].apply(clean_text)
print("\n=== ENHANCED PARSING RESULTS ===")
print(f"Date range added: {df['year_added'].min()} - {df['year_added'].max()}")
print(f"Release year range: {df['release_year'].min()} - {df['release_year'].max()}")
print(f"Content age categories:\n{df['age_category'].value_counts()}")


In [None]:
def parse_multi_label(text_string, delimiter=','):
    if pd.isna(text_string):
        return []
    return [item.strip() for item in str(text_string).split(delimiter) if item.strip()]

df['genre_list'] = df['listed_in'].apply(parse_multi_label)
df['country_list'] = df['country'].apply(parse_multi_label)
if 'cast' in df.columns:
    df['cast_list'] = df['cast'].apply(parse_multi_label)
if 'director' in df.columns:
    df['director_list'] = df['director'].apply(parse_multi_label)

all_genres = set()
for genres in df['genre_list']:
    all_genres.update(genres)

print(f"\n=== MULTI-LABEL CLASSIFICATION RESULTS ===")
print(f"Total unique genres: {len(all_genres)}")

genre_combinations = defaultdict(int)
for genres in df['genre_list']:
    if len(genres) > 1:
        for i, genre1 in enumerate(genres):
            for genre2 in genres[i+1:]:
                combo = tuple(sorted([genre1, genre2]))
                genre_combinations[combo] += 1

print("Top 10 genre combinations:")
top_combos = sorted(genre_combinations.items(), key=lambda x: x[1], reverse=True)[:10]
for combo, count in top_combos:
    print(f"  {combo[0]} + {combo[1]}: {count}")


In [None]:
print("\n=== SEASONAL PATTERN ANALYSIS ===")

monthly_patterns = df.groupby(['month_added', 'type']).size().unstack(fill_value=0)
print("Content additions by month:")
print(monthly_patterns)

seasonal_patterns = df.groupby(['season_added', 'type']).size().unstack(fill_value=0)
print("\nSeasonal content addition patterns:")
print(seasonal_patterns)

quarterly_patterns = df.groupby(['quarter_added', 'type']).size().unstack(fill_value=0)
print("\nQuarterly content addition patterns:")
print(quarterly_patterns)

genre_seasonality = {}
for season in df['season_added'].dropna().unique():
    season_data = df[df['season_added'] == season]
    season_genres = {}
    for genres in season_data['genre_list']:
        for genre in genres:
            season_genres[genre] = season_genres.get(genre, 0) + 1
    genre_seasonality[season] = season_genres

print("\nTop 3 genres by season:")
for season, genres in genre_seasonality.items():
    if genres:
        top_3 = sorted(genres.items(), key=lambda x: x[1], reverse=True)[:3]
        print(f"  {season}: {[f'{g}({c})' for g, c in top_3]}")


In [None]:
fig, axes = plt.subplots(4, 2, figsize=(18, 20))
axes = axes.flatten()

colors = ['#FF6B6B', '#4ECDC4']
axes[0].pie(df['type'].value_counts().values, 
            labels=df['type'].value_counts().index, 
            autopct='%1.1f%%', colors=colors, 
            explode=(0.05, 0.05))
axes[0].set_title('Content Type Distribution', fontsize=14, fontweight='bold')

genre_counts = Counter([genre for genres in df['genre_list'] for genre in genres])
top_15_genres = dict(genre_counts.most_common(15))
y_pos = np.arange(len(top_15_genres))
axes[1].barh(y_pos, list(top_15_genres.values()), color='skyblue')
axes[1].set_yticks(y_pos)
axes[1].set_yticklabels(list(top_15_genres.keys()), fontsize=10)
axes[1].set_title('Top 15 Genres', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Number of Titles')

country_counts = Counter([country for countries in df['country_list'] for country in countries])
top_12_countries = dict(country_counts.most_common(12))
axes[2].bar(range(len(top_12_countries)), list(top_12_countries.values()), 
            color='lightcoral', alpha=0.8)
axes[2].set_xticks(range(len(top_12_countries)))
axes[2].set_xticklabels(list(top_12_countries.keys()), rotation=45, ha='right')
axes[2].set_title('Top 12 Countries by Content Volume', fontsize=14, fontweight='bold')
axes[2].set_ylabel('Number of Titles')

seasonal_data = df['season_added'].value_counts()
axes[3].bar(seasonal_data.index, seasonal_data.values, 
            color=['#87CEEB', '#98FB98', '#F0E68C', '#DDA0DD'])
axes[3].set_title('Content Additions by Season', fontsize=14, fontweight='bold')
axes[3].set_ylabel('Number of Titles')

df['content_age'].hist(bins=30, ax=axes[4], color='lightgreen', alpha=0.7, edgecolor='black')
axes[4].set_title('Distribution of Content Age', fontsize=14, fontweight='bold')
axes[4].set_xlabel('Years Since Release')
axes[4].set_ylabel('Frequency')

rating_counts = df['rating'].value_counts().head(10)
axes[5].pie(rating_counts.values, labels=rating_counts.index, autopct='%1.1f%%')
axes[5].set_title('Content Rating Distribution', fontsize=14, fontweight='bold')

movie_durations = df[df['type'] == 'Movie']['duration_value'].dropna()
tv_seasons = df[df['type'] == 'TV Show']['duration_value'].dropna()

axes[6].hist([movie_durations, tv_seasons], bins=20, alpha=0.7, 
             label=['Movies (minutes)', 'TV Shows (seasons)'], color=['red', 'blue'])
axes[6].set_title('Duration Distribution', fontsize=14, fontweight='bold')
axes[6].set_xlabel('Duration')
axes[6].set_ylabel('Frequency')
axes[6].legend()

monthly_counts = df['month_added'].value_counts().sort_index()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
               'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
axes[7].plot(range(1, 13), [monthly_counts.get(i, 0) for i in range(1, 13)], 
             marker='o', linewidth=2, markersize=8, color='purple')
axes[7].set_xticks(range(1, 13))
axes[7].set_xticklabels(month_names)
axes[7].set_title('Content Additions by Month', fontsize=14, fontweight='bold')
axes[7].set_ylabel('Number of Titles')
axes[7].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
print("\n=== CREATING INTERACTIVE VISUALIZATIONS ===")

country_year_data = []
for _, row in df.iterrows():
    if pd.notna(row['year_added']) and row['country_list']:
        for country in row['country_list']:
            country_year_data.append({
                "country": country,
                "year": int(row["year_added"]),
                "type": row["type"]
            })

country_df = pd.DataFrame(country_year_data)
if not country_df.empty:
    country_dot_data = country_df.groupby(['country', 'year']).size().reset_index(name='count')
    top_countries = country_df['country'].value_counts().head(15).index
    dot_filtered = country_dot_data[country_dot_data['country'].isin(top_countries)]
    fig_dot = px.scatter(
        dot_filtered,
        x="year",
        y="country",
        size="count",
        color="country",
        title="Content Additions by Country and Year (Dot Chart)",
        labels={"year": "Year", "country": "Country", "count": "Number of Titles"},
    )
    fig_dot.update_xaxes(tickmode="linear", dtick=1)
    fig_dot.show()

sunburst_data = []
for _, row in df.iterrows():
    if row["country_list"] and row["genre_list"]:
        primary_country = row["country_list"][0]
        primary_genre = row["genre_list"][0]
        content_type = row["type"]
        sunburst_data.append({
            "country": primary_country,
            "genre": primary_genre,
            "type": content_type,
            "title": row["title"]
        })

sunburst_df = pd.DataFrame(sunburst_data)
if not sunburst_df.empty:
    top_countries_sun = sunburst_df["country"].value_counts().head(8).index
    top_genres_sun = sunburst_df["genre"].value_counts().head(12).index
    filtered_sunburst = sunburst_df[
        (sunburst_df["country"].isin(top_countries_sun)) & 
        (sunburst_df["genre"].isin(top_genres_sun))
    ]
    hierarchy_data = filtered_sunburst.groupby(["country", "genre", "type"]).size().reset_index(name="count")
    fig_sunburst = px.sunburst(
        hierarchy_data,
        path=["country", "genre", "type"],
        values="count",
        title="Content Hierarchy: Country → Genre → Type"
    )
    fig_sunburst.show()


In [None]:
animation_data = []
for year in sorted(df['year_added'].dropna().unique()):
    year_data = df[df['year_added'] == year]
    genre_counts_year = Counter([genre for genres in year_data['genre_list'] for genre in genres])
    for genre, count in genre_counts_year.most_common(10):
        animation_data.append({
            'year': year,
            'genre': genre,
            'count': count
        })

if animation_data:
    anim_df = pd.DataFrame(animation_data)
    fig_animated = px.bar(anim_df, x='genre', y='count', color='genre',
                         animation_frame='year',
                         title='Genre Popularity Evolution Over Time',
                         labels={'count': 'Number of Titles'})
    fig_animated.update_layout(xaxis_tickangle=-45)
    fig_animated.show()

sankey_data = []
for _, row in df.iterrows():
    if row['country_list'] and row['genre_list']:
        primary_country = row['country_list'][0]
        primary_genre = row['genre_list'][0] 
        content_type = row['type']
        sankey_data.append([primary_country, primary_genre, content_type])

sankey_df = pd.DataFrame(sankey_data, columns=['Country', 'Genre', 'Type'])
top_countries_sankey = sankey_df['Country'].value_counts().head(6).index
top_genres_sankey = sankey_df['Genre'].value_counts().head(8).index

filtered_sankey = sankey_df[
    (sankey_df['Country'].isin(top_countries_sankey)) & 
    (sankey_df['Genre'].isin(top_genres_sankey))
]

all_nodes = list(top_countries_sankey) + list(top_genres_sankey) + ['Movie', 'TV Show']
node_dict = {node: i for i, node in enumerate(all_nodes)}

flow_counts = filtered_sankey.groupby(['Country', 'Genre']).size().reset_index(name='count1')
flow_counts2 = filtered_sankey.groupby(['Genre', 'Type']).size().reset_index(name='count2')

links = []
for _, row in flow_counts.iterrows():
    links.append({
        'source': node_dict[row['Country']],
        'target': node_dict[row['Genre']],
        'value': row['count1']
    })

for _, row in flow_counts2.iterrows():
    links.append({
        'source': node_dict[row['Genre']],
        'target': node_dict[row['Type']],
        'value': row['count2']
    })

fig_sankey = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=all_nodes,
        color="blue"
    ),
    link=dict(
        source=[link['source'] for link in links],
        target=[link['target'] for link in links],
        value=[link['value'] for link in links]
    )
)])

fig_sankey.update_layout(title_text="Content Flow: Country → Genre → Type", font_size=10)
fig_sankey.show()


In [None]:
if 'director_list' in df.columns:
    director_counts = Counter([director for directors in df['director_list'] if directors for director in directors])
    prolific_directors = set([d for d, _ in director_counts.most_common(10)])
else:
    prolific_directors = set()

director_genre_connections = []
for _, row in df.iterrows():
    if 'director_list' in row and row['director_list'] and row['genre_list']:
        for director in row['director_list']:
            if director in prolific_directors:
                for genre in row['genre_list']:
                    director_genre_connections.append((director, genre))

director_genre_df = pd.DataFrame(director_genre_connections, columns=["Director", "Genre"])
director_genre_df = director_genre_df[director_genre_df["Director"].isin(prolific_directors)]

top_genres = director_genre_df["Genre"].value_counts().head(5).index

n_plots = len(top_genres) + 1
rows = (n_plots + 1) // 2
cols = 2

fig = make_subplots(rows=rows, cols=cols, subplot_titles=["Overall"] + [f"{g} Genre" for g in top_genres])

overall_counts = director_genre_df["Director"].value_counts().head(10)

fig.add_trace(
    go.Bar(
        x=overall_counts.index,
        y=overall_counts.values,
        marker_color="skyblue",
        name="Overall"
    ),
    row=1, col=1
)

plot_idx = 2
for genre in top_genres:
    genre_counts = (
        director_genre_df[director_genre_df["Genre"] == genre]["Director"]
        .value_counts()
        .head(10)
    )
    row = (plot_idx - 1) // cols + 1
    col = (plot_idx - 1) % cols + 1
    fig.add_trace(
        go.Bar(
            x=genre_counts.index,
            y=genre_counts.values,
            marker_color="coral",
            name=genre
        ),
        row=row, col=col
    )
    plot_idx += 1

fig.update_layout(
    height=400 * rows, width=1200,
    title_text="Top 10 Directors by Genre (Interactive)",
    showlegend=False
)
fig.update_xaxes(tickangle=45)
fig.show()


In [None]:
print("\n=== CONTENT STRATEGY INSIGHTS ===")

def analyze_platform_evolution():
    genre_diversity_by_year = {}
    for year in df['year_added'].dropna().unique():
        year_data = df[df['year_added'] == year]
        unique_genres = set([genre for genres in year_data['genre_list'] for genre in genres])
        genre_diversity_by_year[year] = len(unique_genres)
    
    print("Genre diversity evolution:")
    for year in sorted(genre_diversity_by_year.keys())[-5:]:
        print(f"  {year}: {genre_diversity_by_year[year]} unique genres")
    
    freshness_by_year = df.groupby('year_added')['content_age'].mean()
    print(f"\nContent freshness trends (average age in years):")
    print(freshness_by_year.tail())
    
    us_content = df[df['country_list'].apply(lambda x: 'United States' in x if x else False)]
    international_content = df[df['country_list'].apply(lambda x: 'United States' not in x if x else False)]
    
    print(f"\nContent origin analysis:")
    print(f"  US Content: {len(us_content)} ({len(us_content)/len(df)*100:.1f}%)")
    print(f"  International Content: {len(international_content)} ({len(international_content)/len(df)*100:.1f}%)")
    
    return genre_diversity_by_year, freshness_by_year

evolution_results = analyze_platform_evolution()


In [None]:
print("\n=== ADVANCED STATISTICAL ANALYSIS ===")

numeric_cols = ['release_year', 'year_added', 'content_age', 'duration_value']
correlation_data = df[numeric_cols].corr()

plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(correlation_data, dtype=bool))
sns.heatmap(correlation_data, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .5})
plt.title('Correlation Matrix of Numeric Variables', fontsize=16, fontweight='bold')
plt.show()

from scipy import stats

movies_age = df[df['type'] == 'Movie']['content_age'].dropna()
tv_age = df[df['type'] == 'TV Show']['content_age'].dropna()

if len(movies_age) > 0 and len(tv_age) > 0:
    t_stat, p_value = stats.ttest_ind(movies_age, tv_age)
    print(f"\nContent Age Comparison (Movies vs TV Shows):")
    print(f"  Movies average age: {movies_age.mean():.1f} years")
    print(f"  TV Shows average age: {tv_age.mean():.1f} years")
    print(f"  T-test p-value: {p_value:.6f}")
    print(f"  Significant difference: {'Yes' if p_value < 0.05 else 'No'}")

def analyze_genre_trends():
    genre_year_matrix = []
    years = sorted(df['year_added'].dropna().unique())
    for year in years:
        year_data = df[df['year_added'] == year]
        year_genres = Counter([genre for genres in year_data['genre_list'] for genre in genres])
        genre_year_matrix.append(year_genres)
    top_genres_list = [g for g, _ in Counter([genre for genres in df['genre_list'] for genre in genres]).most_common(10)]
    print(f"\nGenre Momentum Analysis (Top 10 Genres):")
    for genre in top_genres_list:
        recent_count = sum([gm.get(genre, 0) for gm in genre_year_matrix[-3:]])
        earlier_count = sum([gm.get(genre, 0) for gm in genre_year_matrix[-6:-3]])
        if earlier_count > 0:
            growth_rate = ((recent_count - earlier_count) / earlier_count) * 100
            print(f"  {genre}: {growth_rate:+.1f}% growth")

analyze_genre_trends()


In [None]:
fig_dashboard = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Genre Popularity', 'Geographic Distribution', 'Seasonal Patterns', ''),
    specs=[[{"type": "xy"}, {"type": "geo"}],
           [{"type": "polar"}, {"type": "xy"}]]
)

top_genres_counts = Counter([genre for genres in df['genre_list'] for genre in genres]).most_common(8)
fig_dashboard.add_trace(
    go.Bar(
        x=[g[1] for g in top_genres_counts],
        y=[g[0] for g in top_genres_counts],
        orientation='h',
        name='Genres'
    ),
    row=1, col=1
)

country_counts_geo = Counter([country for countries in df['country_list'] for country in countries])
top_countries_geo = country_counts_geo.most_common(10)

fig_dashboard.add_trace(
    go.Choropleth(
        locations=[c[0] for c in top_countries_geo],
        locationmode='country names',
        z=[c[1] for c in top_countries_geo],
        colorscale='Blues',
        marker_line_color='black',
        colorbar=dict(
            title="Titles",
            orientation="h",
            x=0.5,
            xanchor="center",
            y=-0.15
        )
    ),
    row=1, col=2
)

fig_dashboard.update_geos(
    projection_type="natural earth",
    showcountries=True, countrycolor="lightgray",
    showcoastlines=True, coastlinecolor="gray",
    row=1, col=2
)

seasonal_counts = df['season_added'].value_counts()
seasons = ['Winter', 'Spring', 'Summer', 'Fall']
seasonal_values = [seasonal_counts.get(season, 0) for season in seasons]

fig_dashboard.add_trace(
    go.Scatterpolar(
        r=seasonal_values,
        theta=seasons,
        fill='toself',
        name='Seasonal Pattern'
    ),
    row=2, col=1
)

fig_dashboard.update_layout(
    height=800,
    title_text="Amazon Prime Content Analysis Dashboard"
)

fig_dashboard.show()
