In [None]:
import pandas as pd

# Load the dataset
netflix_data = pd.read_csv('/Users/harshapatel/downloads/netflix_titles.csv')

# Display the first few rows to understand its structure and identify initial cleaning tasks
netflix_data.head()

# Data Cleaning
# Handling missing values
netflix_data['director'].fillna('Unknown', inplace=True)
netflix_data['cast'].fillna('Unknown', inplace=True)
netflix_data['country'].fillna('Unknown', inplace=True)

# Considering 'date_added' might need a different approach if a significant number of missing values are present
missing_date_added = netflix_data['date_added'].isnull().sum()

# Remove duplicates based on 'show_id' as it's a unique identifier
netflix_data.drop_duplicates(subset='show_id', inplace=True)

# Convert 'date_added' to datetime format
netflix_data['date_added'] = pd.to_datetime(netflix_data['date_added'], errors='coerce')

# Check for any remaining missing values in 'date_added' after conversion
missing_date_added_after_conversion = netflix_data['date_added'].isnull().sum()

missing_date_added, missing_date_added_after_conversion

#Exploratory Data Analysis

import matplotlib.pyplot as plt
import seaborn as sns

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# Content Type Distribution
content_type_counts = netflix_data['type'].value_counts()

# Genre Popularity
# Splitting the 'listed_in' column to count each genre individually
genres = netflix_data['listed_in'].str.split(', ', expand=True).stack()
genre_counts = genres.value_counts()

# Plotting Content Type Distribution
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
content_type_counts.plot(kind='bar', color=['#FF9999', '#66B3FF'])
plt.title('Content Type Distribution')
plt.xlabel('Content Type')
plt.ylabel('Count')
plt.xticks(rotation=45)

# Plotting Genre Popularity
plt.subplot(1, 2, 2)
genre_counts[:10].plot(kind='bar', color='#99FF99')
plt.title('Top 10 Popular Genres')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# Task 1: Yearly Content Addition and Genre Trends Over Time

# Extracting year from 'date_added'
netflix_data['year_added'] = netflix_data['date_added'].dt.year

# Yearly Content Addition
yearly_content_addition = netflix_data.groupby(['year_added', 'type']).size().unstack().fillna(0)

# Genre Trends Over Time: Counting genres per year
# For simplicity, we'll consider the first genre listed as the primary genre for each title
netflix_data['primary_genre'] = netflix_data['listed_in'].apply(lambda x: x.split(', ')[0])
genre_trends = netflix_data.groupby(['year_added', 'primary_genre']).size().unstack().fillna(0)

# Plotting Yearly Content Addition
plt.figure(figsize=(14, 6))
yearly_content_addition.plot(kind='bar', stacked=True, color=['#FF9999', '#66B3FF'], ax=plt.subplot(1, 2, 1))
plt.title('Yearly Content Addition')
plt.xlabel('Year Added')
plt.ylabel('Number of Titles')
plt.xticks(rotation=45)

# Because of the large number of genres, plotting all might be cluttered. We'll focus on the top genres overall for clarity.
top_genres = genre_counts[:5].index.tolist()  # Selecting top 5 genres for clarity in visualization
genre_trends_filtered = genre_trends[top_genres]

# Plotting Genre Trends Over Time for Top Genres
genre_trends_filtered.plot(kind='line', marker='o', ax=plt.subplot(1, 2, 2))
plt.title('Genre Trends Over Time (Top Genres)')
plt.xlabel('Year Added')
plt.ylabel('Number of Titles')
plt.xticks(rotation=45)
plt.legend(title='Genres')

plt.tight_layout()
plt.show()

# Rating Distribution Over Time
rating_trends = netflix_data.groupby(['year_added', 'rating']).size().unstack().fillna(0)

# Plotting Rating Distribution Over Time
plt.figure(figsize=(14, 6))
rating_trends.plot(kind='area', stacked=True, ax=plt.gca())
plt.title('Rating Distribution Over Time')
plt.xlabel('Year Added')
plt.ylabel('Number of Titles')
plt.legend(title='Ratings', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


# Genre and Rating Correlation
genre_rating_distribution = netflix_data.groupby(['primary_genre', 'rating']).size().unstack().fillna(0)

# Because the table might be large, we'll visualize the correlation for the top genres and ratings for clarity
top_genres_for_correlation = genre_counts[:10].index  # Top 10 genres
top_ratings = rating_trends.columns.tolist()  # All ratings

# Filtering the distribution for top genres
genre_rating_filtered = genre_rating_distribution.loc[top_genres_for_correlation, top_ratings]

# Plotting Genre and Rating Correlation
plt.figure(figsize=(12, 8))
sns.heatmap(genre_rating_filtered, annot=True, fmt=".0f", cmap="YlGnBu", linewidths=.5)
plt.title('Genre and Rating Correlation')
plt.xlabel('Ratings')
plt.ylabel('Genres')
plt.xticks(rotation=45)

plt.show()


# Calculating the gap between release year and the year content was added to Netflix
netflix_data['acquisition_gap'] = netflix_data['year_added'] - netflix_data['release_year']

# Aggregating data to understand the trends in content acquisition over time
acquisition_trends = netflix_data.groupby(['year_added']).agg(
    average_gap=('acquisition_gap', 'mean'),
    min_gap=('acquisition_gap', 'min'),
    max_gap=('acquisition_gap', 'max')
).reset_index()

# Plotting Content Acquisition Trends
plt.figure(figsize=(12, 6))
plt.plot(acquisition_trends['year_added'], acquisition_trends['average_gap'], label='Average Gap', marker='o')
plt.fill_between(acquisition_trends['year_added'], acquisition_trends['min_gap'], acquisition_trends['max_gap'], color='grey', alpha=0.2, label='Gap Range')

plt.title('Content Acquisition Trends Over Time')
plt.xlabel('Year Added')
plt.ylabel('Gap Between Release Year and Year Added')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)

plt.show()