In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

In [None]:
!git clone "https://github.com/GeeksforgeeksDS/21-Days-21-Projects-Dataset"

In [None]:
data=pd.read_csv("/content/21-Days-21-Projects-Dataset/Datasets/netflix_titles.csv")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data.shape

HANDLING DATA : DATA IN STRING / OBJECT

In [None]:
data.columns

In [None]:
data["director"]=data["director"].fillna("unknown")
data["cast"]=data["cast"].fillna("unknown")
data.dropna()

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
print(data["country"].mode()[0])
data["country"]=data["country"].fillna(data["country"].mode()[0])

In [None]:
data.head()

In [None]:
data.dropna(subset=["date_added","rating"],inplace=True)
data.shape

In [None]:
data["date_added"]=pd.to_datetime(data["date_added"],format="mixed")

In [None]:

data['year_added'] = data['date_added'].dt.year
data['month_added'] = data['date_added'].dt.month

In [None]:
plt.figure(figsize=(8,8))
plt.pie(data["type"].value_counts(),labels=data["type"].value_counts().index, autopct='%1.1f%%')
plt.title('Distribution of Movies and TV Shows on Netflix')
plt.show()

In [None]:
contentadded=data.groupby(["year_added","type"]).size().unstack().fillna(0)
print(contentadded)

In [None]:
contentadded.plot(kind="line")

In [None]:
# Split the 'listed_in' column and explode it
genres = data.assign(genre=data['listed_in'].str.split(', ')).explode('genre')

In [None]:
# Get the top 15 genres and their counts
top_genres_counts = genres['genre'].value_counts().reset_index()
top_genres_counts.columns = ['genre', 'count'] # Rename columns for clarity

# Select only the top 15 for plotting
top_genres_counts_plot = top_genres_counts.head(15)

plt.figure(figsize=(12, 8))
sns.barplot(y='genre', x='count', data=top_genres_counts_plot, palette='mako', hue='genre', legend=False)
plt.title('Top 15 Genres on Netflix')
plt.xlabel('Count')
plt.ylabel('Genre')
plt.show()

In [None]:
movies_df = data[data['type'] == 'Movie'].copy()
tv_shows_df = data[data['type'] == 'TV Show'].copy()
movies_df['duration_min'] = movies_df['duration'].str.replace(' min', '').astype(int)
tv_shows_df['seasons'] = tv_shows_df['duration'].str.replace(' Seasons', '').str.replace(' Season', '').astype(int)
fig, axes = plt.subplots(1, 2, figsize=(18, 7))
sns.histplot(ax=axes[0], data=movies_df, x='duration_min', bins=50, kde=True, color='skyblue').set_title('Movie Duration Distribution (minutes)')
sns.countplot(ax=axes[1], x='seasons', data=tv_shows_df, palette='rocket', order=tv_shows_df['seasons'].value_counts().index, hue='seasons', legend=False).set_title('TV Show Season Distribution')
plt.show()

In [None]:

countries = data.assign(country=data['country'].str.split(', ')).explode('country')
top_countries_counts = countries['country'].value_counts().reset_index()
top_countries_counts.columns = ['country', 'count']
top_countries_counts_plot = top_countries_counts.head(15)
plt.figure(figsize=(12, 10))
sns.barplot(y='country', x='count', data=top_countries_counts_plot, palette='viridis', hue='country', legend=False)
plt.title('Top 15 Content Producing Countries on Netflix')
plt.xlabel('Number of Titles')
plt.ylabel('Country')
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(x='rating', data=data, order=data['rating'].value_counts().index, palette='crest', hue='rating', legend=False)
plt.title('Distribution of Content Ratings on Netflix')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Create the 'age_on_netflix' feature
data['age_on_netflix'] = data['year_added'] - data['release_year']
content_age = data[data['age_on_netflix'] >= 0]

plt.figure(figsize=(14, 7))
sns.histplot(data=content_age, x='age_on_netflix', bins=50, kde=True)
plt.title('Distribution of Content Age When Added to Netflix')
plt.xlabel('Content Age (Years)')
plt.ylabel('Number of Titles')
plt.show()

In [None]:
# Analyze movie duration across different top genres
top_genres = genres['genre'].value_counts().index[:5]
genres_movies = genres[(genres['type'] == 'Movie') & (genres['genre'].isin(top_genres))].copy()
genres_movies['duration_min'] = genres_movies['duration'].str.replace(' min', '').astype(int)

plt.figure(figsize=(15, 8))
sns.boxplot(data=genres_movies, x='genre', y='duration_min', palette='pastel', hue='genre', legend=False)
plt.title('Movie Duration by Top Genres')
plt.xlabel('Genre')
plt.ylabel('Duration (minutes)')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Combine all descriptions into a single string
text = ' '.join(data['description'])
wordcloud = WordCloud(width=800, height=400, background_color='black').generate(text)
plt.figure(figsize=(15, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Netflix Content Descriptions', fontsize=20)
plt.show()