<a href="https://colab.research.google.com/github/khushigoswami087/Data-Analysis-On-Netflix/blob/main/Netflix_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1. Importing all the necessary library**

In [None]:
from IPython.display import display, HTML, Image
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams

**2. Design The Image**

In [None]:
# design the image
plt.figure(facecolor = "black", figsize = (18, 10))
plt.text(0.5, 0.75, 'NETFLIX',
        fontsize = 120, fontweight = 'bold',
        ha = 'center', va = 'center',
        color = '#E50914',
        fontfamily = 'Impact')
plt.text(0.5, 0.45, 'DATA ANALYSIS',
        fontsize = 45,
        ha = 'center', va = 'center',
        color = 'white',
        fontfamily = 'Impact')
plt.text(0.5, 0.25, 'By Khushi Goswami',
        fontsize = 28,
        ha = 'center', va = 'center',
        color = '#F5F5F1',
        fontfamily = 'Impact')
plt.axis('off')
plt.tight_layout()
plt.show()

In [None]:
data = pd.read_csv("/content/netflix_titles.csv")

In [None]:
data.head()

**3. Check for Duplicate records**

In [None]:
# check for duplicate records
duplicates = data.duplicated().sum()
print(f"Number of Duplicate records found: {duplicates}")
data.drop_duplicates(inplace = True)
print(f"Shape after removing duplicates: {data.shape}")

**4. Data Cleaning**

In [None]:
# data cleaning
missing_values = data.isnull().sum()
print(missing_values)

data['director'] = data['director'].fillna("Unknown")
data['cast'] = data['cast'].fillna("Not Available")

country_mode = data['country'].mode()[0]
data['country'] = data['country'].fillna(country_mode)

data['date_added'] = data['date_added'].str.strip()
data['date_added'] = pd.to_datetime(data['date_added'], errors='coerce')
data = data.dropna(subset=['date_added'])

rating_mode = data['rating'].mode()[0]
data['rating'] = data['rating'].fillna(rating_mode)

print(f"Missing values after treatment: {data.isnull().sum()}")

print(data.dtypes)

data['Date_N'] = pd.to_datetime(data['date_added'], errors = 'coerce')
data['Year'] = data['Date_N'].dt.year
data['Month'] = data['Date_N'].dt.month_name()

**5. Content type distribution**

In [None]:
# content type distribution
print("\nContent Type Distribution:")
content_counts = data['type'].value_counts()
print(content_counts)

plt.figure(figsize=(8, 5))
content_counts.plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Movies vs TV Shows on Netflix')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

**6. Year with highest Number of releases**

In [None]:
# Year with highest Number of releases
print("\nYear with highest number of releases:")
yearly_counts = data['Year'].value_counts().head(10)
yearly_counts
plt.figure(figsize=(12, 6))
yearly_counts.sort_index().plot(kind='bar', color='teal')
plt.title('Content Releases by Year (Top 10)')
plt.xlabel('Year')
plt.ylabel('Number of Releases')
plt.show()

**7. TV shows released in India**

In [None]:
# TV shows released in India
print("\nTV Shows released in India:")
indian_tv = data[(data['country'] == 'India') & (data['type'] == 'TV Show')]
indian_tv[['title', 'date_added', 'type']]
plt.figure(figsize=(10, 6))
indian_tv['Year'].value_counts().sort_index().plot(kind='bar', color='green')
plt.title('Indian TV Shows by Release Year')
plt.xlabel('Year')
plt.ylabel('Number of Shows')
plt.show()

**8. Top 10 Directors**

In [None]:
# Top 10 Directors
director_data = data[data['director'] != "Unknown"]
top_directors = director_data['director'].value_counts().head(10)
top_directors
plt.figure(figsize=(12, 6))
top_directors.plot(kind='barh', color='purple')
plt.title('Top 10 Directors on Netflix')
plt.xlabel('Number of Titles')
plt.ylabel('Director')
plt.gca().invert_yaxis()
plt.show()

**9. content ratings**

In [None]:
# content ratings
print("\nContent Ratings:")
ratings = data['rating'].value_counts()
ratings
plt.figure(figsize=(12, 6))
ratings.plot(kind='bar', color='orange')
plt.title('Content Distribution by Rating')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
print("\nDuration analysis:")
movies = data[data['type'] == 'Movie'].copy()
movies['duration_min'] = movies['duration'].str.extract(r'(\d+)').astype(float)
tv_shows = data[data['type'] == 'TV Show'].copy()
tv_shows['Duration_seasons'] = tv_shows['duration'].str.extract(r'(\d+)').astype(float)
print(f"Longest Movie: {movies['duration_min'].max()} minutes")
print(f"TV Show with most seasons: {tv_shows['Duration_seasons'].max()} seasons")
plt.figure(figsize=(12, 6))
movies['duration_min'].plot(kind='hist', bins=30, color='salmon')
plt.title('Distribution of Movie Durations')
plt.xlabel('Duration (minutes)')
plt.ylabel('Count')
plt.show()

In [None]:
country_tv = data[data['type'] == 'TV Show']['country'].value_counts().head(10)
country_tv
plt.figure(figsize=(12, 6))
country_tv.plot(kind='barh', color='darkblue')
plt.title('Top 10 Countries by TV Show Production')
plt.xlabel('Number of TV Shows')
plt.ylabel('Country')
plt.show()

**10. monthly release trends**

In [None]:
# monthly release trends
monthly_releases = data['Month'].value_counts()
months_order = ['January', 'February', 'March', 'April', 'May', 'June',
 'July', 'August', 'September', 'October', 'November','December']
monthly_releases = monthly_releases.reindex(months_order)
plt.figure(figsize=(12, 6))
monthly_releases.plot(kind='line', marker='o', color='darkgreen')
plt.title('Content Releases by Month')
plt.xlabel('Month')
plt.ylabel('Number of Releases')
plt.grid(True)
plt.show()

**11. Content By Country (Top 10)**

In [None]:
# content by country (Top 10)
top_countries = data['country'].value_counts().head(10)
top_countries
plt.figure(figsize=(12, 6))
top_countries.plot(kind='bar', color='darkred')
plt.title('Top 10 Countries by Content Production')
plt.xlabel('Country')
plt.ylabel('Number of Titles')
plt.xticks(rotation=45)
plt.show()

**12. Content Type Distribution By rating**

In [None]:
# content type distribution by rating
rating_by_type = pd.crosstab(data['rating'], data['type'])
rating_by_type
plt.figure(figsize=(12, 6))
sns.heatmap(rating_by_type, annot=True, fmt='d', cmap='YlOrRd')
plt.title('Content Type Distribution by Rating')
plt.xlabel('Category')
plt.ylabel('Rating')
plt.show()

**13. Duration Vs Rating Analysis**

In [None]:
# duration vs rating analysis
print("\nAverage Duration by Rating:")
if not movies.empty:
 duration_by_rating = movies.groupby('rating')['duration_min'].mean().sort_values(ascending=False)
 duration_by_rating
 plt.figure(figsize=(12, 6))
 duration_by_rating.plot(kind='bar', color='darkviolet')
 plt.title('Average Movie Duration by Rating')
 plt.xlabel('Rating')
 plt.ylabel('Average Duration (minutes)')
 plt.show()
else:
 print("No movie data available for duration analysis")