# Netflix Data Analysis
This project analyzes a dataset of Netflix titles to uncover trends in content production, country contributions, content duration, and more.

**Goals:**
- Analyze when Netflix adds new content
- Discover top content-producing countries
- Explore movie duration trends
- Visualize popular genres and words in descriptions

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

%matplotlib inline

In [None]:
# Load the CSV file
df = pd.read_csv('netflix_titles.csv')
df.head()

In [None]:
# Convert 'date_added' to datetime and extract year
df['date_added'] = pd.to_datetime(df['date_added'])
df = df.dropna(subset=['date_added'])
df['year_added'] = df['date_added'].dt.year
df = df[(df['year_added'] >= 2008) & (df['year_added'] <= 2025)]

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(data=df, x='year_added', hue='type', palette='Set2')
plt.title('Content Added to Netflix Each Year')
plt.xlabel('Year')
plt.ylabel('Number of Titles')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Clean and analyze country data
df_country = df.dropna(subset=['country'])
df_country = df_country.assign(country=df_country['country'].str.split(', '))
df_country = df_country.explode('country')
df_country = df_country[df_country['country'].str.strip().str.lower() != 'unknown']
df_country = df_country[df_country['country'].str.strip() != '']
top_countries = df_country['country'].value_counts().head(10)

plt.figure(figsize=(10,6))
sns.barplot(x=top_countries.values, y=top_countries.index, palette='Set3')
plt.title('Top 10 Countries Producing Netflix Content')
plt.xlabel('Number of Titles')
plt.ylabel('Country')
plt.tight_layout()
plt.show()

In [None]:
# Filter only movies and clean duration
movies = df[df['type'] == 'Movie']
movies = movies.dropna(subset=['duration'])
movies['duration_minutes'] = movies['duration'].str.extract('(\d+)').astype(int)

plt.figure(figsize=(10,6))
sns.histplot(movies['duration_minutes'], bins=30, kde=True, color='skyblue')
plt.title('Distribution of Movie Durations on Netflix')
plt.xlabel('Duration (minutes)')
plt.ylabel('Number of Movies')
plt.tight_layout()
plt.show()

In [None]:
# Top genres
df_genres = df.dropna(subset=['listed_in'])
df_genres = df_genres.assign(listed_in=df_genres['listed_in'].str.split(', '))
df_genres = df_genres.explode('listed_in')
top_genres = df_genres['listed_in'].value_counts().head(10)

plt.figure(figsize=(10,6))
sns.barplot(x=top_genres.values, y=top_genres.index, palette='coolwarm')
plt.title('Top 10 Genres on Netflix')
plt.xlabel('Number of Titles')
plt.ylabel('Genre')
plt.tight_layout()
plt.show()

In [None]:
# Word cloud from descriptions
text = ' '.join(df['description'].dropna())
stopwords = set(STOPWORDS)
wordcloud = WordCloud(width=800, height=400, background_color='white',
                      stopwords=stopwords, colormap='inferno').generate(text)

plt.figure(figsize=(12,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Netflix Descriptions')
plt.tight_layout()
plt.show()