In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
sns.set_style ('darkgrid')

In [None]:
!git clone "https://github.com/GeeksforgeeksDS/21-Days-21-Projects-Dataset"

In [None]:
df=pd.read_csv('/content/21-Days-21-Projects-Dataset/Datasets/netflix_titles.csv')

In [None]:
df.info()

In [None]:
df.head()

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
df['country'] = df['country'].fillna('Unknown')
df['director'] = df['director'].fillna('Unknown')
df['cast'] = df['cast'].fillna('Unknown')
# df['rating'].fillna('Unknown', inplace=True)
df.dropna(subset=['date_added','rating'], inplace=True)

print(df.isnull().sum())

In [None]:
df['date_added'] = pd.to_datetime(df['date_added'], format='mixed')
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month
display(df.head())

In [None]:
print(df.isnull().sum())
print(df.dtypes)

In [None]:
# exploring using pie chart
plt.figure(figsize=(10,10))
type_counts=df['type'].value_counts()
plt.pie(type_counts,labels=type_counts.index,autopct='%1.f%%',startangle=140,colors=['#e60023','#221f1f'])
plt.title('proportion of movies vs TV shows')
plt.ylabel('')
plt.show()

In [None]:
# handling multi-valued columns
genres =df.assign(genres=df['listed_in'].str.split(',')).explode('genres')
genres.head()
# top generes
top_genres = genres['genres'].value_counts().reset_index()
top_genres.columns = ['Genre', 'Count']
top_genres.head(15)

In [None]:
# in barplot
plt.figure(figsize=(10, 6))
sns.barplot(x='Genre', y='Count', data=top_genres.head(15))
plt.xticks(rotation=90)

In [None]:
# separate tv shows and movies
tv_shows = df[df['type'] == 'TV Show'].copy()
movies = df[df['type'] == 'Movie'].copy()

In [None]:
# convert duration for movies and tv shows
movies['duration'] = movies['duration'].str.extract('(\d+)').astype(int)
tv_shows['duration'] = tv_shows['duration'].str.extract('(\d+)').astype(int)

In [None]:
# subplots for movies and tv shows duration
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
sns.histplot(movies['duration'], bins=20, ax=axes[0], kde=True, color='skyblue').set_title('Movies')
sns.histplot(tv_shows['duration'], bins=20, ax=axes[1]).set_title('TV shows')
plt.tight_layout()
plt.show()

In [None]:
# combine all descrptions into single string
descrption_text = ' '.join(df['description'])
# create a word cloud image
wordcloud = WordCloud(width=800, height=400, background_color='black').generate(descrption_text)
# display
plt.figure(figsize=(15, 15))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# how has the distribution content rating changed over time
rating_counts = df['rating'].value_counts().reset_index()
rating_counts.columns = ['Rating', 'Count']
rating_counts.head()

In [None]:
# relationship btwn content age and its type(movies vs tvshow)
age_rating_counts = df.groupby(['rating', 'type']).size().reset_index(name='Count')
age_rating_counts.head()



In [None]:
# top directors on netflix
director_counts = df['director'].value_counts().reset_index()
director_counts.columns = ['Director', 'Count']
director_counts.head()

In [None]:

# display
plt.figure(figsize=(10, 6))
sns.barplot(x='Director', y='Count', data=director_counts.head(10))
plt.xticks(rotation=90)
plt.show()


In [None]:
# most common word pair  and phrases in content descrption
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

all_descriptions = ' '.join(df['description'].dropna())


vectorizer_bigram = CountVectorizer(ngram_range=(2, 2), stop_words='english')
X_bigram = vectorizer_bigram.fit_transform([all_descriptions])
bigram_counts = Counter(dict(zip(vectorizer_bigram.get_feature_names_out(), X_bigram.sum(axis=0).tolist()[0])))

print("Most common word pairs (bigrams):")
for bigram, count in bigram_counts.most_common(10):
    print(f"{bigram}: {count}")


vectorizer_trigram = CountVectorizer(ngram_range=(3, 3), stop_words='english')
X_trigram = vectorizer_trigram.fit_transform([all_descriptions])
trigram_counts = Counter(dict(zip(vectorizer_trigram.get_feature_names_out(), X_trigram.sum(axis=0).tolist()[0])))

print("\nMost common three-word phrases (trigrams):")
for trigram, count in trigram_counts.most_common(10):
    print(f"{trigram}: {count}")

In [None]:
# Can we identify any trends in content production based on the release year vs. the year added to Netflix?
release_year_counts = df['release_year'].value_counts().reset_index()
release_year_counts.columns = ['Release Year', 'Count']
release_year_counts.head()


In [None]:

year_added_counts = df['year_added'].value_counts().reset_index()
year_added_counts.columns = ['Year Added', 'Count']
year_added_counts = year_added_counts.sort_values(by='Year Added')


release_year_counts_sorted = release_year_counts.sort_values(by='Release Year')
# display
plt.figure(figsize=(12, 6))
sns.lineplot(x='Year Added', y='Count', data=year_added_counts, marker='o', label='Year Added')
sns.lineplot(x='Release Year', y='Count', data=release_year_counts_sorted, marker='o', label='Release Year')
plt.title('Content Count by Release Year vs. Year Added to Netflix')
plt.xlabel('Year')
plt.ylabel('Count')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
pip install ydata-profiling

In [None]:
from ydata_profiling import ProfileReport
rep= ProfileReport(df,title="Netflix Report")
rep.to_file(output_file='netflix.html')

In [None]:
from google.colab import files
files.download('netflix.html')