In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')


In [3]:
df.head()

In [4]:
df.isnull().sum()

In [5]:
#check duplicate rows
df.duplicated().sum()

In [6]:
import seaborn as sns #importing our visualization library
import matplotlib.pyplot as plt

In [7]:
sns.heatmap(df.isnull(), cmap = 'viridis')

In [8]:
df['duration'].value_counts()

In [9]:
df['rating'].unique()

In [10]:
df.drop(['director', 'cast'], axis = 1, inplace = True)

In [11]:
df.head()

In [12]:
#replacing nan values in country with US
df.replace({'country': {np.nan: 'United States'}}, inplace = True)

In [13]:
#dropping data_added since we already have release year
df.drop(['date_added'], axis = 1 , inplace = True)

In [14]:
df

In [15]:
df['rating'].value_counts()

In [16]:
#since tv ma is the most popular rating we h=can fill the nan values with this value
df.replace({'rating': { np.nan : 'TV-MA'}}, inplace = True)

In [17]:
df.head()

In [18]:
df.isnull().sum()

In [19]:
df = df[df['duration'].notna()]

In [20]:
df.isnull().sum()

In [23]:
import plotly.express as px #distribution according to countries
sns.countplot(x = 'type', data = df)
x=df["type"].value_counts().reset_index()
px.pie(x,values="type",names="index")

In [24]:
sns.countplot(x = 'rating', data = df)

In [25]:
plt.figure(figsize = (35,6))
sns.countplot(x='release_year',data = df)

In [26]:
sns.scatterplot(x = 'rating', y = 'type' , data = df)

In [27]:
plt.figure(figsize = (35,10))
sns.countplot(x = 'rating', data = df, hue = 'type')

In [28]:
df.head()

In [29]:
tag = "Stand-Up Comedy"
df["stand-up"] = df['listed_in'].fillna("").apply(lambda x: 1 if tag.lower() in x.lower() else 0)


In [30]:
com = df[df["stand-up"] == 1]

In [31]:
com[com["country"] == "United States"][["title", "country","release_year"]].head(10)

In [32]:
df_countries = pd.DataFrame(df.country.value_counts().reset_index().values, columns=["country", "count"])
df_countries.head()

In [33]:
fig = px.choropleth(   #distribution of content on basis of countries
    locationmode='country names',
    locations=df_countries.country,
    labels=df_countries.count
)
fig.show()

In [34]:
df[df['type']=='Movie']['release_year'].value_counts()[:20].plot(kind = "bar")

In [35]:
from wordcloud import WordCloud

In [36]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
 
example_sent = " ".join(df.title)
 
stop_words = set(stopwords.words('english'))
 
word_tokens = word_tokenize(example_sent)
 
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
 
filtered_sentence = []
 
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)
 



In [37]:
final = " ".join(filtered_sentence)


In [38]:
plt.subplots(figsize=(25,15))
wordcloud = WordCloud(
                          background_color='Black',
                          width=1920,
                          height=1080
                         ).generate(final)
plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('cast.png')
plt.show()

In [39]:
countries=pd.crosstab(df["country"],["type"]).sort_values(by="type",ascending=True)[-10:].reset_index()
countries
plt.figure(figsize=(12,6))
ax=sns.barplot(x=countries["country"],y=countries["type"] )
plt.title("TOP COUNTRIES WITH HIGHEST NUMBER OF MOVIES AND SHOWS",fontsize=20)
for i in ax.patches:
    ax.text(i.get_x()+.25,i.get_height()+2.3,str(int((i.get_height()))),
            rotation=0,fontsize=15,color='black')




In [40]:
m=df[df["type"]=="Movie"]
movie_con = pd.crosstab(m['country'], ['type']).sort_values(by = 'type', ascending = True)[-10:].reset_index()
plt.figure(figsize=(12,6))
ax=sns.barplot(x=movie_con["country"],y=movie_con["type"] )

In [41]:
t=df[df["type"]=="TV Show"]
tv_con = pd.crosstab(t['country'], ['type']).sort_values(by = 'type', ascending = True)[-10:].reset_index()
plt.figure(figsize=(12,6))
ax=sns.barplot(x=tv_con["country"],y=tv_con["type"] )

In [42]:
train = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')

In [43]:
train = train[train['duration'].notna()]

In [44]:
train['date_added'] = train['date_added'].fillna(train['date_added'].mode()[0])

In [45]:
df["year"]=pd.DatetimeIndex(train["date_added"]).year
df["month"]=pd.DatetimeIndex(train["date_added"]).month

In [46]:
new = df[['type', 'year']].value_counts().reset_index()
new.columns = ['type', 'year', 'count']

In [47]:
sns.lineplot(data = new, x = 'year', y = 'count', hue = 'type')

In [48]:
df.isnull().sum()

# **RECOMMENDER SYSTEM**

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [50]:
#removing stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df['description'] = df['description'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['description'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

In [51]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [52]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [53]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [54]:
get_recommendations('Peaky Blinders')