In [6]:
import numpy as np
import pandas as pd
data=pd.read_csv('/kaggle/input/netflix/netfix_data.csv')

In [7]:
data.sample()

Unnamed: 0,Show Id,Title,Description,Director,Genres,Cast,Production Country,Release Date,Rating,Duration,Imdb Score,Content Type,Date Added
1012,7b7ea6fb-9ea9-4b76-b373-f98dd0061a52,CIA: Comrade in America,A young man in Kerala has two weeks to stop th...,Amal Neerad,"Action & Adventure, Dramas, Independent Movies","Dulquer Salmaan, Karthika Muraleedharan, Siddi...",India,2017.0,TV-14,130 min,6.4/10,Movie,1-May-18


In [8]:
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
data.isnull().sum()

Show Id                  0
Title                    0
Description              0
Director              2063
Genres                   0
Cast                   529
Production Country     559
Release Date             3
Rating                   4
Duration                 3
Imdb Score             608
Content Type             0
Date Added            1334
dtype: int64

The dataset contains null values, but before removing the null values, let’s select the columns that we can use to build a Netflix recommendation system:

In [10]:
data = data[["Title", "Description", "Content Type", "Genres"]]
data.head()

Unnamed: 0,Title,Description,Content Type,Genres
0,#Alive,"As a grisly virus rampages a city, a lone man ...",Movie,"Horror Movies, International Movies, Thrillers"
1,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...",Movie,"Documentaries, International Movies"
2,#blackAF,Kenya Barris and his family navigate relations...,TV Show,TV Comedies
3,#cats_the_mewvie,This pawesome documentary explores how our fel...,Movie,"Documentaries, International Movies"
4,#FriendButMarried,"Pining for his high school crush for years, a ...",Movie,"Dramas, International Movies, Romantic Movies"


Now let’s drop the rows containing null values and move further:

In [11]:
data = data.dropna()

Now let's clean the Title column as it contains some data preparation:

In [12]:

import nltk
2
import re
3
nltk.download('stopwords')
4
stemmer = nltk.SnowballStemmer("english")
5
from nltk.corpus import stopwords
6
import string
7
stopword=set(stopwords.words('english'))
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["Title"] = data["Title"].apply(clean)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Now let’s have a look at some samples of the Titles before moving forward:

In [13]:
print(data.Title.sample(10))

4021                    rugrat pari movi
2558                          kiss first
1372                      dr seuss lorax
664              bikram yogi guru predat
4265                        sinatra noth
1468                    el potro unstopp
5248                    trader sovdagari
450          aunti donna big ol hous fun
1485    elit short stori guzmán cay rebe
2579                             krutant
Name: Title, dtype: object


Now I will use the Genres column as the feature to recommend similar content to the user. I will use the concept of cosine similarity here (used to find similarities in two documents):

In [15]:
feature = data["Genres"].tolist()
tfidf = text.TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(feature)
similarity = cosine_similarity(tfidf_matrix)

Now I will set the Title column as an index so that we can find similar content by giving the title of the movie or TV show as an input:

In [17]:
indices = pd.Series(data.index,index=data['Title']).drop_duplicates()


Now here’s how to write a function to recommend Movies and TV shows on Netflix:

In [18]:
def netFlix_recommendation(title, similarity = similarity):
    index = indices[title]
    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[0:10]
    movieindices = [i[0] for i in similarity_scores]
    return data['Title'].iloc[movieindices]

print(netFlix_recommendation("girlfriend"))

2                          blackaf
284                     washington
416                 arrest develop
433     astronomi club sketch show
450    aunti donna big ol hous fun
655                      big mouth
751                bojack horseman
804                   brew brother
934                       champion
936                  chappell show
Name: Title, dtype: object
