let’s import the necessary Python libraries and the dataset we need for this task

In [15]:
import numpy as np
import pandas as pd 
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity 

import nltk  # Natural Language Toolkit 
             # It is one of the most powerful NLP libraries,
             # which contains packages to make machines understand human language
             # and reply to it with an appropriate response.
            
import re    # Re library in python holds the key to deal with all the problems relating to textual data analysis. 
             # This library provides a range of methods that can help you build patterns 
             # and extract or substitute the desired string.


let's import the dataset from kaggle of netflix 

In [16]:
data = pd.read_csv('netflixData.csv')

In [25]:
data[1:200]

Unnamed: 0,Title,Description,Genres,Content Type
1,#Alive,"As a grisly virus rampages a city, a lone man ...","Horror Movies, International Movies, Thrillers",Movie
2,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...","Documentaries, International Movies",Movie
3,#blackAF,Kenya Barris and his family navigate relations...,TV Comedies,TV Show
4,#cats_the_mewvie,This pawesome documentary explores how our fel...,"Documentaries, International Movies",Movie
5,#FriendButMarried,"Pining for his high school crush for years, a ...","Dramas, International Movies, Romantic Movies",Movie
...,...,...,...,...
195,Across Grace Alley,"A young boy, upset by his parents' divorce, be...",Dramas,Movie
196,Action Replayy,"Sick of his parents’ constant squabbling, a yo...","Children & Family Movies, Comedies, Internatio...",Movie
197,Acts of Vengeance,Devastated by the murder of his wife and child...,"Action & Adventure, International Movies",Movie
198,Ad Vitam,In a future where regeneration technology lets...,"Crime TV Shows, International TV Shows, TV Dramas",TV Show


let’s have a look at whether the data contains null values or not?

In [18]:
data.isnull().sum()

Show Id                  0
Title                    0
Description              0
Director              2064
Genres                   0
Cast                   530
Production Country     559
Release Date             3
Rating                   4
Duration                 3
Imdb Score             608
Content Type             0
Date Added            1335
dtype: int64

The dataset contains null values, but before removing the null values, let’s select the columns that we can use to build a Netflix recommendation system

In [19]:
data = data[['Title','Description','Genres','Content Type']]
data

Unnamed: 0,Title,Description,Genres,Content Type
0,(Un)Well,This docuseries takes a deep dive into the luc...,Reality TV,TV Show
1,#Alive,"As a grisly virus rampages a city, a lone man ...","Horror Movies, International Movies, Thrillers",Movie
2,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...","Documentaries, International Movies",Movie
3,#blackAF,Kenya Barris and his family navigate relations...,TV Comedies,TV Show
4,#cats_the_mewvie,This pawesome documentary explores how our fel...,"Documentaries, International Movies",Movie
...,...,...,...,...
5962,الف مبروك,"On his wedding day, an arrogant, greedy accoun...","Comedies, Dramas, International Movies",Movie
5963,دفعة القاهرة,A group of women leaves Kuwait to attend unive...,"International TV Shows, TV Dramas",TV Show
5964,海的儿子,"Two brothers start a new life in Singapore, wh...","International TV Shows, TV Dramas",TV Show
5965,반드시 잡는다,After people in his town start turning up dead...,"Dramas, International Movies, Thrillers",Movie


-  Title        : The title column contains the titles of movies and TV shows on Netflix 

-  Description  : Description column describes the plot of the TV shows and movies

-  Content Type : The Content Type column tells us if it’s a movie or a TV show

-  Genre        : The Genre column contains all the genres of the TV show or the movie

let’s drop the rows containing null values

In [20]:
data = data.dropna()

Now I will clean the Title column as it contains some data preparation

In [22]:
import nltk 
import re 

stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))





In [26]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

In [27]:
data["Title"] = data["Title"].apply(clean)

In [31]:
data.Title.sample(10)

546       barbi life dreamhous
1817           ginni wed sunni
5872        world famous lover
731                 blown away
356       tomorrow entir world
2155          hunter x hunter 
4915                innoc file
1642               first match
22                            
4056    saint seiya lost canva
Name: Title, dtype: object

Now I will use the Genres column as the feature to recommend similar content to the user. I will use the concept of cosine similarity

In [39]:
feature = data["Genres"].tolist()

v

In [40]:
tfidf = text.TfidfVectorizer(input=feature, stop_words="english")

v

In [41]:
tfidf_matrix = tfidf.fit_transform(feature)

v

In [42]:
similarity = cosine_similarity(tfidf_matrix)

Now I will set the Title column as an index 
so that we can find similar content 
by giving the title of the movie or TV show as an input

In [43]:
indices = pd.Series(data.index,index=data['Title']).drop_duplicates()

Now here’s how to write a function to recommend Movies and TV shows on Netflix

In [44]:
def netFlix_recommendation(title, similarity = similarity):
    index = indices[title]
    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[0:10]
    movieindices = [i[0] for i in similarity_scores]
    return data['Title'].iloc[movieindices]

In [45]:
print(netFlix_recommendation("girlfriend"))

3                          blackaf
285                     washington
417                 arrest develop
434     astronomi club sketch show
451    aunti donna big ol hous fun
656                      big mouth
752                bojack horseman
805                   brew brother
935                       champion
937                  chappell show
Name: Title, dtype: object


In [46]:
print(netFlix_recommendation("washington"))

3                          blackaf
285                     washington
417                 arrest develop
434     astronomi club sketch show
451    aunti donna big ol hous fun
656                      big mouth
752                bojack horseman
805                   brew brother
935                       champion
937                  chappell show
Name: Title, dtype: object


In [51]:
indices

Title
unwel                           0
aliv                            1
annefrank  parallel stori       2
blackaf                         3
catsthemewvi                    4
                             ... 
الف مبروك                    5962
دفعة القاهرة                 5963
海的儿子                         5964
반드시 잡는다                      5965
최강전사 미니특공대  영웅의 탄생           5966
Length: 5967, dtype: int64

In [52]:
print(netFlix_recommendation("unwel"))

0                         unwel
68                          day
305                        alon
322      america next top model
406                         one
468    awak million dollar game
615            best leftov ever
694     black ink crew new york
720                 bling empir
843                buri bernard
Name: Title, dtype: object


In [53]:
print(netFlix_recommendation("aliv"))

1                      aliv
178                  aaviri
360            andhaghaaram
361             andhakaaram
398                  apostl
1759     game hindi version
1760     game tamil version
1761    game telugu version
1801              ghost lab
1804            ghost stori
Name: Title, dtype: object


So this is how i built a Netflix Recommendation System using the Python programming language