
# 6. Build a content based movie recommender system with natural language processing.The function should take movie name as input and should return top 3 recommended movies.(Use Netflix dataset)

In [1]:
## importing important packages 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import warnings
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv('NETFLIX TITLES.csv')   ## importing the data

In [3]:
df.head()   ## checking the data in tab form

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81088285,Movie,The Mayo Clinic,"Ken Burns, Christopher Loren Ewers, Erik Ewers",Peter Coyote,United States,"April 19, 2019",2018,TV-14,116 min,Documentaries,A look at how a world-renowned medical institu...
1,81077597,Movie,I Am,Onir,"Juhi Chawla, Rahul Bose, Nandita Das, Sanjay S...","India, Japan","March 4, 2019",2010,TV-MA,106 min,"Dramas, Independent Movies, International Movies",Four individuals in modern India grapple with ...
2,1150871,Movie,Love Jones,Theodore Witcher,"Larenz Tate, Nia Long, Isaiah Washington, Lisa...",United States,"November 1, 2019",1997,R,109 min,"Comedies, Dramas, Independent Movies","In this urban romantic comedy set in Chicago, ..."
3,20077944,Movie,Ghayal,Rajkumar Santoshi,"Sunny Deol, Meenakshi Sheshadri, Amrish Puri, ...",India,"December 31, 2019",1990,TV-14,163 min,"Action & Adventure, Dramas, International Movies","Framed for his older brother's murder, a boxer..."
4,80223779,Movie,Marriage Story,Noah Baumbach,"Scarlett Johansson, Adam Driver, Laura Dern, A...","United States, United Kingdom","December 6, 2019",2019,R,137 min,Dramas,Academy Award-nominated filmmaker Noah Baumbac...


In [4]:
df.columns  ## names of the columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [5]:
## selecting only particular columns for bag of words

In [6]:
df1=df[['type','director','title','director','cast','country','rating','listed_in','description']]

In [7]:
df1.set_index('title',inplace=True)  ## setting the index to the name of title

In [8]:
## converting the data in small letter
df1 = df1.applymap(lambda s:s.lower() if type(s) == str else s)

In [9]:
col_names=list(df1.columns)

In [10]:
## create a single data corpus
data = []
for row in range(0,len(df1.index)):
    data.append(' '.join(str(x) for x in df1.iloc[row,0:len(col_names)]))

In [11]:
data[1]  ## checking first row paragraph 

'movie onir onir juhi chawla, rahul bose, nandita das, sanjay suri, manisha koirala, purab kohli, shernaz patel, abhimanyu singh, arjun mathur, radhika apte india, japan tv-ma dramas, independent movies, international movies four individuals in modern india grapple with their identities amid social taboos, trauma and brutal sexual discrimination in this quartet of stories.'

In [12]:
## we are converting it into paragraph because it easy to convert into bag of words

In [13]:
## more data cleaning like removing numbers and stop words
ps = PorterStemmer()
corpus = []
for i in range(0, len(data)):
    review = re.sub('[^a-zA-Z]', ' ', data[i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [14]:
## implementing BAG OF WORDS
CV = CountVectorizer()
BOG = CV.fit_transform(corpus)

In [15]:
## buliding the cosine similarity matrix
CSM=cosine_similarity(BOG,BOG)

In [16]:
## checking cosine similarity matrix
CSM

array([[1.        , 0.07142857, 0.07559289, ..., 0.06338657, 0.0985138 ,
        0.04393748],
       [0.07142857, 1.        , 0.15118579, ..., 0.10564428, 0.15762208,
        0.08787496],
       [0.07559289, 0.15118579, 1.        , ..., 0.        , 0.18766297,
        0.09299811],
       ...,
       [0.06338657, 0.10564428, 0.        , ..., 1.        , 0.06993786,
        0.15596257],
       [0.0985138 , 0.15762208, 0.18766297, ..., 0.06993786, 1.        ,
        0.07271792],
       [0.04393748, 0.08787496, 0.09299811, ..., 0.15596257, 0.07271792,
        1.        ]])

In [17]:
indexs = pd.Series(df1.index)

In [18]:
## defining a function to return the top 3 recommended movies 
def rec(name,CSM=CSM):
    recommended=[]
    INDEX=indexs[indexs==name].index[0]
    SC=pd.Series(CSM[INDEX]).sort_values(ascending = False)
    top_3=list(SC.iloc[1:4].index)
    for i in top_3:
        recommended.append(list(df1.index)[i])
    return recommended

In [27]:
## first way we have used BAG OF WORDS 

In [28]:
## now using TFIDF method to build the martix

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus)
CSM1=cosine_similarity(X,X)


In [30]:
def rec2(name,CSM1=CSM1):
    recommended=[]
    INDEX=indexs[indexs==name].index[0]
    SC=pd.Series(CSM[INDEX]).sort_values(ascending = False)
    top_3=list(SC.iloc[1:4].index)
    for i in top_3:
        recommended.append(list(df1.index)[i])
    return recommended

In [31]:
rec('Ghayal')

['Pukar', 'Barsaat', 'Mandi']

In [32]:
rec2('The Incredibles 2')

['Incredibles 2 (Spanish Version)', "Pee-wee's Big Holiday", 'Stuart Little']