## **We used to observe recomendations in every application. For example, if we take netflix we can find movie recommendations. This project is about to achieve such recommendation system by considering description as out key information.**

### **Import required libraries**

In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.cluster import MiniBatchKMeans

### **load the dataset**

In [16]:
df=pd.read_csv("netflix_titles.csv")

### **Analyse the data and check for null values/data types**

In [17]:
df.isna().sum()

show_id            0
type               0
title              0
director        1969
cast             570
country          476
date_added        11
release_year       0
rating            10
duration           0
listed_in          0
description        0
dtype: int64

In [18]:
df.shape

(6234, 12)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6234 entries, 0 to 6233
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       6234 non-null   int64 
 1   type          6234 non-null   object
 2   title         6234 non-null   object
 3   director      4265 non-null   object
 4   cast          5664 non-null   object
 5   country       5758 non-null   object
 6   date_added    6223 non-null   object
 7   release_year  6234 non-null   int64 
 8   rating        6224 non-null   object
 9   duration      6234 non-null   object
 10  listed_in     6234 non-null   object
 11  description   6234 non-null   object
dtypes: int64(2), object(10)
memory usage: 584.6+ KB


In [20]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

### Initialize TF-IDF **object**

In [21]:
vector=TfidfVectorizer()

### **Approach : vectorize the contents of every film in the dataset and check for the similirity with other films, so that we can recommend those films which are having high similarity score**

In [22]:
description=df['description']
tf_vector=TfidfVectorizer(max_df=0.4,min_df=1,lowercase=True,stop_words='english',use_idf=True,norm=u'l2',smooth_idf=True)

### **fit and transform the description of every film and store it into a variable. After fitting and transforming, the result will be a matrix.**

In [23]:
tfidf=tf_vector.fit_transform(description)
tfidf.shape

(6234, 16151)

### **Define a function to search top records after enter a film name**

In [30]:
def search(tfidf_matrix,model,request, top_n = 5):
    request_transform = model.transform([request])
    similarity = np.dot(request_transform,np.transpose(tfidf_matrix))
    x = np.array(similarity.toarray()[0])
    indices=np.argsort(x)[-5:][::-1]
    return indices

### **Define a function which results top n films having high similarity with the input film name**

In [31]:
def find_similar(tfidf_matrix, index, top_n = 5):
    cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [index for index in related_docs_indices][0:top_n] 

### **Define a function to print the obtained results in a good format**

In [32]:
def print_result(request_content,indices,X):
    print('\nsearch : ' + request_content)
    print('\nBest Results :')
    for i in indices:
        print('id = {0:5d} - title = {1}'.format(i,X['title'].loc[i]))

### **call search function and get top n records matching with the film name**

In [33]:
request="Norm of the North: King Sized Adventure"
result=search(tfidf,tf_vector,request,top_n=5)
#print(result)
print_result(request,result,df)


search : Norm of the North: King Sized Adventure

Best Results :
id =  1780 - title = Norm of the North: Keys to the Kingdom
id =  4454 - title = Some Freaks
id =   260 - title = Norm Macdonald Has a Show
id =   499 - title = Equals
id =  4942 - title = Save Our Shelter


### **call find_similar function to get the results of recommendations.**

In [34]:
index=1
result_2 = find_similar(tfidf, index, top_n = 5)
print_result('Norm of the North: King Sized Adventure',result_2,df)


search : Norm of the North: King Sized Adventure

Best Results :
id =  3373 - title = Katt Williams: Great America
id =  4375 - title = Garfunkel and Oates: Trying to be Special
id =  1682 - title = Jen Kirkman: I'm Gonna Die Alone (And I Feel Fine)
id =  4424 - title = George of the Jungle 2
id =   582 - title = Joe Rogan: Triggered


### **Clustering the corpus of texts with kmeans**

In [44]:
k = 20
kmeans = MiniBatchKMeans(n_clusters = k)
kmeans.fit(tfidf)
centers = kmeans.cluster_centers_.argsort()[:,::-1]
terms = tf_vector.get_feature_names()

In [43]:
for i in range(0,k):
    word_list=[]
    print("cluster%d:"% i)
    for j in centers[i,:10]:
        word_list.append(terms[j])
    print(word_list) 

cluster0:
['discover', 'earth', 'planet', 'left', 'gonsha', 'students', 'mrs', 'son', 'end', 'sets']
cluster1:
['gets', 'shot', 'years', 'chance', 'life', '50', 'man', 'estranged', 'studio', 'elsa']
cluster2:
['dark', 'secrets', 'secret', 'group', 'past', 'mysterious', 'killer', 'power', 'car', 'woman']
cluster3:
['detective', 'case', 'police', 'killer', 'uncovers', 'investigating', 'missing', 'serial', 'man', 'solve']
cluster4:
['woman', 'husband', 'young', 'man', 'family', 'love', 'life', 'home', 'father', 'wife']
cluster5:
['mother', 'man', 'love', 'young', 'daughter', 'son', 'falls', 'soon', 'old', 'single']
cluster6:
['new', 'friends', 'series', 'family', 'documentary', 'young', 'love', 'man', 'lives', 'group']
cluster7:
['returns', 'hometown', 'family', 'past', 'face', 'brother', 'ex', 'years', 'home', 'daughter']
cluster8:
['murder', 'case', 'series', 'convicted', 'accused', 'wife', 'cop', 'framed', 'documentary', 'detective']
cluster9:
['time', 'tour', 'value', 'comic', 'career