In [81]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


ds = pd.read_excel(r"C:\Users\jaya\Desktop\mrs_dataset.xlsx")
ds.head()

Unnamed: 0,MOVIE_ID,LANGUAGE,GENRE,MOVIE_NAME,ACTOR,ACTRESS,DIRECTOR,PRODUCER,RELEASE_YEAR
0,1001,Hindi,Drama,Mughal-E-Azam,Dilip Kumar,Madhubala,K. Asif,Shapoorji,1960.0
1,1002,Hindi,Action,Kohinoor,Dilip Kumar,Meena Kumari,S.U. Sunny,Dr. V. N. Sinha,1960.0
2,1003,Hindi,Romance,Barsaat ki Raat,Bharat Bhushan,Madhubala,P.L. Santoshi,R. Chandra,1960.0
3,1004,Hindi,Drama,Mera Naam Joker,Raj Kapoor,Padmini,Raj Kapoor,Raj Kapoor,1970.0
4,1005,Hindi,Action,Sholay,Amitabh Bachchan,Jaya Bachchan,Ramesh Sippy,G.P. Sippy,1975.0


In [82]:
#IMPORTANT FEATURE SELECTION
ds = ds[['LANGUAGE','GENRE','MOVIE_NAME','ACTOR' ]]
ds.head()

Unnamed: 0,LANGUAGE,GENRE,MOVIE_NAME,ACTOR
0,Hindi,Drama,Mughal-E-Azam,Dilip Kumar
1,Hindi,Action,Kohinoor,Dilip Kumar
2,Hindi,Romance,Barsaat ki Raat,Bharat Bhushan
3,Hindi,Drama,Mera Naam Joker,Raj Kapoor
4,Hindi,Action,Sholay,Amitabh Bachchan


In [83]:
#bag of words means words situated in Movie name upon which we will calculate the similarity and would recommend. 
ds['bag_of_words'] = ''
columns = ds.columns
for index, row in ds.iterrows():
    words = ''
    for col in columns:
        if col != 'ACTOR':
            words = words + ' '.join(row[col])+ ' '
        else:
            words = words + row[col]+ ' '
    row['bag_of_words'] = words
    
ds.drop(columns = [col for col in ds.columns if col !='bag_of_words'], inplace = True)


TypeError: can only join an iterable

In [84]:
#bag_of_words COMBINES  ALL THE WORDS PRESENT IN THE ABOVE 4 COLUMNS.
ds.head()

Unnamed: 0,LANGUAGE,GENRE,MOVIE_NAME,ACTOR,bag_of_words
0,Hindi,Drama,Mughal-E-Azam,Dilip Kumar,H i n d i D r a m a M u g h a l - E - A z a ...
1,Hindi,Action,Kohinoor,Dilip Kumar,H i n d i A c t i o n K o h i n o o r Dilip ...
2,Hindi,Romance,Barsaat ki Raat,Bharat Bhushan,H i n d i R o m a n c e B a r s a a t k i ...
3,Hindi,Drama,Mera Naam Joker,Raj Kapoor,H i n d i D r a m a M e r a N a a m J o ...
4,Hindi,Action,Sholay,Amitabh Bachchan,H i n d i A c t i o n S h o l a y Amitabh Ba...


In [85]:
#A MATRIX REPRESENTING ALL BAG OF WORDS IN VECTOR FORM
#FROM SKLEARN FEATURE EXTRACTION USED ABOVE WE EXTRACT THE COUNT VECTORIZER.
#INITIATES AND GENERATES THE COUNT MATRIX

count = CountVectorizer()
count_matrix = count.fit_transform(ds['bag_of_words'])


#CREATES A SERIES FOR THE MOVIES_NAME SO THEY ASSOCIATES TO AN NUMERICAL ORDER

indices = pd.Series(ds.MOVIE_NAME)
indices[:5]
#count_matrix



0      Mughal-E-Azam
1           Kohinoor
2    Barsaat ki Raat
3    Mera Naam Joker
4             Sholay
Name: MOVIE_NAME, dtype: object

In [86]:
#MEASURES THE COSINE SIMILARITY BETWEEN COUNT MATRIX .

cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1., 1., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [87]:
cosine_sim.shape

(180, 180)

In [88]:
#FUNCTION THAT TAKES MOVIE NAME AS INPUT AND RECOMMENDS TOP 10 MOVIES

def recommendations(MOVIE_NAME, cosine_sim = cosine_sim):
    
    recommended_movies = []
    
    #GETTING INDEX OF THE MOVIE THAT MATCHES THE MOVIE NAME.
    idx = indices[indices == MOVIE_NAME].index[0]
    
    #CREATES A SERIES WITH SIMILAR SCORES IN DESCENDING ORDER
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    
    #GETS INDEX OF 10 MOST SIMILAR MOVIES
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    #CREATING A LIST WITH THE MOVIE NAME OF BEST 10 MATCHING MOVIES
    for i in top_10_indexes:
        recommended_movies.append(list(ds.MOVIE_NAME)[i])
        
    return recommended_movies    

In [89]:
#WATCHED A ROMANTIC MOVIE KUCH KUCH HOTA HAI OF SHARUKH KHAN , NEXT 10 MOVIES ARE RECOMMENDED OF SHAHRUKH KHAN AND ROMANCE GENRE.

recommendations('Kuch Kuch Hota Hai')

['Dilwale Dulhaniya Le Jayenge',
 'Chak De India',
 'Kabhi Alvida Na Kehna',
 'Main Hoon Na',
 'Devdas',
 'Kabhi Khushi Kabhi Gham',
 'Kuch Kuch Hota Hai',
 'Om Shanti Om',
 'Fanaa',
 'Rang De Basanti']