In [1]:
# lets import the basic Libraries
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [2]:
#reading the cleaned data into a database
movies_df = pd.read_csv('cleanedmovies.csv')

In [3]:
#printing the first 5 rows of the database
movies_df.head()

Unnamed: 0,Title,Rating,Description,Country,Episodes,Year,Genre1,Genre2,Genre3,Genre4,Genre5,Genre6,Cast1,Cast2,Cast3,Cast4,Cast5,Cast6
0,Amazing Saturday,3.5,the poster shows shin dong yup girls days hyer...,korean,100.0,2018,drama,comedy,drama,food,music,korean,shin dong yup,key,park na rae,hyeri,shin dong yup,shin dong yup
1,I Can See Your Voice Season 9,3.0,i can see your voice is a mystery music game s...,korean,10.0,2022,drama,kshow,music,mystery,korean,drama,ha sung woon,,lee teuk,yoo se yoon,lee hyun yi,kim seung hyun
2,Omniscient Interfering View,3.0,the managers who know more about the stars the...,korean,200.0,2017,drama,game-show,kshow,korean,drama,drama,,,,,,
3,Mr. House Husband,3.0,dear valued customer 1 dramacool regularly upd...,korean,200.0,2016,drama,drama,family,food,kshow,korean,kim eli,lee chul min,son tae young,kim jung tae,kim seung woo,bong tae gyu
4,The Game Caterers Season 2,4.0,a business trip program with pd na young seok ...,korean,12.0,2022,drama,business,kshow,korean,drama,drama,im ji yeon,lee seo jin,jung hyung suk,lee jung jae,jung woo sung,oh se hun


In [4]:
#checking the types of the columns
movies_df.dtypes

Title           object
Rating         float64
Description     object
Country         object
Episodes       float64
Year             int64
Genre1          object
Genre2          object
Genre3          object
Genre4          object
Genre5          object
Genre6          object
Cast1           object
Cast2           object
Cast3           object
Cast4           object
Cast5           object
Cast6           object
dtype: object

In [5]:
#checking for null values
movies_df.isna().sum()

Title             0
Rating            0
Description       3
Country           0
Episodes          7
Year              0
Genre1            0
Genre2            0
Genre3            0
Genre4            0
Genre5            0
Genre6            0
Cast1          1256
Cast2          1256
Cast3          1256
Cast4          1256
Cast5          1256
Cast6          1256
dtype: int64

In [6]:
#filling the null values with an empty space
movies_df.fillna("", inplace =True)

In [7]:
#checking out the columns of a dataframe
movies_df.columns

Index(['Title', 'Rating', 'Description', 'Country', 'Episodes', 'Year',
       'Genre1', 'Genre2', 'Genre3', 'Genre4', 'Genre5', 'Genre6', 'Cast1',
       'Cast2', 'Cast3', 'Cast4', 'Cast5', 'Cast6'],
      dtype='object')

In [8]:
#selecting the features to be used for building our recommendation system
features = ['Title', 'Description', 'Country','Genre1', 'Genre2', 'Genre3', 'Genre4', 'Genre5', 'Genre6', 'Cast1', 'Cast2', 'Cast3', 'Cast4', 'Cast5', 'Cast6']

In [9]:
#concatenating the columns together
selected_features= movies_df['Title']+' '+ movies_df['Description']+' '+movies_df['Country']+' '+movies_df['Genre1']+' '+movies_df['Genre2']+' '+movies_df['Genre3']+' '+movies_df['Genre4']+' '+movies_df['Genre5']+' '+['Genre6']+' '+['Cast1']+' '+['Cast2']+' '+['Cast3']+' '+['Cast4']+' '+['Cast5']+' '+['Cast6']

In [10]:
#checking out the concatenated columns
selected_features

0       Amazing Saturday  the poster shows shin dong y...
1       I Can See Your Voice Season 9  i can see your ...
2       Omniscient Interfering View  the managers who ...
3       Mr. House Husband  dear valued customer 1 dram...
4       The Game Caterers Season 2  a business trip pr...
                              ...                        
5623    Provocateur  after the deaths of his first and...
5624    Married But Available  after experiencing a ne...
5625    My Dearly Sinful Mind  after his girlfriend di...
5626    The No No Girl  law lai jings love life has be...
5627    My Unfair Lady  every one out of three hong ko...
Length: 5628, dtype: object

In [11]:
#initializing the tfidf vectorizer
vectorizer = TfidfVectorizer()

In [12]:
#using the tfidf vectorizer on the selected features.
feature_vectors = vectorizer.fit_transform(selected_features)

In [13]:
print(feature_vectors)

  (0, 2925)	0.01993429931092891
  (0, 2924)	0.01993429931092891
  (0, 2923)	0.01993429931092891
  (0, 2922)	0.01993429931092891
  (0, 2921)	0.01993429931092891
  (0, 2920)	0.01993429931092891
  (0, 7470)	0.01993429931092891
  (0, 12526)	0.0733552256579647
  (0, 6998)	0.0853951285257509
  (0, 3732)	0.042507849113625824
  (0, 5535)	0.04037466968819127
  (0, 10565)	0.035516821441397464
  (0, 9977)	0.11854544326784888
  (0, 975)	0.06436400713658638
  (0, 8177)	0.05102937866662311
  (0, 8229)	0.04636330496224309
  (0, 1250)	0.10048017596591843
  (0, 9628)	0.05790254094580213
  (0, 2390)	0.12565550835703235
  (0, 1525)	0.05342580548743562
  (0, 9136)	0.15636320087530897
  (0, 12657)	0.08780218553173347
  (0, 8497)	0.045022678939715816
  (0, 21328)	0.038427858175357384
  (0, 21121)	0.08696711896128638
  :	:
  (5627, 10880)	0.06452019992851102
  (5627, 13270)	0.0593142012515144
  (5627, 9115)	0.08822157412930072
  (5627, 19368)	0.06348610611589839
  (5627, 10513)	0.06006641731993202
  (5627, 2

In [14]:
#checking the similarity between the feature vectors
similarity = cosine_similarity(feature_vectors)

In [15]:
print(similarity)

[[1.         0.03987866 0.08019611 ... 0.04382662 0.0610041  0.02602478]
 [0.03987866 1.         0.10490879 ... 0.02352589 0.04628462 0.03657855]
 [0.08019611 0.10490879 1.         ... 0.03940912 0.06214665 0.05127781]
 ...
 [0.04382662 0.02352589 0.03940912 ... 1.         0.09364563 0.05889837]
 [0.0610041  0.04628462 0.06214665 ... 0.09364563 1.         0.09178401]
 [0.02602478 0.03657855 0.05127781 ... 0.05889837 0.09178401 1.        ]]


In [16]:
print(similarity.shape)

(5628, 5628)


In [None]:

movie_name = input('Enter your favorte movie:')

In [None]:
movies_df['Title'] = movies_df['Title'].str.rstrip()

In [None]:
list_of_all_title = movies_df['Title'].tolist()
print(list_of_all_title)

In [None]:
find_close_match = difflib.get_close_matches(movie_name, list_of_all_title)
print(find_close_match)

In [None]:
close_match = find_close_match[2]
print(close_match)

In [None]:
#find the index of the movie title
index_of_the_movie = movies_df[movies_df.Title == close_match].index.values[0]
print(index_of_the_movie)

In [None]:
sim_score = list(enumerate(similarity[index_of_the_movie]))

In [None]:
len(sim_score)

In [None]:
#sorting the movies based on their similarity score
sorted_similar_movies = sorted(sim_score, key = lambda x:x[1], reverse=True)
print(sorted_similar_movies[:10])

In [None]:
#print the name of similar movies based on the index

print("Movies suggested for you : \n")

i = 1

for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = movies_df[movies_df.index == index]['Title'].values[0]
    if i<30:
        print(i, '-', title_from_index)
        i += 1
    

# Putting everything together in a function.

In [None]:
def movie_rec_system(movie_name):
    if movie_name in movies_df['Title']:
        list_of_all_title = movies_df['Title'].tolist()
        find_close_match = difflib.get_close_matches(movie_name, list_of_all_title)
        close_match = find_close_match[2]
        index_of_the_movie = movies_df[movies_df.Title == close_match].index.values[0]
        sim_score = list(enumerate(similarity[index_of_the_movie]))
        sorted_similar_movies = sorted(sim_score, key = lambda x:x[1], reverse=True)

        print("Movies suggested for you : \n")

        i = 1

        for movie in sorted_similar_movies:
            index = movie[0]
            title_from_index = movies_df[movies_df.index == index]['Title'].values[0]
            if i<15:
                print(i, '-', title_from_index)
                i += 1
    else:
         print("This movie is not found in our database. Please check out the following movies \n", movies_df[['Title', 'Country', 'Genre2']].sample(10).reset_index(drop=True))

In [None]:
movie_name = input('Enter your favorte movie:')
movie_rec_system(movie_name)