# Content Based Movie Recommendation System

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#dataset loading...
movie_df = pd.read_csv('moviedataset.csv')

In [3]:
movie_df.head()

Unnamed: 0,movie_id,Name of movie,Year of release,Rating,Votes,Genre
0,0,Sherlock Jr.,1924,8.2,43871,"Action,Comedy,Romance"
1,1,Hotarubi no mori e,2011,7.9,14884,"Animation,Drama,Fantasy"
2,2,Koto no ha no niwa,2013,7.5,37219,"Animation,Drama,Romance"
3,3,Blood: The Last Vampire,2000,6.7,12102,"Animation,Action,Horror"
4,4,Host,2020,6.5,20885,"Horror,Mystery"


In [4]:
# set name of movie column as the index of the dataframe 
movie_df.set_index('Name of movie')

Unnamed: 0_level_0,movie_id,Year of release,Rating,Votes,Genre
Name of movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Sherlock Jr.,0,1924,8.2,43871,"Action,Comedy,Romance"
Hotarubi no mori e,1,2011,7.9,14884,"Animation,Drama,Fantasy"
Koto no ha no niwa,2,2013,7.5,37219,"Animation,Drama,Romance"
Blood: The Last Vampire,3,2000,6.7,12102,"Animation,Action,Horror"
Host,4,2020,6.5,20885,"Horror,Mystery"
L'âge d'or,5,1930,7.3,13080,"Comedy,Drama"
Karakomik Filmler: 2 Arada,6,2019,6.4,10906,"Comedy,Drama"
Fear and Desire,7,1953,5.5,10609,"Drama,Thriller,War"
It's Such a Beautiful Day,8,2012,8.3,11800,"Animation,Comedy,Drama"
The Most Dangerous Game,9,1932,7.1,11043,"Action,Adventure,Horror"


In [5]:
movie_df.shape

(8957, 6)

In [6]:
# dropping the rows which having null values
movie_df.dropna(inplace=True)

In [7]:
movie_df.isnull().sum()

movie_id           0
Name of movie      0
Year of release    0
Rating             0
Votes              0
Genre              0
dtype: int64

In [8]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8957 entries, 0 to 8956
Data columns (total 6 columns):
movie_id           8957 non-null int64
Name of movie      8957 non-null object
Year of release    8957 non-null int64
Rating             8957 non-null float64
Votes              8957 non-null int64
Genre              8957 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 489.8+ KB


In [9]:
# creating genre dataframe along with movies name as a index
genre = pd.DataFrame(data=movie_df[['Name of movie','Genre']])
genre.set_index('Name of movie',inplace=True)

In [10]:
genre.head(5)

Unnamed: 0_level_0,Genre
Name of movie,Unnamed: 1_level_1
Sherlock Jr.,"Action,Comedy,Romance"
Hotarubi no mori e,"Animation,Drama,Fantasy"
Koto no ha no niwa,"Animation,Drama,Romance"
Blood: The Last Vampire,"Animation,Action,Horror"
Host,"Horror,Mystery"


In [11]:
# Assigning the countvectorizer 
count = CountVectorizer()
count_matrix = count.fit_transform(genre['Genre'])

# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [12]:
# Training the recommendation system
movie_name = list(movie_df['Name of movie'])
movie_name

['Sherlock Jr.',
 'Hotarubi no mori e',
 'Koto no ha no niwa',
 'Blood: The Last Vampire',
 'Host',
 "L'âge d'or",
 'Karakomik Filmler: 2 Arada',
 'Fear and Desire',
 "It's Such a Beautiful Day",
 'The Most Dangerous Game',
 'Byôsoku 5 senchimêtoru',
 'Winnie the Pooh',
 'Freaks',
 'Dumbo',
 'Our Hospitality',
 'Ma vie de Courgette',
 'The General',
 "Killer's Kiss",
 'Tetsuo',
 'The Kid',
 'Horse Feathers',
 'Detour',
 'The Adventures of Ichabod and Mr. Toad',
 'Astérix le Gaulois',
 'Interstella 5555: The 5tory of the 5ecret 5tar 5ystem',
 'Duck Soup',
 'I Walked with a Zombie',
 'The Land Before Time',
 'Following',
 'Steamboat Bill, Jr.',
 'Frankenstein',
 'The Wolf Man',
 'Bambi',
 'Manos: The Hands of Fate',
 'Une vie de chat',
 'The Invisible Man',
 'The Three Caballeros',
 '9 Songs',
 'The Secret of Kells',
 'The Party',
 'The Circus',
 'Animal Farm',
 'The Little Shop of Horrors',
 'Astérix et Cléopâtre',
 'La planète sauvage',
 'Begotten',
 "We're Back! A Dinosaur's Story",
 

In [13]:
def fullname(partial):
    return_list=[]
    for name in movie_name:
        a = partial.lower().split()
        count = 0
        for i in a:
            if name.lower().find(i) > -1:
                count = count+1
        if count==len(a):
            return_list.append(name)
    if partial in return_list:
        return partial
    else:
        return return_list[0]

In [14]:
# recommendation function to give the recommendation on the basis of genre and the highest votes.
def get_recommendation(title, cosine_sim = cosine_sim):
    name = fullname(title)
    
    recommended_movies = []

    idx = movie_df[movie_df['Name of movie']==name].index[0]

    # creating a Series with the similarity scores in descending order
    score_df = pd.DataFrame(cosine_sim[idx],columns=['score'])
    
    score_df=score_df.join(movie_df['Votes'])
    score_df=score_df.join(movie_df['Name of movie'])
    
    score_df = score_df[(score_df['Votes']>100000)].sort_values('score',ascending=False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_df.iloc[1:11].index)
    
    # getting the top 10 movies names and appending the given movie name at the starting
    top_10_movies=[name]+list(movie_df['Name of movie'].iloc[top_10_indexes])

    
        
    return top_10_movies

In [15]:
a=get_recommendation('Dracula')
a

['Dracula',
 'The Wolfman',
 'Crimson Peak',
 'It Chapter Two',
 'The Woman in Black',
 'The Lighthouse',
 'Doctor Sleep',
 'The Tree of Life',
 'The Shining',
 "Rosemary's Baby",
 'Carrie']

In [16]:
import dill

In [17]:
dill.settings['recurse'] = True

In [18]:
dill_out = open("recomm.pkl","wb")
dill.dump(get_recommendation,dill_out)

In [19]:
dill_out.close()