In [2]:
import numpy as np
import pandas as pd

In [3]:
moviedf=pd.read_csv('')
creditsdf=pd.read_csv('')
#Add the path

In [1]:
moviedf.head(1)
#Shows the first row of the dataset

In [None]:
creditsdf['crew'][0]
#Shows the crew of the first movie

In [6]:
moviedf=moviedf.merge(creditsdf,on='title')

In [None]:
moviedf.head(1)
#Shows the first row of the dataset after merging
#Removing the unnecessary columns like budget,homepage ,popularity,production companies,production countries,spoken languages,status,tagline,keywords,

In [None]:
moviedf['original_language'].value_counts()

In [None]:
moviedf.info()

In [None]:
moviedf=moviedf[['genres','id','overview','keywords','title','vote_average','vote_count','cast','crew']]
moviedf

Data Preprocessing

In [None]:
moviedf.isnull().sum()
#Shows the number of null values in each column

In [12]:
moviedf.dropna(inplace=True)
#Drops the null values

In [None]:
moviedf.duplicated().sum()
#Shows the number of duplicate values/

In [None]:
moviedf.iloc[0].genres

In [None]:
import ast
ast.literal_eval(moviedf.iloc[0].genres)
#Converts the string into dictionary

In [None]:
def convert(obj):
    l=[]
    for i in ast.literal_eval(obj):
        l.append(i['name'])
    return l
#convert the string into list of dictionaries
moviedf['genres']=moviedf['genres'].apply(convert)
#Apply the function to the genres column


In [None]:
moviedf.head(1)

In [None]:
moviedf['keywords']=moviedf['keywords'].apply(convert)
#Apply the function to the keywords column
moviedf['cast']=moviedf['cast'].apply(convert)
#Apply the function to the cast column
#moviedf['overview']=moviedf['overview'].apply(convert)
moviedf['crew'][0]

In [None]:
def director(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job']=="Director":
            L.append(i['name'])
            break
    return L
#Function to get only the director name

In [None]:
moviedf['crew']=moviedf['crew'].apply(director)

In [None]:
moviedf.head()

In [None]:
moviedf['overview']=moviedf['overview'].apply(lambda x:x.split())



In [None]:

#Now we need to remove the spaces between names in order for the model not to get confused with the names. For eg:
#Samuel L. Jackson and Samuel L Jackson are two different people but the model will treat them as the same person. So we need to remove the spaces between the names.
moviedf['cast']=moviedf['cast'].apply(lambda x:[i.replace(" ","") for i in x])
moviedf['crew']=moviedf['crew'].apply(lambda x:[i.replace(" ","") for i in x])
moviedf['keywords']=moviedf['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
moviedf['genres']=moviedf['genres'].apply(lambda x:[i.replace(" ","") for i in x])
moviedf.head()

In [None]:
moviedf['tags']=moviedf['overview']+moviedf['keywords']+moviedf['cast']+moviedf['crew']+moviedf['genres']
moviedf.head()

In [None]:
movies_new_df=moviedf[['id','vote_average','title','vote_count','tags']]
movies_new_df.head()

In [None]:
movies_new_df['tags']=movies_new_df['tags'].apply(lambda x:" ".join(x))
movies_new_df.head()

In [None]:
movies_new_df['tags']=movies_new_df['tags'].apply(lambda x:x.lower())
#It is recommended to convert the tags into lower case as the model will not be able to differentiate between the upper and lower case letters.
movies_new_df['tags'][0]

Vectorization of text

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,stop_words='english')
#max_features is the number of words we want to consider in the model
#stop_words is the words we want to remove from the model
#We can also use TfidfVectorizer instead of CountVectorizer
#TfidfVectorizer is used to give more weightage to the words which are more important and less weightage to the words which are less important.

vectormovie=cv.fit_transform(movies_new_df['tags']).toarray()
#It will convert the tags into vectors


In [None]:
vectormovie[0]

In [None]:

cv.get_feature_names_out()
#It will show the words which are considered in the model

In [None]:
import re
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
#It is used to convert the words into their root words. For eg: loved will be converted to love
#It is not necessary to use PorterStemmer but it is recommended to use it as it will reduce the number of words in the model.
def stemming(content):
    y=[]
    for i in content.split():
        y.append(ps.stem(i))
    return " ".join(y)
movies_new_df['tags']=movies_new_df['tags'].apply(stemming)
movies_new_df['tags'][0]

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,stop_words='english')
vectormovie=cv.fit_transform(movies_new_df['tags']).toarray()

In [None]:

cv.get_feature_names_out()
#It will show the words which are considered in the model

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similar=cosine_similarity(vectormovie)
#This will calculate the similarity between the vectors and if the distance is less then it is more related to the  movie so hence it will be recommended to the user.
similar


In [None]:
#We now create a recommneder function which will recommend the movies to the user based on the movie he has watched.
def recommend_movie(movie):
    movieindex=movies_new_df[movies_new_df['title']==movie].index[0]
    #It will get the index of the movie which the user has watched
    distance=similar[movieindex]
    #It will get the distance of the movie which the user has watched
    movieslist=sorted(list(enumerate(distance)),reverse=True,key=lambda x:x[1])[1:6]
    #lambda x:x[1] is used to sort the movies based on the distance NOT ON INDEX.
    #It will sort the movies based on the distance keeping the index same
    #This is why enumerate functon is used.
    for i in movieslist:
        print(movies_new_df.iloc[i[0]].title)

recommend_movie('The Avengers')

In [38]:
import pickle
pickle.dump(movies_new_df.to_dict(),open('movies.pkl','wb'))

In [39]:
pickle.dump(similar,open('similarity.pkl','wb'))