In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')
movies = movies.merge(credits,on = "title")

Columns Selection


In [None]:
''' genre, id(movie poster will be fetched this way), keywords, (since english movies are very much so language is not a contributing factor), title, overview, 
cast, crew [director], '''

Selecting useful data only

In [None]:
movies = movies[['movie_id','title','cast','genres','overview','crew','keywords']]

Data Pre-Processing (Duplicate, missing data)

In [None]:
movies.isnull().sum()
movies.dropna(inplace=True) # removing blank data
movies.isnull().sum()

In [None]:
#Duplicate Data
movies.duplicated().sum() # no duplicate data found


In [None]:
# Processing genre column of movies
import ast
def convert(obj):
    l = []
    for i in ast.literal_eval(obj):
        l.append(i['name'])
    return l

In [None]:
#run only once 
movies['genres'] = movies['genres'].apply(convert)

In [None]:
movies['keywords'] = movies['keywords'].apply(convert)

In [None]:
movies.head(1)

In [None]:
import ast
def convert_cast(obj):
    l1 = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            l1.append(i['name'])
            counter+=1
        else:
            break
    return l1


In [None]:
# refining cast names
movies['cast'] = movies['cast'].apply(convert_cast)

In [None]:
# refining director name
import ast
def convert_director(obj):
    l2 = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            l2.append(i['name'])
            break
    return l2

In [None]:
movies['crew'] = movies['crew'].apply(convert_director)

In [None]:
movies.head(1)

In [None]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [None]:
movies.head(1)

In [None]:
# replacing " "
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])

In [None]:
# creating a common tag
movies['tags'] = movies['overview']+movies['cast']+movies['crew']+movies['genres']+movies['keywords']

In [None]:
movies.head(1)

In [None]:
newmovie = movies[['movie_id','title','tags']]

In [None]:
newmovie.head(1)

In [None]:
#convert list to string
newmovie['tags'] = newmovie['tags'].apply(lambda x:" ".join(x))

In [None]:
newmovie.head(1)

In [None]:
newmovie['tags'][0]

In [None]:
newmovie['tags'] = newmovie['tags'].apply(lambda x:x.lower())

In [None]:
newmovie['tags'][0]


In [None]:
# order of tag - 'cast'+'crew'+'genres'+'keywords'+'overview'
# To find the similarity between movies using only tag we need to use vector as finding similarity between just texts is not possible with good results
# we will convert vector to text which will plot al the movies into vectors and then the closest vector to the liked movie will be taken as the possible 
# movie the user will like
# there are many methods to convert text to vector the simples one is bag of words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000,stop_words = 'english')

In [None]:
vectors = cv.fit_transform(newmovie['tags']).toarray()

In [None]:
import sys
np.set_printoptions(threshold=sys.maxsize)
cv.get_feature_names_out()
# Here we have many words which mean the same but have been used to give a dimension to the vector such as 'action' and 'actions', so we need to equalize
# them and make vector formation more effective, we will apply stemming using nltk library

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [None]:
newmovie['tags'] = newmovie['tags'].apply(stem)
newmovie['tags'][0]
# order of tag - 'cast'+'crew'+'genres'+'keywords'+'overview'

In [None]:
#finding the cosine distance(angular distance) to find the similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)


In [None]:
similarity[0]

In [None]:
newmovie.head(1)
newmovie = newmovie


In [142]:
newmovie = newmovie.drop([4806,4807,4808])


In [150]:
#Finding the Movie
def recommend(movie):
    try:
        index1 = newmovie[newmovie['title'] == movie].index[0]
    except:
        index1 = -1
    if index1 == -1:
        print("Movie not in Database")
    elif index1 <=4806:
        distances = similarity[index1]
        movies_list = sorted(list(enumerate(distances)),reverse = True,key = lambda x:x[1])[1:10]
        for i in movies_list:
            print(newmovie['title'][i[0]])


In [156]:
#Enter the movie name:
recommend("Iron Man")

Iron Man 2
Iron Man 3
Avengers: Age of Ultron
Captain America: Civil War
The Avengers
Ant-Man
X-Men
X-Men: The Last Stand
Thor: The Dark World
