In [15]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

movies = pd.read_csv("/content/tmdb_5000_movies[1].csv")
credits = pd.read_csv("/content/tmdb_5000_credits[1].csv")

movies = movies.merge(credits, left_on="title", right_on="title")
movies = movies[["movie_id", "title", "overview", "genres", "keywords", "cast", "crew"]]

In [25]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4809 non-null   int64 
 1   title     4809 non-null   object
 2   overview  4809 non-null   object
 3   genres    4809 non-null   object
 4   keywords  4809 non-null   object
 5   cast      4809 non-null   object
 6   crew      4809 non-null   object
 7   tags      4809 non-null   object
dtypes: int64(1), object(7)
memory usage: 300.7+ KB


In [26]:
movies.describe()

Unnamed: 0,movie_id
count,4809.0
mean,57120.571429
std,88653.369849
min,5.0
25%,9012.0
50%,14624.0
75%,58595.0
max,459488.0


In [27]:
movies.isnull().sum()

Unnamed: 0,0
movie_id,0
title,0
overview,0
genres,0
keywords,0
cast,0
crew,0
tags,0


In [16]:
def convert(text):
    L = []
    if isinstance(text, str) and text.strip():
        if text == "[]":
            return L
        try:
            for i in literal_eval(text):
                L.append(i['name'])
        except (ValueError, SyntaxError):
            pass

    return L

movies["genres"] = movies["genres"].apply(convert)
movies["keywords"] = movies["keywords"].apply(convert)

In [19]:
movies["overview"] = movies["overview"].apply(lambda x: x.split() if isinstance(x, str) else [])

movies["genres"] = movies["genres"].apply(lambda x: [i.replace(" ", "") for i in x])
movies["keywords"] = movies["keywords"].apply(lambda x: [i.replace(" ", "") for i in x])

movies["cast"] = movies["cast"].apply(convert)
movies["crew"] = movies["crew"].apply(convert)

movies["tags"] = movies["overview"] + movies["genres"] + movies["keywords"] + movies["cast"] + movies["crew"]

new_df = movies[["movie_id", "title", "tags"]]

In [None]:
new_df["tags"] = new_df["tags"].apply(lambda x: " ".join(x))
new_df["tags"] = new_df["tags"].str.lower()

In [21]:
cv = CountVectorizer(max_features=5000, stop_words="english")
vectors = cv.fit_transform(new_df["tags"]).toarray()

In [22]:
similarity = cosine_similarity(vectors)

In [23]:
def recommend(movie):
    movie_index = new_df[new_df["title"] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    for i in movies_list:
        print(new_df.iloc[i[0]].title)


In [24]:
recommend("Avatar")

Star Trek Into Darkness
The Lovers
Jupiter Ascending
The Time Machine
The Mummy: Tomb of the Dragon Emperor
