In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv("dataset/tmdb_5000_movies.csv")
credits = pd.read_csv("dataset/tmdb_5000_credits.csv")

In [3]:
movies = movies.merge(credits, on="title")

In [4]:
movies.shape

(4809, 23)

In [5]:
movies = movies[["movie_id", "title", "overview", "genres", "keywords", "cast", "crew"]]
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4809 non-null   int64 
 1   title     4809 non-null   object
 2   overview  4806 non-null   object
 3   genres    4809 non-null   object
 4   keywords  4809 non-null   object
 5   cast      4809 non-null   object
 6   crew      4809 non-null   object
dtypes: int64(1), object(6)
memory usage: 263.1+ KB


In [6]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [7]:
movies.dropna(inplace=True)

In [8]:
movies.duplicated().sum()

np.int64(0)

In [9]:
import ast

In [10]:
def preprocess_genres_and_keywords(obj):
    list = []
    for i in ast.literal_eval(obj):
        list.append(i["name"])
    return list

In [11]:
movies["genres"] = movies["genres"].apply(preprocess_genres_and_keywords)

In [12]:
movies["keywords"] = movies["keywords"].apply(preprocess_genres_and_keywords)

In [13]:
def preprocess_cast(obj):
    list = []
    counter = 0
    for i in ast.literal_eval(obj):
        if(counter != 3):
            list.append(i["name"])
        else:
            break
    return list

In [14]:
movies["cast"] = movies["cast"].apply(preprocess_cast)

In [15]:
def preprocess_crew(obj):
    list = []
    for i in ast.literal_eval(obj):
        if i["job"] == "Director":
            list.append(i["name"])
            break
    return list

In [16]:
movies["crew"] = movies["crew"].apply(preprocess_crew)

In [17]:
movies["overview"] = movies["overview"].apply(lambda x:x.split())

In [18]:
remove_spaces = lambda x:[i.replace(" ", "") for i in x]

movies["genres"].apply(remove_spaces)
movies["keywords"].apply(remove_spaces)
movies["cast"].apply(remove_spaces)
movies["crew"].apply(remove_spaces)

0           [JamesCameron]
1          [GoreVerbinski]
2              [SamMendes]
3       [ChristopherNolan]
4          [AndrewStanton]
               ...        
4804     [RobertRodriguez]
4805         [EdwardBurns]
4806          [ScottSmith]
4807          [DanielHsia]
4808     [BrianHerzlinger]
Name: crew, Length: 4806, dtype: object

In [19]:
movies["tags"] = movies["overview"] + movies["genres"] + movies["keywords"] + movies["cast"] + movies["crew"]

In [20]:
final_df = movies[["movie_id", "title", "tags"]]

In [21]:
final_df["tags"] = final_df["tags"].apply(lambda x:" ".join(x))
final_df["tags"] = final_df["tags"].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df["tags"] = final_df["tags"].apply(lambda x:" ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df["tags"] = final_df["tags"].apply(lambda x:x.lower())


In [22]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [23]:
def stem(text):
    y= []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [24]:
final_df['tags'] = final_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['tags'] = final_df['tags'].apply(stem)


In [25]:
final_df

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."
...,...,...,...
4804,9367,El Mariachi,el mariachi just want to play hi guitar and ca...
4805,72766,Newlyweds,a newlyw couple' honeymoon is upend by the arr...
4806,231617,"Signed, Sealed, Delivered","""signed, sealed, delivered"" introduc a dedic q..."
4807,126186,Shanghai Calling,when ambiti new york attorney sam is sent to s...


In [26]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000, stop_words='english')

In [27]:
vectors = cv.fit_transform(final_df['tags']).toarray()
cv.get_feature_names_out()

array(['000', '10', '11', ..., 'zooey', 'zoë', 'zucker'],
      shape=(5000,), dtype=object)

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
similarity = cosine_similarity(vectors)

In [30]:
def recommend_movie(movie):
    movie_index = final_df[final_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]

    for i in movies_list:
        print(final_df.iloc[i[0]].title)

In [31]:
import pickle

In [32]:
pickle.dump(final_df.to_dict(),open('movie_dict.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))