In [31]:
import numpy as np
import pandas as pd
import ast
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [32]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

movies = movies.merge(credits, on="title")


In [33]:
movies['poster_url'] = "https://image.tmdb.org/t/p/w500" + movies['poster_path']


KeyError: 'poster_path'

In [18]:
movies = movies[['id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]


In [19]:
movies.dropna(inplace=True)
movies.reset_index(drop=True, inplace=True)


In [20]:
def convert(text):
    return [i['name'] for i in ast.literal_eval(text)]

def fetch_director(text):
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            return [i['name']]
    return []

def collapse(L):
    return [i.replace(" ", "") for i in L]


In [21]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert).apply(lambda x: x[:3])
movies['crew'] = movies['crew'].apply(fetch_director)

movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)

movies['overview'] = movies['overview'].apply(lambda x: x.split())


In [22]:
movies['tags'] = (
    movies['overview'] +
    movies['genres'] +
    movies['keywords'] +
    movies['cast'] +
    movies['crew']
)


In [25]:
final_df = movies[['id', 'title', 'tags']].copy()
final_df['tags'] = final_df['tags'].apply(lambda x: " ".join(x))


In [26]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(final_df['tags']).toarray()


In [27]:
similarity = cosine_similarity(vectors)


In [28]:
print(len(final_df))
print(similarity.shape)


4806
(4806, 4806)


In [29]:
pickle.dump(final_df, open("movie_list.pkl", "wb"))
pickle.dump(similarity, open("similarity.pkl", "wb"))
