In [128]:
import pandas as pd
import numpy as np

In [129]:
credits_df = pd.read_csv("tmdb_5000_credits.csv")
movies_df = pd.read_csv("tmdb_5000_movies.csv")

In [130]:
credits_df.columns = ['id','title','cast','crew']
movies_df = movies_df.merge(credits_df, on="id")

In [131]:
from ast import literal_eval

features = ["cast", "crew", "keywords", "genres"]

for feature in features:
    movies_df[feature] = movies_df[feature].apply(literal_eval)

In [132]:
def get_director(x):
    for i in x:
        if i["job"] == "Director":
            return i["name"]
    return np.nan

In [133]:
def get_list(x):
    if isinstance(x, list):
        names = [i["name"] for i in x]

        if len(names) > 3:
            names = names[:3]

        return names

    return []

In [134]:
movies_df["director"] = movies_df["crew"].apply(get_director)

features = ["cast", "keywords", "genres"]
for feature in features:
    movies_df[feature] = movies_df[feature].apply(get_list)

In [135]:
def clean_data(row):
    if isinstance(row, list):
        return [str.lower(i.replace(" ", "")) for i in row]
    else:
        if isinstance(row, str):
            return str.lower(row.replace(" ", ""))
        else:
            return ""

features = ['cast', 'keywords', 'director', 'genres']
for feature in features:
    movies_df[feature] = movies_df[feature].apply(clean_data)

In [136]:
def create_cast_soup(features):
    return ' '.join(features['keywords']) + ' ' + ' '.join(features['cast']) + ' ' + features['director'] + ' ' + ' '.join(features['genres'])

def create_genre_soup(features):
    return ' '.join(features['genres']) + ' ' + ' '.join(features['keywords'])

movies_df["cast_soup"] = movies_df.apply(create_cast_soup, axis=1)
movies_df["genre_soup"] = movies_df.apply(create_genre_soup, axis=1)

In [137]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [138]:
count_vectorizer = CountVectorizer(stop_words="english")

In [139]:
count_matrix = count_vectorizer.fit_transform(movies_df["cast_soup"])
cosine_sim_cast = cosine_similarity(count_matrix, count_matrix)

In [140]:
count_matrix = count_vectorizer.fit_transform(movies_df["genre_soup"])
cosine_sim_genre = cosine_similarity(count_matrix, count_matrix)

In [141]:
movies_df = movies_df.reset_index()
indices = pd.Series(movies_df.index, index=movies_df['original_title'])

In [142]:
indices = pd.Series(movies_df.index, index=movies_df["original_title"]).drop_duplicates()

In [143]:
def get_recommendations(title, cosine_sim):
    idx = indices[title]
    similarity_scores = list(enumerate(cosine_sim[idx]))
    similarity_scores= sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores= similarity_scores[1:11]
    # (a, b) where a is id of movie, b is similarity_scores

    movies_indices = [indices[0] for indices in similarity_scores]
    movies = movies_df["original_title"].iloc[movies_indices]
    return movies

In [144]:
print(get_recommendations("Twilight", cosine_sim_cast))

172     The Twilight Saga: Breaking Dawn - Part 2
612                    The Twilight Saga: Eclipse
898                   The Twilight Saga: New Moon
80                    Snow White and the Huntsman
1132                              Red Riding Hood
1958                                  On the Road
3043                             End of the Spear
410                                 Mirror Mirror
583                                      Big Fish
777         The Mortal Instruments: City of Bones
Name: original_title, dtype: object


In [152]:
print(get_recommendations("Twilight", cosine_sim_genre))

172     The Twilight Saga: Breaking Dawn - Part 2
612                    The Twilight Saga: Eclipse
898                   The Twilight Saga: New Moon
967                                     Hereafter
3043                             End of the Spear
80                    Snow White and the Huntsman
410                                 Mirror Mirror
583                                      Big Fish
777         The Mortal Instruments: City of Bones
812                                    Pocahontas
Name: original_title, dtype: object
