In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import ast
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [3]:
movies = pd.read_csv("data/tmdb_5000_movies.csv")
credits = pd.read_csv("data/tmdb_5000_credits.csv")

In [4]:
movies = movies.merge(credits, on = "title")

In [5]:
movies = movies[["movie_id", "title", "overview", "genres", "keywords", "cast", "crew"]]

In [6]:
movies.isnull().sum

<bound method DataFrame.sum of       movie_id  title  overview  genres  keywords   cast   crew
0        False  False     False   False     False  False  False
1        False  False     False   False     False  False  False
2        False  False     False   False     False  False  False
3        False  False     False   False     False  False  False
4        False  False     False   False     False  False  False
...        ...    ...       ...     ...       ...    ...    ...
4804     False  False     False   False     False  False  False
4805     False  False     False   False     False  False  False
4806     False  False     False   False     False  False  False
4807     False  False     False   False     False  False  False
4808     False  False     False   False     False  False  False

[4809 rows x 7 columns]>

In [7]:
movies.dropna(inplace=True)

In [8]:
def convert(text):
    result = []
    for i in ast.literal_eval(text):
        result.append(i["name"])
    return result

In [9]:
movies["genres"] = movies["genres"].apply(convert)

In [10]:
movies["keywords"] = movies["keywords"].apply(convert)

In [11]:
def convert_cast(text):
    result = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter<5:
            result.append(i["name"])
        counter+= 1
    return result

In [12]:
movies["cast"] = movies["cast"].apply(convert_cast)

In [13]:
def convert_crew(text):
    result = []
    for i in ast.literal_eval(text):
        if i["job"] == "Director":
            result.append(i["name"])
            break
    return result

In [14]:
movies["crew"] = movies["crew"].apply(convert_crew)

In [15]:
movies["overview"] = movies["overview"].apply(lambda x:x.split())

In [16]:
def remove_space(word):
    result = []
    for i in word:
        result.append(i.replace(" ", ""))
    return result

In [None]:
movies["crew"] = movies["crew"].apply(remove_space)
movies["cast"] = movies["cast"].apply(remove_space)
movies["genres"] = movies["genres"].apply(remove_space)
movies["keywords"] = movies["keywords"].apply(remove_space)

In [18]:
movies["tags"] = movies["overview"] + movies["genres"] + movies["keywords"] + movies["cast"] + movies["crew"]

In [19]:
new_dataframe = movies[["movie_id", "title", "tags"]]

In [59]:
new_dataframe.loc[:, "tags"] = new_dataframe["tags"].apply(lambda x: " ".join(x))

In [61]:
new_dataframe.loc[:, "tags"] = new_dataframe["tags"].apply(lambda x: x.lower())

In [22]:
ps = PorterStemmer()

In [23]:
def stems(text):
    result = []
    for i in text.split():
        result.append(ps.stem(i))
    return " ".join(result)

In [63]:
new_dataframe.loc[:, "tags"] = new_dataframe["tags"].apply(stems)

In [25]:
cv = CountVectorizer(max_features = 5000, stop_words = "english")

In [26]:
vector = cv.fit_transform(new_dataframe["tags"]).toarray()

In [27]:
similarity = cosine_similarity(vector)

In [38]:
def recommend_movie(movie):
    index = new_dataframe[new_dataframe["title"] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse = True, key= lambda x: x[1])
    for i in distances[1:6]:
        print(new_dataframe.iloc[i[0]].title)

In [65]:
pickle.dump(new_dataframe, open("pickle/movie_list.pkl", "wb"))
pickle.dump(similarity, open("pickle/similarity.pkl", "wb"))