In [1]:
import numpy as np
import pandas as pd
import ast
import pickle

In [2]:
listOfMovies = pd.read_csv('Datasets/tmdb_5000_movies.csv')
listOfCredits = pd.read_csv('Datasets/tmdb_5000_credits.csv') 

In [3]:
moviesList = listOfMovies.merge(listOfCredits,on='title')

In [4]:
moviesList = moviesList[['movie_id','title','overview','genres','keywords','cast','crew']]

In [5]:
ast.literal_eval('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [6]:
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L

In [7]:
moviesList.dropna(inplace=True)

In [8]:
moviesList['genres'] = moviesList['genres'].apply(convert)

In [9]:
moviesList['keywords'] = moviesList['keywords'].apply(convert)

In [10]:
def convert3(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            L.append(i['name'])
        counter+=1
    return L

In [11]:
moviesList['cast'] = moviesList['cast'].apply(convert)

In [12]:
moviesList['cast'] = moviesList['cast'].apply(lambda x:x[0:3])

In [13]:
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 

In [14]:
moviesList['crew'] = moviesList['crew'].apply(fetch_director)

In [15]:
def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [16]:
moviesList['cast'] = moviesList['cast'].apply(collapse)
moviesList['crew'] = moviesList['crew'].apply(collapse)
moviesList['genres'] = moviesList['genres'].apply(collapse)
moviesList['keywords'] = moviesList['keywords'].apply(collapse)

In [17]:
moviesList['overview'] = moviesList['overview'].apply(lambda x:x.split())

In [18]:
moviesList['tags'] = moviesList['overview'] + moviesList['genres'] + moviesList['keywords'] + moviesList['cast'] + moviesList['crew']

In [19]:
new = moviesList.drop(columns=['overview','genres','keywords','cast','crew'])

In [20]:
new['tags'] = new['tags'].apply(lambda x: " ".join(x))

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [22]:
vector = cv.fit_transform(new['tags']).toarray()

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
similarity = cosine_similarity(vector)

In [25]:
def recommend_similar_movies(movie_name):
    index = new[new['title'] == movie_name].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:16]:
        print(new.iloc[i[0]].title)

In [26]:
recommend_similar_movies("Spider-Man")

Spider-Man 3
Spider-Man 2
The Amazing Spider-Man 2
The Amazing Spider-Man
Arachnophobia
Kick-Ass
21 Jump Street
X-Men
Small Soldiers
Superman III
Iron Man
The New Guy
Radio
Cry_Wolf
Hellboy II: The Golden Army


In [27]:
pickle.dump(new,open('movie_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))