In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import os

In [None]:
# Loading The Data

credits = pd.read_csv('../Data/tmdb_5000_credits.csv')
movies = pd.read_csv('../Data/tmdb_5000_movies.csv')

In [None]:
movies.head(2)

In [None]:
credits.head(2)

In [None]:
movies.shape

In [None]:
credits.shape

In [None]:
movies = movies.merge(credits, on = 'title')

In [None]:
movies.shape

In [None]:
movies.head()

In [None]:
movies.columns

In [None]:
# Keeping important columns for recommendation
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew', 'release_date', 'vote_average']]

In [None]:
movies.shape

In [None]:
movies.isnull().sum()

In [None]:
movies.dropna(inplace=True)

In [None]:
movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').dt.year

In [None]:
movies.duplicated().sum()

In [None]:
movies.iloc[0]['genres']

In [None]:
import ast #for converting str to list

def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L

In [None]:
movies['genres'] = movies['genres'].apply(convert)

In [None]:
movies.head()

In [None]:
movies['genres']

In [None]:
movies.iloc[2]['keywords']

In [None]:
movies['keywords'] = movies['keywords'].apply(convert)
movies['keywords'].head()

In [None]:
#  Here i am just keeping top 3 cast

def convert_cast(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            L.append(i['name'])
        counter+=1
    return L

In [None]:
movies['cast'] = movies['cast'].apply(convert_cast)
movies.head()

In [None]:
# handling the crew

movies.iloc[0]['crew']

In [None]:
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if  i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [None]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [None]:
movies.head(3)

In [None]:
# handle overview (converting to list)

movies.iloc[0]['overview']

In [None]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [None]:
movies.iloc[0]['overview']

In [None]:
# Removing Spaces

'Anna Kedrick'
'AnnaKedrick'

def remove_space(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [None]:
movies.columns

In [None]:
movies['cast'] = movies['cast'].apply(remove_space)
movies['crew'] = movies['crew'].apply(remove_space)
movies['genres'] = movies['genres'].apply(remove_space)
movies['keywords'] = movies['keywords'].apply(remove_space)

In [None]:
movies.head()

In [None]:
# concatinate all tags

movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [None]:
movies.head(3)

In [None]:
movies.columns

In [None]:
new_data = movies[['movie_id',  'title', 'tags', 'year', 'vote_average']]

new_data.head()

In [None]:
# converting list to str
new_data['tags'] = new_data['tags'].apply(lambda x:" ".join(x))
new_data.head()

In [None]:
# converting to lowercase

new_data['tags'] = new_data['tags'].apply(lambda x:x.lower())

In [None]:
new_data.head()

In [None]:
import nltk
from nltk.stem import PorterStemmer

In [None]:
ps = PorterStemmer()

In [None]:
def stems(text):
    T = []

    for i in text.split():
        T.append(ps.stem(i))
    
    return " ".join(T)

In [None]:
new_data['tags'] = new_data['tags'].apply(stems)

In [None]:
new_data.iloc[0]['tags']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [None]:
vector = cv.fit_transform(new_data['tags']).toarray()

In [None]:
vector[0]

In [None]:
vector.shape

In [None]:
len(cv.get_feature_names_out())

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vector)
similarity.shape

In [None]:
new_data[new_data['title'] == 'The Lego Movie'].index[0]

In [None]:
def recommend(movie):
    index = new_data[new_data['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(new_data.iloc[i[0]].title)

In [None]:
recommend('Spider-Man 2')

In [None]:
import pickle

In [None]:
pickle.dump(new_data, open('artifacts/movie_list.pkl', 'wb'))
pickle.dump(similarity, open('artifacts/similarity.pkl', 'wb'))