## importing libraries

In [None]:
import pandas as pd
import numpy as np

## loading datasets

In [None]:
movies = pd.read_csv('./data/movies.csv')
credits = pd.read_csv('./data/credits.csv')
print(movies.shape, credits.shape)
movies.head(1)

In [None]:
credits.head(2)

In [None]:
#merge credits with movies dataframe
movies_df = movies.merge(credits, on="title")
print(movies.shape)
movies.head(1)

## Data preprocessing

In [None]:
#extracting only required columns
movies_df = movies_df.loc[:, ['movie_id','title','genres','overview','keywords','cast','crew']]
movies_df.head(1)

In [None]:
#check if any missing value present in the dataset
movies_df.isnull().sum()

In [None]:
# drop rows having missing values
movies_df.dropna(inplace = True)

In [None]:
# check if any duplicate rows present
movies_df.duplicated().sum()

In [None]:
movies_df['overview']

In [None]:
#converting string into list
movies_df['overview'] = movies_df['overview'].apply(lambda x:x.split())

In [None]:
movies_df['genres'].values

In [None]:
import ast

def convert(obj):
    '''
    this function returns the name from the dict 
    '''
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [None]:
movies_df['genres'] = movies_df['genres'].apply(convert)
movies_df.head(1)

In [None]:
movies_df['keywords'] = movies_df['keywords'].apply(convert)

In [None]:
def convert2(obj):
    '''
    this function returns the first three name from the dict
    '''
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

In [None]:
movies_df['cast'] = movies_df['cast'].apply(convert2)

In [None]:
movies_df.head(1)

In [None]:
def convert_crew(obj):
    '''
    this function returns the name from the dict where the job is Director
    '''
    L = []
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            L.append(i['name'])
            break
    return L

In [None]:
movies_df['crew'] = movies_df['crew'].apply(convert_crew)

In [None]:
movies_df.head(3)

In [None]:
#transforming the string having space to nonspace 
movies_df['genres'] = movies_df['genres'].apply(lambda x: [i.replace(" ","") for i in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x: [i.replace(" ","") for i in x])
movies_df['cast'] = movies_df['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies_df['crew'] = movies_df['crew'].apply(lambda x: [i.replace(" ","") for i in x])

In [None]:
movies_df.head(2)

In [None]:
movies_df['tags'] = movies_df['genres'] + movies_df['overview'] + movies_df['keywords'] + movies_df['cast'] + movies_df['crew']
movies_df.head(3)

In [None]:
#taking only necessary columns
new_movies_df = movies_df.loc[:, ['movie_id', 'title', 'tags']]
new_movies_df.head()

In [None]:
#converting list to string 
new_movies_df['tags'] = new_movies_df['tags'].apply(lambda x: " ".join(x))
new_movies_df['tags'][0]

In [None]:
#convert all letters to lowercase
new_movies_df['tags'] = new_movies_df['tags'].apply(lambda x: x.lower())
new_movies_df.head()

In [None]:
#steming the words
import nltk

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
ps.stem("loving")

In [None]:
def stem(text):
    """
    this stem function is used to stemming the words
    text - words given by the users which needs to be stemmed
    """
    y = []
    for i in text.split():
        y.append(ps.stem(i))
        
    string = " ".join(y)
    return string

In [None]:
new_movies_df['tags'] = new_movies_df['tags'].apply(stem)
new_movies_df.head()

### text vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3000, stop_words="english")

In [None]:
vectors = cv.fit_transform(new_movies_df['tags']).toarray()
vectors

In [None]:
#calculating the cosine similarity between movies
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)

## Recommendation

In [None]:
# function to recommend movie
def recommend(movie):
    movie_index = new_movies_df[new_movies_df['title']==movie].index[0]
    
    #calculate the distances between the movies
    distances = similarity[movie_index]
    
    #sort the distances and take top five similiar movies
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]
    
    print(f'Top 5 recommended movies for {movie} are: \n')
    for i in movies_list:
        print(new_movies_df.iloc[i[0]].title)

In [None]:
# Recommendation
recommend('Batman Begins')