In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
moviee = pd.read_csv('tmdb_5000_movies.csv') # importing datasets
credits = pd.read_csv('tmdb_5000_credits.csv')

In [None]:
movie = moviee.merge(credits,on='title') # merging 

In [None]:
movie.head()

In [None]:
# Columns needed
# genres 
# movie_id
# keywords
# title
# overview
# cast
# crew

movies = movie[['movie_id','title','overview','genres','keywords','cast','crew']]

In [None]:
movies.info()

In [None]:
movies.head()

In [None]:
movies.isnull().sum()

In [None]:
movies.dropna(inplace=True)

In [None]:
movies.duplicated().sum()

In [None]:
movies.iloc[0].genres

In [None]:
#'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'
#['Action','Adventure','Fantasy','SciFi'] 

In [None]:
import ast
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name']) 
    return L 

In [None]:
movies['genres'] = movies['genres'].apply(convert)

In [None]:
movies.head()

In [None]:
movies['keywords'] = movies['keywords'].apply(convert)

In [None]:
movies.head()

In [None]:
# selecting 3 main actors
def convert3(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            L.append(i['name'])
        counter+=1
    return L 

In [None]:
movies['cast'] = movies['cast'].apply(convert3)

In [None]:
movies.head()

In [None]:
# selecting director name
import ast
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L 

In [None]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [None]:
movies.head()

In [None]:
# converting overview tuples into lists
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [None]:
movies.head()

In [None]:
# removing spaces between 2 words
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [None]:
movies.head()

In [None]:
# Simplifying the data
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [None]:
movies.head()

In [None]:
new_df = movies[['movie_id','title','tags']]

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

In [None]:
 new_df.head()

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower()) # converting data in tags column into lower case characters

In [None]:
new_df.head()

In [None]:
import nltk

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
# Stemming
def stem(text):
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
        
    return " ".join(y)

In [None]:
new_df['tags'] = new_df['tags'].apply(stem)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer # Vectorization through bag of words method
cv = CountVectorizer(max_features=5000,stop_words='english') # no of words = 5000 and not taking stop words(is, am, are) into consideration while stemming.

In [None]:
vectors = cv.fit_transform(new_df['tags']).toarray() # vectorizing tags

In [None]:
vectors

In [None]:
vectors.shape 

In [None]:
ps.stem('loved') # stemming example

In [None]:
new_df['tags'][0]

In [None]:
stem('in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron')

In [None]:
from sklearn.metrics.pairwise import cosine_similarity # To Calculate cosine distance

In [None]:
similarity = cosine_similarity(vectors) # similarity based on cosine distances

In [None]:
sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[1:6] # sorting the similarity based on selected movie and fetching first 5 movies similar to it.

In [None]:
similarity[0]

In [None]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0] # Fetching index
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6] # Searching the 5 movies based on similarity from a particular movie.
    
    for i in movies_list:
        print(new_df.iloc[i[0]].title) # Movie title

In [None]:
recommend('The Avengers') # testing

In [None]:
import pickle # serializing and deserializing a Python object structure

In [None]:
pickle.dump(new_df.to_dict(),open('movie_dict.pkl','wb'))

In [None]:
pickle.dump(similarity,open('similarity.pkl','wb'))