In [272]:
import numpy as np
import pandas as pd

In [273]:
#Selecting those columns only which we need while making the model
movies=pd.read_csv('tmdb_movies_data_final.csv')

In [274]:
movies.head(2)

Unnamed: 0,id,popularity,original_title,cast,director,tagline,keywords,overview,genres,release_year
0,135397,32.985763,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,The park is open.,monster|dna|tyrannosaurus rex|velociraptor|island,Twenty-two years after the events of Jurassic ...,Action|Adventure|Science Fiction|Thriller,2015
1,76341,28.419936,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,What a Lovely Day.,future|chase|post-apocalyptic|dystopia|australia,An apocalyptic story set in the furthest reach...,Action|Adventure|Science Fiction|Thriller,2015


In [275]:
#Looking if any data is missing in movies
movies.isnull().sum()

id                   0
popularity           0
original_title       0
cast                50
director            32
tagline           1484
keywords           903
overview             4
genres              13
release_year         0
dtype: int64

In [276]:
movies.shape

(5038, 10)

In [277]:
#removing the missing content in this dataset
movies.dropna(inplace=True)

In [278]:
movies.shape

(3092, 10)

In [279]:
#Looking if there is any duplicate row
movies.duplicated().sum()

1

In [280]:
# Remove all duplicate rows 
movies = movies.drop_duplicates()

In [281]:
movies.shape

(3091, 10)

In [282]:
# Reindexing the movies in dataframe
movies.reset_index(drop=True, inplace=True)

In [283]:
#Converting string in the form of list
movies['overview']=movies['overview'].apply(lambda x:x.split())
movies['genres']=movies['genres'].apply(lambda x:x.split())
movies['keywords']=movies['keywords'].apply(lambda x:x.split())
movies['cast']=movies['cast'].apply(lambda x:x.split())

In [284]:
movies.head(3)

Unnamed: 0,id,popularity,original_title,cast,director,tagline,keywords,overview,genres,release_year
0,135397,32.985763,Jurassic World,"[Chris, Pratt|Bryce, Dallas, Howard|Irrfan, Kh...",Colin Trevorrow,The park is open.,"[monster|dna|tyrannosaurus, rex|velociraptor|i...","[Twenty-two, years, after, the, events, of, Ju...","[Action|Adventure|Science, Fiction|Thriller]",2015
1,76341,28.419936,Mad Max: Fury Road,"[Tom, Hardy|Charlize, Theron|Hugh, Keays-Byrne...",George Miller,What a Lovely Day.,[future|chase|post-apocalyptic|dystopia|austra...,"[An, apocalyptic, story, set, in, the, furthes...","[Action|Adventure|Science, Fiction|Thriller]",2015
2,262500,13.112507,Insurgent,"[Shailene, Woodley|Theo, James|Kate, Winslet|A...",Robert Schwentke,One Choice Can Destroy You,"[based, on, novel|revolution|dystopia|sequel|d...","[Beatrice, Prior, must, confront, her, inner, ...","[Adventure|Science, Fiction|Thriller]",2015


In [285]:
#concatination of overbiew,genres,keywords,cast
movies['tags']=movies['overview']+movies['genres']+movies['keywords']+movies['cast']

In [286]:
#converting all lists in strings
movies['tags']=movies['tags'].apply(lambda x:" ".join(x))

In [287]:
#Taking just id,original_title,tags,release_year,director in new dataframe
movies = movies[['id','original_title','tags','director']]

In [288]:
movies.head(3)

Unnamed: 0,id,original_title,tags,director
0,135397,Jurassic World,Twenty-two years after the events of Jurassic ...,Colin Trevorrow
1,76341,Mad Max: Fury Road,An apocalyptic story set in the furthest reach...,George Miller
2,262500,Insurgent,Beatrice Prior must confront her inner demons ...,Robert Schwentke


In [289]:
#textvectorisation of tag column using bag of words technique using sklearn.feature_extraction.text function
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,stop_words='english')

In [290]:
vectors=cv.fit_transform(movies['tags']).toarray()

In [291]:
#checking which 5000 most frequent words are there in corpus
cv.get_feature_names()

['000',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '18th',
 '19',
 '1930s',
 '1950s',
 '1960',
 '1960s',
 '1970s',
 '1980s',
 '1987',
 '1994',
 '1999',
 '19th',
 '20',
 '200',
 '2001',
 '2003',
 '2008',
 '2009',
 '2010',
 '2011',
 '20th',
 '24',
 '25',
 '30',
 '35',
 '3d',
 '40',
 '50',
 '500',
 '60',
 'aaron',
 'abandoned',
 'abbie',
 'abby',
 'abducted',
 'abigail',
 'abilities',
 'ability',
 'able',
 'aboard',
 'abroad',
 'absence',
 'abuse',
 'abusive',
 'academy',
 'accepts',
 'access',
 'accident',
 'accidental',
 'accidentally',
 'acclaimed',
 'accompanied',
 'accompany',
 'accomplished',
 'according',
 'account',
 'accused',
 'act',
 'action',
 'actions',
 'activist',
 'activities',
 'actor',
 'actors',
 'actress',
 'acts',
 'actual',
 'actually',
 'ad',
 'adam',
 'adams',
 'adaptation',
 'addict',
 'addicted',
 'addiction',
 'adele',
 'adkins',
 'adolescence',
 'adolf',
 'adopt',
 'adopted',
 'adoption',
 'adrian',
 'adrien',
 'adrienne',
 'adrift',
 'adul

In [292]:
#Applying stemming to avoid treatment of different words to the words with same meaning
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [293]:
def stem(text):
  y=[]
  for i in text.split():
    y.append(ps.stem(i))

  return " ".join(y)

In [294]:
movies['tags']=movies['tags'].apply(stem)

In [295]:
#calculating cosine distance(angle between 2 vectors) of 2 vectors and storing in variable similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity=cosine_similarity(vectors)

In [296]:
#checking similarity of 1st movie with all other movies in dataframe
similarity[0]

array([1.        , 0.15534713, 0.13453456, ..., 0.03300492, 0.03363364,
       0.0489996 ])

In [297]:
def recommend(movie):

#1.fetching the index of movie in dataframe
  movie_index=movies[movies['original_title']==movie].index[0]

#2.Assigning distances variable to the similarity obtained between 2 movies
  distances=similarity[movie_index]

#3.sorting movies (without losing there index) to get most similar movies in the beginning
  movies_list=sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]

#4.fetching names of recommended movies from the index
  for i in movies_list:
    print('Movie name:')
    print(movies.iloc[i[0]].original_title)
    print('Director:')
    print(movies.iloc[i[0]].director)
    
  return

In [298]:
recommend('Mad Max: Fury Road')

Movie name:
The Book of Eli
Director:
Albert Hughes|Allen Hughes
Movie name:
9
Director:
Shane Acker
Movie name:
Turbo Kid
Director:
FranÃ§ois Simard|Anouk Whissell|Yoann-Karl Whissell
Movie name:
The Maze Runner
Director:
Wes Ball
Movie name:
The Day
Director:
Douglas Aarniokoski


In [299]:
import pickle

In [300]:
# Creating .pkl file in which all names of all movies are available
pickle.dump(movies.to_dict(),open('movies_dict.pkl','wb'))

In [301]:
# Creating .pkl file in which all similarity vectors are available
pickle.dump(similarity,open('similarity.pkl','wb'))