## Importing necessary libraries

In [26]:
import ast
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

## Importing the datafiles

In [27]:
movies=pd.read_csv("tmdb_5000_movies.csv")
credits=pd.read_csv("tmdb_5000_credits.csv")

## Merging two dataframes into one on the basis of a common column

In [28]:
movies=movies.merge(credits,on='title')

## Data preparation and exploratory analysis

In [29]:
# movies.head()

In [30]:
# movies['genres'][1]

In [31]:
# movies['keywords'][1]

In [32]:
# movies['production_companies'][11]

In [33]:
# movies['cast'][33]

In [34]:
movies.drop(['budget','homepage','original_language','original_title','production_countries','release_date',
             'revenue','spoken_languages','tagline','vote_count','movie_id'],1,inplace=True)

  movies.drop(['budget','homepage','original_language','original_title','production_countries','release_date',


In [35]:
movies.isna().sum()

genres                  0
id                      0
keywords                0
overview                3
popularity              0
production_companies    0
runtime                 2
status                  0
title                   0
vote_average            0
cast                    0
crew                    0
dtype: int64

In [36]:
movies.dropna(inplace=True)

In [37]:
movies.duplicated().sum()

0

## Scraping only the important keywords

In [38]:
def return_list(obj):
    lst=[]
    for i in ast.literal_eval(obj):
        lst.append(i['name'])
    return lst
movies['genres']= movies['genres'].apply(return_list)
movies['keywords']=movies['keywords'].apply(return_list)
movies['production_companies']=movies['production_companies'].apply(return_list)
movies['cast']=movies['cast'].apply(return_list)



In [41]:
# "'saving only the first three cast members to avoid showing unmatching results when cast matches with characters other
# "'the main ones'"
def save_three(obj):
    return obj[:3]
movies['cast']=movies['cast'].apply(save_three)
# movies['crew'][1]

In [42]:
##extracting the name of director
def director(obj):
    lst=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            lst.append(i['name'])
            break
    return lst
movies['crew']=movies['crew'].apply(director)

In [43]:
##Converting into more useful format
movies['overview']=movies['overview'].apply(lambda x: x.split())

In [44]:
## concatenating words to avoid matching with cast members with the same first or last names
movies['genres']=movies['genres'].apply(lambda x: [i.replace(" ","") for i in x])


In [45]:
##similarly, applying the same to other series in the dataframe
movies['overview']=movies['overview'].apply(lambda x: [i.replace(" ","") for i in x])
movies['production_companies']=movies['production_companies'].apply(lambda x: [i.replace(" ","") for i in x])
movies['cast']=movies['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies['crew']=movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])
movies['keywords']=movies['keywords'].apply(lambda x: [i.replace(" ","") for i in x])

## Creating a simpler dataframe after data cleaning

In [46]:

# movies.head()
movies['text']=movies['genres']+movies['keywords']+movies['overview']+movies['production_companies']+movies['cast']+movies['crew']

In [47]:
df=movies[['id','title','text']]
df

Unnamed: 0,id,title,text
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction, c..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action, ocean, drugabuse,..."
2,206647,Spectre,"[Action, Adventure, Crime, spy, basedonnovel, ..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller, dccomics, cri..."
4,49529,John Carter,"[Action, Adventure, ScienceFiction, basedonnov..."
...,...,...,...
4804,9367,El Mariachi,"[Action, Crime, Thriller, unitedstates–mexicob..."
4805,72766,Newlyweds,"[Comedy, Romance, A, newlywed, couple's, honey..."
4806,231617,"Signed, Sealed, Delivered","[Comedy, Drama, Romance, TVMovie, date, loveat..."
4807,126186,Shanghai Calling,"[When, ambitious, New, York, attorney, Sam, is..."


In [48]:
df['text']=df['text'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text']=df['text'].apply(lambda x:" ".join(x))


In [49]:
df['text']=df['text'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text']=df['text'].apply(lambda x:x.lower())


In [50]:
##stemming words to reduce unnecessary data and make the model robus
stemmer=PorterStemmer()
def stem(str):
    lst=[]
    for i in str.split():
        lst.append(stemmer.stem(i))
    return lst
df['text']=df['text'].apply(stem)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text']=df['text'].apply(stem)


Unnamed: 0,id,title,text
0,19995,Avatar,"[action, adventur, fantasi, sciencefict, cultu..."
1,285,Pirates of the Caribbean: At World's End,"[adventur, fantasi, action, ocean, drugabus, e..."
2,206647,Spectre,"[action, adventur, crime, spi, basedonnovel, s..."
3,49026,The Dark Knight Rises,"[action, crime, drama, thriller, dccomic, crim..."
4,49529,John Carter,"[action, adventur, sciencefict, basedonnovel, ..."


In [51]:
df['text']=df['text'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text']=df['text'].apply(lambda x: " ".join(x))


## Vectorizing the textual data using cosine similarity to give matching results

In [52]:
vectorizer = TfidfVectorizer()
vectors=vectorizer.fit_transform(np.array(df['text'])).toarray()


In [53]:
len(vectors[0])

38322

In [54]:
df

Unnamed: 0,id,title,text
0,19995,Avatar,action adventur fantasi sciencefict culturecla...
1,285,Pirates of the Caribbean: At World's End,adventur fantasi action ocean drugabus exotici...
2,206647,Spectre,action adventur crime spi basedonnovel secreta...
3,49026,The Dark Knight Rises,action crime drama thriller dccomic crimefight...
4,49529,John Carter,action adventur sciencefict basedonnovel mar m...
...,...,...,...
4804,9367,El Mariachi,action crime thriller unitedstates–mexicobarri...
4805,72766,Newlyweds,comedi romanc a newlyw couple' honeymoon is up...
4806,231617,"Signed, Sealed, Delivered",comedi drama romanc tvmovi date loveatfirstsig...
4807,126186,Shanghai Calling,when ambiti new york attorney sam is sent to s...


In [55]:
similarity_matrix=cosine_similarity(vectors)
similarity_matrix[0]

array([1.        , 0.02701461, 0.0265421 , ..., 0.03020421, 0.01000448,
       0.00738306])

## Returning a particular number of matching results

In [58]:
def return_index(movie):
    index= df[df['title']==movie].index[0]
    distance=similarity_matrix[index]
    movie_list=sorted(enumerate(distance),reverse=True,key=lambda x:x[1])[0:10]
    print(movie_list)
    for movie in movie_list:
        print(df.iloc[movie[0]].title)



return_index('Cars 2')



[(40, 1.0000000000000004), (566, 0.31128247206740667), (225, 0.13201782115097246), (935, 0.13028344389943902), (405, 0.13006642765717502), (1972, 0.1262598719125652), (3104, 0.12357729766633571), (560, 0.11482287778542803), (1155, 0.10635458671065633), (339, 0.10446153884266446)]
Cars 2
Cars
Speed Racer
Herbie Fully Loaded
The Fast and the Furious: Tokyo Drift
Old School
The Brown Bunny
Driven
Back to the Future Part II
The Incredibles


In [56]:
# df.head(55)

In [57]:
# model=df['title'].values

## Saving the model so far created to deploy it in a website

In [59]:

# pickle.dump(df.to_dict(),open('dict','wb'))
with open('./16032022.pkl','wb') as f:
    pickle.dump(df.to_dict(),f)

In [60]:
with open('./similarity_matrix.pkl','wb') as f:
    pickle.dump(similarity_matrix,f)