In [3]:
import pandas as pd
import ast
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import zipfile
import pickle

In [6]:
!kaggle datasets download tmdb/tmdb-movie-metadata -p dataset

Dataset URL: https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata
License(s): other
Downloading tmdb-movie-metadata.zip to dataset
100%|██████████████████████████████████████| 8.89M/8.89M [00:11<00:00, 1.05MB/s]
100%|███████████████████████████████████████| 8.89M/8.89M [00:11<00:00, 834kB/s]


In [7]:
with zipfile.ZipFile ("./dataset/tmdb-movie-metadata.zip",'r') as f:
    f.extractall("dataset")

In [8]:
movies=pd.read_csv('./dataset/tmdb_5000_movies.csv')
credit=pd.read_csv('./dataset/tmdb_5000_credits.csv')

In [9]:
dataset=movies.merge(right=credit,on='title')

In [10]:
dataset=dataset[['movie_id','title','genres','keywords','spoken_languages','overview','cast','crew']]

In [11]:
display(dataset.isna().sum())
dataset.dropna(inplace=True)
dataset.drop_duplicates(inplace=True)

movie_id            0
title               0
genres              0
keywords            0
spoken_languages    0
overview            3
cast                0
crew                0
dtype: int64

In [12]:
ast.literal_eval(dataset['genres'][0])

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [13]:
def getgen(genre):
    res=[]
    for i in ast.literal_eval(genre):
        res.append(i['name'])
    return res
def getdir(crew):
    res=[]
    for i in ast.literal_eval(crew):
        if i['job']=='Director':
            res.append(i['name'])
    return res

In [14]:
dataset['genres']=dataset['genres'].apply(getgen)
dataset['keywords']=dataset['keywords'].apply(getgen)
dataset['spoken_languages']=dataset['spoken_languages'].apply(getgen)
dataset['overview']=dataset['overview'].apply(lambda x:x.split())
dataset['cast']=dataset['cast'].apply(getgen)
dataset['crew']=dataset['crew'].apply(getdir)

In [15]:
dataset['tags']=dataset['genres']+dataset['keywords']+dataset['spoken_languages']+dataset['overview']+dataset['cast']+dataset['crew']
new_data = pd.DataFrame({
    'movie_id': dataset['movie_id'],
    'title': dataset['title'],
    'tags': dataset['tags']
})

In [16]:
stemmer=PorterStemmer()
new_data['tags']=new_data['tags'].apply(lambda x: [stemmer.stem(word.lower().replace(' ','')) for word in list(set(x)) if word not in stopwords.words('english')])

In [17]:
new_data['tags']=new_data['tags'].apply(lambda x: " ".join(x))

In [18]:
new_data

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,fantasi melvinlenoclarkiii lisaroumain michael...
1,285,Pirates of the Caribbean: At World's End,fantasi jonathanpryc tomholland drugabus loveo...
2,206647,Spectre,alessandrobressanello battl adammcgradi deutsc...
3,49026,The Dark Knight Rises,follow davidgyasi dent' kyle ronniegeneblevin ...
4,49529,John Carter,edgarriceburrough mar arkiereec andrewstanton ...
...,...,...,...
4804,9367,El Mariachi,carri mistak reinolmartinez ramirogomez henchm...
4805,72766,Newlyweds,arriv comedi daniellapineda caitlinfitzgerald ...
4806,231617,"Signed, Sealed, Delivered",date loveatfirstsight benjaminhollingsworth de...
4807,126186,Shanghai Calling,"english specialist, when shanghai ambiti roman..."


In [19]:
vectorizer = CountVectorizer()
bagofwords = vectorizer.fit_transform(new_data['tags']).toarray()

In [20]:
similarity=cosine_similarity(bagofwords)

In [21]:
index = dataset['title'][dataset['title'] == 'Avatar'].index[0]
similarity[index]

array([1.        , 0.03898036, 0.03788857, ..., 0.03131121, 0.01160518,
       0.01084333])

In [22]:
def predict(movie):
    index = dataset['title'][dataset['title'] == movie].index[0]
    res = [list(dataset['title'])[i[0]] for i in sorted(enumerate(similarity[index]), key=lambda x: x[1],reverse=True)][1:10]
    return res

In [23]:
predict('Avatar')

['The Helix... Loaded',
 'Falcon Rising',
 'Small Soldiers',
 'Aliens',
 'Predators',
 'Beowulf',
 'Journey 2: The Mysterious Island',
 'Predator',
 'Titan A.E.']

In [24]:
with open('similarity.pkl','wb') as f:
    pickle.dump(similarity,f)
with open('titles.pkl','wb') as fb:
    pickle.dump(dict(zip(dataset['title'], dataset['movie_id'])),fb)