In [22]:
import pandas as pd
import numpy as np
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm

In [2]:
pd.set_option('display.max_columns', 50)

In [3]:
data_path = '../../../data/movies_df.csv'
movies = pd.read_csv(data_path)

In [15]:
movies = movies[['tmdbId', 'overview']]
movies = movies.rename({'tmdbId':'ItemId'}, axis=1)

In [6]:
vectorizer = TfidfVectorizer(stop_words='english')
overview_matrix = vectorizer.fit_transform(movies['overview'].fillna(''))

similarity_matrix = cosine_similarity(overview_matrix)

In [16]:
def get_similar_movies(movie_id, top_n=1000):
    movie_idx = movies.index[movies['ItemId'] == movie_id].tolist()[0]
    similarity_scores = list(enumerate(similarity_matrix[movie_idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:top_n+1]
    similar_movies = [movies['ItemId'][i] for i, score in similarity_scores]
    return similar_movies

In [17]:
movie_dictionaries = {}
for movie_id in tqdm(movies['ItemId'], desc="Processing Movies"):
    similar_movies_ids = get_similar_movies(movie_id)
    movie_dictionaries[movie_id] = similar_movies_ids

Processing Movies: 100%|█████████████████████████████████████████████████████████| 26483/26483 [08:56<00:00, 49.39it/s]


In [20]:
def convert_int64(o):
    if isinstance(o, np.int64):
        return int(o)
    raise TypeError

In [23]:
data_path = '../../../data/movies_similarity.json'
with open(data_path, 'w') as f:
    json.dump(movie_dictionaries, f, default=convert_int64)