In [25]:
import numpy as np
import re
import json
from utils.preprocess import preprocess
from similarity.tfidf import TFIDFCalculator
from similarity.gensim import GensimCalculator
from similarity.bert import BERTCalculator

In [2]:
# Load JSON File
with open("../../data/movies_metadata.json") as data_file:
    movie_data_str = data_file.read()

movie_data = json.loads(movie_data_str)


In [3]:
# Configuration
topK_movie = 5


In [4]:
# Create Movie ID
movie_id_list = list(movie_data)


In [5]:
# Building Vocabulary
preprocessed_corpus = []
for m_id in movie_id_list:
    # Put Movie Title and Plot as the summary of the movie
    movie_name = movie_data[m_id]["localized_title"]
    summary = movie_data[m_id]["summary"].split("\nPlot: ")
    summary = summary[1] if len(summary) > 1 else ""

    preprocessed_corpus.append(
        preprocess(movie_name, is_sw_remove=False) + preprocess(summary)
    )

preprocessed_corpus_join = []
for pc in preprocessed_corpus:
    preprocessed_corpus_join.append(' '.join(pc))


preprocessed_corpus_nosw = []
for m_id in movie_id_list:
    # Put Movie Title and Plot as the summary of the movie
    movie_name = movie_data[m_id]["localized_title"]
    summary = movie_data[m_id]["summary"].split("\nPlot: ")
    summary = summary[1] if len(summary) > 1 else ""

    preprocessed_corpus_nosw.append(
        preprocess(movie_name, is_sw_remove=False) +
        preprocess(summary, is_sw_remove=False)
    )

preprocessed_corpus_join_nosw = []
for pc in preprocessed_corpus_nosw:
    preprocessed_corpus_join_nosw.append(' '.join(pc))


raw_corpus = []
for m_id in movie_id_list:
    # Put Movie Title and Plot as the summary of the movie
    movie_name = movie_data[m_id]["localized_title"]
    summary = movie_data[m_id]["summary"].split("\nPlot: ")
    summary = summary[1] if len(summary) > 1 else ""

    raw_corpus.append(
        re.sub(r"[^a-zA-Z0-9 ]", "", movie_name).lower().split(" ") +
        re.sub(r"[^a-zA-Z0-9 ]", "", summary).lower().split(" ")
    )

raw_corpus_join = []
for pc in raw_corpus:
    raw_corpus_join.append(' '.join(pc))


In [21]:
def generateSimilarMoviesJSON(result_json, method_name):
    output_json = {}
    for movie_id, similarMovies in result_json.items():
        output_json[movie_id] = {}
        targetMovie = output_json[movie_id]
        targetMovie["similar_movie_ids"] = similarMovies
        targetMovie["cover_urls"] = []
        targetMovie["movies_name"] = []

        for similar_movie_id in similarMovies:
            targetMovie["cover_urls"].append(
                movie_data[similar_movie_id]["cover_url"]
            )
            targetMovie["movies_name"].append(
                movie_data[similar_movie_id]["localized_title"]
            )
        
    with open(f"../../data/similarMovies_{method_name}.json", "w") as outfile:
        json.dump(output_json, outfile)

    return output_json


In [23]:
tfidf_calculator = TFIDFCalculator(preprocessed_corpus_join_nosw)
tfidf_calculator.buildSimilarityMatrix()

# Calculate Similar Movie Result
similar_movies_result_tfidf = {}
for i in range(len(preprocessed_corpus_join_nosw)):
    similar_movies = (
        -tfidf_calculator.sim_matrix[i]
    ).argsort()[:topK_movie + 1]
    similar_movies = [
        movie_id_list[j] for j in similar_movies if j != i
    ][:topK_movie]

    assert movie_id_list[i] not in similar_movies, "Movie ID in Similar Movies List"

    similar_movies_result_tfidf[movie_id_list[i]] = similar_movies


In [24]:
_ = generateSimilarMoviesJSON(similar_movies_result_tfidf, "tfidf")

In [28]:
gensim_calculator = GensimCalculator(raw_corpus, "word2vec-google-news-300")
gensim_calculator.buildSimilarityMatrix()

# Calculate Similar Movie Result
similar_movies_result_gensim = {}
for i in range(len(preprocessed_corpus_join_nosw)):
    similar_movies = (
        -gensim_calculator.sim_matrix[i]
    ).argsort()[:topK_movie + 1]
    similar_movies = [
        movie_id_list[j] for j in similar_movies if j != i
    ][:topK_movie]

    assert movie_id_list[i] not in similar_movies, "Movie ID in Similar Movies List"

    similar_movies_result_gensim[movie_id_list[i]] = similar_movies


In [29]:
_ = generateSimilarMoviesJSON(similar_movies_result_gensim, "gensim")

In [26]:
bert_calculator = BERTCalculator(raw_corpus_join)
bert_calculator.buildSimilarityMatrix()

# Calculate Similar Movie Result
similar_movies_result_bert = {}
for i in range(len(raw_corpus_join)):
    similar_movies = (
        -bert_calculator.sim_matrix[i]
    ).argsort()[:topK_movie + 1]
    similar_movies = [
        movie_id_list[j] for j in similar_movies if j != i
    ][:topK_movie]

    assert movie_id_list[i] not in similar_movies, "Movie ID in Similar Movies List"

    similar_movies_result_bert[movie_id_list[i]] = similar_movies


In [27]:
_ = generateSimilarMoviesJSON(similar_movies_result_bert, "bert")