In [1]:
import numpy as np
import pickle
import sys
import re
sys.path.append("..")
from dataAggregation.json_processor import json_unzip
from utils.preprocess import preprocess


[nltk_data] Downloading package stopwords to C:\Users\Johnson
[nltk_data]     Lui\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Johnson
[nltk_data]     Lui\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Johnson
[nltk_data]     Lui\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load Compressed Dataset
zipped_dataset = pickle.load(open("../../data/movies_review_json_zip.p", "rb"))

In [3]:
# Unzip the Dataset
movie_data = json_unzip(zipped_dataset)

In [4]:
# Configuration
topK_movie = 5

In [5]:
# Create Movie ID
movie_id_list = list(movie_data)


In [6]:
# Building Vocabulary
preprocessed_corpus = []
for m_id in movie_id_list:
    # Put Movie Title and Plot as the summary of the movie
    movie_name = movie_data[m_id]["localized_title"]
    summary = movie_data[m_id]["summary"].split("\nPlot: ")
    summary = summary[1] if len(summary) > 1 else ""
        
    preprocessed_corpus.append(
        preprocess(movie_name, is_sw_remove=False) + preprocess(summary)
    )

preprocessed_corpus_join = []
for pc in preprocessed_corpus:
    preprocessed_corpus_join.append(' '.join(pc))


preprocessed_corpus_nosw = []
for m_id in movie_id_list:
    # Put Movie Title and Plot as the summary of the movie
    movie_name = movie_data[m_id]["localized_title"]
    summary = movie_data[m_id]["summary"].split("\nPlot: ")
    summary = summary[1] if len(summary) > 1 else ""

    preprocessed_corpus_nosw.append(
        preprocess(movie_name, is_sw_remove=False) + preprocess(summary, is_sw_remove=False)
    )

preprocessed_corpus_join_nosw = []
for pc in preprocessed_corpus_nosw:
    preprocessed_corpus_join_nosw.append(' '.join(pc))


raw_corpus = []
for m_id in movie_id_list:
    # Put Movie Title and Plot as the summary of the movie
    movie_name = movie_data[m_id]["localized_title"]
    summary = movie_data[m_id]["summary"].split("\nPlot: ")
    summary = summary[1] if len(summary) > 1 else ""

    raw_corpus.append(
        re.sub(r"[^a-zA-Z0-9 ]", "", movie_name).lower().split(" ") +
        re.sub(r"[^a-zA-Z0-9 ]", "", summary).lower().split(" ")
    )

raw_corpus_join = []
for pc in raw_corpus:
    raw_corpus_join.append(' '.join(pc))


##### Evaluation Function

In [7]:
from utils.evaluator import MetricsEvaluator

In [8]:
# Create Metrics Evaluator
evaluator = MetricsEvaluator(movie_data)

##### Random walk

In [9]:
from similarity.random import RandomCalculator

random_calculator = RandomCalculator(movie_data, topK_movie)
similar_movies_result_random = random_calculator.generateSimilarMovies()


In [31]:
dcg_result = evaluator.evaluate(similar_movies_result_random)
print("Random walk")
print(f"Average DCG: {np.mean(dcg_result)}")
print(f"Std DCG: {np.std(dcg_result)}")


Random walk
Average DCG: 1.6449281091741155
Std DCG: 1.0411774399440266


##### Jaccard Similarity

In [11]:
from similarity.jaccard import JaccardCalculator

# Create Jaccard Similarity Calculator
jaccard_calculator = JaccardCalculator(preprocessed_corpus)
# Build Simiarity Matrix
jaccard_calculator.buildSimilarityMatrix()


In [12]:
# Calculate Similar Movie Result
similar_movies_result_jaccard = {}
for i in range(len(preprocessed_corpus)):
    similar_movies = (-jaccard_calculator.sim_matrix[i]).argsort()[:topK_movie + 1]
    similar_movies = [movie_id_list[j] for j in similar_movies if j != i][:topK_movie]

    assert movie_id_list[i] not in similar_movies, "Movie ID in Similar Movies List"

    similar_movies_result_jaccard[movie_id_list[i]] = similar_movies



In [32]:
dcg_result = evaluator.evaluate(similar_movies_result_jaccard)
print("Jaccard Similarity")
print(f"Average DCG: {np.mean(dcg_result)}")
print(f"Std DCG: {np.std(dcg_result)}")


Random walk
Average DCG: 2.338252519004315
Std DCG: 1.0358377806742958


##### TF-IDF Similarity

In [9]:
from similarity.tfidf import TFIDFCalculator


In [11]:
tfidf_calculator = TFIDFCalculator(preprocessed_corpus_join)
tfidf_calculator.buildSimilarityMatrix()

# Calculate Similar Movie Result
similar_movies_result_tfidf = {}
for i in range(len(preprocessed_corpus_join)):
    similar_movies = (
        -tfidf_calculator.sim_matrix[i]
    ).argsort()[:topK_movie + 1]
    similar_movies = [
        movie_id_list[j] for j in similar_movies if j != i
    ][:topK_movie]

    assert movie_id_list[i] not in similar_movies, "Movie ID in Similar Movies List"

    similar_movies_result_tfidf[movie_id_list[i]] = similar_movies


In [12]:
dcg_result = evaluator.evaluate(similar_movies_result_tfidf)
print("TFIDF Similarity (With Stopword Removal)")
print(f"Average DCG: {np.mean(dcg_result)}")
print(f"Std DCG: {np.std(dcg_result)}")


TFIDF Similarity (With Stopword Removal)
Average DCG: 2.334625692622194
Std DCG: 1.0358093978760579


In [13]:
tfidf_calculator = TFIDFCalculator(preprocessed_corpus_join_nosw)
tfidf_calculator.buildSimilarityMatrix()

# Calculate Similar Movie Result
similar_movies_result_tfidf = {}
for i in range(len(preprocessed_corpus_join_nosw)):
    similar_movies = (
        -tfidf_calculator.sim_matrix[i]
    ).argsort()[:topK_movie + 1]
    similar_movies = [
        movie_id_list[j] for j in similar_movies if j != i
    ][:topK_movie]

    assert movie_id_list[i] not in similar_movies, "Movie ID in Similar Movies List"

    similar_movies_result_tfidf[movie_id_list[i]] = similar_movies


In [14]:
dcg_result = evaluator.evaluate(similar_movies_result_tfidf)
print("TFIDF Similarity (Without Stopword Removal)")
print(f"Average DCG: {np.mean(dcg_result)}")
print(f"Std DCG: {np.std(dcg_result)}")


TFIDF Similarity (Without Stopword Removal)
Average DCG: 2.3517530416134167
Std DCG: 1.0351286699006752


##### Pre-Trained Word2Vec Embedding

In [9]:
from similarity.gensim import GensimCalculator

In [10]:
gensim_calculator = GensimCalculator(raw_corpus, "word2vec-google-news-300")
gensim_calculator.buildSimilarityMatrix()


In [11]:
# Calculate Similar Movie Result
similar_movies_result_gensim = {}
for i in range(len(preprocessed_corpus_join_nosw)):
    similar_movies = (
        -gensim_calculator.sim_matrix[i]
    ).argsort()[:topK_movie + 1]
    similar_movies = [
        movie_id_list[j] for j in similar_movies if j != i
    ][:topK_movie]

    assert movie_id_list[i] not in similar_movies, "Movie ID in Similar Movies List"

    similar_movies_result_gensim[movie_id_list[i]] = similar_movies


In [12]:
dcg_result = evaluator.evaluate(similar_movies_result_gensim)
print("GenSim Similarity (word2vec-google-news-300)")
print(f"Average DCG: {np.mean(dcg_result)}")
print(f"Std DCG: {np.std(dcg_result)}")


GenSim Similarity (word2vec-google-news-300)
Average DCG: 2.4507202544319235
Std DCG: 1.043982208164629


In [13]:
gensim_calculator = GensimCalculator(raw_corpus, "fasttext-wiki-news-subwords-300")
gensim_calculator.buildSimilarityMatrix()

# Calculate Similar Movie Result
similar_movies_result_gensim = {}
for i in range(len(preprocessed_corpus_join_nosw)):
    similar_movies = (
        -gensim_calculator.sim_matrix[i]
    ).argsort()[:topK_movie + 1]
    similar_movies = [
        movie_id_list[j] for j in similar_movies if j != i
    ][:topK_movie]

    assert movie_id_list[i] not in similar_movies, "Movie ID in Similar Movies List"

    similar_movies_result_gensim[movie_id_list[i]] = similar_movies




In [14]:
dcg_result = evaluator.evaluate(similar_movies_result_gensim)
print("GenSim Similarity (fasttext-wiki-news-subwords-300)")
print(f"Average DCG: {np.mean(dcg_result)}")
print(f"Std DCG: {np.std(dcg_result)}")


GenSim Similarity (fasttext-wiki-news-subwords-300)
Average DCG: 2.196003871343459
Std DCG: 1.0621610054556159


In [10]:
gensim_calculator = GensimCalculator(raw_corpus, "glove-twitter-200")
gensim_calculator.buildSimilarityMatrix()

# Calculate Similar Movie Result
similar_movies_result_gensim = {}
for i in range(len(preprocessed_corpus_join_nosw)):
    similar_movies = (
        -gensim_calculator.sim_matrix[i]
    ).argsort()[:topK_movie + 1]
    similar_movies = [
        movie_id_list[j] for j in similar_movies if j != i
    ][:topK_movie]

    assert movie_id_list[i] not in similar_movies, "Movie ID in Similar Movies List"

    similar_movies_result_gensim[movie_id_list[i]] = similar_movies


In [11]:
dcg_result = evaluator.evaluate(similar_movies_result_gensim)
print("GenSim Similarity (glove-twitter-200)")
print(f"Average DCG: {np.mean(dcg_result)}")
print(f"Std DCG: {np.std(dcg_result)}")


GenSim Similarity (glove-twitter-200)
Average DCG: 2.257351379313323
Std DCG: 1.062587634837154
