Import all the required packages for our code

In [1]:
import itertools
import shutil
from multiprocessing import Pool
from evaluate_recommender import evaluate_recommender, generate_gt, map_id_to_name, parse_json
from os import cpu_count
from os.path import exists
qual_eval_folder = './evaluation'

Test the basic recommender using different distance metrics, tf-idf methods and disabling/enabling feedback weighting

Distance metrics:
- Euclidian distance: `sqrt(sum((x - y)^2))`
- Cosine distance: $1-\frac{x \cdot y}{||x||_2||y||_2}$
- Manhattan distance: `sum(|x - y|)`
- Chebyshev distance: `max(|x - y|)`

Tf-idf methods:
- No tf-idf
- default tf-idf: `tf(t, d) * [log [n/df(t)] + 1]`
- smoothed tf-idf: `tf(t, d) * [log [(1+n)/(1+df(t))] + 1]`
- sublinear tf-idf: `[1 + log(tf)] * [log [n/df(t)] + 1]`
- smoothed sublinear tf-idf: `[1 + log(tf)] * [log [(1+n)/(1+df(t))] + 1]`

Feedback weighting: will transform the feature vectors of items that were reviewed negatively to negative values (dislikes)

In [2]:
gt_file = './data/ground_truth.parquet'
if not exists(gt_file):
    generate_gt(gt_file)
metrics = ['euclidean', 'cosine', 'manhattan']
tfidf = [None, 'default', 'smooth', 'sublinear', 'smooth_sublinear']
combinations = list(itertools.product(metrics, tfidf))
with Pool(min(cpu_count(), len(combinations))) as pool:
    results = [pool.apply_async(evaluate_recommender, args=(metric, tfidf, False, qual_eval_folder)) for metric, tfidf in combinations]
    output = [p.get() for p in results]
for result in output:
    print(result[0], result[1], result[2], '\b:', result[3])
    
with Pool(min(cpu_count(), len(tfidf))) as pool:
    results = [pool.apply_async(evaluate_recommender, args=('cosine', tfidf_method, True, qual_eval_folder)) for tfidf_method in tfidf]
    output = [p.get() for p in results]
for result in output:
    print(result[0], result[1], result[2], '\b:', result[3])

euclidean None False: {'nDCG@k': 0.21217929154761547, 'recall@k': 0.025599964095272373}
euclidean default False: {'nDCG@k': 0.2365792569708007, 'recall@k': 0.029979453947888618}
euclidean smooth False: {'nDCG@k': 0.23675069163099288, 'recall@k': 0.029998350833202816}
euclidean sublinear False: {'nDCG@k': 0.2365792569708007, 'recall@k': 0.029979453947888618}
euclidean smooth_sublinear False: {'nDCG@k': 0.23675069163099288, 'recall@k': 0.029998350833202816}
cosine None False: {'nDCG@k': 0.3416964808768842, 'recall@k': 0.04017819006225621}
cosine default False: {'nDCG@k': 0.34589429604608735, 'recall@k': 0.040467501225920244}
cosine smooth False: {'nDCG@k': 0.3460184542501682, 'recall@k': 0.04046616806914695}
cosine sublinear False: {'nDCG@k': 0.34589429604608735, 'recall@k': 0.040467501225920244}
cosine smooth_sublinear False: {'nDCG@k': 0.3460184542501682, 'recall@k': 0.04046616806914695}
manhattan None False: {'nDCG@k': 0.21218879058172857, 'recall@k': 0.025591013030650714}
manhattan d

(optional)  Create a .zip archive of the created qualitative evaluation files. This is done such that the qualitative evaluation results can be shared through GitHub.
            
    This step can be skipped if the file `evaluation.zip` is already present.

In [12]:
shutil.make_archive(f'{qual_eval_folder}/evaluation', 'zip', f'{qual_eval_folder}/source')
shutil.rmtree(f'{qual_eval_folder}/source')

The qualitative evaluation results in `evaluation.zip` are provided in terms of item ids.
In order to be able to interpret the results, the ids are mapped to application names through the following code.

In [3]:
shutil.unpack_archive(f'{qual_eval_folder}/evaluation.zip', qual_eval_folder)

import glob
games = parse_json("./data/steam_games.json")
games = games[['id', 'app_name']]
mapping = dict(zip(games.id, games.app_name))
for f in glob.glob(f'{qual_eval_folder}/*.csv'):
    map_id_to_name(mapping, f)

32135it [00:01, 21678.42it/s]


Reading 32135 rows.
