This file implements all hyper-parameter optimization experiments. We try to select the ideal item representation, weighting scheme and dimensionality reduction technique based on the results of all possible combinations.
All results are saved to pickle files within the `evaluation` folder of the repository.
For visualizations of these results, we refer to `visualizations.ipynb`.

In [5]:
from src.recommenders import ImprovedRecommender
from sklearn.decomposition import TruncatedSVD, PCA, IncrementalPCA, KernelPCA, SparsePCA
import pandas as pd
import numpy as np
from os import cpu_count
from multiprocessing import Pool
from itertools import combinations
import pickle

In [6]:
qual_eval_folder = './evaluation'
items_path = "./data/games.pkl"
data_path = "./data/interactions_splits_"
reviews_path = "./data/reviews.parquet"

In [4]:
def average_results(results):
    average_dict = {}
    for key in results[0]:
        key_list = [result[key] for result in results]
        average_dict[key] = sum(key_list)/len(key_list)
    return average_dict


Experiment for evaluating different item representations

In [12]:
information = ['specs', 'genres', 'early_access', 'publisher', 'developer']
def test_representation(columns, split):
    rec = ImprovedRecommender(items_path, train_path=f"{split}_train.parquet", test_path=f"{split}_test.parquet", val_path=f"{split}_val.parquet", reviews_path=reviews_path, sparse=True, tfidf='smooth', normalize=True, columns=columns, weighting_scheme={})
    rec.generate_recommendations(read_max=5000)
    return rec.evaluate(k=10)

results = []
representations = [map(list, combinations(information, r)) for r in range(1, len(information)+1)]
representations = [rep+['tags'] for sublist in representations for rep in sublist]
for columns in representations:
    split_results = []
    for split in [data_path + str(i) for i in range(3)]:
        split_results.append(test_representation(columns, split))
    results.append(average_results(split_results))
    print(columns, results[-1])

with open('./evaluation/representations.pickle', 'wb') as handle:
    pickle.dump(representations, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('./evaluation/representations_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

5000it [00:00, 8446.30it/s]
5000it [00:10, 472.60it/s]
5000it [00:00, 8531.39it/s]
5000it [00:10, 474.53it/s]
5000it [00:00, 7721.89it/s]
5000it [00:10, 467.30it/s]


['specs', 'tags'] {'HR@10': 0.1902, 'nDCG@10': 0.051539207908820474, 'recall@10': 0.06717211479669648, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.06939227513227514}


5000it [00:00, 8516.16it/s]
5000it [00:09, 518.66it/s]
5000it [00:00, 8410.25it/s]
5000it [00:09, 521.11it/s]
5000it [00:00, 8665.30it/s]
5000it [00:09, 524.44it/s]


['genres', 'tags'] {'HR@10': 0.19013333333333335, 'nDCG@10': 0.04837447216211648, 'recall@10': 0.06477183772303335, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.06733669312169312}


5000it [00:00, 8675.92it/s]
5000it [00:09, 512.81it/s]
5000it [00:00, 8757.40it/s]
5000it [00:09, 507.66it/s]
5000it [00:00, 8127.73it/s]
5000it [00:10, 495.57it/s]


['early_access', 'tags'] {'HR@10': 0.19053333333333333, 'nDCG@10': 0.04850597681774197, 'recall@10': 0.06500904155128008, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.06756915343915343}


5000it [00:01, 3238.20it/s]
5000it [00:13, 374.62it/s]
5000it [00:01, 3148.72it/s]
5000it [00:12, 393.92it/s]
5000it [00:01, 3215.19it/s]
5000it [00:12, 393.09it/s]


['publisher', 'tags'] {'HR@10': 0.2614666666666667, 'nDCG@10': 0.07419843603846196, 'recall@10': 0.0944219852957892, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.09824222222222223}


5000it [00:02, 2470.06it/s]
5000it [00:12, 400.92it/s]
5000it [00:02, 2469.48it/s]
5000it [00:12, 390.10it/s]
5000it [00:02, 2412.35it/s]
5000it [00:12, 389.72it/s]


['developer', 'tags'] {'HR@10': 0.2484666666666667, 'nDCG@10': 0.06991308061813012, 'recall@10': 0.08976327941508815, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.0932865873015873}


5000it [00:00, 8246.44it/s]
5000it [00:10, 472.12it/s]
5000it [00:00, 8235.60it/s]
5000it [00:10, 475.89it/s]
5000it [00:00, 7942.81it/s]
5000it [00:11, 441.32it/s]


['specs', 'genres', 'tags'] {'HR@10': 0.19013333333333335, 'nDCG@10': 0.051512375330560335, 'recall@10': 0.06709598426997455, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.06932291005291005}


5000it [00:00, 8264.45it/s]
5000it [00:10, 472.08it/s]
5000it [00:00, 8441.87it/s]
5000it [00:10, 471.72it/s]
5000it [00:00, 8385.41it/s]
5000it [00:10, 478.96it/s]


['specs', 'early_access', 'tags'] {'HR@10': 0.1902, 'nDCG@10': 0.051539207908820474, 'recall@10': 0.06717211479669648, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.06939227513227514}


5000it [00:01, 3126.66it/s]
5000it [00:13, 372.61it/s]
5000it [00:01, 3204.06it/s]
5000it [00:13, 377.57it/s]
5000it [00:01, 3172.88it/s]
5000it [00:13, 362.35it/s]


['specs', 'publisher', 'tags'] {'HR@10': 0.26313333333333333, 'nDCG@10': 0.07651871451299042, 'recall@10': 0.09571626259231343, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.09928719576719576}


5000it [00:02, 2450.83it/s]
5000it [00:13, 361.73it/s]
5000it [00:02, 2364.78it/s]
5000it [00:13, 366.94it/s]
5000it [00:02, 2424.01it/s]
5000it [00:14, 355.39it/s]


['specs', 'developer', 'tags'] {'HR@10': 0.25279999999999997, 'nDCG@10': 0.07369298590099534, 'recall@10': 0.09162933459030874, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.09512396825396825}


5000it [00:00, 8582.96it/s]
5000it [00:09, 515.76it/s]
5000it [00:00, 8402.83it/s]
5000it [00:09, 515.35it/s]
5000it [00:00, 8625.39it/s]
5000it [00:09, 502.72it/s]


['genres', 'early_access', 'tags'] {'HR@10': 0.19013333333333335, 'nDCG@10': 0.04837447216211648, 'recall@10': 0.06477183772303335, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.06733669312169312}


5000it [00:01, 3235.87it/s]
5000it [00:12, 391.71it/s]
5000it [00:01, 3053.43it/s]
5000it [00:12, 397.01it/s]
5000it [00:01, 3092.27it/s]
5000it [00:12, 386.27it/s]


['genres', 'publisher', 'tags'] {'HR@10': 0.26120000000000004, 'nDCG@10': 0.07417781192556498, 'recall@10': 0.094320090913855, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.09813222222222222}


5000it [00:02, 2472.80it/s]
5000it [00:12, 397.85it/s]
5000it [00:02, 2443.52it/s]
5000it [00:12, 393.88it/s]
5000it [00:02, 2407.66it/s]
5000it [00:12, 387.72it/s]


['genres', 'developer', 'tags'] {'HR@10': 0.24793333333333334, 'nDCG@10': 0.06985427355294115, 'recall@10': 0.08965999632030504, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.09318949735449734}


5000it [00:01, 3159.24it/s]
5000it [00:12, 392.94it/s]
5000it [00:01, 3188.85it/s]
5000it [00:12, 397.62it/s]
5000it [00:01, 3137.64it/s]
5000it [00:12, 392.00it/s]


['early_access', 'publisher', 'tags'] {'HR@10': 0.2614666666666667, 'nDCG@10': 0.07419843603846196, 'recall@10': 0.0944219852957892, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.09824222222222223}


5000it [00:02, 2443.28it/s]
5000it [00:12, 386.36it/s]
5000it [00:02, 2465.74it/s]
5000it [00:12, 389.74it/s]
5000it [00:02, 2324.17it/s]
5000it [00:13, 384.59it/s]


['early_access', 'developer', 'tags'] {'HR@10': 0.2484666666666667, 'nDCG@10': 0.06991308061813012, 'recall@10': 0.08976327941508815, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.0932865873015873}


5000it [00:02, 2134.18it/s]
5000it [00:13, 372.11it/s]
5000it [00:02, 2140.26it/s]
5000it [00:13, 383.15it/s]
5000it [00:02, 2165.01it/s]
5000it [00:13, 374.47it/s]


['publisher', 'developer', 'tags'] {'HR@10': 0.2594666666666667, 'nDCG@10': 0.07601606741859933, 'recall@10': 0.09694517402355, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.10029952380952382}


5000it [00:00, 8221.29it/s]
5000it [00:10, 476.53it/s]
5000it [00:00, 8314.57it/s]
5000it [00:10, 467.58it/s]
5000it [00:00, 8223.52it/s]
5000it [00:10, 467.84it/s]


['specs', 'genres', 'early_access', 'tags'] {'HR@10': 0.19013333333333335, 'nDCG@10': 0.051512375330560335, 'recall@10': 0.06709598426997455, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.06932291005291005}


5000it [00:01, 3147.01it/s]
5000it [00:13, 362.60it/s]
5000it [00:01, 3078.46it/s]
5000it [00:13, 365.61it/s]
5000it [00:01, 3055.79it/s]
5000it [00:13, 358.59it/s]


['specs', 'genres', 'publisher', 'tags'] {'HR@10': 0.2630666666666666, 'nDCG@10': 0.07650533060930705, 'recall@10': 0.09571198603553689, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.09927878306878307}


5000it [00:02, 2336.47it/s]
5000it [00:14, 355.91it/s]
5000it [00:02, 2252.43it/s]
5000it [00:13, 358.10it/s]
5000it [00:02, 2454.53it/s]
5000it [00:13, 364.31it/s]


['specs', 'genres', 'developer', 'tags'] {'HR@10': 0.25286666666666663, 'nDCG@10': 0.07365126912390114, 'recall@10': 0.09158171345928645, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.09508650793650793}


5000it [00:01, 3112.44it/s]
5000it [00:13, 364.38it/s]
5000it [00:01, 3147.18it/s]
5000it [00:13, 370.43it/s]
5000it [00:01, 3140.68it/s]
5000it [00:13, 360.59it/s]


['specs', 'early_access', 'publisher', 'tags'] {'HR@10': 0.26313333333333333, 'nDCG@10': 0.07651871451299042, 'recall@10': 0.09571626259231343, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.09928719576719576}


5000it [00:02, 2400.17it/s]
5000it [00:14, 355.94it/s]
5000it [00:02, 2357.10it/s]
5000it [00:13, 358.32it/s]
5000it [00:02, 2381.58it/s]
5000it [00:14, 357.09it/s]


['specs', 'early_access', 'developer', 'tags'] {'HR@10': 0.25279999999999997, 'nDCG@10': 0.07369298590099534, 'recall@10': 0.09162933459030874, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.09512396825396825}


5000it [00:02, 2164.20it/s]
5000it [00:14, 344.91it/s]
5000it [00:02, 2158.60it/s]
5000it [00:14, 341.06it/s]
5000it [00:02, 2179.22it/s]
5000it [00:14, 356.25it/s]


['specs', 'publisher', 'developer', 'tags'] {'HR@10': 0.26366666666666666, 'nDCG@10': 0.0792388269982653, 'recall@10': 0.09852182418310478, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.1018154232804233}


5000it [00:01, 3151.97it/s]
5000it [00:12, 394.82it/s]
5000it [00:01, 3171.09it/s]
5000it [00:12, 392.66it/s]
5000it [00:01, 3140.57it/s]
5000it [00:12, 394.65it/s]


['genres', 'early_access', 'publisher', 'tags'] {'HR@10': 0.26120000000000004, 'nDCG@10': 0.07417781192556498, 'recall@10': 0.094320090913855, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.09813222222222222}


5000it [00:02, 2401.02it/s]
5000it [00:12, 391.11it/s]
5000it [00:02, 2439.21it/s]
5000it [00:12, 387.98it/s]
5000it [00:02, 2350.88it/s]
5000it [00:13, 383.48it/s]


['genres', 'early_access', 'developer', 'tags'] {'HR@10': 0.24793333333333334, 'nDCG@10': 0.06985427355294115, 'recall@10': 0.08965999632030504, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.09318949735449734}


5000it [00:02, 2179.56it/s]
5000it [00:13, 378.84it/s]
5000it [00:02, 2215.09it/s]
5000it [00:12, 392.17it/s]
5000it [00:02, 2215.62it/s]
5000it [00:13, 374.54it/s]


['genres', 'publisher', 'developer', 'tags'] {'HR@10': 0.259, 'nDCG@10': 0.07588120370769746, 'recall@10': 0.09660555468863653, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.09995698412698413}


5000it [00:02, 2155.40it/s]
5000it [00:13, 371.63it/s]
5000it [00:02, 2192.12it/s]
5000it [00:13, 384.09it/s]
5000it [00:02, 2190.60it/s]
5000it [00:13, 384.37it/s]


['early_access', 'publisher', 'developer', 'tags'] {'HR@10': 0.2594666666666667, 'nDCG@10': 0.07601606741859933, 'recall@10': 0.09694517402355, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.10029952380952382}


5000it [00:01, 3124.37it/s]
5000it [00:13, 374.15it/s]
5000it [00:01, 3122.54it/s]
5000it [00:13, 374.98it/s]
5000it [00:01, 3118.81it/s]
5000it [00:13, 365.13it/s]


['specs', 'genres', 'early_access', 'publisher', 'tags'] {'HR@10': 0.2630666666666666, 'nDCG@10': 0.07650533060930705, 'recall@10': 0.09571198603553689, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.09927878306878307}


5000it [00:02, 2272.72it/s]
5000it [00:13, 357.65it/s]
5000it [00:02, 2352.38it/s]
5000it [00:13, 360.18it/s]
5000it [00:02, 2412.98it/s]
5000it [00:13, 363.06it/s]


['specs', 'genres', 'early_access', 'developer', 'tags'] {'HR@10': 0.25286666666666663, 'nDCG@10': 0.07365126912390114, 'recall@10': 0.09158171345928645, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.09508650793650793}


5000it [00:02, 2100.06it/s]
5000it [00:14, 351.00it/s]
5000it [00:02, 2147.95it/s]
5000it [00:14, 344.43it/s]
5000it [00:02, 2121.04it/s]
5000it [00:14, 340.63it/s]


['specs', 'genres', 'publisher', 'developer', 'tags'] {'HR@10': 0.26286666666666664, 'nDCG@10': 0.0790052078761835, 'recall@10': 0.09815592627909164, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.10144746031746033}


5000it [00:02, 2154.34it/s]
5000it [00:14, 350.59it/s]
5000it [00:02, 2116.92it/s]
5000it [00:14, 346.81it/s]
5000it [00:02, 2114.54it/s]
5000it [00:14, 343.76it/s]


['specs', 'early_access', 'publisher', 'developer', 'tags'] {'HR@10': 0.26366666666666666, 'nDCG@10': 0.0792388269982653, 'recall@10': 0.09852182418310478, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.1018154232804233}


5000it [00:02, 2157.93it/s]
5000it [00:13, 381.56it/s]
5000it [00:02, 2210.30it/s]
5000it [00:12, 390.10it/s]
5000it [00:02, 2169.08it/s]
5000it [00:13, 382.99it/s]


['genres', 'early_access', 'publisher', 'developer', 'tags'] {'HR@10': 0.259, 'nDCG@10': 0.07588120370769746, 'recall@10': 0.09660555468863653, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.09995698412698413}


5000it [00:02, 2121.00it/s]
5000it [00:14, 349.93it/s]
5000it [00:02, 2077.72it/s]
5000it [00:14, 349.85it/s]
5000it [00:02, 2104.36it/s]
5000it [00:14, 344.27it/s]


['specs', 'genres', 'early_access', 'publisher', 'developer', 'tags'] {'HR@10': 0.26286666666666664, 'nDCG@10': 0.0790052078761835, 'recall@10': 0.09815592627909164, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.10144746031746033}


Experiment for evaluating different weighting schemes

In [8]:
weighting_schemes = [{'playtime': playtime, 'sentiment': sentiment, 'reviews': reviews} for playtime in [True, False] for sentiment in ['rating', 'n_reviews', 'mixed', False] for reviews in [True, False]]
def test_weighting_scheme(rec, weighting_scheme):
    rec.set_weighting_scheme(weighting_scheme)
    rec.generate_recommendations(read_max=5000, silence=True)
    return rec.evaluate(k=10)

results = []
recs = [ImprovedRecommender(items_path, train_path=f"{split}_train.parquet", test_path=f"{split}_test.parquet", val_path=f"{split}_val.parquet", reviews_path=reviews_path, sparse=True, tfidf='smooth', normalize=True) for split in [data_path + str(i) for i in range(3)]]
for weighting_scheme in weighting_schemes:
    split_results = []
    for rec in recs:
        split_results.append(test_weighting_scheme(rec, weighting_scheme))
    results.append(average_results(split_results))
    print(weighting_scheme, results[-1])
    
with open('./evaluation/weighting_schemes.pickle', 'wb') as handle:
    pickle.dump(weighting_schemes, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('./evaluation/weighting_schemes_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

{'playtime': True, 'sentiment': 'rating', 'reviews': True} {'HR@10': 0.215, 'nDCG@10': 0.059321776717981445, 'recall@10': 0.07618113972795272, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.07930021164021163}
{'playtime': True, 'sentiment': 'rating', 'reviews': False} {'HR@10': 0.215, 'nDCG@10': 0.05934824475332367, 'recall@10': 0.07626506438449554, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.07936087301587301}
{'playtime': True, 'sentiment': 'n_reviews', 'reviews': True} {'HR@10': 0.21153333333333335, 'nDCG@10': 0.05810909331975015, 'recall@10': 0.07449825934288308, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.07763301587301587}
{'playtime': True, 'sentiment': 'n_reviews', 'reviews': False} {'HR@10': 0.21206666666666665, 'nDCG@10': 0.05838593333343447, 'recall@10': 0.0750858663085082, 'ideal_recall@10': 0.9429034701783022, 'nRecall@10': 0.07819534391534393}
{'playtime': True, 'sentiment': 'mixed', 'reviews': True} {'HR@10': 0.20593333333333333, 'nDCG@10': 

Experiment for evaluating different dimensionality reduction techniques

In [4]:
dim_reds = [TruncatedSVD(n_components=300, random_state=500), PCA(n_components=300, random_state=500, whiten=False), IncrementalPCA(n_components=300, whiten=False), KernelPCA(n_components=300)]
labels = ["TruncatedSVD(n_components=300)", "PCA(n_components=300)", "IncrementalPCA(n_components=300)", "KernelPCA(n_components=300)"]
# use_data = ['specs', 'genres', 'tags', 'early_access', 'publisher', 'developer']
results = []
for i, dim_red in enumerate(dim_reds):
    split_results = []
    for split in [data_path + str(i) for i in range(3)]:
        rec = ImprovedRecommender(items_path, train_path=f"{split}_train.parquet", test_path=f"{split}_test.parquet", val_path=f"{split}_val.parquet", reviews_path=reviews_path, sparse=False, tfidf='smooth', normalize=True, dim_red=dim_red)
        rec.generate_recommendations(read_max=5000, silence=True)
        split_results.append(rec.evaluate(k=10))
    results.append(average_results(split_results))
    print(labels[i], results[-1])
    
with open('./evaluation/dim_reduction.pickle', 'wb') as handle:
    pickle.dump(labels, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('./evaluation/dim_reduction_results.pickle', 'wb') as handle:
    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)

5000it [00:00, 40143.87it/s]
5000it [02:00, 41.33it/s]
5000it [00:00, 34598.76it/s]
5000it [02:10, 38.31it/s]
5000it [00:00, 36323.63it/s]
5000it [02:01, 41.21it/s]
5000it [00:00, 32679.66it/s]
5000it [01:52, 44.61it/s]
5000it [00:00, 35696.63it/s]
5000it [02:06, 39.65it/s]
5000it [00:00, 33774.15it/s]
5000it [01:52, 44.53it/s]
5000it [00:00, 35954.40it/s]
5000it [02:01, 41.25it/s]
5000it [00:00, 30298.05it/s]
5000it [02:01, 41.30it/s]
5000it [00:00, 29749.96it/s]
5000it [02:09, 38.74it/s]
5000it [00:00, 26316.14it/s]
5000it [02:09, 38.50it/s]
5000it [00:00, 25895.40it/s]
5000it [02:05, 39.77it/s]
5000it [00:00, 23364.64it/s]
5000it [02:18, 36.20it/s]


Experiment for sparse PCA, it can be observed that this technique takes a very long time to provide the reduced item output, which immediately invalidates this technique as being useful.

In [4]:
%%time
dim_red = SparsePCA(n_components=300, n_jobs=8)
rec = ImprovedRecommender(items_path, train_path=f"{data_path}0_train.parquet", test_path=f"{data_path}0_test.parquet", val_path=f"{data_path}0_val.parquet", reviews_path=reviews_path, sparse=False, tfidf='smooth', normalize=True, dim_red=dim_red)
rec.generate_recommendations(read_max=5000) # read_max=1000
rec.evaluate(k=10)

5000it [00:00, 22409.64it/s]
5000it [03:02, 27.39it/s]


Wall time: 16min 37s


{'HR@10': 0.2302,
 'nDCG@10': 0.05956175198640447,
 'recall@10': 0.0714757177844029,
 'ideal_recall@10': 0.9429034701783021,
 'nRecall@10': 0.07638436507936508}

In [3]:
am_splits = 5
am_recommended = 20
k_values = [5, 10, 20]
data_basepath = "./data/v3/"
quant_output_path = "./evaluation/v3/quantitative/"
qual_output_path = "./evaluation/v3/qualitative/"
items_path = data_basepath + "games.pkl.gz"
reviews_path = data_basepath + "reviews.parquet"
data_path = data_basepath + "interactions_splits_"

In [1]:
def save_pickle(save_path: str, data:object) -> None:
    with open(save_path, "wb") as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [4]:
def invert_dict(dicts:list) -> dict:
    new_dict = dict()
    for i, d in enumerate(dicts):
        for key in d.keys():
            ar = new_dict.get(key, np.zeros(len(dicts)))
            ar[i] = d[key]
            new_dict[key] = ar
    return new_dict

Experiment for PCA with different output sizes

In [5]:
dim_reds = [
    PCA(n_components=200, random_state=500, whiten=False), PCA(n_components=200, random_state=500, whiten=True),
    PCA(n_components=300, random_state=500, whiten=False), PCA(n_components=300, random_state=500, whiten=True),
    PCA(n_components=400, random_state=500, whiten=False), PCA(n_components=400, random_state=500, whiten=True),
    PCA(n_components=500, random_state=500, whiten=False), PCA(n_components=500, random_state=500, whiten=True)]
labels = [
    "PCA nc=200", "PCA nc=200 whitened",
    "PCA nc=300", "PCA nc=300 whitened",
    "PCA nc=400", "PCA nc=400 Whitened",
    "PCA nc=500", "PCA nc=500 Whitened"]
# use_data = ['specs', 'genres', 'tags', 'early_access', 'publisher', 'developer']
results = []
for dim_red in dim_reds:
    evaluations = []
    for split in range(am_splits):
        rec = ImprovedRecommender(items_path, train_path=f"{data_path}{split}_train.parquet", test_path=f"{data_path}{split}_test.parquet",
                                val_path=f"{data_path}{split}_val.parquet", reviews_path=reviews_path, sparse=False, tfidf="smooth", normalize=True, dim_red=dim_red)
        rec.generate_recommendations(amount=am_recommended, read_max=5000, silence=True)
        evaluation = dict()
        for k in k_values:
            evaluation.update(rec.evaluate(val=False, k=k))
        evaluations.append(evaluation)
    evaluations = invert_dict(evaluations)
    results.append(evaluations)


save_pickle(quant_output_path + f"ImprovedRec_PCA.pickle.gz", labels)
save_pickle(quant_output_path + f"ImprovedRec_PCA_results.pickle.gz", results)

results

[{'HR@5': array([0.1804, 0.1784, 0.1802, 0.1804, 0.178 ]),
  'nDCG@5': array([0.06248786, 0.0625275 , 0.06613434, 0.06455479, 0.06314145]),
  'recall@5': array([0.05730968, 0.05721413, 0.0615682 , 0.06079072, 0.05967371]),
  'ideal_recall@5': array([0.85512594, 0.85512594, 0.85512594, 0.85512594, 0.85512594]),
  'nRecall@5': array([0.06974667, 0.06860333, 0.07293   , 0.07199667, 0.07113667]),
  'HR@10': array([0.251 , 0.2562, 0.2562, 0.2592, 0.259 ]),
  'nDCG@10': array([0.0683486 , 0.06924959, 0.07303678, 0.07240726, 0.07144234]),
  'recall@10': array([0.08292479, 0.08263495, 0.0876824 , 0.08832863, 0.08789034]),
  'ideal_recall@10': array([0.94290347, 0.94290347, 0.94290347, 0.94290347, 0.94290347]),
  'nRecall@10': array([0.0879931 , 0.08743421, 0.0924881 , 0.09309143, 0.09290341]),
  'HR@20': array([0.355 , 0.3538, 0.3594, 0.3538, 0.3564]),
  'nDCG@20': array([0.08099801, 0.08141371, 0.08575252, 0.08462381, 0.083721  ]),
  'recall@20': array([0.12417113, 0.12181307, 0.12956648, 0.1

Experiment for PCA with differen output sizes, using all data that is available in the steam data set for the original item representations.

In [7]:
dim_reds = [
    PCA(n_components=200, random_state=500, whiten=False), PCA(n_components=200, random_state=500, whiten=True),
    PCA(n_components=300, random_state=500, whiten=False), PCA(n_components=300, random_state=500, whiten=True),
    PCA(n_components=400, random_state=500, whiten=False), PCA(n_components=400, random_state=500, whiten=True),
    PCA(n_components=500, random_state=500, whiten=False), PCA(n_components=500, random_state=500, whiten=True)]
labels = [
    "PCA AllData nc=200", "PCA AllData nc=200 whitened",
    "PCA AllData nc=300", "PCA AllData nc=300 whitened",
    "PCA AllData nc=400", "PCA AllData nc=400 Whitened",
    "PCA AllData nc=500", "PCA AllData nc=500 Whitened"]
columns = ['specs', 'genres', 'tags', 'early_access', 'publisher', 'developer']
results = []
for dim_red in dim_reds:
    evaluations = []
    for split in range(am_splits):
        rec = ImprovedRecommender(items_path, train_path=f"{data_path}{split}_train.parquet", test_path=f"{data_path}{split}_test.parquet",
                                val_path=f"{data_path}{split}_val.parquet", reviews_path=reviews_path, sparse=False, tfidf="smooth", normalize=True, dim_red=dim_red, columns=columns)
        rec.generate_recommendations(amount=am_recommended, read_max=5000, silence=True)
        evaluation = dict()
        for k in k_values:
            evaluation.update(rec.evaluate(val=False, k=k))
        evaluations.append(evaluation)
    evaluations = invert_dict(evaluations)
    results.append(evaluations)


save_pickle(quant_output_path + f"ImprovedRec_PCA_alldata.pickle.gz", labels)
save_pickle(quant_output_path + f"ImprovedRec_PCA_alldata_results.pickle.gz", results)

results

[{'HR@5': array([0.1778, 0.1794, 0.1828, 0.1782, 0.1776]),
  'nDCG@5': array([0.06175334, 0.06231263, 0.06651293, 0.06353535, 0.06281053]),
  'recall@5': array([0.05691302, 0.05659321, 0.06178801, 0.05997881, 0.05909648]),
  'ideal_recall@5': array([0.85512594, 0.85512594, 0.85512594, 0.85512594, 0.85512594]),
  'nRecall@5': array([0.06903667, 0.06821667, 0.07322   , 0.07114   , 0.07072   ]),
  'HR@10': array([0.2502, 0.2552, 0.2562, 0.2584, 0.2574]),
  'nDCG@10': array([0.06762756, 0.06891715, 0.07299892, 0.07153927, 0.07082147]),
  'recall@10': array([0.08190123, 0.08194935, 0.08747443, 0.08758985, 0.08700981]),
  'ideal_recall@10': array([0.94290347, 0.94290347, 0.94290347, 0.94290347, 0.94290347]),
  'nRecall@10': array([0.0868973 , 0.08680643, 0.09217246, 0.09238468, 0.09198738]),
  'HR@20': array([0.354 , 0.3492, 0.36  , 0.3548, 0.3578]),
  'nDCG@20': array([0.08070514, 0.08069851, 0.08571034, 0.08418825, 0.08346659]),
  'recall@20': array([0.12426492, 0.12046479, 0.12880275, 0.1