In [1]:
from src.recommenders import ImprovedRecommender, PopBasedRecommender
from sklearn.decomposition import TruncatedSVD, PCA, IncrementalPCA, KernelPCA, SparsePCA
import pandas as pd
import re
import numpy as np

In [2]:
qual_eval_folder = './evaluation'
items_path = "./data/games.pkl"
data_path = "./data/interactions_splits_0"
reviews_path = "./data/reviews.parquet"

From the base recommender results, we will now continue to use the ideal combination of methods as a baseline before we make further improvements.
This means we use cosine distance and normalized feature vectors. We also don't use feedback weighting.

The following cell runs the basic recommender and provides more in-depth evaluation and compares it to a popularity-based recommender.

In [3]:
rec = PopBasedRecommender(train_path=f"{data_path}_train.parquet", test_path=f"{data_path}_test.parquet", val_path=f"{data_path}_val.parquet")
rec.generate_recommendations() # read_max=1000
rec.evaluate(k=10)

{'HR@10': 0.2080642184904964,
 'nDCG@10': 0.042073496428109126,
 'recall@10': 0.05927747669954759,
 'ideal_recall@10': 0.9450448802719371,
 'nRecall@10': 0.06391801919759106}

In [3]:
dim_red = TruncatedSVD(n_components=100, random_state=500)
use_data = ['specs', 'genres', 'tags', 'early_access', 'publisher']
rec = ImprovedRecommender(items_path, train_path=f"{data_path}_train.parquet", test_path=f"{data_path}_test.parquet", val_path=f"{data_path}_val.parquet", reviews_path=reviews_path, sparse=True, tfidf='smooth', normalize=True, dim_red=None, columns=use_data)
rec.generate_recommendations() # read_max=1000
rec.evaluate(k=10)

54190it [01:08, 794.68it/s] 
54190it [01:58, 457.15it/s] 


{'HR@10': 0.25630190071969,
 'nDCG@10': 0.07477908640105389,
 'recall@10': 0.09310825248820044,
 'ideal_recall@10': 0.9450448802719371,
 'nRecall@10': 0.0965415630482986}

In [4]:
dim_red = None
use_data = ['specs', 'genres', 'tags', 'early_access', 'publisher']
rec = ImprovedRecommender(items_path, train_path=f"{data_path}_train.parquet", test_path=f"{data_path}_test.parquet", val_path=f"{data_path}_val.parquet", reviews_path=reviews_path, sparse=True, tfidf='smooth', normalize=True, dim_red=dim_red, columns=use_data)
rec.generate_recommendations(read_max=1000) # read_max=1000
rec.evaluate(k=10)

1000it [00:04, 221.86it/s]
1000it [00:02, 335.11it/s]


{'HR@10': 0.486,
 'nDCG@10': 0.08197619145181916,
 'recall@10': 0.015611512208878174,
 'ideal_recall@10': 0.210713794256582,
 'nRecall@10': 0.07659999999999999}

In [5]:
dim_red = TruncatedSVD(n_components=300, random_state=500)
use_data = ['specs', 'genres', 'tags', 'early_access', 'publisher', 'developer']
rec = ImprovedRecommender(items_path, train_path=f"{data_path}_train.parquet", test_path=f"{data_path}_test.parquet", val_path=f"{data_path}_val.parquet", reviews_path=reviews_path, sparse=False, tfidf='smooth', normalize=True, dim_red=dim_red, columns=use_data)
rec.generate_recommendations(read_max=1000)
rec.evaluate(k=10)

1000it [00:01, 814.92it/s]
1000it [00:15, 64.90it/s]


{'HR@10': 0.388,
 'nDCG@10': 0.053807912634474465,
 'recall@10': 0.011021867277832427,
 'ideal_recall@10': 0.210713794256582,
 'nRecall@10': 0.0535}

In [6]:
dim_red = PCA(n_components=300, random_state=500, whiten=False)
use_data = ['specs', 'genres', 'tags', 'early_access', 'publisher', 'developer']
rec = ImprovedRecommender(items_path, train_path=f"{data_path}_train.parquet", test_path=f"{data_path}_test.parquet", val_path=f"{data_path}_val.parquet", reviews_path=reviews_path, sparse=False, tfidf='smooth', normalize=True, dim_red=dim_red, columns=use_data)
rec.generate_recommendations()
rec.evaluate(k=10)

54190it [00:49, 1094.89it/s]
54190it [14:49, 60.93it/s] 


{'HR@10': 0.2589038568001476,
 'nDCG@10': 0.07147831237560676,
 'recall@10': 0.08660981878035039,
 'ideal_recall@10': 0.9450448802719371,
 'nRecall@10': 0.0914564861436978}

In [8]:
dim_red = IncrementalPCA(n_components=300, whiten=False)
use_data = ['specs', 'genres', 'tags', 'early_access', 'publisher', 'developer']
rec = ImprovedRecommender(items_path, train_path=f"{data_path}_train.parquet", test_path=f"{data_path}_test.parquet", val_path=f"{data_path}_val.parquet", reviews_path=reviews_path, sparse=False, tfidf='smooth', normalize=True, dim_red=dim_red, columns=use_data)
rec.generate_recommendations(read_max=1000)
rec.evaluate(k=10)

1000it [00:01, 596.60it/s]
1000it [00:22, 43.53it/s]


{'HR@10': 0.498,
 'nDCG@10': 0.0860154142835861,
 'recall@10': 0.015818049384715562,
 'ideal_recall@10': 0.210713794256582,
 'nRecall@10': 0.07840000000000001}

In [9]:
dim_red = KernelPCA(n_components=300)
use_data = ['specs', 'genres', 'tags', 'early_access', 'publisher', 'developer']
rec = ImprovedRecommender(items_path, train_path=f"{data_path}_train.parquet", test_path=f"{data_path}_test.parquet", val_path=f"{data_path}_val.parquet", reviews_path=reviews_path, sparse=False, tfidf='smooth', normalize=True, dim_red=dim_red, columns=use_data)
rec.generate_recommendations(read_max=1000)
rec.evaluate(k=10)

1000it [00:01, 588.01it/s]
1000it [00:24, 40.95it/s]


{'HR@10': 0.498,
 'nDCG@10': 0.08612152746657255,
 'recall@10': 0.015818049384715562,
 'ideal_recall@10': 0.210713794256582,
 'nRecall@10': 0.07840000000000001}

In [10]:
dim_red = SparsePCA(n_components=300, n_jobs=8)
use_data = ['specs', 'genres', 'tags', 'early_access', 'publisher', 'developer']
rec = ImprovedRecommender(items_path, train_path=f"{data_path}_train.parquet", test_path=f"{data_path}_test.parquet", val_path=f"{data_path}_val.parquet", reviews_path=reviews_path, sparse=False, tfidf='smooth', normalize=True, dim_red=dim_red, columns=use_data)
rec.generate_recommendations(read_max=1000) # read_max=1000
rec.evaluate(k=10)

KeyboardInterrupt: 

In [5]:
items = pd.read_pickle(items_path)
items["price"] = items["price"].apply(lambda p: np.float32(p) if re.match(r"\d+(?:.\d{2})?", str(p)) else 0)
items["metascore"] = items["metascore"].apply(lambda m: m if m != "NA" else np.nan)
items["developer"].fillna(value='', inplace=True)
items["developer"] = items["developer"].apply(lambda my_str: my_str.lower().split(','))
items["publisher"].fillna(value='', inplace=True)
items["publisher"] = items["publisher"].apply(lambda my_str: my_str.lower().split(','))
items["early_access"] = items["early_access"].apply(lambda x: ["earlyaccess"] if x else [])
items["specs"] = items["specs"].apply(lambda l: [re.subn(r"[^a-z0-9]", "", my_str.lower())[0] for my_str in l])
items["tags"] = items["tags"].apply(lambda l: [re.subn(r"[^a-z0-9]", "", my_str.lower())[0] for my_str in l])
items["genres"] = items["genres"].apply(lambda l: [re.subn(r"[^a-z0-9]", "", my_str.lower())[0] for my_str in l])

items

Unnamed: 0,publisher,genres,app_name,release_date,tags,discount_price,specs,price,early_access,id,developer,sentiment,metascore,users_count
0,[valve],[action],Counter-Strike: Global Offensive,2012-08-21,"[fps, multiplayer, shooter, action, teambased,...",,"[multiplayer, steamachievements, fullcontrolle...",14.99,[],730,[valve],Very Positive,83.0,42620
1,[valve],"[indie, simulation]",Garry's Mod,2006-11-29,"[sandbox, multiplayer, funny, moddable, buildi...",,"[singleplayer, multiplayer, coop, crossplatfor...",9.99,[],4000,[facepunch studios],Overwhelmingly Positive,,42157
2,[smartly dressed games],"[action, adventure, casual, freetoplay, indie]",Unturned,2017-07-07,"[freetoplay, survival, zombies, multiplayer, o...",,"[singleplayer, onlinemultiplayer, onlinecoop, ...",0.00,[],304930,[smartly dressed games],Very Positive,,37654
3,[valve],[action],Left 4 Dead 2,2009-11-16,"[zombies, coop, fps, multiplayer, action, onli...",,"[singleplayer, multiplayer, coop, steamachieve...",19.99,[],550,[valve],Overwhelmingly Positive,89.0,35990
4,[re-logic],"[action, adventure, indie, rpg]",Terraria,2011-05-16,"[sandbox, adventure, survival, 2d, multiplayer...",,"[singleplayer, multiplayer, onlinemultiplayer,...",9.99,[],105600,[re-logic],Overwhelmingly Positive,83.0,28551
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7345,[projectile entertainment],[indie],Momentum,2016-08-11,"[indie, physics]",,"[singleplayer, steamachievements, fullcontroll...",9.99,[],462130,[projectile entertainment],8 user reviews,,1
7346,[nuligine],"[action, indie, racing, simulation]",MadOut Ice Storm,2015-09-04,"[racing, indie, action, simulation]",,"[singleplayer, fullcontrollersupport]",6.99,[],400500,[nuligine],Mixed,,1
7347,[like a boss llc],"[action, adventure, indie, strategy]",OCCHIO,2016-08-11,"[action, adventure, indie, strategy, puzzlepla...",,"[singleplayer, fullcontrollersupport]",1.99,[],513140,[acerio games],7 user reviews,,1
7348,[dxf games],"[action, indie, rpg]",Just a Cleric,2016-05-19,"[rpg, indie, action, pixelgraphics, difficult]",,"[singleplayer, steamachievements, steamtrading...",4.99,[],467330,[dxf games],Positive,,1


In [12]:
interactions = pd.read_pickle('./data/interactions.pkl')
interactions

# user_ids = pd.read_parquet('./data/user_ids.parquet')
# user_ids

Unnamed: 0,user_id,steam_id,interactions
0,76561197981203305,76561197981203305,"[[0, 0, 0], [1, 0, 0], [2, 867, 0], [4, 122, 0..."
1,bosslucek,76561198029968002,"[[4, 574, 0], [6, 147, 0], [7, 0, 0], [8, 242,..."
2,icantwait,76561197971666535,"[[1, 1, 0], [4, 107, 0], [5, 0, 0], [6, 0, 0],..."
3,76561198067911521,76561198067911521,"[[0, 0, 0], [1, 61, 0], [3, 448, 0], [4, 0, 0]..."
4,kushziller,76561198021307778,"[[0, 1549, 0], [1, 24, 0], [3, 875, 0], [4, 19..."
...,...,...,...
54185,76561198056783123,76561198056783123,"[[45, 0, 0], [122, 1953, 0], [1304, 77, 0]]"
54186,76561197972619838,76561197972619838,"[[1, 118, 0], [4, 16031, 0], [27, 26, 0]]"
54187,76561197965631636,76561197965631636,"[[1, 0, 0], [93, 409, 0], [497, 0, 0]]"
54188,SlimShady9,76561198067952943,"[[19, 4043, 0], [32, 0, 0], [47, 0, 0]]"
