### Ensemble hybrid content-based and collaborative filtering model
KNNWithZscore on playtimes and TF-IDF/Cosine Similarity on metadata

Calculate similarities separately and then combines

It works but two models take up a lot of space when pickled

In [1]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from surprise import Reader, Dataset, KNNWithZScore
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise import accuracy
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# import our processed datasets
users_df = pd.read_csv('../data/steam_playtime_clean.csv')
games_df = pd.read_csv('../data/steam_app_metadata_clean.csv')
# gamesinfo_df = pd.read_csv('../data/steam_gameinfo.csv')

#### 3 Scaling methods
#### Test around to see which method or combination of methods is best
1. RobustScaler and StandardScaler
2. Just normalize
3. RobustScaler and Normalize?

In [3]:
# check datasets appid length
len(users_df['appid'].unique()) == len(games_df['appid'].unique() == len(games_df))

True

In [None]:
# first do robustscaler to minimize outliers
scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True)
users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].array.reshape(-1,1))

In [6]:
# use StandardScaler to scale user playtimes
scaler = StandardScaler()
users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].values.reshape(-1, 1))

In [4]:
# Normalize the targets between 0 and 1. Makes it easy to train.
min_playtime = min(users_df['playtime_forever'])
max_playtime = max(users_df['playtime_forever'])
users_df['playtime_forever'] = users_df['playtime_forever'].apply(lambda x: (x - min_playtime) / (max_playtime - min_playtime)).values

In [34]:
users_df

Unnamed: 0,appid,steam_id,playtime_forever
0,300,76561198015529005,0.000608
1,340,76561198015529005,0.000262
2,10180,76561198015529005,0.010735
3,10180,76561198015529005,0.131131
4,550,76561198015529005,0.002601
...,...,...,...
2245364,674020,76561197991159105,0.000015
2245365,368260,76561197991159105,0.009427
2245366,990080,76561197991159105,0.018515
2245367,493540,76561197991159105,0.000046


In [5]:
# instantiate surprise.Reader()
reader = Reader()

# make surprise dataset
data = Dataset.load_from_df(users_df[['steam_id', 'appid', 'playtime_forever']], reader)

In [6]:
# make a training and test set
trainset, testset = train_test_split(data, test_size=0.25)

### Collaborative Filtering: KNNWithZScore

#### GridSearch for KNN

In [7]:
# set up gridsearch param_grid for KNNWithZScore
knn_param_grid = {'k': [1000, 2000, 3000]}

In [8]:
grid_search = GridSearchCV(KNNWithZScore, knn_param_grid, cv=3, n_jobs=3, verbose=10)

In [9]:
grid_search.fit(data)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.


[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:  7.9min


Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.


[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed: 24.3min


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...


[Parallel(n_jobs=3)]: Done   9 out of  12 | elapsed: 25.8min remaining:  8.6min


Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.


[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed: 34.8min finished


In [10]:
# print best params for grid search
print('Best params: ', grid_search.best_params)

Best params:  {'rmse': {'k': 3000}, 'mae': {'k': 1000}}


In [11]:
# print best score for grid search
print('Score: ', grid_search.best_score)

Score:  {'rmse': 0.9886158218452586, 'mae': 0.9872905251091363}


### Train KNN

In [13]:
k_value = grid_search.best_params['rmse']['k']

In [14]:
k_value

3000

In [15]:
# make knn model with best params from gridsearch
knn = KNNWithZScore(k=k_value, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [16]:
# fit the training data and test with our test set
predictions = knn.fit(trainset).test(testset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [17]:
# get accuracy
accuracy.rmse(predictions)

RMSE: 0.9887


0.9886510060558482

In [18]:
# build a full trainset now
trainset = data.build_full_trainset()

In [19]:
# save model in pickle
pickle.dump(trainset, open('../models/trainset.pkl', 'wb'))

In [20]:
# instantiate new knn
knn = KNNWithZScore(k=k_value, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [21]:
# fit the knn
knn.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x7ff6623f3cd0>

In [22]:
# save model in pickle
pickle.dump(knn, open('../models/knn_model.pkl', 'wb'))

### Recommend KNN

In [23]:
# function that takes game title and returns the knn model inner id
def get_innerid(title):
    # get appid from games_df
    appid = games_df[games_df['name'] == title]['appid'].values[0]
    inner_id = trainset.to_inner_iid(appid)
    return inner_id

In [24]:
# function that takes knn model innerid and returns the game title
def get_title(inner_id):
    steam_id = trainset.to_raw_iid(inner_id)
    title = games_df[games_df['appid'] == steam_id].iloc[0]['name']
    return title

In [25]:
def get_knn_appid(inner_id):
    return trainset.to_raw_iid(inner_id)

In [26]:
def recommend_knn(title):
    '''Get k nearest neighbours using the model'''
    # get inner id for game
    inner_id = get_innerid(title)
    # get nearest neighbours
    neighbors = knn.get_neighbors(inner_id, k=10)
    # get game titles for those neighbours
    titles = [get_title(i) for i in neighbors]
    return titles

In [27]:
# test run
recommend_knn('LIMBO')

['Mark of the Ninja',
 'VVVVVV',
 'World of Goo',
 "Mirror's Edge™",
 'Dead Space',
 'ORION: Prelude',
 'Magicka',
 'Trine 2: Complete Story',
 'INSIDE',
 'F.E.A.R. 3']

### KNN Similarity Matrix

In [28]:
knn_similarities = knn.compute_similarities()

Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [52]:
def get_knn_similar(title, knn_sim = knn_similarities):
    '''Get k nearest neighbors using the similarity matrix'''
    
    idx = get_innerid(title)
    
    # get pairwise similarity scores of all games w.r.t to our game
    sim_scores = list(enumerate(knn_sim[idx]))
    
    # sort scores based on similarity
    sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # make dictionary with title as key and similarity score as value
    similar_games = {get_title(i[0]): i[1] for i in sorted_scores[1:]}

    return similar_games

In [30]:
get_knn_similar('LIMBO')

{'Mark of the Ninja': 0.8270509213987579,
 'VVVVVV': 0.8235271539319691,
 'World of Goo': 0.8231959832567497,
 "Mirror's Edge™": 0.8184542146833247,
 'Dead Space': 0.7965762606112908,
 'ORION: Prelude': 0.7853443560497188,
 'Magicka': 0.7791670352261109,
 'Trine 2: Complete Story': 0.7686020973785362,
 'INSIDE': 0.7651592572208243,
 'F.E.A.R. 3': 0.7624763911310805,
 'Rock of Ages': 0.7596923464643296,
 'Little Nightmares': 0.7578189386583029,
 'Scribblenauts Unlimited': 0.7487106438148136,
 'McPixel': 0.7445713729424711,
 'To the Moon': 0.7405275249772677,
 'Gone Home': 0.7386569119513423,
 'Tomb Raider': 0.7362503212578412,
 'Dear Esther': 0.7170404371258016,
 'Indie Game: The Movie': 0.7156302502155343,
 'Deadlight': 0.704006593446912}

### CONTENT-BASED Cosine Similarity for Game Metadata

In [31]:
# instantiate tfidfvectorizer
tfidf = TfidfVectorizer(max_features=1500, lowercase=False, min_df=5, ngram_range=(1,3))

In [32]:
# make description column into type string, otherwise tfidf cries
games_df['description_clean'] = games_df['description_clean'].astype(str)

In [33]:
# fit tfidfvectorizer to description column
tfidf_matrix = tfidf.fit_transform(games_df['description_clean'])

In [34]:
# games_df columns 5 to second last are the tags
game_tags = games_df.iloc[:, 5:-1]

In [35]:
# make matrix into dataframe
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

In [36]:
# concatentate tfidf_df and game_tags
tfidf_df = pd.concat([tfidf_df, game_tags], axis=1)

In [37]:
# make matrix for cosine sim
matrix = tfidf_df.values

In [38]:
matrix.shape

(19818, 1574)

In [39]:
# make a cosine similarity matrix, maybe linear kernel instead?
cosine_sim = cosine_similarity(matrix, matrix)

In [40]:
cosine_sim.shape

(19818, 19818)

In [41]:
# store cosine similarity matrix in pickle
pickle.dump(cosine_sim, open('../models/cosine_sim.pkl', 'wb'))

In [42]:
# Construct a reverse map of indices and game titles
indices = pd.Series(games_df.index, index=games_df['name'])

In [55]:
# recommend function for content
def recommend_content(title, sim_matrix = cosine_sim):
    '''Get similar games based on content using the cosine similarity matrix'''
    # get index for our game
    idx = indices[title]
    
    # get pairwise similarity scores of all games w.r.t to our game
    sim_scores = list(enumerate(sim_matrix[idx]))
    
    # sort scores based on similarity
    sorted_sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # make a dictionary with title as key and score as value
    content_similar_scores = {indices.index[i[0]]: i[1] for i in sorted_sim_scores[1:]}

    return content_similar_scores

In [56]:
recommend_content('LIMBO')

{'Jump King': 0.9090909090909093,
 '3000th Duel': 0.9090909090909093,
 'Driven Out': 0.9090909090909093,
 'Blasphemous': 0.8800214111431761,
 'Intruders: Hide and Seek': 0.875921833811626,
 'Hollow Knight': 0.8703882797784892,
 'Moonlighter': 0.8703882797784892,
 'Mute Crimson+': 0.8703882797784892,
 'Road 96 🛣️': 0.8703882797784892,
 'Celeste': 0.8703882797784892,
 'Jenny LeClue - Detectivu': 0.8703882797784892,
 'Valfaris': 0.8609166731619822,
 'GRIS': 0.8581163303210333,
 'Over the Alps': 0.8581163303210333,
 'Lost Ruins': 0.8581163303210333,
 'Super Meat Boy Forever': 0.8451443077823318,
 'Slime Rancher': 0.836242010007091,
 'CARRION': 0.836242010007091,
 'ScourgeBringer': 0.836242010007091,
 'Shovel Knight: Specter of Torment': 0.836242010007091,
 'A Plague Tale: Innocence': 0.825185161822059,
 'Giana Sisters: Twisted Dreams - Rise of the Owlverlord': 0.8244723325455283,
 'Discouraged Workers TEEN': 0.8220209678949034,
 'Narita Boy': 0.8213030264123858,
 'Life is Strange - Episode

### COMBINED AND WEIGHTED CONTENT RECOMMENDATIONS

In [45]:
def weighted_recommend_content(title, cosine_sim = cosine_sim, knn_sim = knn_similarities):
    '''Get weighted recommendations based on both matrices'''
    # COSINE STUFF
    content_similar_scores = recommend_content(title, cosine_sim)

    # KNN STUFF
    knn_similar_scores = get_knn_similar(title, knn_sim)

    # COMBINE STUFF
    weighted_scores = {}
    for key, value in content_similar_scores.items():
        weighted_scores[key] = value * 0.5 + knn_similar_scores[key] * 0.5

    sorted_weighted_scores = sorted(weighted_scores.items(), key=lambda x: x[1], reverse=True)

    return sorted_weighted_scores[1:21]

In [58]:
weighted_recommend_content('LIMBO')

[('Tomb Raider', 0.7597998865292408),
 ('Dead Space', 0.7500513656463696),
 ('INSIDE', 0.7424195217732956),
 ('Firewatch', 0.739262645396334),
 ('Deadlight', 0.733388332421693),
 ('VVVVVV', 0.7315646514994001),
 ('GRIS', 0.72186977041578),
 ('Trine 2: Complete Story', 0.7132768961691526),
 ("Hellblade: Senua's Sacrifice", 0.7124180321413996),
 ('Bayonetta', 0.7015690387003344),
 ('Little Nightmares', 0.698710543862567),
 ("Mirror's Edge™", 0.6985408004638303),
 ('DuckTales: Remastered', 0.6954420912156656),
 ('Gone Home', 0.6875102741574894),
 ('Brutal Legend', 0.6805154005221291),
 ('I Am Bread', 0.6740889968234429),
 ('Among the Sleep - Enhanced Edition', 0.6718513350132815),
 ('To the Moon', 0.6717751070663975),
 ('The Vanishing of Ethan Carter', 0.6716325605615432),
 ('Octodad: Dadliest Catch', 0.6676594858455362)]

In [59]:
def combined_recom(title):
    '''Returns two lists of recommended games based on both user and game data'''
    
    knn_recom = recommend_knn(title)
    
    cont_recom = recommend_content(title)
    
    print('Similar recommendations to', title, 'based on user data:')
    print(*knn_recom, sep='\n')
    print('\nSimilar recommendations to', title, 'based on game data:')
    print(*cont_recom, sep='\n')



In [60]:
combined_recom('LIMBO')

Similar recommendations to LIMBO based on user data:
Mark of the Ninja
VVVVVV
World of Goo
Mirror's Edge™
Dead Space
ORION: Prelude
Magicka
Trine 2: Complete Story
INSIDE
F.E.A.R. 3

Similar recommendations to LIMBO based on game data:
Jump King
3000th Duel
Driven Out
Blasphemous
Intruders: Hide and Seek
Hollow Knight
Moonlighter
Mute Crimson+
Road 96 🛣️
Celeste
Jenny LeClue - Detectivu
Valfaris
GRIS
Over the Alps
Lost Ruins
Super Meat Boy Forever
Slime Rancher
CARRION
ScourgeBringer
Shovel Knight: Specter of Torment
A Plague Tale: Innocence
Giana Sisters: Twisted Dreams - Rise of the Owlverlord
Discouraged Workers TEEN
Narita Boy
Life is Strange - Episode 1
Mad Max
My Friend Pedro
Spyro™ Reignited Trilogy
Katana ZERO
ENDER LILIES: Quietus of the Knights
Shadow of the Tomb Raider: Definitive Edition
The Stanley Parable
Control Ultimate Edition
Outer Wilds
Bayonetta
2064: Read Only Memories
Felix The Reaper
Path of Giants
Mutazione
Dig Dog
Sea Salt
LABYRINTH OF TOUHOU - GENSOKYO AND THE