## Video Game Recommendation System based on Steam data 
### Uses a hybrid content-based and collaborative filtering approach to recommend games?

In [1]:
# import libraries
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from surprise import Reader, Dataset, KNNWithZScore
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise import accuracy
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# import our processed datasets
users_df = pd.read_csv('../data/steam_playtime_clean.csv')
games_df = pd.read_csv('../data/steam_app_metadata_clean.csv')
# gamesinfo_df = pd.read_csv('../data/steam_gameinfo.csv')

#### COLLABORATIVE FILTERING FOR USER PLAYTIME DATA

In [3]:
# get users_df unique id's
len(users_df['appid'].unique())

19546

In [4]:
len(games_df['appid'].unique())

19546

In [173]:
len(games_df)

19546

In [None]:
# first do robustscaler to minimize outliers
# scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True)
# users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].array.reshape(-1,1))

In [5]:
# use StandardScaler to scale user playtimes
scaler = StandardScaler()
users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].values.reshape(-1, 1))

In [6]:
# instantiate surprise.Reader()
reader = Reader()

# make surprise dataset
data = Dataset.load_from_df(users_df[['steam_id', 'appid', 'playtime_forever']], reader)

In [7]:
# make a training and test set
trainset, testset = train_test_split(data, test_size=0.25)

### Collaborative Filtering: KNNWithZScore

#### GridSearch for KNN

In [9]:
# set up gridsearch param_grid for KNNWithZScore
knn_param_grid = {'k': [600, 800]}

In [10]:
grid_search = GridSearchCV(KNNWithZScore, knn_param_grid, cv=5, n_jobs=3)

In [11]:
grid_search.fit(data)

Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


In [12]:
# print best params for grid search
print('Best params: ', grid_search.best_params)

Best params:  {'rmse': {'k': 800}, 'mae': {'k': 800}}


In [13]:
# print best score for grid search
print('Score: ', grid_search.best_score)

Score:  {'rmse': 1.412431892579643, 'mae': 1.229177014477301}


In [8]:
# make knn model with best params from gridsearch
knn = KNNWithZScore(k=800, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [9]:
# fit the training data and test with our test set
predictions = knn.fit(trainset).test(testset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [10]:
# get accuracy
accuracy.rmse(predictions)

RMSE: 1.3847


1.384702785666047

In [11]:
# build a full trainset now
trainset = data.build_full_trainset()

In [12]:
# save model in pickle
pickle.dump(trainset, open('../models/trainset.pkl', 'wb'))

In [13]:
# instantiate new knn
knn = KNNWithZScore(k=800, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [14]:
# fit the knn
knn.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x7f438c817ac0>

In [15]:
# save model in pickle
pickle.dump(knn, open('../models/knn_model.pkl', 'wb'))

In [16]:
# function that takes game title and returns the knn model inner id
def get_innerid(title):
    # get appid from games_df
    appid = games_df[games_df['name'] == title]['appid'].values[0]
    inner_id = trainset.to_inner_iid(appid)
    return inner_id

In [17]:
# function that takes knn model innerid and returns the game title
def get_title(inner_id):
    steam_id = trainset.to_raw_iid(inner_id)
    title = games_df[games_df['appid'] == steam_id].iloc[0]['name']
    return title

In [18]:
def get_knn_appid(inner_id):
    return trainset.to_raw_iid(inner_id)

In [239]:
get_knn_appid(3)

10180

In [19]:
# function that takes a game title and returns 10 nearest neighbours
def recommend_knn(title):
    # get inner id for game
    inner_id = get_innerid(title)
    # get nearest neighbours
    neighbors = knn.get_neighbors(inner_id, k=10)
    # get game titles for those neighbours
    titles = [get_title(i) for i in neighbors]
    return titles

In [104]:
# test run
recommend_knn('LIMBO')

['VVVVVV',
 'Mark of the Ninja',
 'World of Goo',
 "Mirror's Edge™",
 'Dead Space',
 'Magicka',
 'ORION: Prelude',
 'Trine 2: Complete Story',
 'INSIDE',
 'F.E.A.R. 3']

In [20]:
knn_similarities = knn.compute_similarities()

Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [21]:
# recommend function for content
def get_knn_similar(title, knn_sim = knn_similarities):
    
    idx = get_innerid(title)
    
    # get pairwise similarity scores of all games w.r.t to our game
    sim_scores = list(enumerate(knn_sim[idx]))
    
    # sort scores based on similarity
    sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # make dictionary with title as key and similarity score as value
    similar_games = {get_title(i[0]): i[1] for i in sorted_scores[1:21]}

    return similar_games

In [24]:
get_knn_similar('LIMBO')

{'VVVVVV': 0.826537924260695,
 'Mark of the Ninja': 0.8207203828284743,
 'World of Goo': 0.8169189658823532,
 "Mirror's Edge™": 0.8149737240823128,
 'Dead Space': 0.7956553871764342,
 'Magicka': 0.7831007093975789,
 'ORION: Prelude': 0.7813297734863306,
 'Trine 2: Complete Story': 0.7678644167066896,
 'INSIDE': 0.7573305086670962,
 'F.E.A.R. 3': 0.7521111064512996,
 'Little Nightmares': 0.7517670108739803,
 'Tomb Raider': 0.7473036992192646,
 'To the Moon': 0.7443995128555665,
 'Scribblenauts Unlimited': 0.7421048240746818,
 'Rock of Ages': 0.7405880407380657,
 'McPixel': 0.7403627580561097,
 'METAL GEAR SOLID V: GROUND ZEROES': 0.7393797240145948,
 'Gone Home': 0.7229923009738277,
 'Dear Esther': 0.7099877041017841,
 'Indie Game: The Movie': 0.701873491438399}

### CONTENT-BASED FILTERING FOR GAME DATA

In [25]:
# instantiate tfidfvectorizer
tfidf = TfidfVectorizer(max_features=1500, lowercase=False, min_df=5, ngram_range=(1,3))

In [26]:
games_df.shape

(19546, 80)

In [27]:
# make description column into type string, otherwise tfidf cries
games_df['description_clean'] = games_df['description_clean'].astype(str)

In [28]:
# fit tfidfvectorizer to description column
tfidf_matrix = tfidf.fit_transform(games_df['description_clean'])

In [29]:
# games_df columns 5 to second last are the tags
game_tags = games_df.iloc[:, 5:-1]

In [30]:
# make matrix into dataframe
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

In [31]:
# concatentate tfidf_df and game_tags
tfidf_df = pd.concat([tfidf_df, game_tags], axis=1)

In [32]:
# make matrix for cosine sim
matrix = tfidf_df.values

In [33]:
matrix.shape

(19546, 1574)

In [34]:
# make a cosine similarity matrix, maybe linear kernel instead?
cosine_sim = cosine_similarity(matrix, matrix)

In [35]:
cosine_sim.shape

(19546, 19546)

In [36]:
# store cosine similarity matrix in pickle
pickle.dump(cosine_sim, open('../models/cosine_sim.pkl', 'wb'))

In [37]:
# Construct a reverse map of indices and game titles
indices = pd.Series(games_df.index, index=games_df['name'])

In [42]:
# recommend function for content
def recommend_content(title, cosine_sim = cosine_sim):
    # get index for our game
    idx = indices[title]
    
    # get pairwise similarity scores of all games w.r.t to our game
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # sort scores based on similarity
    sorted_sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # make a dictionary with title as key and score as value
    content_similar_scores = {indices.index[i[0]]: i[1] for i in sorted_sim_scores[1:21]}

    return content_similar_scores

In [43]:
recommend_content('LIMBO')

{'Jump King': 0.9090909090909093,
 '3000th Duel': 0.9090909090909093,
 'Driven Out': 0.9090909090909093,
 'Blasphemous': 0.8802091791155916,
 'Intruders: Hide and Seek': 0.8759451887022192,
 'Hollow Knight': 0.8703882797784892,
 'Moonlighter': 0.8703882797784892,
 'Mute Crimson+': 0.8703882797784892,
 'Road 96 🛣️': 0.8703882797784892,
 'Celeste': 0.8703882797784892,
 'Jenny LeClue - Detectivu': 0.8703882797784892,
 'Valfaris': 0.860903571509737,
 'GRIS': 0.8581163303210333,
 'Over the Alps': 0.8581163303210333,
 'Lost Ruins': 0.8581163303210333,
 'Super Meat Boy Forever': 0.8452039315501134,
 'Slime Rancher': 0.836242010007091,
 'CARRION': 0.836242010007091,
 'ScourgeBringer': 0.836242010007091,
 'Shovel Knight: Specter of Torment': 0.836242010007091}

In [44]:
# get weighted recommendations based on both matrices
def weighted_recommend_content(title, cosine_sim = cosine_sim, knn_sim = knn_similarities):
    # COSINE STUFF
    # get index for our game
    cos_idx = indices[title]
    
    # get pairwise similarity scores of all games w.r.t to our game
    cos_sim_scores = list(enumerate(cosine_sim[cos_idx]))
    
    # make a dictionary with title as key and score as value
    content_similar_scores = {indices.index[i[0]]: i[1] for i in cos_sim_scores[1:]}

    # KNN STUFF
    knn_idx = get_innerid(title)
    
    # get pairwise similarity scores of all games w.r.t to our game
    knn_sim_scores = list(enumerate(knn_sim[knn_idx]))

    # get titles for scores
    knn_similar_scores = {get_title(i[0]): i[1] for i in knn_sim_scores[1:]}

    # COMBINE STUFF
    weighted_scores = {}
    for key, value in content_similar_scores.items():
        weighted_scores[key] = value * 0.5 + knn_similar_scores[key] * 0.5

    sorted_weighted_scores = sorted(weighted_scores.items(), key=lambda x: x[1], reverse=True)

    return sorted_weighted_scores[1:21]

In [46]:
weighted_recommend_content('LIMBO')

[('Mark of the Ninja', 0.7739965550506008),
 ('Tomb Raider', 0.7653265755099525),
 ('Dead Space', 0.7495909289289413),
 ('INSIDE', 0.7385207836033271),
 ('VVVVVV', 0.733070036663763),
 ('Firewatch', 0.7328702359864503),
 ('Deadlight', 0.7281439820025508),
 ('GRIS', 0.7181162538540189),
 ("Hellblade: Senua's Sacrifice", 0.7139682370499829),
 ('Trine 2: Complete Story', 0.7129080558332292),
 ('Bayonetta', 0.6998847417371078),
 ("Mirror's Edge™", 0.6968064722146547),
 ('Little Nightmares', 0.6956845799704058),
 ('DuckTales: Remastered', 0.6910819887071625),
 ('METAL GEAR SOLID V: GROUND ZEROES', 0.689490936540713),
 ('Gone Home', 0.6796779686687321),
 ('Brutal Legend', 0.6782627772972023),
 ('To the Moon', 0.6737111010055469),
 ('Machinarium', 0.6728687815184013),
 ('I Am Bread', 0.6701042199699982)]

In [49]:
# function for mapping between similarity matrix indices
def knn_index_to_cos_index(knn_index):
    return indices[get_title(knn_index)]

In [59]:
# create a copy of knn_similarities with cosine indices
knn_sim_ordered = knn_similarities.copy()

In [61]:
for i in range(len(knn_similarities)):
    for j in range(len(knn_similarities[i])):
        knn_sim_ordered[knn_index_to_cos_index(i)][knn_index_to_cos_index(j)] = knn_similarities[i][j]

In [48]:
# make a weighted average matrix of the the ordered knn and cosine matrix
weighted_sim = (knn_sim_ordered + cosine_sim) / 2

KeyboardInterrupt: 

In [None]:
recommend_content('LIMBO', weighted_sim)

In [None]:
# store weighted similarity matrix in pickle
pickle.dump(weighted_sim, open('../models/weighted_sim.pkl', 'wb'))

In [74]:
def combined_recom(title):
    '''DEPRECATED Returns two lists of recommended games based on both user and game data'''
    # get recommended games from knn
    knn_recom = recommend_knn(title)
    # get recommended games from content based
    cont_recom = recommend_content(title)
    # print knn_recom
    print('Similar recommendations to', title, 'based on user data:')
    print(*knn_recom, sep='\n')
    print('\nSimilar recommendations to', title, 'based on game data:')
    print(*cont_recom, sep='\n')

In [31]:
# test run for combined

title = 'LIMBO'
combined_recom(title)

Similar recommendations to DOOM based on user data:
HITMAN™
GTFO
HITMAN™ 2
Tom Clancy's Rainbow Six® Vegas 2
The Escapists 2
Ori and the Blind Forest: Definitive Edition
POSTAL 2
A Hat in Time
Serious Sam 3: BFE
Batman: Arkham City

Similar recommendations to DOOM based on game data:
Saints Row: The Third
DOOM Eternal
Psychonauts
Toukiden: Kiwami
Danganronpa Another Episode: Ultra Despair Girls
Crash Bandicoot™ N. Sane Trilogy
Monster Hunter: World
UNLOVED
NieR:Automata™
Dead Rising® 2
