## Video Game Recommendation System based on Steam data 
### Uses a hybrid content-based and collaborative filtering approach to recommend games?

In [169]:
# import libraries
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from surprise import Reader, Dataset, KNNWithZScore
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise import accuracy
from sklearn.metrics.pairwise import cosine_similarity

In [170]:
# import our processed datasets
users_df = pd.read_csv('../data/steam_playtime_clean.csv')
games_df = pd.read_csv('../data/steam_app_metadata_clean.csv')
# gamesinfo_df = pd.read_csv('../data/steam_gameinfo.csv')

#### COLLABORATIVE FILTERING FOR USER PLAYTIME DATA

In [171]:
# get users_df unique id's
len(users_df['appid'].unique())

19546

In [172]:
len(games_df['appid'].unique())

19546

In [173]:
len(games_df)

19546

In [None]:
# first do robustscaler to minimize outliers
# scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True)
# users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].array.reshape(-1,1))

In [174]:
# use StandardScaler to scale user playtimes
scaler = StandardScaler()
users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].values.reshape(-1, 1))

In [175]:
# instantiate surprise.Reader()
reader = Reader()

# make surprise dataset
data = Dataset.load_from_df(users_df[['steam_id', 'appid', 'playtime_forever']], reader)

In [176]:
# make a training and test set
trainset, testset = train_test_split(data, test_size=0.25)

### Collaborative Filtering: KNNWithZScore

#### GridSearch for KNN

In [9]:
# set up gridsearch param_grid for KNNWithZScore
knn_param_grid = {'k': [600, 800]}

In [10]:
grid_search = GridSearchCV(KNNWithZScore, knn_param_grid, cv=5, n_jobs=3)

In [11]:
grid_search.fit(data)

Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


In [12]:
# print best params for grid search
print('Best params: ', grid_search.best_params)

Best params:  {'rmse': {'k': 800}, 'mae': {'k': 800}}


In [13]:
# print best score for grid search
print('Score: ', grid_search.best_score)

Score:  {'rmse': 1.412431892579643, 'mae': 1.229177014477301}


In [177]:
# make knn model with best params from gridsearch
knn = KNNWithZScore(k=800, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [178]:
# fit the training data and test with our test set
predictions = knn.fit(trainset).test(testset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [179]:
# get accuracy
accuracy.rmse(predictions)

RMSE: 1.3885


1.3885300600603294

In [180]:
# build a full trainset now
trainset = data.build_full_trainset()

In [181]:
# save model in pickle
pickle.dump(trainset, open('../models/trainset.pkl', 'wb'))

In [182]:
# instantiate new knn
knn = KNNWithZScore(k=800, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [183]:
# fit the knn
knn.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x7fceb1e42d00>

In [184]:
# save model in pickle
pickle.dump(knn, open('../models/knn_model.pkl', 'wb'))

In [185]:
# function that takes game title and returns the inner id
def get_innerid(title):
    # get appid from games_df
    appid = games_df[games_df['name'] == title]['appid'].values[0]
    inner_id = trainset.to_inner_iid(appid)
    return inner_id

In [186]:
# function that takes innerid and returns the game title
def get_title(inner_id):
    steam_id = trainset.to_raw_iid(inner_id)
    title = games_df[games_df['appid'] == steam_id].iloc[0]['name']
    return title

In [187]:
# function that takes a game title and returns 10 nearest neighbours
def recommend_knn(title):
    # get inner id for game
    inner_id = get_innerid(title)
    # get nearest neighbours
    neighbors = knn.get_neighbors(inner_id, k=10)
    # get game titles for those neighbours
    titles = [get_title(i) for i in neighbors]
    return titles

In [188]:
knn_similarities = knn.compute_similarities()

Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [189]:
# recommend function for content
def get_knn_similar(title, knn_sim = knn_similarities):
    
    idx = get_innerid(title)
    
    # get pairwise similarity scores of all games w.r.t to our game
    sim_scores = list(enumerate(knn_sim[idx]))
    
    # sort scores based on similarity
    sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # get scores for 10 similar games, not including the game itself
    sorted_scores = sorted_scores[1:11]

    # get titles for sorted_scores
    game_titles = [get_title(i[0]) for i in sorted_scores]

    return game_titles

In [190]:
get_knn_similar('System Shock 2')

['Zeno Clash',
 'Gone Home',
 'LIMBO',
 'Brothers - A Tale of Two Sons',
 'Enclave',
 'Betrayer',
 'Deadlight',
 'Stealth Inc 2: A Game of Clones',
 'A Story About My Uncle',
 'The Ship: Murder Party']

In [191]:
# test run
recommend_knn('System Shock 2')

['Zeno Clash',
 'Gone Home',
 'LIMBO',
 'Brothers - A Tale of Two Sons',
 'Enclave',
 'Betrayer',
 'Deadlight',
 'Stealth Inc 2: A Game of Clones',
 'A Story About My Uncle',
 'The Ship: Murder Party']

### CONTENT-BASED FILTERING FOR GAME DATA

In [192]:
# instantiate tfidfvectorizer
tfidf = TfidfVectorizer(max_features=1500, lowercase=False, min_df=5, ngram_range=(1,3))

In [193]:
games_df.shape

(19546, 80)

In [194]:
# make description column into type string, otherwise tfidf cries
games_df['description_clean'] = games_df['description_clean'].astype(str)

In [195]:
# fit tfidfvectorizer to description column
tfidf_matrix = tfidf.fit_transform(games_df['description_clean'])

In [198]:
# games_df columns 5 to second last are the tags
game_tags = games_df.iloc[:, 5:-1]

In [66]:
# make matrix into dataframe
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

In [67]:
# concatentate tfidf_df and game_tags
tfidf_df = pd.concat([tfidf_df, game_tags], axis=1)

In [68]:
# make matrix for cosine sim
matrix = tfidf_df.values

In [69]:
matrix.shape

(19546, 1574)

In [70]:
# make a cosine similarity matrix, maybe linear kernel instead?
cosine_sim = cosine_similarity(matrix, matrix)

In [71]:
# store cosine similarity matrix in pickle
pickle.dump(cosine_sim, open('../models/cosine_sim.pkl', 'wb'))

In [72]:
# Construct a reverse map of indices and game titles
indices = pd.Series(games_df.index, index=games_df['name'])

In [73]:
# recommend function for content
def recommend_content(title, cosine_sim = cosine_sim):
    # get index for our game
    idx = indices[title]
    
    # get pairwise similarity scores of all games w.r.t to our game
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # sort scores based on similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # get scores for 10 similar games, not including the game itself
    sim_scores = sim_scores[1:11]
    
    # get the game indices
    game_indices = [i[0] for i in sim_scores]
    
    # return the titles
    return games_df['name'].iloc[game_indices].tolist()

In [75]:
# calculate weighted average for knn_similarities and cosine_sim
weighted_sim = (knn_similarities + cosine_sim) / 2


In [76]:
weighted_sim.shape

(19546, 19546)

In [74]:
# function that takes a game title and returns
def combined_recom(title):
    # get recommended games from knn
    knn_recom = recommend_knn(title)
    # get recommended games from content based
    cont_recom = recommend_content(title)
    # print knn_recom
    print('Similar recommendations to', title, 'based on user data:')
    print(*knn_recom, sep='\n')
    print('\nSimilar recommendations to', title, 'based on game data:')
    print(*cont_recom, sep='\n')
    #list(set(knn_recom) & set(cont_recom))

In [31]:
# test run for combined

title = input('Enter a game title: ')
combined_recom(title)

Similar recommendations to DOOM based on user data:
HITMAN™
GTFO
HITMAN™ 2
Tom Clancy's Rainbow Six® Vegas 2
The Escapists 2
Ori and the Blind Forest: Definitive Edition
POSTAL 2
A Hat in Time
Serious Sam 3: BFE
Batman: Arkham City

Similar recommendations to DOOM based on game data:
Saints Row: The Third
DOOM Eternal
Psychonauts
Toukiden: Kiwami
Danganronpa Another Episode: Ultra Despair Girls
Crash Bandicoot™ N. Sane Trilogy
Monster Hunter: World
UNLOVED
NieR:Automata™
Dead Rising® 2
