## Video Game Recommendation System based on Steam data 
### Uses a hybrid content-based and collaborative filtering approach to recommend games?

In [1]:
# import libraries
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from surprise import Reader, Dataset, KNNWithZScore
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise import accuracy
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# import our processed datasets
users_df = pd.read_csv('../data/steam_playtime_clean.csv')
games_df = pd.read_csv('../data/steam_app_metadata_clean.csv')
gamesinfo_df = pd.read_csv('../data/steam_gameinfo.csv')

#### COLLABORATIVE FILTERING FOR USER PLAYTIME DATA

In [3]:
# first do robustscaler to minimize outliers
scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True)
users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].array.reshape(-1,1))

In [4]:
# use StandardScaler to scale user playtimes
scaler = StandardScaler()
users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].values.reshape(-1, 1))

In [5]:
# instantiate surprise.Reader()
reader = Reader()

# make surprise dataset
data = Dataset.load_from_df(users_df[['steam_id', 'appid', 'playtime_forever']], reader)

In [6]:
# make a training and test set
trainset, testset = train_test_split(data, test_size=0.25)

### Collaborative Filtering: KNNWithZScore

In [None]:
# set up gridsearch param_grid for KNNWithZScore
knn_param_grid = {'k': [100, 200, 300, 400, 500]}

In [None]:
grid_search = GridSearchCV(KNNWithZScore, knn_param_grid, cv=5)

In [None]:
grid_search.fit(data)

In [None]:
# print best params for grid search
print('Best params: ', grid_search.best_params)

In [None]:
# print best score for grid search
print('Score: ', grid_search.best_score)

In [None]:
# make knn model with best params from gridsearch. i chose 300 and hardcoded it in
knn = KNNWithZScore(k=300, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [None]:
# fit the training data and test with our test set
predictions = knn.fit(trainset).test(testset)

In [None]:
# get accuracy
accuracy.rmse(predictions)

In [6]:
# build a full trainset now
trainset = data.build_full_trainset()

In [7]:
# save model in pickle
pickle.dump(trainset, open('../models/trainset.pkl', 'wb'))

In [7]:
# instantiate new knn
knn = KNNWithZScore(k=300, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [8]:
# fit the knn
knn.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x7fadf06a0790>

In [9]:
# save model in pickle
pickle.dump(knn, open('../models/knn_model.pkl', 'wb'))

In [9]:
# function that takes game title and returns the inner id
def get_appid(title):
    steam_id = gamesinfo_df[gamesinfo_df['name'] == title].iloc[0]['appid']
    inner_id = trainset.to_inner_iid(steam_id)
    return inner_id

In [10]:
# function that takes innerid and returns the game title
def get_title(appid):
    steam_id = trainset.to_raw_iid(appid)
    title = gamesinfo_df[gamesinfo_df['appid'] == steam_id].iloc[0]['name']
    return title

In [51]:
# function that takes a game title and returns 10 nearest neighbours
def recommend_knn(title):
    # get inner id for game
    inner_id = get_appid(title)
    # get nearest neighbours
    neighbors = knn.get_neighbors(inner_id, k=20)
    # get game titles for those neighbours
    titles = [get_title(i) for i in neighbors]
    return titles

In [33]:
# test run
recommend_knn('HITMAN™')

['HITMAN™ 2',
 'The Binding of Isaac',
 'Serious Sam 3: BFE',
 'Warhammer: Vermintide 2',
 'POSTAL 2',
 'Just Cause™ 3',
 'Half-Life 2: Episode One',
 'Half-Life 2: Episode Two',
 'Red Dead Redemption 2',
 'Watch_Dogs® 2']

### CONTENT-BASED FILTERING FOR GAME DATA

In [13]:
# make a matrix out of games_df without the appid column
matrix = games_df.drop(['appid'], axis=1).values

In [14]:
matrix.shape

(23045, 1572)

In [15]:
# make a cosine similarity matrix, maybe linear kernel instead?
cosine_sim = cosine_similarity(matrix, matrix)

In [13]:
# store cosine similarity matrix in pickle
pickle.dump(cosine_sim, open('../models/cosine_sim.pkl', 'wb'))

In [16]:
# Construct a reverse map of indices and game titles
indices = pd.Series(gamesinfo_df.index, index=gamesinfo_df['name'])

In [58]:
# recommend function for content
def recommend_content(title, cosine_sim = cosine_sim):
    # get index for our movie
    idx = indices[title]
    
    # get pairwise similarity scores of all movies w.r.t to our movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # sort scores based on similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # get scores for 10 similar movies, not including the game itself
    sim_scores = sim_scores[1:21]
    
    # get the game indices
    game_indices = [i[0] for i in sim_scores]
    
    # return the titles
    return gamesinfo_df['name'].iloc[game_indices].tolist()

In [52]:
idx = indices['HITMAN™']

In [53]:
sim_scores = list(enumerate(cosine_sim[idx]))

In [54]:
smtn = lambda sim_scores : sim_scores[1]

In [55]:
list(enumerate(cosine_sim[idx]))

[(0,
  array([0.38292411, 0.38136545, 0.1711322 , ..., 0.23000121, 0.43653039,
         0.60206619])),
 (1,
  array([0.38292411, 0.38136545, 0.1711322 , ..., 0.23000121, 0.43653039,
         0.60206619]))]

In [56]:
smtn(list(enumerate(cosine_sim[idx])))

(1,
 array([0.38292411, 0.38136545, 0.1711322 , ..., 0.23000121, 0.43653039,
        0.60206619]))

In [57]:
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [48]:
sim_scores

[(1,
  array([0.38292411, 0.38136545, 0.1711322 , ..., 0.23000121, 0.43653039,
         0.60206619])),
 (0,
  array([0.38292411, 0.38136545, 0.1711322 , ..., 0.23000121, 0.43653039,
         0.60206619]))]

In [29]:
'Hitman' in gamesinfo_df['name']

False

In [34]:
# test run for conten
recommend_content('HITMAN™')

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [59]:
# function that takes a game title and returns
def combined_recom(title):
    # get recommended games from knn
    knn_recom = recommend_knn(title)
    # get recommended games from content based
    cont_recom = recommend_content(title)
    # print knn_recom
    #print('Similar recommendations to', title, 'based on user data:')
    #print(*knn_recom, sep='\n')
    #print('\nSimilar recommendations to', title, 'based on game data:')
    #print(*cont_recom.tolist(), sep='\n')
    list(set(knn_recom) & set(cont_recom))

In [61]:
get_appid('DOOM')

563

In [62]:
# test run for combined
combined_recom('DOOM')