## Video Game Recommendation System based on Steam data 
### Uses a hybrid content-based and collaborative filtering approach to recommend games?

In [3]:
# import libraries
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from surprise import Reader, Dataset, KNNWithZScore
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise import accuracy
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# import our processed datasets
users_df = pd.read_csv('../data/steam_playtime_clean.csv')
games_df = pd.read_csv('../data/steam_app_metadata_clean.csv')
gamesinfo_df = pd.read_csv('../data/steam_gameinfo.csv')

#### COLLABORATIVE FILTERING FOR USER PLAYTIME DATA

In [5]:
# first do robustscaler to minimize outliers
scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True)
users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].array.reshape(-1,1))

In [6]:
# use StandardScaler to scale user playtimes
scaler = StandardScaler()
users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].values.reshape(-1, 1))

In [7]:
# instantiate surprise.Reader()
reader = Reader()

# make surprise dataset
data = Dataset.load_from_df(users_df[['steam_id', 'appid', 'playtime_forever']], reader)

In [8]:
# make a training and test set
trainset, testset = train_test_split(data, test_size=0.25)

### Collaborative Filtering: KNNWithZScore

In [36]:
# set up gridsearch param_grid for KNNWithZScore
knn_param_grid = {'k': [20, 50, 100, 200]}

In [37]:
grid_search = GridSearchCV(KNNWithZScore, knn_param_grid, cv=5)

In [38]:
grid_search.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [39]:
# print best params for grid search
print('Best params: ', grid_search.best_params)

Best params:  {'rmse': {'k': 200}, 'mae': {'k': 100}}


In [40]:
# print best score for grid search
print('Score: ', grid_search.best_score)

Score:  {'rmse': 1.4132492927756473, 'mae': 1.3111066234270337}


In [41]:
# make knn model with best params from gridsearch
knn = KNNWithZScore(k=200, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [42]:
# fit the training data and test with our test set
predictions = knn.fit(trainset).test(testset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [43]:
# get accuracy
accuracy.rmse(predictions)

RMSE: 1.3098


1.3097653913199245

In [44]:
# build a full trainset now
trainset = data.build_full_trainset()

In [45]:
# save model in pickle
pickle.dump(trainset, open('../models/trainset.pkl', 'wb'))

In [46]:
# instantiate new knn
knn = KNNWithZScore(k=100, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [47]:
# fit the knn
knn.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithZScore at 0x7fb54030d790>

In [48]:
# save model in pickle
pickle.dump(knn, open('../models/knn_model.pkl', 'wb'))

In [49]:
# function that takes game title and returns the inner id
def get_appid(title):
    steam_id = gamesinfo_df[gamesinfo_df['name'] == title].iloc[0]['appid']
    inner_id = trainset.to_inner_iid(steam_id)
    return inner_id

In [50]:
# function that takes innerid and returns the game title
def get_title(appid):
    steam_id = trainset.to_raw_iid(appid)
    title = gamesinfo_df[gamesinfo_df['appid'] == steam_id].iloc[0]['name']
    return title

In [51]:
# function that takes a game title and returns 10 nearest neighbours
def recommend_knn(title):
    # get inner id for game
    inner_id = get_appid(title)
    # get nearest neighbours
    neighbors = knn.get_neighbors(inner_id, k=10)
    # get game titles for those neighbours
    titles = [get_title(i) for i in neighbors]
    return titles

In [53]:
# test run
recommend_knn('South Park™: The Fractured But Whole™')

['UNO',
 "Hellblade: Senua's Sacrifice",
 'South Park™: The Stick of Truth™',
 'Moonlighter',
 'HITMAN™',
 'Warhammer 40,000: Mechanicus',
 'Labyronia RPG',
 'Labyronia RPG 2',
 'Fallout: New Vegas',
 'Loop Hero']

### CONTENT-BASED FILTERING FOR GAME DATA

In [54]:
# make a matrix out of games_df without the appid column
matrix = games_df.drop(['appid'], axis=1).values

In [55]:
matrix.shape

(22838, 1573)

In [56]:
# make a cosine similarity matrix, maybe linear kernel instead?
cosine_sim = cosine_similarity(matrix, matrix)

ValueError: Input contains NaN.

In [None]:
# store cosine similarity matrix in pickle
pickle.dump(cosine_sim, open('../models/cosine_sim.pkl', 'wb'))

In [None]:
# Construct a reverse map of indices and game titles
indices = pd.Series(gamesinfo_df.index, index=gamesinfo_df['name'])

In [None]:
# recommend function for content
def recommend_content(title, cosine_sim = cosine_sim):
    # get index for our movie
    idx = indices[title]
    
    # get pairwise similarity scores of all movies w.r.t to our movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # sort scores based on similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # get scores for 10 similar movies, not including the game itself
    sim_scores = sim_scores[1:11]
    
    # get the game indices
    game_indices = [i[0] for i in sim_scores]
    
    # return the titles
    return gamesinfo_df['name'].iloc[game_indices].tolist()

In [None]:
# function that takes a game title and returns
def combined_recom(title):
    # get recommended games from knn
    knn_recom = recommend_knn(title)
    # get recommended games from content based
    cont_recom = recommend_content(title)
    # print knn_recom
    print('Similar recommendations to', title, 'based on user data:')
    print(*knn_recom, sep='\n')
    print('\nSimilar recommendations to', title, 'based on game data:')
    print(*cont_recom, sep='\n')
    #list(set(knn_recom) & set(cont_recom))

In [None]:
# test run for combined
while True:
    title = input('Enter a game title: ')
    combined_recom(title)