### Singular Value Decomposition for user playtimes

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from surprise import SVDpp, BaselineOnly
from surprise import Reader, Dataset
from surprise import accuracy
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
import pickle

In [6]:
users_df = pd.read_csv('../data/steam_playtime_clean.csv')
games_df = pd.read_csv('../data/steam_app_metadata.csv')

In [7]:
# random subset of users 25% of the data in case you want to run gridsearch faster
users_df = users_df.sample(frac=0.10, random_state=42)

In [4]:
# use StandardScaler to scale user playtimes
scaler = StandardScaler()
users_df['playtime_forever'] = scaler.fit_transform(users_df['playtime_forever'].values.reshape(-1, 1))

In [4]:
# use normalize instead of standardscaler
min_playtime = users_df['playtime_forever'].min()
max_playtime = users_df['playtime_forever'].max()
users_df['playtime_forever'] = (users_df['playtime_forever'] - min_playtime) / (max_playtime - min_playtime)

In [6]:
# instantiate surprise.Reader()
reader = Reader()

# make surprise dataset
data = Dataset.load_from_df(users_df[['steam_id', 'appid', 'playtime_forever']], reader)

In [7]:
# make a training and test set
trainset, testset = train_test_split(data, test_size=0.25)

### Hyperparameter tuning with gridsearch

In [10]:
# do gridsearch on svdpp
param_grid = {'n_factors': [150, 200], 'n_epochs': [40], 'lr_all': [0.005], 'reg_all': [0.2]}
gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-2, joblib_verbose=5)

In [11]:
gs.fit(data)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done   3 out of   6 | elapsed:  9.6min remaining:  9.6min
[Parallel(n_jobs=-2)]: Done   6 out of   6 | elapsed: 12.7min finished


In [12]:
gs.best_params

{'rmse': {'n_factors': 150, 'n_epochs': 40, 'lr_all': 0.005, 'reg_all': 0.2},
 'mae': {'n_factors': 200, 'n_epochs': 40, 'lr_all': 0.005, 'reg_all': 0.2}}

prev gs run

'rmse': {'n_factors': 150, 'n_epochs': 40, 'lr_all': 0.005, 'reg_all': 0.2},

'mae': {'n_factors': 150, 'n_epochs': 40, 'lr_all': 0.005, 'reg_all': 0.2}}

In [27]:
gs.best_estimator

{'rmse': <surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7f4e20b3dc10>,
 'mae': <surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7f4dc5ea7280>}

### SVD

In [9]:
svdpp = SVDpp(n_factors=150, n_epochs=40, lr_all=0.005, reg_all=0.2)

In [19]:
predictions = svdpp.fit(trainset).test(testset)

In [20]:
# test accuracy
print(accuracy.rmse(predictions), accuracy.mae(predictions))

RMSE: 0.9886
MAE:  0.9873
0.9886230537760803 0.9872876897261794


In [8]:
fullset = data.build_full_trainset()

In [10]:
svdpp.fit(fullset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7f1f232d9b50>

In [14]:
svdpp.predict(76561197960265728, 570)

Prediction(uid=76561197960265728, iid=570, r_ui=None, est=1, details={'was_impossible': False})

In [11]:
# pickle the svdpp model because it takes a long time to train
with open('../models/svdpp_model.pkl', 'wb') as f:
    pickle.dump(svdpp, f)

### Non-user based SVD
using https://alyssaq.github.io/2015/20150426-simple-movie-recommender-using-svd/

In [8]:
user_data = users_df[['steam_id', 'appid', 'playtime_forever']]
game_data = games_df[['appid', 'name', 'genres']]

In [9]:
# Create playtime matrix
playtime_mat = np.ndarray(
    shape=(np.max(user_data.appid.values), np.max(user_data.steam_id.values)),
    dtype=np.uint8)
playtime_mat[user_data.appid.values-1, user_data.steam_id.values-1] = user_data.playtime_forever.values


ValueError: array is too big; `arr.size * arr.dtype.itemsize` is larger than the maximum possible size.

In [None]:
# normalize matrix
normalized_mat = playtime_mat - np.asarray([(np.mean(playtime_mat, 1))]).T

In [None]:
# Compute SVD
A = normalized_mat.T / np.sqrt(playtime_mat.shape[0] - 1)
U, S, V = np.linalg.svd(A)

In [None]:
# Calculate cosine similarity
def top_cosine_similarity(user_data, appid, top_n=10):
    index = appid - 1
    query = user_data[index, :]
    magnitude = np.sqrt(np.einsum('ij,ij->j', user_data, user_data))
    similarity = np.dot(query, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

def print_similar_games(game_data, appid, top_indexes):
    print('Recommendations for {0}: \n'.format(
        game_data[game_data.appid == appid].name.values[0]))
    for id in top_indexes + 1:
        print(game_data[game_data.appid == id].name.values[0])

In [None]:
# test and select k principal components
k = 50
game_id = 10
top_n = 10

sliced = V.T[:, :k]
indexes = top_cosine_similarity(sliced, game_id, top_n)
print_similar_games(game_data, game_id, indexes)