## Keras model

### Implement the one from Keras website
#### Uses only playtime data and seems to do better with the IQR set, it recommended lots of Football Manager otherwise :P

In [None]:
%pip install pandas

In [None]:
%pip install scikit-learn

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
# import our processed datasets
users_df = pd.read_csv('../data/steam_playtime_clean.csv')
game_df = pickle.load(open('../data/steam_games_clean.pkl', 'rb'))

### Set up user data

In [None]:
users_df.head()

In [None]:
len(users_df['steam_id'].unique())

In [None]:
user_ids = users_df['steam_id'].unique().tolist()

In [None]:
user2user_encoded = {x: i for i, x in enumerate(user_ids)}

In [None]:
userencoded2user = {i: x for i, x in enumerate(user_ids)}

In [None]:
game_ids = users_df['appid'].unique().tolist()

In [None]:
game2game_encoded = {x: i for i, x in enumerate(game_ids)}

In [None]:
game_encoded2game = {i: x for i, x in enumerate(game_ids)}

In [None]:
users_df['user'] = users_df['steam_id'].map(user2user_encoded)

In [None]:
users_df['game'] = users_df['appid'].map(game2game_encoded)

In [None]:
num_users = len(user2user_encoded)
num_games = len(game_encoded2game)

In [None]:
users_df['playtime_forever'] = users_df['playtime_forever'].values.astype(np.float32)

In [None]:
min_playtime = min(users_df['playtime_forever'])
max_playtime = max(users_df['playtime_forever'])

In [None]:
print(
    "Number of users: {}, Number of Games: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_games, min_playtime, max_playtime
    )
)

In [None]:
users_df = users_df.sample(frac=1, random_state=42)
x = users_df[["user", "game"]].values
# Normalize the targets between 0 and 1. Makes it easy to train.
y = users_df["playtime_forever"].apply(lambda x: (x - min_playtime) / (max_playtime - min_playtime)).values

# Assuming training on 90% of the data and validating on 10%.
train_indices = int(0.9 * users_df.shape[0])
x_train, x_val, y_train, y_val = (
    x[:train_indices],
    x[train_indices:],
    y[:train_indices],
    y[train_indices:],
)

Set up recommender net

In [None]:
EMBEDDING_SIZE = 50


class RecommenderNet(keras.Model):
    def __init__(self, num_users, num_games, embedding_size, **kwargs):
        super().__init__(**kwargs)
        self.num_users = num_users
        self.num_games = num_games
        self.embedding_size = embedding_size
        self.user_embedding = layers.Embedding(
            num_users,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.user_bias = layers.Embedding(num_users, 1)
        self.game_embedding = layers.Embedding(
            num_games,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.game_bias = layers.Embedding(num_games, 1)

    def call(self, inputs):
        user_vector = self.user_embedding(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])
        game_vector = self.game_embedding(inputs[:, 1])
        game_bias = self.game_bias(inputs[:, 1])
        dot_user_game = tf.tensordot(user_vector, game_vector, 2)
        # Add all the components (including bias)
        x = dot_user_game + user_bias + game_bias
        # The sigmoid activation forces the rating to between 0 and 1
        return tf.nn.sigmoid(x)


model = RecommenderNet(num_users, num_games, EMBEDDING_SIZE)
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
)


In [None]:
history = model.fit(
    x=x_train,
    y=y_train,
    batch_size=64,
    epochs=5,
    verbose=1,
    validation_data=(x_val, y_val),
)


In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "test"], loc="upper left")
plt.show()

In [None]:
# Let us get a user and see the top recommendations.
def recommend_games(user_id = users_df.steam_id.sample(1).iloc[0]):
    games_played_by_user = users_df[users_df.steam_id == user_id]
    games_not_played = game_df[
        ~game_df["appid"].isin(games_played_by_user.appid.values)
    ]["appid"]
    games_not_played = list(
        set(games_not_played).intersection(set(game2game_encoded.keys()))
    )
    games_not_played = [[game2game_encoded.get(x)] for x in games_not_played]
    user_encoder = user2user_encoded.get(user_id)
    user_game_array = np.hstack(
        ([[user_encoder]] * len(games_not_played), games_not_played)
    )
    playtimes = model.predict(user_game_array).flatten()
    top_playtimes_indices = playtimes.argsort()[-10:][::-1]
    recommended_game_ids = [
        game_encoded2game.get(games_not_played[x][0]) for x in top_playtimes_indices
    ]

    print("Showing recommendations for user: {}".format(user_id))
    print("====" * 9)
    print("games with high playtimes from user")
    print("----" * 8)
    top_games_user = (
        games_played_by_user.sort_values(by="playtime_forever", ascending=False)
        .head(5)
        .appid.values
    )
    game_df_rows = game_df[game_df["appid"].isin(top_games_user)]
    for row in game_df_rows.itertuples():
        print(row.name)

    print("----" * 8)
    print("Top 10 game recommendations")
    print("----" * 8)
    recommended_games = game_df[game_df["appid"].isin(recommended_game_ids)]
    for row in recommended_games.itertuples():
        print(row.name)


In [None]:
recommend_games()