This project creates a neural matrix factorization model to recommend video games listed on the steam store.

We train using user reviews on the steam store obtained from [https://www.kaggle.com/datasets/antonkozyriev/game-recommendations-on-steam](https://www.kaggle.com/datasets/antonkozyriev/game-recommendations-on-steam). The data is partitioned into train, validation, and test sets where the latter two consist only of users' most recent reviews which are also positive.

In [None]:
import pandas as pd
import json
import random

def prepare_data():
    # Process original data into convenient training, validation, and testing datasets.
    recommendations = pd.read_csv("./data/original/recommendations.csv", usecols=['user_id', 'app_id', 'is_recommended', 'date'])
    recommendations = recommendations[['user_id', 'app_id', 'is_recommended', 'date']].sample(frac=1)
    games = pd.read_csv("./data/original/games.csv", usecols=['app_id', 'title'])
    
    # Pytorch embedding layers require inputs within a range, so we replace the provided
    # user and game identifers with enumerations.
    users_in_rec = sorted(recommendations.user_id.unique())
    games_in_rec = sorted(recommendations.app_id.unique())
    user_map = {user_id : index for index, user_id in enumerate(users_in_rec)}
    game_map = {app_id : index for index, app_id in enumerate(games_in_rec)}
    recommendations.user_id = recommendations.user_id.map(lambda x: user_map[x]).astype('int32')
    recommendations.app_id = recommendations.app_id.map(lambda x: game_map[x]).astype('int32')
    games['external_app_id'] = games['app_id']
    games.app_id = games.app_id.map(lambda x: game_map[x] if x in game_map else len(games)).astype('int32')
    games = games.sort_values('app_id')
    games.app_id = range(len(games))
    
    # Hold out some users' most recent review for validation and testing, if positive.
    test_recs = recommendations.loc[recommendations.groupby('user_id').date.idxmax()]
    test_recs = test_recs.loc[test_recs.is_recommended].sample(n=10000)
    train_recs = recommendations.loc[recommendations.index.difference(test_recs.index)]
    train_recs = train_recs.drop(columns=['date'])
    test_recs = test_recs.drop(columns=['date'])
    train_recs['observed'] = True
    
    valid_recs = test_recs.sample(frac=0.5)
    test_recs = test_recs.loc[test_recs.index.difference(valid_recs.index)]
    valid_recs.to_csv("./data/prepared/valid_recs.csv", index=False)
    test_recs.to_csv("./data/prepared/test_recs.csv", index=False)
    train_recs.to_csv("./data/prepared/train_recs.csv", index=False)
    games.to_csv("./data/prepared/games.csv", index=False)
    with open("./data/prepared/misc.json", 'w') as file:
        json.dump({'num_users': len(users_in_rec), 'num_games': len(games_in_rec)}, file)

prepare_data()

Next, load the prepared data into pytorch Dataset objects. The InteractionsDataset class is used for training. In addition to the observed data, it generates unobserved user-game interactions. A given game appears in an unobserved interaction with probability proportional to its occurences in the explicit data. Since most user reviews are positive, this sampling scheme prevents introducing a large bias towards games with many reviews.

In [None]:
import torch
from torch.utils.data import DataLoader, RandomSampler
import random
import json
import pandas as pd

UNOBSERVED_RATIO = 4
RANKING_LIST_SIZE = 100

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

def reduce_memory(df):
    for col in df.columns:
        if df[col].dtype == 'int64':
            df[col] = df[col].astype('int32')
    return df

class InteractionsDataset(torch.utils.data.Dataset):
    """
    Dataset for training. Contains recommendations and user-game pairs for which
    there is no corresponding review.
    """
    
    def __init__(self, interactions, num_users, unobserved_ratio=UNOBSERVED_RATIO):
        super().__init__()
        self.data = interactions.set_index(['user_id', 'app_id'], drop=False).sort_index()
        self.length = int(len(self.data) * (1+unobserved_ratio))
        self.num_users = num_users
        # This line gets around memory problems seemlingly related to https://github.com/pytorch/pytorch/issues/13246
        (0, random.choice(self.data.app_id.to_numpy())) in self.data.index

    def __len__(self):
        return self.length

    def __getitem__(self, key):
        if key < len(self.data):
            return torch.tensor(self.data.iloc[key].to_numpy(dtype='int32'))
        rand_user = random.randrange(self.num_users)
        rand_game = random.choice(self.data.app_id.to_numpy())
        while (rand_user, rand_game) in self.data.index:
            rand_user = random.randrange(self.num_users)
            rand_game = random.choice(self.data.app_id.to_numpy())
        return torch.tensor([rand_user, rand_game, 0, 0], dtype=torch.int32)

class RankingDataset(torch.utils.data.Dataset):
    """
    Dataset for validation or testing. Yields a recommendation along with a 
    fixed number of unobserved user-game pairs.
    """
    
    def __init__(self, interactions, num_games, size=RANKING_LIST_SIZE):
        super().__init__()
        self.data = interactions
        self.num_games = num_games
        self.list_size = size

    def __len__(self):
        return len(self.data)

    def __getitem__(self, key):
        target = self.data.loc[key]
        candidates = [[target.user_id, target.app_id]]
        app_ids = set([target.app_id])
        while len(candidates) < self.list_size:
            game = random.randrange(num_games)
            if game not in app_ids:
                candidates.append([target.user_id, game])
                app_ids.add(game)
        return torch.tensor(candidates)

num_users = 0
num_games = 0
with open("./data/prepared/misc.json") as file:
    data = json.load(file)
    num_users = data['num_users']
    num_games = data['num_games']

In [None]:
from torch import nn

# Length of embedding vectors for the generalized matrix factorization and
# multilayer perceptron components of the model.
GMF_VECTOR_LENGTH = 16
MLP_VECTOR_LENGTH = 16

class NeuMF(nn.Module):
    """Computes a similarity score between user and game embedding vectors."""
    
    def __init__(self, gmf_vector_length=GMF_VECTOR_LENGTH, mlp_vector_length=MLP_VECTOR_LENGTH):
        super().__init__()
        self.gmf_vector_length = gmf_vector_length
        self.mlp_vector_length = mlp_vector_length
        self.mlp = nn.Sequential(
            nn.Linear(2 * self.mlp_vector_length, self.mlp_vector_length),
            nn.SiLU(),
            nn.Linear(self.mlp_vector_length, int(self.mlp_vector_length / 2)),
        )
        self.activation = nn.SiLU()
        self.predict = nn.Linear(self.gmf_vector_length + int(self.mlp_vector_length / 2), 1)

    def forward(self, user_embedding, game_embedding):
        x = torch.cat((game_embedding[:, self.gmf_vector_length:], user_embedding[:, self.gmf_vector_length:]), 1)
        x = self.mlp(x)
        x = torch.cat((x, game_embedding[:, :self.gmf_vector_length] * user_embedding[:, :self.gmf_vector_length]), 1)
        x = self.activation(x)
        return self.predict(x).squeeze()

The training loop uses binary cross entropy as the loss function. A positive recommendation is treated as the user liking the game with probability 1, and a negative recommendation as the user liking the game with probability 0. The probability score for an unobserved interaction is a tunable parameter, as is the weight these instances are given in the loss function.

For validation and testing, a positive recommendation is evaluated along with a fixed number of unobserved interactions for the same user. The resulting ranking of user-game pairs is scored using the [Normalized Discounted Cumulative Gain](https://en.wikipedia.org/wiki/Discounted_cumulative_gain).

In [None]:
BATCH_SIZE = 256
UNOBSERVED_WEIGHT = 1
UNOBSERVED_SCORE = 0.05
import torch.nn.functional as F
import math

def train_loop(dataloader, model, user_embedding, game_embedding):
    # Train the neural matrix factorization and embedding models using a binary
    # cross entropy loss function.
    size = len(dataloader.dataset)
    model_opt = torch.optim.AdamW(model.parameters())
    user_opt = torch.optim.SparseAdam(user_embedding.parameters())
    game_opt = torch.optim.SparseAdam(game_embedding.parameters())
    model.train()
    user_embedding.train()
    game_embedding.train()
    for batch, X in enumerate(dataloader):
        X = X.to(device)
        pred = model(user_embedding(X[:,0]), game_embedding(X[:,1]))
        target = X[:,2] + torch.logical_not(X[:,3]) * UNOBSERVED_SCORE
        weight = UNOBSERVED_WEIGHT + X[:, 3] * (1 - UNOBSERVED_WEIGHT)
        loss = F.binary_cross_entropy_with_logits(pred, target, weight=weight)

        loss.backward()
        model_opt.step()
        user_opt.step()
        game_opt.step()
        model_opt.zero_grad()
        user_opt.zero_grad()
        game_opt.zero_grad()

        if batch % 100000 == 0:
            loss = loss.item()
            current = batch * BATCH_SIZE + len(X)
            print(f"Training Loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test_loop(dataloader, model, user_embedding, game_embedding, k=None):
    # Calculate the average Normalized Cumulative Discounted Gain.
    model.eval()
    user_embedding.eval()
    game_embedding.eval()
    normalized_cumulative_discounted_gain = 0
    with torch.no_grad():
        for X in dataloader:
            X = X.to(device)
            score = model(user_embedding(X[:,0]), game_embedding(X[:,1]))
            rank = (score >= score[0]).sum()
            if k is None or rank <= k:
                normalized_cumulative_discounted_gain += math.log(2) / torch.log(rank+1)
    return normalized_cumulative_discounted_gain / len(dataloader)
        

Below, we train model until the validation score stops improving which seems to be after four epochs.

In [None]:
model = NeuMF().to(device)
user_embedding = nn.Embedding(num_users, GMF_VECTOR_LENGTH + MLP_VECTOR_LENGTH, sparse=True).to(device)
game_embedding = nn.Embedding(num_games, GMF_VECTOR_LENGTH + MLP_VECTOR_LENGTH, sparse=True).to(device)

valid_recs = reduce_memory(pd.read_csv("./data/prepared/valid_recs.csv"))
train_recs = reduce_memory(pd.read_csv("./data/prepared/train_recs.csv"))
train_dataset = InteractionsDataset(train_recs, num_users)
validation_dataset = RankingDataset(valid_recs, num_games)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=RandomSampler(train_dataset), num_workers=16, pin_memory=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=None, pin_memory=True)

accuracy = 0
epoch = 0
while True:
    epoch += 1
    print(f"Epoch {epoch}")
    train_loop(train_dataloader, model, user_embedding, game_embedding)
    new_acc = test_loop(validation_dataloader, model, user_embedding, game_embedding)
    print(f"Average Validation Accuracy: {new_acc:>5f} \n")
    if new_acc > accuracy:
        accuracy = new_acc
        torch.save(model.state_dict(), "NeuMF_weights.pth")
        torch.save(user_embedding.state_dict(), "user_embedding_weights.pth")
        torch.save(game_embedding.state_dict(), "game_embedding_weights.pth")
    else:
        break

We can call the function below to get recommendations for a given user.

In [None]:
model = NeuMF().to(device)
user_embedding = nn.Embedding(num_users, GMF_VECTOR_LENGTH + MLP_VECTOR_LENGTH, sparse=True).to(device)
game_embedding = nn.Embedding(num_games, GMF_VECTOR_LENGTH + MLP_VECTOR_LENGTH, sparse=True).to(device)

model.load_state_dict(torch.load("NeuMF_weights.pth"))
user_embedding.load_state_dict(torch.load("user_embedding_weights.pth"))
game_embedding.load_state_dict(torch.load("game_embedding_weights.pth"))

games = reduce_memory(pd.read_csv("./data/prepared/games.csv"))

def recommend(user, model, user_embedding, game_embedding):
    # Return collection of recommended games for user.
    model.eval()
    user_embedding.eval()
    game_embedding.eval()
    with torch.no_grad():
        # Only evaluate games the user has not interacted with.
        played_games = reduce_memory(pd.read_csv("./data/original/recommendations.csv", usecols=['user_id', 'app_id']))
        played_games = set(played_games.loc[played_games.user_id == user].app_id.to_numpy())
        unplayed_games = games.iloc[:num_games]
        unplayed_games = torch.tensor(unplayed_games.loc[~unplayed_games.external_app_id.isin(played_games)].app_id.to_numpy())
        scores_list = []
        u = user_embedding(torch.tensor([user], device=device))
        i = 0
        # Evaluate each unplayed game against the user.
        while i < len(unplayed_games):
            g = game_embedding(unplayed_games[i:i+256].to(device))
            scores_list.append(model(u.expand(len(g), -1), g).cpu())
            i += 256
        ranking = torch.argsort(torch.cat(scores_list, dim=0), descending=True)
        top = set(ranking[:20].tolist())
        return games.loc[games.app_id.isin(top)].title.to_numpy()
        

Combine training and validation data for final model.

In [None]:
valid_recs = reduce_memory(pd.read_csv("./data/prepared/valid_recs.csv"))
train_recs = reduce_memory(pd.read_csv("./data/prepared/train_recs.csv"))

valid_recs['observed'] = True
train_dataset = InteractionsDataset(pd.concat([train_recs, valid_recs]), num_users)

model = NeuMF().to(device)
user_embedding = nn.Embedding(num_users, GMF_VECTOR_LENGTH + MLP_VECTOR_LENGTH, sparse=True).to(device)
game_embedding = nn.Embedding(num_games, GMF_VECTOR_LENGTH + MLP_VECTOR_LENGTH, sparse=True).to(device)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=RandomSampler(train_dataset), num_workers=16, pin_memory=True)
for epoch in range(1, 5):
    print(f"Epoch {epoch}")
    train_loop(train_dataloader, model, user_embedding, game_embedding)
    torch.save(model.state_dict(), "NeuMF_weights.pth")
    torch.save(user_embedding.state_dict(), "user_embedding_weights.pth")
    torch.save(game_embedding.state_dict(), "game_embedding_weights.pth")

Evaluating the model on the test set, we get a score of 0.4324, up from the score of approximately 0.2 that a newly initialized model yields.

In [None]:
model = NeuMF().to(device)
user_embedding = nn.Embedding(num_users, GMF_VECTOR_LENGTH + MLP_VECTOR_LENGTH, sparse=True).to(device)
game_embedding = nn.Embedding(num_games, GMF_VECTOR_LENGTH + MLP_VECTOR_LENGTH, sparse=True).to(device)

model.load_state_dict(torch.load("NeuMF_weights.pth"))
user_embedding.load_state_dict(torch.load("user_embedding_weights.pth"))
game_embedding.load_state_dict(torch.load("game_embedding_weights.pth"))

test_recs = reduce_memory(pd.read_csv("./data/prepared/valid_recs.csv"))
test_dataset = RankingDataset(test_recs, num_games)
test_dataloader = DataLoader(test_dataset, batch_size=None, pin_memory=True)
print(test_loop(test_dataloader, model, user_embedding, game_embedding))

## References
1. [Neural Collaborative Filtering Paper](https://arxiv.org/abs/1708.05031)
2. [Steam Recommendation Dataset](https://www.kaggle.com/datasets/antonkozyriev/game-recommendations-on-steam)