In [1]:
import os
import sys

sys.path.append("src")
os.chdir("..")

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from params import GENRES_SEQ_LEN, TAGS_SEQ_LEN

torch.manual_seed(0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
dataset = torch.load("run_artifacts/preprocess/dataset.t")

In [4]:
import json

with open("run_artifacts/preprocess/mappings.json", "r") as f:
    MAPPINGS = json.load(f)

In [5]:
# Train test split
train_test_indices = torch.randperm(len(dataset))
train_size = 500
train_indices = train_test_indices[:train_size]
test_indices = train_test_indices[train_size:]

X_train, y_train = dataset[train_indices]
X_test, y_test = dataset[test_indices]

# The goal is predicting how much the items in test set are similar to train set
y_train = y_train[:, train_indices]
y_test = y_test[:, train_indices]

In [6]:
# Convert boolans to floats
edge_threshold = 0.01
y_train = y_train > edge_threshold
y_test = y_test > edge_threshold
y_train, y_test = y_train.float(), y_test.float()

In [7]:
X_train = X_train.to(device)
X_test = X_test.to(device)
y_train = y_train.to(device)
y_test = y_test.to(device)

In [8]:
# Note that the positive rate is very low
y_test.mean().item()

0.018260415643453598

In [9]:
# Helper for doing pack and unpacks in GRU
class GRU_subunit(nn.Module):
    def __init__(self, input_size, hidden_dim, num_layers=1):
        super().__init__()
        self.gru = nn.GRU(
            input_size,
            hidden_dim,
            num_layers,
            bidirectional=False,
            batch_first=True,
        )

    def forward(self, inputs, inputs_len):
        self.gru.flatten_parameters()
        x = torch.nn.utils.rnn.pack_padded_sequence(
            inputs,
            inputs_len.flatten().cpu().int(),
            batch_first=True,
            enforce_sorted=False,
        )
        _, x = self.gru(x)  # returns output, final_hidden
        x = x[-1]
        return x

In [10]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.developer_emb_dim = 2  # dimension of the developer embedding
        self.genres_emb_dim = 10  # dimension of the genres embedding
        self.genres_out_dim = 10  # output dimension of the genres GRU layer
        self.tags_emb_dim = 24  # dimension of the tags embedding
        self.tags_out_dim = 24  # output dimension of the tags GRU layer

        # Developer layers
        self.developer_embedding = nn.Embedding(
            len(MAPPINGS["developer"]) + 1, self.developer_emb_dim
        )

        # Genres layers
        self.genres_embedding = nn.Embedding(
            len(MAPPINGS["Genres"]) + 1, self.genres_emb_dim
        )
        self.genres_gru = GRU_subunit(self.genres_emb_dim, self.genres_out_dim)

        # Tags layers
        self.tags_embedding = nn.Embedding(len(MAPPINGS["Tags"]) + 1, self.tags_emb_dim)
        self.tags_gru = GRU_subunit(self.tags_emb_dim, self.tags_out_dim)

        # Batch normalization layer and linear layers for concatenating the inputs and generating output
        # size(1) = 1(price) + developer + genres + tags + tfidf-fasttext embedding
        self.concat_batchnorm = nn.BatchNorm1d(
            1 + self.developer_emb_dim + self.genres_out_dim + self.tags_out_dim + 300
        )
        self.concat_linear_1 = nn.Linear(
            1 + self.developer_emb_dim + self.genres_out_dim + self.tags_out_dim + 300,
            32,
        )
        self.concat_relu = nn.ReLU()
        self.concat_linear_out = nn.Linear(32, train_size)

    def forward(self, x):
        # Splitting the input tensor into its components
        (
            price,
            developer,
            genres_seq,
            genres_len,
            tags_seq,
            tags_len,
            weighted_embeddings,
        ) = torch.split(x, [1, 1, GENRES_SEQ_LEN, 1, TAGS_SEQ_LEN, 1, 300], dim=1)

        # Processing the developer input
        developer = developer.long()
        developer = self.developer_embedding(developer)
        developer = developer[:, 0, :]

        # Processing the genres input
        genres = genres_seq.long()
        genres = self.genres_embedding(genres)
        genres = self.genres_gru(genres, genres_len)

        # Processing the tags input
        tags = tags_seq.long()
        tags = self.tags_embedding(tags)
        tags = self.tags_gru(tags, tags_len)

        # Concatenating the inputs and passing through the linear layers
        concat = torch.cat((price, developer, genres, tags, weighted_embeddings), 1)
        concat = self.concat_batchnorm(concat)
        concat = self.concat_linear_1(concat)
        concat = self.concat_relu(concat)
        concat = self.concat_linear_out(concat)

        # Returning the output tensor
        return concat

In [11]:
def multi_label_accuracy(predicted, target):
    intersection = (predicted * target).sum(dim=1)
    union = (predicted + target).clamp(0, 1).sum(dim=1)
    jaccard_similarity = intersection / union
    jaccard_similarity = torch.nan_to_num(jaccard_similarity, 0)
    return jaccard_similarity.mean(), intersection, union

In [12]:
def get_top_pct(output_tensor, top_pct):
    # Determine the number of probabilities to keep per row
    num_keep = int(output_tensor.shape[1] * top_pct)

    # Find the indices of the top probabilities per row
    top_indices = torch.topk(output_tensor, num_keep, dim=1)[1]

    # Create a mask to select the top probabilities per row
    mask = torch.zeros_like(output_tensor)
    mask.scatter_(1, top_indices, 1)

    # Apply the mask to select the top probabilities per row
    top_probs = output_tensor * mask

    # Set prediciton = 0 if true
    top_probs = (top_probs > 0).float()
    return top_probs

In [13]:
# Make the output positve rate be on average the train positive rate
top_pct = y_train.mean().item()

In [14]:
model = Model()
model = model.to(device)
criterion = nn.BCEWithLogitsLoss()
learning_rate = 1e-3
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
batch_size = 32
num_epochs = 50
# Training loop
for epoch in range(num_epochs):
    # Create a random permutation of indices for the batch sampling
    indices = torch.randperm(X_train.shape[0])

    for i in range(0, X_train.shape[0], batch_size):
        # Get the batch of data and labels
        batch_indices = indices[i : i + batch_size]
        batch_X, batch_y = X_train[batch_indices], y_train[batch_indices]

        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print the progress
    if epoch % 5 == 0 or epoch == num_epochs - 1:
        with torch.no_grad():
            test_outputs = torch.sigmoid(model(X_test))
            predictions = get_top_pct(test_outputs, top_pct)
            accuracy, intersection, union = multi_label_accuracy(predictions, y_test)
            print(
                f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}",
                f"Test Jaccard Similarity: {accuracy * 100:.2f}%",
            )

Epoch [1/50], Loss: 0.6335 Test Jaccard Similarity: 0.73%
Epoch [6/50], Loss: 0.0994 Test Jaccard Similarity: 2.13%
Epoch [11/50], Loss: 0.0980 Test Jaccard Similarity: 4.91%
Epoch [16/50], Loss: 0.0758 Test Jaccard Similarity: 5.19%
Epoch [21/50], Loss: 0.0604 Test Jaccard Similarity: 5.55%
Epoch [26/50], Loss: 0.1102 Test Jaccard Similarity: 5.87%
Epoch [31/50], Loss: 0.0811 Test Jaccard Similarity: 6.24%
Epoch [36/50], Loss: 0.0629 Test Jaccard Similarity: 6.25%
Epoch [41/50], Loss: 0.0621 Test Jaccard Similarity: 6.30%
Epoch [46/50], Loss: 0.0410 Test Jaccard Similarity: 6.50%
Epoch [50/50], Loss: 0.0648 Test Jaccard Similarity: 6.75%


In [15]:
y_test.mean()

tensor(0.0183, device='cuda:0')

In [16]:
predictions.mean()

tensor(0.0220, device='cuda:0')

While an average similarity of 6.6% is very poor, this can be due to multiple factors:
- Our initial edge matrix has issues including its sparsity and the very low likelihood of the same user being in the recommendations dataset twice.
- The positive label rate is very sparse (1.8%) so finding positive labels is equally difficult
- Our training data is only 500 samples long but contains many features (dimension 300+)
- The  training methods and hyperparameters could be further tuned once data issues are solved

On the positive side, 6.6% is still about 3.5 times the average positive rate. A random guess (coin flip) would have a score of approximately 1.8%. And the loss is going down!