Wide and Deep Learning (https://arxiv.org/pdf/1606.07792.pdf)

- a "wide" component that uses embedding layers to represent users and items
  - wide linear layer to learn interaction & co-occurence between features
- a "deep" component that uses fully-connected layers to learn the interactions between users and items
  - embedding based MLP to generalize to unseen item feature pairs
  - deep learning to model complex feature interactions
- The final output is the sum of the outputs from the wide and deep components.
- The idea of Wide and Deep Learning is to achieve both **memorization** and **generalization**
- Focus on ranking of items

<img src="model_illustration.png" style="width:700px;">

In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, roc_auc_score, top_k_accuracy_score

In [2]:
class WideDeep(nn.Module):
    def __init__(self, n_fields, n_features, mlp_dims=[256, 128, 64], embed_dim=64, dropout=0.2):
        super().__init__()
        # wide linear component
        self.linear = nn.Embedding(n_features, 1)
        self.bias = nn.Parameter(torch.zeros((1,)))
        # deep mlp component
        self.embedding = nn.Embedding(n_features, embed_dim)
        self.embed_out_dim = n_fields*embed_dim
        mlp_layers = []
        input_dim = self.embed_out_dim 
        for dim in mlp_dims:
            mlp_layers.append(nn.Linear(input_dim, dim))
            mlp_layers.append(nn.BatchNorm1d(dim))
            mlp_layers.append(nn.ReLU())
            mlp_layers.append(nn.Dropout(dropout))
            input_dim = dim
        mlp_layers.append(nn.Linear(input_dim, 1)) # final output layer
        self.mlp = nn.Sequential(*mlp_layers)

    def forward(self, x):
        x_embed = self.embedding(x).view(-1, self.embed_out_dim)
        x = self.bias + self.linear(x).sum(dim=1) + self.mlp(x_embed)
        return torch.sigmoid(x)
    
    def predict(self, x):
        self.eval()
        with torch.no_grad():
            return self.forward(x)

In [3]:
def train(model,  dataloader, epochs=20, lr=0.001):
    device = (
        torch.device("cuda:0") if torch.cuda.is_available(
        ) else torch.device("cpu")
    )
    model.to(device)
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCELoss()
    training_history = []
    for epoch in range(epochs):
        epoch_loss = 0
        for x, y in dataloader:
            y_pred = model.forward(x)
            loss = criterion(y_pred, y)
            epoch_loss += loss
            model.zero_grad()
            loss.backward()
            optimizer.step()
        epoch_loss /= len(dataloader)
        training_history.append(epoch_loss)
        if epoch%10 == 0:
            print(f"Epoch {epoch}: {epoch_loss:.4f}")
    return model, training_history

# Data Preparation
- X is an array of feature indices (n_samples, n_attributes), where each feature index will be mapped to a latent factor
  - [user, item, user_features, item_features]
- y is just a (n_sample, 1) array of the ground truth

In [4]:
import sys
sys.path.append('..')
import utils

In [5]:
rating, item, user = utils.get_movielens()

In [6]:
item_label = utils.get_items_label_encoding(item, return_df=False)
user_label = utils.get_users_label_encoding(user, return_df=False)

In [7]:
# concat item & user feature matrix to get X
user_offset = item_label.max() + 1 
user_label_offset = user_label + user_offset
X = np.hstack((item_label[rating['item_id']-1,:], user_label_offset[rating['user_id']-1,:])) # offset -1 since item&user id starts with 1

In [13]:
# convert rating to 1/0
threshold = 3
y = np.where(rating['rating'].to_numpy()>=threshold, 1, 0).reshape(-1, 1)

Train test split

Here, for simplicity, we are only using a random split, with 80% as the train set, and 20% as the test set. In practice, the splitting maybe done by user, e.g. 80/20 split of a user's rating/interaction history.

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
dataset = data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train).float())
train_dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

# Train Model

In [16]:
n_features = X.max() + 1
n_fields = X.shape[1]
embed_dim = 30

model = WideDeep(n_fields=n_fields, 
                 n_features=n_features, 
                 mlp_dims=[128, 64], 
                 embed_dim=embed_dim, 
                 dropout=0.2)

In [17]:
model, history = train(model, train_dataloader, epochs=100, lr=0.001)

Epoch 0: 0.9950
Epoch 10: 0.3451
Epoch 20: 0.3093
Epoch 30: 0.2741
Epoch 40: 0.2416
Epoch 50: 0.2189
Epoch 60: 0.1993
Epoch 70: 0.1838
Epoch 80: 0.1683
Epoch 90: 0.1572


In [18]:
y_pred = model.predict(torch.from_numpy(X_test))


In [23]:
accuracy_score(np.where(y_pred.numpy() > 0.5, 1, 0), y_test)


0.80695

In [24]:
roc_auc_score(y_test, y_pred)

0.7676053480984486