Wide and Deep Learning (https://arxiv.org/pdf/1606.07792.pdf)

- a "wide" component that uses embedding layers to represent users and items
  - wide linear layer to learn interaction & co-occurence between features
- a "deep" component that uses fully-connected layers to learn the interactions between users and items
  - embedding based MLP to generalize to unseen item feature pairs
  - deep learning to model complex feature interactions
- The final output is the sum of the outputs from the wide and deep components.
- The idea of Wide and Deep Learning is to achieve both **memorization** and **generalization**
- Focus on ranking of items

<img src="model_illustration.png" style="width:700px;">

In [74]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, roc_auc_score, top_k_accuracy_score

In [63]:
class WideDeep(nn.Module):
    def __init__(self, n_fields, n_features, mlp_dims=[256, 128, 64], embed_dim=64, dropout=0.2):
        super().__init__()
        # wide linear component
        self.linear = nn.Embedding(n_features, 1)
        self.bias = nn.Parameter(torch.zeros((1,)))
        # deep mlp component
        self.embedding = nn.Embedding(n_features, embed_dim)
        self.embed_out_dim = n_fields*embed_dim
        mlp_layers = []
        input_dim = self.embed_out_dim 
        for dim in mlp_dims:
            mlp_layers.append(nn.Linear(input_dim, dim))
            mlp_layers.append(nn.BatchNorm1d(dim))
            mlp_layers.append(nn.ReLU())
            mlp_layers.append(nn.Dropout(dropout))
            input_dim = dim
        mlp_layers.append(nn.Linear(input_dim, 1)) # final output layer
        self.mlp = nn.Sequential(*mlp_layers)

    def forward(self, x):
        x_embed = self.embedding(x).view(-1, self.embed_out_dim)
        x = self.bias + self.linear(x).sum(dim=1) + self.mlp(x_embed)
        return torch.sigmoid(x)
    
    def predict(self, x):
        self.eval()
        with torch.no_grad():
            return self.forward(x)

In [64]:
def train(model,  dataloader, epochs=20, lr=0.001):
    device = (
        torch.device("cuda:0") if torch.cuda.is_available(
        ) else torch.device("cpu")
    )
    model.to(device)
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCELoss()
    training_history = []
    for epoch in range(epochs):
        epoch_loss = 0
        for x, y in dataloader:
            y_pred = model.forward(x)
            loss = criterion(y_pred, y)
            epoch_loss += loss
            model.zero_grad()
            loss.backward()
            optimizer.step()
        epoch_loss /= len(dataloader)
        training_history.append(epoch_loss)
        if epoch%10 == 0:
            print(f"Epoch {epoch}: {epoch_loss:.4f}")
    return model, training_history

# Data Preparation
- X is an array of feature indices (n_samples, n_attributes), where each feature index will be mapped to a latent factor
  - [user, item, user_features, item_features]
- y is just a (n_sample, 1) array of the ground truth

In [26]:
import sys
sys.path.append('..')
import utils

In [27]:
movie, rating = utils.load_movielens()

Encode movie features

In [30]:
# construct an array of feature indices for each item
all_movie_features = set([f for feat in movie['features'] for f in feat])
feature_to_id = {f:ix for ix, f in enumerate(all_movie_features)}
movie_feat_id = movie['features'].apply(lambda x: [feature_to_id[f]+1 for f in x]).to_list() # +1 since 0 is the null feature

# since items have variable # of features, pad feature sequence with 0 
features_enc = pad_sequence([torch.tensor(i) for i in movie_feat_id], batch_first=True, padding_value=0)

Combine userId, movieId and features to get our `x`
- [userId, movieId, feature-1, feature-n]

In [31]:
n_user = rating['userId'].nunique()
n_movies = movie['movieId'].nunique()

In [32]:
# join userid & movieid, and features
x = rating[['userId', 'movieId']].to_numpy()
features = features_enc[rating['movieId']]
x = np.hstack((x, features))
x

array([[   0,   30,   22, ...,    0,    0,    0],
       [   0,  833,   19, ...,    0,    0,    0],
       [   0,  859,   16, ...,    0,    0,    0],
       ...,
       [ 670, 4603,   13, ...,    0,    0,    0],
       [ 670, 4616,   22, ...,    0,    0,    0],
       [ 670, 4703,   22, ...,    0,    0,    0]])

In [33]:
# calculate offset, avoid duplicated feature indices
x[:, 1] += n_user
x[:, 2:] += n_user+n_movies
x

array([[   0,  701, 9818, ..., 9796, 9796, 9796],
       [   0, 1504, 9815, ..., 9796, 9796, 9796],
       [   0, 1530, 9812, ..., 9796, 9796, 9796],
       ...,
       [ 670, 5274, 9809, ..., 9796, 9796, 9796],
       [ 670, 5287, 9818, ..., 9796, 9796, 9796],
       [ 670, 5374, 9818, ..., 9796, 9796, 9796]])

In [35]:
threshold = 3
y = np.where(rating['rating'].to_numpy() > threshold, 1, 0).reshape(-1, 1)

Create train & test set
- train: first n-1 ratings per user (n is the number of ratings of the user)
- test: last/most recent rating per user

In [36]:
test_ix = [i for i,v in enumerate(x[:-1, 0]) if v != x[i+1,0]] + [len(x)-1]
train_ix = [i for i in range(len(x)) if i not in test_ix]

In [37]:
x_train, x_test = x[train_ix], x[test_ix]
y_train, y_test = y[train_ix], y[test_ix]

In [38]:
dataset = data.TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train).float())
train_dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

# Train Model

In [65]:
n_features = n_user + n_movies + len(all_movie_features) + 1 # +1 for the null feature
n_fields = x_train.shape[1]
embed_dim = 20

model = WideDeep(n_fields=n_fields, 
                 n_features=n_features, 
                 mlp_dims=[128, 64], 
                 embed_dim=embed_dim, 
                 dropout=0.2)

In [66]:
model, history = train(model, train_dataloader, epochs=200, lr=0.001)

Epoch 0: 0.7988
Epoch 10: 0.4995
Epoch 20: 0.4489
Epoch 30: 0.4072
Epoch 40: 0.3728
Epoch 50: 0.3430
Epoch 60: 0.3206
Epoch 70: 0.2984
Epoch 80: 0.2863
Epoch 90: 0.2683
Epoch 100: 0.2588
Epoch 110: 0.2529
Epoch 120: 0.2402
Epoch 130: 0.2306
Epoch 140: 0.2250
Epoch 150: 0.2203
Epoch 160: 0.2132
Epoch 170: 0.2101
Epoch 180: 0.2056
Epoch 190: 0.2015


In [67]:
y_pred = model.predict(torch.from_numpy(x_test))

In [76]:
accuracy_score(np.where(y_pred.numpy()>0.5, 1,0), y_test)

0.6751117734724292

In [77]:
roc_auc_score(y_test, y_pred)

0.6898539508671667