# 1. Setup

## 1.1 Import libraries

In [1]:
from IPython.display import clear_output

In [2]:
!pip install fbgemm-gpu
#!pip install fbgemm-gpu-cpu

!pip install torch
!pip install torchrec
#!pip install torchrec-cpu
!pip install pyre_extensions

#clear_output()



In [3]:
import pandas as pd
import numpy as np
import random

import torch
from torch.utils.data.dataset import IterableDataset
from torchrec.datasets.utils import Batch
from torchrec.sparse.jagged_tensor import KeyedJaggedTensor

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")

Device: 'cuda'


In [5]:
!git clone https://github.com/linafaik08/recommender_systems_dlrm.git

fatal: destination path 'recommender_systems_dlrm' already exists and is not an empty directory.


In [6]:
cd recommender_systems_dlrm/src

/content/recommender_systems_dlrm/src


In [7]:
from batch import *
from model import *

## 1.2. Generate data

In [8]:
n_rows = 10000

train_df = pd.DataFrame({
    "feat1": np.random.rand(n_rows),
    "feat2": np.random.rand(n_rows),
    "cat1": np.random.choice(["A","B","C"], n_rows),
    "cat2": [random.choice([["A","B","C"], ["A","B"],  ["A","C"], ["B","C"], ["B"], []]) for i in range(n_rows)],
    "label": np.random.choice([0,1], n_rows)
})

n_rows = int(n_rows*0.2)

test_df = pd.DataFrame({
    "feat1": np.random.rand(n_rows),
    "feat2": np.random.rand(n_rows),
    "cat1": np.random.choice(["A","B","C"], n_rows),
    "cat2": [random.choice([["A","B","C"], ["A","B"],  ["A","C"], ["B","C"], ["B"], []]) for i in range(n_rows)],
    "label": np.random.choice([0,1], n_rows)
})

n_rows = int(n_rows*0.2)

val_df = pd.DataFrame({
    "feat1": np.random.rand(n_rows),
    "feat2": np.random.rand(n_rows),
    "cat1": np.random.choice(["A","B","C"], n_rows),
    "cat2": [random.choice([["A","B","C"], ["A","B"],  ["A","C"], ["B","C"], ["B"], []]) for i in range(n_rows)],
    "label": np.random.choice([0,1], n_rows)
})

In [9]:
train_df.head()

Unnamed: 0,feat1,feat2,cat1,cat2,label
0,0.336064,0.451204,C,"[A, B, C]",0
1,0.096554,0.802365,A,[B],1
2,0.877572,0.822153,A,"[A, C]",1
3,0.072148,0.428623,B,"[A, C]",0
4,0.96617,0.258741,B,[B],1


In [10]:
cols_dense = ["feat1", "feat2"]
cols_sparse = ['cat1', 'cat2']

# 2. Features

## 2.1. Encoding sparse features

In [11]:
def encode(col, map_rev):
    return [map_rev.get(item,item) for item in col]

In [12]:
map_sparse = {}
map_sparse_rev = {}

In [13]:
for feat in cols_sparse:
    map_sparse[feat] = {i: c for i,c in enumerate(train_df[feat].explode().value_counts().index)}
    map_sparse_rev[feat] = {v:k for k, v in map_sparse[feat].items()}

    train_df[feat+'_enc'] = train_df[feat].apply(lambda x: encode(x, map_sparse_rev[feat]))
    test_df[feat+'_enc'] = test_df[feat].apply(lambda x: encode(x, map_sparse_rev[feat]))
    val_df[feat+'_enc'] = test_df[feat].apply(lambda x: encode(x, map_sparse_rev[feat]))

In [14]:
map_sparse

{'cat1': {0: 'C', 1: 'A', 2: 'B'}, 'cat2': {0: 'B', 1: 'C', 2: 'A'}}

## 2.2. Build data batches

In [15]:
train_data = RecBatch(
    data = train_df,
    cols_sparse = [c+'_enc' for c in cols_sparse],
    cols_dense = cols_dense,
    col_label = "label",
    batch_size = 100,
    num_generated_batches = 20,
    seed = 123,
    device = device
)

replace False


In [16]:
test_data = RecBatch(
    data = test_df,
    cols_sparse = [c+'_enc' for c in cols_sparse],
    cols_dense = cols_dense,
    col_label = "label",
    batch_size = 100,
    num_generated_batches = 3,
    seed = 123,
    device = device
)

replace False


In [17]:
val_data = RecBatch(
    data = val_df,
    cols_sparse = [c+'_enc' for c in cols_sparse],
    cols_dense = cols_dense,
    col_label = "label",
    batch_size = 100,
    num_generated_batches = None,
    seed = 123,
    device = device
)

### Check

In [18]:
i=0

batch = build_batch(val_data.batches[i], cols_sparse)
sparse_features = batch.sparse_features

In [19]:
sparse_features[val_data.cols_sparse[0]].to_dense()[:5]

[tensor([2], device='cuda:0'),
 tensor([0], device='cuda:0'),
 tensor([1], device='cuda:0'),
 tensor([1], device='cuda:0'),
 tensor([2], device='cuda:0')]

In [20]:
val_df.iloc[:5]

Unnamed: 0,feat1,feat2,cat1,cat2,label,cat1_enc,cat2_enc
0,0.445078,0.124188,B,"[A, B, C]",1,[2],[0]
1,0.672566,0.161864,B,"[A, B, C]",1,[0],[0]
2,0.461935,0.552659,B,"[A, C]",0,[1],"[2, 0, 1]"
3,0.849517,0.144644,A,"[A, B]",0,[1],"[2, 0]"
4,0.835506,0.339795,C,"[A, C]",0,[2],"[2, 1]"


In [21]:
sparse_features[val_data.cols_sparse[1]].to_dense()[:5]

[tensor([0], device='cuda:0'),
 tensor([0], device='cuda:0'),
 tensor([2, 0, 1], device='cuda:0'),
 tensor([2, 0], device='cuda:0'),
 tensor([2, 1], device='cuda:0')]

# 3. Training / Test

In [22]:
embedding_dim = 10
num_embeddings_per_feature = {c+'_enc': len(v) for c, v in map_sparse.items()}

dense_arch_layer_sizes = [512, 256, embedding_dim]
over_arch_layer_sizes = [512, 512, 256, 1]

adagrad = False
eps = 1e-8 # Epsilon for Adagrad optimizer

learning_rate = 0.01# 0.01 #15.0

In [23]:
model_dlrm = DLRMCustom(
        cols_dense, cols_sparse,
        embedding_dim, num_embeddings_per_feature,
        dense_arch_layer_sizes, over_arch_layer_sizes,
        adagrad, learning_rate, eps,
        device)

In [24]:
n_epochs = 100
e_patience = 10

In [25]:
scores = model_dlrm.train_test(train_data, test_data, n_epochs, e_patience, nb_batches = None)

 22%|██▏       | 22/100 [00:06<00:23,  3.29it/s, epoch=23, loss_train=0.693, losses_test=0.692, auc_train=0.513, auc_test=0.529]

Early stopping





In [26]:
results = pd.DataFrame(scores).T
results = results.reset_index().rename(columns = {'index':'epoch'})

plot_results(results)

# 5. Cross-Validation

In [27]:
batch_size = 500
num_generated_batches_train = 100
num_generated_batches_val = 10
n_epochs, e_patience = 100, 10

In [28]:
batch_size*num_generated_batches_train

50000

In [29]:
train_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

In [30]:
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold

seed = 123
kfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)

test_aucs = {}

for i, (train_index, test_index) in enumerate(kfolds.split(train_df, train_df.label)):
    print("---"*10)
    print(f"Fold {i+1}/{kfolds.n_splits}")

    train_df_kf = train_df.iloc[train_index].reset_index(drop=True)
    test_df_kf = train_df.iloc[test_index].reset_index(drop=True)

    print('   Generate train data ...')

    train_data = RecBatch(
        data = train_df_kf,
        cols_sparse = [c+'_enc' for c in cols_sparse],
        cols_dense = cols_dense,
        col_label = "label",
        batch_size = batch_size,
        num_generated_batches = num_generated_batches_train,
        seed = seed,
        device= device
    )

    print('   Generate test data ...')

    test_data = RecBatch(
        data = test_df_kf,
        cols_sparse = [c+'_enc' for c in cols_sparse],
        cols_dense = cols_dense,
        col_label = "label",
        batch_size = batch_size,
        num_generated_batches = num_generated_batches_val,
        seed = seed,
        device= device
    )

    model_dlrm = DLRMCustom(
        cols_dense, cols_sparse,
        embedding_dim, num_embeddings_per_feature,
        dense_arch_layer_sizes, over_arch_layer_sizes,
        adagrad, learning_rate, eps,
        device)

    scores = model_dlrm.train_test(train_data, test_data, n_epochs, e_patience)
    scores = pd.DataFrame(scores).T.reset_index().rename(columns = {'index':'epoch'})

    print('   Scores:')
    print(scores.iloc[-5:])

    labels, preds, losses_test, auc_test = model_dlrm.evaluate(test_data)
    print("   Test AUC:", round(auc_test, 4))

    test_aucs[i] = auc_test


------------------------------
Fold 1/3
   Generate train data ...
replace True
   Generate test data ...
replace True


 62%|██████▏   | 62/100 [00:32<00:20,  1.90it/s, epoch=63, loss_train=0.685, losses_test=0.696, auc_train=0.571, auc_test=0.524]


Early stopping
   Scores:
    epoch  loss_train  loss_test  auc_train  auc_test
58     58    0.685878   0.697065     0.5659    0.5210
59     59    0.685507   0.697801     0.5671    0.5227
60     60    0.684978   0.695432     0.5697    0.5248
61     61    0.684526   0.697813     0.5715    0.5241
62     62    0.684680   0.695828     0.5710    0.5241
   Test AUC: 0.5241
------------------------------
Fold 2/3
   Generate train data ...
replace True
   Generate test data ...
replace True


 94%|█████████▍| 94/100 [00:44<00:02,  2.13it/s, epoch=95, loss_train=0.673, losses_test=0.719, auc_train=0.609, auc_test=0.5]


Early stopping
   Scores:
    epoch  loss_train  loss_test  auc_train  auc_test
90     90    0.674196   0.714838     0.6053    0.4925
91     91    0.672693   0.716547     0.6099    0.5107
92     92    0.673487   0.714668     0.6080    0.5035
93     93    0.675512   0.712997     0.6018    0.4999
94     94    0.672610   0.719446     0.6089    0.5002
   Test AUC: 0.5002
------------------------------
Fold 3/3
   Generate train data ...
replace True
   Generate test data ...
replace True


 94%|█████████▍| 94/100 [00:44<00:02,  2.13it/s, epoch=95, loss_train=0.659, losses_test=0.734, auc_train=0.642, auc_test=0.489]

Early stopping
   Scores:
    epoch  loss_train  loss_test  auc_train  auc_test
90     90    0.660853   0.732778     0.6366    0.4764
91     91    0.663043   0.731722     0.6314    0.4958
92     92    0.659055   0.737153     0.6399    0.4735
93     93    0.662437   0.729665     0.6325    0.4896
94     94    0.658766   0.734510     0.6419    0.4888
   Test AUC: 0.4888





In [31]:
test_aucs = pd.DataFrame.from_dict(test_aucs, orient='index').\
              reset_index().\
              rename(columns = {'index': 'epoch', 0:'val_auc'})

test_aucs

Unnamed: 0,epoch,val_auc
0,0,0.524127
1,1,0.500151
2,2,0.4888
