# Criteo data

In [1]:
NUM_DENSE = 13
NUM_SPARSE = 26

In [2]:
dense_features = [f"DENSE_{i}" for i in range(NUM_DENSE)]
sparse_features = [f"SPARSE_{i}" for i in range(NUM_SPARSE)]

In [3]:
columns = ["labels"] + dense_features + sparse_features

In [4]:
column_types = {col: 'float32' for col in ["labels"] + dense_features}
column_types.update({col: 'category' for col in sparse_features})

In [5]:
day_to_load = 0

In [6]:
file_path = f"/data/day_{day_to_load}.gz"

In [7]:
import pandas as pd

chunk_size = 1000  # Adjust this value based on your system's memory constraints
result_df = pd.DataFrame()
num_chunks = 1

for chunk in pd.read_csv(file_path, sep='\t', header=None, names=columns, dtype=column_types, compression='gzip', chunksize=chunk_size):
    # Process each chunk as needed
    # For example, you can perform operations on the chunk like filtering, transformation, etc.
    # For demonstration, let's append the chunk to the result DataFrame
    result_df = pd.concat([result_df, chunk], ignore_index=True)
    num_chunks -= 1
    if num_chunks == 0:
        break

# Display the resulting DataFrame
print(result_df.head())

   labels  DENSE_0  DENSE_1  DENSE_2  DENSE_3  DENSE_4  DENSE_5  DENSE_6  \
0     1.0      5.0    110.0      NaN     16.0      NaN      1.0      0.0   
1     0.0     32.0      3.0      5.0      NaN      1.0      0.0      0.0   
2     0.0      NaN    233.0      1.0    146.0      1.0      0.0      0.0   
3     0.0      NaN     24.0      NaN     11.0     24.0      NaN      0.0   
4     0.0     60.0    223.0      6.0     15.0      5.0      0.0      0.0   

   DENSE_7  DENSE_8  ...  SPARSE_16  SPARSE_17  SPARSE_18  SPARSE_19  \
0     14.0      7.0  ...   d20856aa   b8170bba   9512c20b   c38e2f28   
1     61.0      5.0  ...   d20856aa   a1eb1511   9512c20b   febfd863   
2     99.0      7.0  ...   d20856aa   628f1b8d   9512c20b   c38e2f28   
3     56.0      3.0  ...   1f7fc70b   a1eb1511   9512c20b        NaN   
4      1.0      8.0  ...   d20856aa   d9f758ff   9512c20b   c709ec07   

  SPARSE_20 SPARSE_21 SPARSE_22 SPARSE_23 SPARSE_24 SPARSE_25  
0  14f65a5d  25b1b089  d7c1fc0b  7caf609c  304

In [8]:
result_df.SPARSE_9

0      2e027dc1
1      7de9c0a9
2      2e027dc1
3           NaN
4      6da2367e
         ...   
995    937273d4
996    62da11e3
997    0c55e061
998    62da11e3
999    e241e533
Name: SPARSE_9, Length: 1000, dtype: category
Categories (294, object): ['003add7e', '0062a7ef', '00ef7eef', '01f2feb1', ..., 'fce75ba2', 'fe23372a', 'fea5bfef', 'feea2469']

In [9]:
from typing import NamedTuple, Mapping, Tuple
import torch

class ModelInput(NamedTuple):
    dense_features: torch.Tensor
    sparse_features: Mapping[str, torch.Tensor]

# Model Architecture

An implementation of a deep learning recommendation model (DLRM). The model input consists of dense and sparse features. The former is a vector of floating point values. The latter is a list of sparse indices into embedding tables, which consist of vectors of floating point values. The selected vectors are passed to mlp networks denoted by triangles, in some cases the vectors are interacted through operators (Ops).

In [13]:
# output:
#                     probability of a click
# model:                        |
#                              /\
#                             /__\
#                               |
#       _____________________> Op  <___________________
#     /                         |                      \
#    /\                        /\                      /\
#   /__\                      /__\           ...      /__\
#    |                          |                       |
#    |                         Op                      Op
#    |                    ____/__\_____           ____/__\____
#    |                   |_Emb_|____|__|    ...  |_Emb_|__|___|
# input:
# [ dense features ]     [sparse indices] , ..., [sparse indices]

In [11]:
import torch.nn as nn

# MLP

In [16]:
# Define the MLP model
class MLP(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_size: int):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Dense Arch

In [17]:
import torch.nn as nn

class DenseArch(nn.Module):
    def __init__(self, dense_feature_count: int, output_size: int) -> None:
        super(DenseArch, self).__init__()  # Call the superclass's __init__ method
        self.mlp = MLP(input_size=dense_feature_count, hidden_size=output_size * 2, output_size=output_size) # D X O

    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
        # Input : B X D # Output : B X O
        return self.mlp(inputs)

In [18]:
# Let's test it

In [69]:
dense_tensor = torch.from_numpy(result_df[dense_features].fillna(0).values)
dense_tensor.size()

torch.Size([1000, 13])

In [26]:
output_size = 16

dense_arch = DenseArch(len(dense_features), 16)
dense_out = dense_arch(dense_tensor)

In [27]:
dense_out.size()

torch.Size([1000, 16])

In [28]:
dense_out

tensor([[ -21.8944,   42.7839,   13.1310,  ...,  -19.7198,   -1.6498,
           12.8695],
        [-162.7766,  464.6440,  188.7261,  ..., -186.6807,   22.6407,
          134.0165],
        [-166.5877,  451.4858,  162.3094,  ..., -183.4777,   -2.3576,
          132.6056],
        ...,
        [-220.7068,  624.7383,  244.8305,  ..., -250.1361,   19.7442,
          181.1371],
        [ -20.4149,   22.6927,    8.2683,  ...,   -9.8889,   -4.8041,
            6.0394],
        [-197.4798,  -22.9126,  -27.6376,  ...,  -24.3786, -197.4626,
           44.8815]], grad_fn=<AddmmBackward0>)

# Sparse Arch

In [158]:
from typing import List, Dict

class SparseArch(nn.Module):
    def __init__(self, embedding_dimensions: Mapping[str, Tuple[int, int]], output_size: int) -> None:
        super(SparseArch, self).__init__()

        # Create Embedding layers for each sparse feature
        self.embeddings = nn.ModuleDict({
            feature_name: nn.Embedding(num_embeddings, embedding_dim)
            for feature_name, (num_embeddings, embedding_dim) in embedding_dimensions.items()
        })

        # Create MLP for each sparse feature
        self.mlps = nn.ModuleDict({
            feature_name: MLP(input_size=embedding_dim, hidden_size=output_size * 2, output_size=output_size)
            for feature_name, (num_embeddings, embedding_dim) in embedding_dimensions.items()
        })

    def forward(self, inputs: Dict[str, torch.Tensor]) -> List[torch.Tensor]:
        output_values = []
        for feature, input_values in inputs.items():
            embeddings = self.embeddings[feature](input_values)
            sparse_out = self.mlps[feature](embeddings)
            output_values.append(sparse_out)
        
        return output_values

In [159]:
embedding_dimensions = {fn: (len(result_df[fn].dtype.categories) + 1, 16) for fn in sparse_features}

In [160]:
output_size = 16

In [161]:
sparse_arch = SparseArch(embedding_dimensions=embedding_dimensions, output_size=output_size)

In [162]:
import numpy as np
sparse_values = {fn: torch.from_numpy(result_df[fn].cat.codes.values.astype(np.int64)) + 1 for fn in sparse_features}

# for key in sparse_values:
#     sparse_values[key][sparse_values[key] == -1] = 9999999999

In [163]:
sparse_values["SPARSE_0"].size()

torch.Size([1000])

In [164]:
sparse_out = sparse_arch(sparse_values)

# Dense Sparse Interaction

In [165]:
class DenseSparseInteractionLayer(nn.Module):    
    def forward(self, dense_out: torch.Tensor, sparse_out: List[torch.Tensor]) -> float:
        concat = torch.cat([dense_out] + sparse_out, dim=-1).unsqueeze(2)
        out = torch.bmm(concat, torch.transpose(concat, 1, 2))
        flattened = torch.flatten(out, 1)
        return flattened

In [166]:
class PredictionLayer(nn.Module):
    def __init__(self,dense_out_size: int , sparse_out_sizes: List[int], hidden_size: int): 
        super(PredictionLayer, self).__init__()
        concat_size = sum(sparse_out_sizes) + dense_out_size
        self.mlp = MLP(input_size=concat_size * concat_size, hidden_size=hidden_size, output_size=1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, inputs: torch.Tensor) -> float:
        mlp_out = self.mlp(inputs)
        result = self.sigmoid(mlp_out)
        return result

In [167]:
dense_sparse_interaction_layer = DenseSparseInteractionLayer()
ds_out = dense_sparse_interaction_layer(dense_out, sparse_out)

In [168]:
ds_out.size()

torch.Size([1000, 186624])

In [169]:
prediction_layer = PredictionLayer(dense_out_size=output_size, sparse_out_sizes=[output_size]*len(sparse_features), hidden_size=16)

In [170]:
pred_out = prediction_layer(ds_out)
pred_out.size()

torch.Size([1000, 1])

# Model

In [201]:
from dataclasses import dataclass

# parameters
# dense_input_feature_size
# sparse_embedding_dimenstions
# dense_output_size
# sparse_output_size
# dense_hidden_size
# sparse_hidden_size
# prediction_hidden_size

@dataclass
class Parameters:
    dense_input_feature_size: int
    sparse_embedding_dimenstions: Mapping[str, Tuple[int, int]]
    dense_output_size: int
    sparse_output_size: int
    dense_hidden_size: int
    sparse_hidden_size: int
    prediction_hidden_size: int

class DLRM(nn.Module):
    def __init__(self, parameters: Parameters):
        super(DLRM, self).__init__()
        self.dense_layer = DenseArch(dense_feature_count=parameters.dense_input_feature_size, output_size=parameters.dense_output_size)
        self.sparse_layer = SparseArch(embedding_dimensions=parameters.sparse_embedding_dimenstions, output_size=parameters.sparse_output_size)
        self.interaction_layer = DenseSparseInteractionLayer()
        self.prediction_layer = PredictionLayer(
            dense_out_size=parameters.dense_output_size, 
            sparse_out_sizes=[parameters.sparse_output_size]*len(parameters.sparse_embedding_dimenstions), 
            hidden_size=parameters.prediction_hidden_size
        )

    def forward(self, dense_features, sparse_features) -> float:
        dense_out = self.dense_layer(dense_features)
        sparse_out = self.sparse_layer(sparse_features)
        ds_out = self.interaction_layer(dense_out, sparse_out)
        return self.prediction_layer(ds_out)

In [202]:
parameters = Parameters(
    dense_input_feature_size=len(dense_features),
    sparse_embedding_dimenstions={fn: (len(result_df[fn].dtype.categories) + 1, 16) for fn in sparse_features},
    dense_output_size=16,
    sparse_output_size=16,
    dense_hidden_size=32,
    sparse_hidden_size=32,
    prediction_hidden_size=32)

In [203]:
dlrm = DLRM(parameters=parameters)

In [204]:
# dlrm

In [198]:
model_input = ModelInput(dense_features=dense_tensor, sparse_features=sparse_values)

In [208]:
traced_model = torch.jit.trace(dlrm, [dense_tensor, sparse_values])

In [211]:
print(traced_model.code)

def forward(self,
    dense_features: Tensor,
    sparse_features: Dict[str, Tensor]) -> Tensor:
  prediction_layer = self.prediction_layer
  interaction_layer = self.interaction_layer
  sparse_layer = self.sparse_layer
  dense_layer = self.dense_layer
  input = sparse_features["SPARSE_0"]
  input0 = sparse_features["SPARSE_1"]
  input1 = sparse_features["SPARSE_2"]
  input2 = sparse_features["SPARSE_3"]
  input3 = sparse_features["SPARSE_4"]
  input4 = sparse_features["SPARSE_5"]
  input5 = sparse_features["SPARSE_6"]
  input6 = sparse_features["SPARSE_7"]
  input7 = sparse_features["SPARSE_8"]
  input8 = sparse_features["SPARSE_9"]
  input9 = sparse_features["SPARSE_10"]
  input10 = sparse_features["SPARSE_11"]
  input11 = sparse_features["SPARSE_12"]
  input12 = sparse_features["SPARSE_13"]
  input13 = sparse_features["SPARSE_14"]
  input14 = sparse_features["SPARSE_15"]
  input15 = sparse_features["SPARSE_16"]
  input16 = sparse_features["SPARSE_17"]
  input17 = sparse_features["SP

In [213]:
traced_model(dense_tensor, sparse_values)

tensor([[4.0291e-01],
        [9.5614e-01],
        [9.9924e-01],
        [0.0000e+00],
        [9.9613e-01],
        [4.7108e-01],
        [0.0000e+00],
        [0.0000e+00],
        [3.5301e-19],
        [4.7533e-01],
        [3.0257e-09],
        [2.8145e-08],
        [1.0000e+00],
        [1.0000e+00],
        [1.0000e+00],
        [4.7033e-01],
        [4.6534e-01],
        [7.9726e-01],
        [3.3282e-05],
        [1.8508e-06],
        [5.2667e-01],
        [1.0000e+00],
        [9.9998e-01],
        [3.8780e-01],
        [0.0000e+00],
        [3.9004e-01],
        [0.0000e+00],
        [1.0000e+00],
        [9.9031e-01],
        [0.0000e+00],
        [4.3205e-01],
        [1.0000e+00],
        [6.0877e-01],
        [5.2975e-01],
        [9.9989e-01],
        [4.3957e-01],
        [1.4588e-02],
        [9.9992e-01],
        [5.4316e-05],
        [5.4616e-35],
        [4.4739e-01],
        [4.6119e-01],
        [0.0000e+00],
        [9.9913e-01],
        [9.9304e-01],
        [4