所有的都在git上面有官方的torch和caffe的實現
 
這裡只能根據自己的一個粗淺的理解，梳理一個簡單的代碼版本

略去了並行處理和一些複雜的優化，包括各個層獨特的優化以及使用GPU train

[官方實現](https://github.com/facebookresearch/dlrm)

In [1]:
import torch.nn as nn
import torch
class DLRM_Net(nn.Module):
    def __init__(self):
        super(DLRM_Net,self).__init__()
        self.emb_l = self.create_emb(2,[4,3,2])
        self.bot_l = self.create_mlp([4,3,2],-1)
        self.top_l = self.create_mlp([8,4,2,1],2)
        self.loss_fn = torch.nn.MSELoss(reduction='mean')
    def create_mlp(self,ln,sigmoid_layer):
        layers = nn.ModuleList()
        for i in range(0,len(ln)-1):
            n = ln[i]
            m = ln[i+1]
            LL = nn.Linear(int(n),int(m),bias=True)
            layers.append(LL)
            if i==sigmoid_layer:
                layers.append(nn.Sigmoid())
            else:
                layers.append(nn.ReLU())
        return torch.nn.Sequential(*layers)
    def create_emb(self,m,ln):
        emb_l = nn.ModuleList()
        for i in range(0,len(ln)):
            n = ln[i]
            EE = nn.EmbeddingBag(n,m,mode="sum",sparse=True)
            emb_l.append(EE)
        return emb_l
    def apply_emb(self,lS_o,lS_i,emb_l):
        ly = []
        for k,sparse_index_group_batch in enumerate(lS_i):
            sparse_offset_group_batch = lS_o[k]
            E = emb_l[k]
            V = E(
                sparse_index_group_batch,
                sparse_offset_group_batch,
            )
            ly.append(V)
        return ly
    def apply_mlp(self,x,layers):
        return layers(x)
    
    def interact_features(self,x,ly):
        (batch_size,d) = x.shape
        T = torch.cat([x]+ly,dim=1).view((batch_size,-1,d))
        Z = torch.bmm(T,torch.transpose(T,1,2))
        _,ni,nj = Z.shape
        li = torch.tensor([i for i in range(ni) for j in range(i)])
        lj = torch.tensor([j for i in range(nj) for j in range(i)])
        Zflat = Z[:, li, lj]
        # concatenate dense features and interactions
        R = torch.cat([x] + [Zflat], dim=1)
        return R

    def forward(self,dense_x,lS_o,lS_i):
        x = self.apply_mlp(dense_x,self.bot_l)
        ly = self.apply_emb(lS_o,lS_i,self.emb_l)
        z = self.interact_features(x,ly)
        p = self.apply_mlp(z,self.top_l)
        return p

In [2]:
import numpy as np
'''
# here we only show the fake train code and the output and input shape
train code
parameters = dlrm.parameters()
optimizer = torch.optimi.SGD(parameters,lr=0.01)
for j,inputBatch in enumerate(train_ld):
    x,lS_o,lS_i,T = inputBatch
    # forward
    Z = dlrm(
        x,
        lS_o,
        lS_i
    )
    E = dlrm.loss_fn(Z,T)
    # backward
    optimizer.zero_grad()
    E.backward()
'''
# fake data described here
# dense feature
B = 20
m_den= 4
ln_emb = [4,3,2]
num_indices_per_lookup = 10
Xt = torch.tensor(np.random.rand(B, m_den).astype(np.float32))
# sparse feature (sparse indices)
lS_emb_offsets = []
lS_emb_indices = []
# for each embedding generate a list of n lookups,
# where each lookup is composed of multiple sparse indices
for size in ln_emb:
    lS_batch_offsets = []
    lS_batch_indices = []
    offset = 0
    for _ in range(B):
        # num of sparse indices to be used per embedding (between
        # random between [1,num_indices_per_lookup])
        r = np.random.random(1)
        sparse_group_size = np.int64(
            np.round(max([1.0], r * min(size, num_indices_per_lookup)))
        )
        # sparse indices to be used per embedding
        r = np.random.random(sparse_group_size)
        sparse_group = np.unique(np.round(r * (size - 1)).astype(np.int64))
        # reset sparse_group_size in case some index duplicates were removed
        sparse_group_size = np.int32(sparse_group.size)
        # store lengths and indices
        lS_batch_offsets += [offset]
        lS_batch_indices += sparse_group.tolist()
        # update offset for next iteration
        offset += sparse_group_size
    lS_emb_offsets.append(torch.tensor(lS_batch_offsets))
    lS_emb_indices.append(torch.tensor(lS_batch_indices))
print(lS_emb_offsets)
print(lS_emb_indices)

[tensor([ 0,  2,  4,  5,  7,  9, 10, 12, 13, 16, 17, 18, 19, 20, 22, 24, 26, 29,
        32, 34]), tensor([ 0,  1,  2,  3,  4,  5,  6,  8,  9, 11, 13, 14, 17, 19, 21, 22, 24, 26,
        27, 28]), tensor([ 0,  1,  2,  3,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 19,
        20, 21])]
[tensor([0, 2, 1, 2, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 2, 3, 3, 0, 2, 2, 0, 1, 0, 3,
        1, 2, 1, 2, 3, 0, 1, 2, 1, 3, 3]), tensor([2, 2, 1, 1, 1, 0, 1, 2, 1, 1, 2, 0, 1, 1, 0, 1, 2, 0, 1, 0, 1, 2, 0, 2,
        1, 2, 0, 0, 2]), tensor([0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1])]


In [3]:
dlrm = DLRM_Net()
Z_test = dlrm(Xt,lS_emb_offsets,lS_emb_indices)
Z_test

tensor([[0.5473],
        [0.5473],
        [0.5473],
        [0.5473],
        [0.5473],
        [0.5448],
        [0.5473],
        [0.5473],
        [0.5473],
        [0.5473],
        [0.5431],
        [0.5473],
        [0.5473],
        [0.5466],
        [0.5473],
        [0.5473],
        [0.5473],
        [0.5473],
        [0.5473],
        [0.5473]], grad_fn=<SigmoidBackward>)