# Deep Crossing

该模型使用 Field 的概念，将每一个特征使用 Embedding 嵌入到低维度空间内。如果有 $n$ 个特征，那么需要把每一个维度 Embedding 后的结果拼接起来，最后送入残差网络，进行优化，得到最终的结果。

In [1]:
# build field dict

import os

BASEDIR = os.getcwd()

fields_dict = {}

with open(BASEDIR + '/assets/datasets/criteo_ctr/small_train.txt') as f:
    line = f.readline()
    line = line.strip('\n')

    while line:
        elems = line.split(' ')

        for i in range(1, len(elems)):
            field, feature, value = elems[i].split(':')
            if field not in fields_dict:
                fields_dict[field] = {}
                fields_dict[field]['field'] = {feature: 0}
                fields_dict[field]['last_idx'] = 0
            elif feature not in fields_dict[field]['field']:
                next_idx = fields_dict[field]['last_idx'] + 1
                fields_dict[field]['field'][feature] = next_idx
                fields_dict[field]['last_idx'] = next_idx
 
        line = f.readline()
        line = line.strip('\n')

for field in fields_dict.keys():
    next_idx = fields_dict[field]['last_idx'] + 1
    fields_dict[field]['field']['none'] = next_idx
    fields_dict[field]['last_idx'] = next_idx

In [2]:
field_cnt = len(fields_dict)
field_cnt

18

In [None]:
# build train data

X_train = []
y_train = []
idx = 0
with open(BASEDIR + '/assets/datasets/criteo_ctr/small_train.txt') as f:
    line = f.readline()
    line = line.strip('\n')

    while line:
        elems = line.split(' ')
        y_train.append(int(elems[0]))

        X_train.append([])
        for i in range(field_cnt):
            X_train[idx].append([])

        for i in range(1, len(elems)):
            field, feature, value = elems[i].split(':')
            field_idx = int(field)
            X_train[idx][field_idx].append(fields_dict[field]['field'][feature])

        for i in range(field_cnt):
            if len(X_train[idx][i]) == 0:
                X_train[idx][i].append(fields_dict[str(i)]['field']['none'])
       
        line = f.readline()
        line = line.strip('\n')
        idx += 1

In [9]:
# build embedding layer

import torch
import torch.nn as nn
import torch.nn.functional as F


class ResidualBlock(nn.Module):
    def __init__(self, INPUT_DIMENSION, HIDDEN_DIMENSION):
        super(ResidualBlock, self).__init__()
        self.INPUT_DIMENSION = INPUT_DIMENSION
        self.OUTPUT_DIMENSION = OUTPUT_DIMENSION
        self.h1 = nn.Linear(INPUT_DIMENSION, HIDDEN_DIMENSION, bias=True).double()
        self.h2 = nn.Linear(HIDDEN_DIMENSION, INPUT_DIMENSION, bias=True).double()

    def forward(self, x):
        residual = x
        out = F.relu(self.h1(x))
        out = self.h2(out)
        out = F.relu(out + residual)
        return out


class DeepCrossing(nn.Module):
    def __init__(self, fields_dict, embedding_size):
        super(DeepCrossing, self).__init__()

        self.fields_dict = fields_dict
        self.input_embeddings = []
        self.embedding_size = embedding_size

        for i in fields_dict.keys():
            self.input_embeddings.append(nn.Linear(len(self.fields_dict[i]['field']), self.embedding_size))
        
        self.res_input_dim = len(self.fields_dict[i]['field']) * 3
        self.res1 = ResidualBlock(self.res_input_dim, 5)
        self.res2 = ResidualBlock(res_input_dim, 5)
        self.res3 = ResidualBlock(res_input_dim, 5)
        self.fc = nn.Linear(res_input_dim, 1)

    def forward(self, x):
        embedding_input = []
        for idx in range(len(self.input_embeddings)):
            embedding_input.append(self.input_embeddings[i](x[i]))

        stack_input = torch.concat(embedding_input, 1)
        out = self.res1(stack_input)
        out = self.res2(out)
        out = self.res3(out)
        out = F.sigmoid(self.fc(out))

        return out