# Deep Crossing

该模型使用 Field 的概念，将每一个特征使用 Embedding 嵌入到低维度空间内。如果有 $n$ 个特征，那么需要把每一个维度 Embedding 后的结果拼接起来，最后送入残差网络，进行优化，得到最终的结果。

In [1]:
# build field dict

import os

BASEDIR = os.getcwd()

fields_dict = {}

with open(BASEDIR + '/assets/datasets/criteo_ctr/small_train.txt') as f:
    line = f.readline()
    line = line.strip('\n')

    while line:
        elems = line.split(' ')

        for i in range(1, len(elems)):
            field, feature, value = elems[i].split(':')
            if field not in fields_dict:
                fields_dict[field] = {}
                fields_dict[field]['field'] = {feature: 0}
                fields_dict[field]['last_idx'] = 0
            elif feature not in fields_dict[field]['field']:
                next_idx = fields_dict[field]['last_idx'] + 1
                fields_dict[field]['field'][feature] = next_idx
                fields_dict[field]['last_idx'] = next_idx
 
        line = f.readline()
        line = line.strip('\n')

for field in fields_dict.keys():
    next_idx = fields_dict[field]['last_idx'] + 1
    fields_dict[field]['field']['none'] = next_idx
    fields_dict[field]['last_idx'] = next_idx

field_cnt = len(fields_dict)
field_cnt

In [3]:
# build train data

X_train = []
y_train = []
idx = 0
with open(BASEDIR + '/assets/datasets/criteo_ctr/small_train.txt') as f:
    line = f.readline()
    line = line.strip('\n')

    while line:
        elems = line.split(' ')
        y_train.append(int(elems[0]))

        X_train.append([])
        for i in range(field_cnt):
            X_train[idx].append([])

        for i in range(1, len(elems)):
            field, feature, value = elems[i].split(':')
            field_idx = int(field)
            X_train[idx][field_idx].append(fields_dict[field]['field'][feature])

        for i in range(field_cnt):
            if len(X_train[idx][i]) == 0:
                X_train[idx][i].append(fields_dict[str(i)]['field']['none'])
       
        line = f.readline()
        line = line.strip('\n')
        idx += 1

In [4]:
# build embedding layer

import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np


class ResidualBlock(nn.Module):
    def __init__(self, INPUT_DIMENSION, HIDDEN_DIMENSION):
        super(ResidualBlock, self).__init__()
        self.INPUT_DIMENSION = INPUT_DIMENSION
        self.h1 = nn.Linear(INPUT_DIMENSION, HIDDEN_DIMENSION, bias=True).double()
        self.h2 = nn.Linear(HIDDEN_DIMENSION, INPUT_DIMENSION, bias=True).double()

    def forward(self, x):
        residual = x
        out = F.relu(self.h1(x))
        out = self.h2(out)
        out = F.relu(out + residual)
        return out


class DeepCrossing(nn.Module):
    def __init__(self, fields_dict, embedding_size):
        super(DeepCrossing, self).__init__()

        self.fields_dict = fields_dict
        self.input_embeddings = []
        self.embedding_size = embedding_size

        for j in range(len(self.fields_dict)):
            self.input_embeddings.append(nn.Linear(len(self.fields_dict[str(j)]['field']), self.embedding_size).double())

        self.res_input_dim = len(self.fields_dict) * self.embedding_size
        self.res1 = ResidualBlock(self.res_input_dim, 5)
        self.res2 = ResidualBlock(self.res_input_dim, 5)
        self.res3 = ResidualBlock(self.res_input_dim, 5)
        self.fc = nn.Linear(self.res_input_dim, 1).double()

    def forward(self, x):
        embedding_input = []
        batch_size = len(x)
        for idx in range(len(self.input_embeddings)):
            fields_size = len(self.fields_dict[str(idx)]['field'])
            fields_idx_input = torch.zeros(fields_size, batch_size, dtype=dtype, device=device)
            for i in range(batch_size):
                for q in x[i][idx]:
                    fields_idx_input[q, i] = 1.0
            t = self.input_embeddings[idx](fields_idx_input.T)
            embedding_input.append(t)

        stack_input = torch.cat(embedding_input, 1)
        out = self.res1(stack_input)
        out = self.res2(out)
        out = self.res3(out)
        out = F.sigmoid(self.fc(out))

        return out

In [5]:
# PyTorch Version

import torch.optim as optim

dtype = torch.double
device = torch.device('cpu')

LEARNING_RATE = 1e-3

EPOCH = 10
PRINT_STEP = EPOCH / 10
N = len(y_train)

deepCrossing = DeepCrossing(fields_dict, 8)

BATCH_SIZE = 8
loss_fn = nn.BCELoss(size_average=True, reduce=True)
optimizer = optim.Adam(deepCrossing.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCH):
    start = 0
    end = start + BATCH_SIZE

    while start < N:
        optimizer.zero_grad()
        if end >= N:
            end = N

        X_batch = X_train[start:end]
        y_batch = torch.from_numpy(np.array(y_train[start:end], np.float)).reshape(-1, BATCH_SIZE)

        y_hat = deepCrossing(X_batch).reshape(-1, BATCH_SIZE)
        loss = loss_fn(y_hat, y_batch)

        loss.backward()
        optimizer.step()

        start = end
        end = start + BATCH_SIZE

    if epoch % PRINT_STEP == 0:
        print('EPOCH: %d, loss: %f' % (epoch, loss))


EPOCH: 0, loss: 0.561079
EPOCH: 1, loss: 0.560367
EPOCH: 2, loss: 0.560164
EPOCH: 3, loss: 0.559752
EPOCH: 4, loss: 0.558925
EPOCH: 5, loss: 0.558395
EPOCH: 6, loss: 0.557568
EPOCH: 7, loss: 0.556281
EPOCH: 8, loss: 0.554932
EPOCH: 9, loss: 0.553097
