In [1]:
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

from torch.utils.data import DataLoader, Dataset, TensorDataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import roc_auc_score, accuracy_score
import datetime
import warnings
warnings.filterwarnings('ignore')

## 构造数据

In [2]:
def create_criteo_dataset(file_path, test_size=0.3):
    data = pd.read_csv(file_path)
    
    dense_features = [col for col in data.columns if col[0]=='I']
    sparse_features = [col for col in data.columns if col[0]=='C']
    
    #缺失值填充
    data[dense_features] = data[dense_features].fillna(0)
    data[sparse_features] = data[sparse_features].fillna('-1')
    
    #归一化
    data[dense_features] = MinMaxScaler().fit_transform(data[dense_features])
    #one-hot 编码
    data = pd.get_dummies(data)
    
    #数据集划分
    X = data.drop(['label'], axis=1).values
    y = data['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    return (X_train, y_train), (X_test, y_test)

In [4]:
(X_train, y_train), (X_test, y_test) = create_criteo_dataset('./train.txt', test_size=0.5)

train_dataset = TensorDataset(torch.tensor(X_train).float(), torch.tensor(y_train.values).float())
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=32)

#看下数据
for b in iter(train_loader):
    print(b[0].shape, b[1])
    break

torch.Size([32, 13104]) tensor([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.])


## 构造模型

In [5]:
class FMLayer(nn.Module):
    def __init__(self, n=10, k=5):
        """
        n:特征维度
        k:隐向量维度
        """
        super(FMLayer, self).__init__()
        self.n = n
        self.k = k
        self.linear = nn.Linear(self.n, 1) #前两项线性层
        self.v = nn.Parameter(torch.randn(self.n, self.k)) #二阶部分的交互矩阵
        nn.init.uniform_(self.v, -0.1, 0.1)
    def forward(self, x):
        #x的维度是(batch_size, n)
        linear_part = self.linear(x)
        #根据公式计算二阶部分
        inter_part1 = torch.mm(x, self.v)
        inter_part2 = torch.mm(torch.pow(x, 2), torch.pow(self.v, 2))
        inter = 0.5 * torch.sum(torch.sub(torch.pow(inter_part1, 2), inter_part2), 1, keepdim=True)
        output = linear_part + inter
        output = torch.sigmoid(output)
        return output

In [24]:
class DNN_layer(nn.Module):
    def __init__(self, hidden_units, input_dim, dropout=0.):
        super(DNN_layer, self).__init__()
        self.dnn_network = nn.ModuleList([nn.Linear(layer[0], layer[1]) for layer in list(zip(hidden_units[:-1], hidden_units[1:]))])
        self.dropout = nn.Dropout(dropout)
        self.final_linear = nn.Linear(hidden_units[-1], 1)
    def forward(self, x):
        for linear in self.dnn_network:
            x = linear(x)
            x = F.relu(x)
        x = self.dropout(x)
        output = self.final_linear(x)
        output = torch.sigmoid(output)
        return output

In [52]:
#构建FM模型
n = X_train.shape[1]
k = 8
fm_model = FMLayer(n, k)
fm_model

FMLayer(
  (linear): Linear(in_features=13104, out_features=1, bias=True)
)

In [53]:
#测试一下FM模型
for fea, label in iter(train_loader):
    fm_out = fm_model(fea)
    print(fm_out)
    break

tensor([[0.5297],
        [0.4689],
        [0.4627],
        [0.4739],
        [0.4466],
        [0.5589],
        [0.4697],
        [0.5083],
        [0.5216],
        [0.4904],
        [0.4805],
        [0.4588],
        [0.5141],
        [0.4549],
        [0.4475],
        [0.5383],
        [0.6233],
        [0.5263],
        [0.5061],
        [0.5257],
        [0.4598],
        [0.4690],
        [0.4353],
        [0.5182],
        [0.4684],
        [0.4338],
        [0.4765],
        [0.4684],
        [0.4989],
        [0.5701],
        [0.4763],
        [0.4953]], grad_fn=<SigmoidBackward>)


## 训练和评估模型

In [54]:
epochs = 10
log_step_freq = 10
def auc(y_pred, y_true):
    pred = y_pred.data
    y = y_true.data
    return roc_auc_score(y, pred)

loss_func = nn.BCELoss()
optimizer = torch.optim.Adam(params=fm_model.parameters(), lr=0.001)
metric_func = auc
metric_name = 'auc'

print('fm start_training.........')

for epoch in range(1, epochs+1):
    
    # 训练阶段
    fm_model.train()
    fm_loss_sum = 0.0
    fm_metric_sum = 0.0
    step = 1
    
    for step, (features, labels) in enumerate(train_loader, 1):
        # 梯度清零
        optimizer.zero_grad()
        
        # 正向传播
        predictions = fm_model(features);
        loss = loss_func(predictions, labels)
        try:
            metric = metric_func(predictions, labels)
        except ValueError:
            pass
        
        # 反向传播
        loss.backward()
        optimizer.step()
        
        # 打印batch级别日志
        fm_loss_sum += loss.item()
        fm_metric_sum += metric.item()
        if step % log_step_freq == 0:
            print(("[step=%d] loss: %.3f, " + metric_name + ": %.3f") % (step, fm_loss_sum/step, fm_metric_sum/step));

print('fm model Finished Training')

start_training.........
[step=10] loss: 0.670, auc: 0.508
[step=20] loss: 0.644, auc: 0.552
[step=30] loss: 0.625, auc: 0.569
[step=10] loss: 0.533, auc: 0.852
[step=20] loss: 0.514, auc: 0.865
[step=30] loss: 0.500, auc: 0.879
[step=10] loss: 0.439, auc: 0.972
[step=20] loss: 0.434, auc: 0.964
[step=30] loss: 0.434, auc: 0.948
[step=10] loss: 0.397, auc: 0.975
[step=20] loss: 0.386, auc: 0.973
[step=30] loss: 0.374, auc: 0.976
[step=10] loss: 0.326, auc: 0.990
[step=20] loss: 0.325, auc: 0.992
[step=30] loss: 0.317, auc: 0.989
[step=10] loss: 0.278, auc: 0.995
[step=20] loss: 0.265, auc: 0.995
[step=30] loss: 0.261, auc: 0.996
[step=10] loss: 0.207, auc: 1.000
[step=20] loss: 0.213, auc: 0.999
[step=30] loss: 0.210, auc: 0.998
[step=10] loss: 0.176, auc: 0.999
[step=20] loss: 0.169, auc: 0.999
[step=30] loss: 0.167, auc: 1.000
[step=10] loss: 0.141, auc: 1.000
[step=20] loss: 0.140, auc: 1.000
[step=30] loss: 0.134, auc: 1.000
[step=10] loss: 0.114, auc: 1.000
[step=20] loss: 0.109, a

In [55]:
#先用fm中的交互矩阵v初始化dnn的embedding部分,换种思路，就是用fm中生成的权重先对原始数据做转换(因为embedding其实是一个mlp)
#生成用于dnn的数据
new_X_train = torch.unsqueeze(torch.tensor(X_train).float(), dim=2)
v = fm_model.v
new_X_train1 = torch.mul(new_X_train, v).reshape(-1, v.shape[0]*v.shape[1])
#在这里一定要用numpy作为中间存储，不然会和fm的图连在一起，后面的dnn的网络图没法训练
new_X_trains = new_X_train1.data.numpy()
new_train_dataset = TensorDataset(torch.tensor(new_X_trains).float(), torch.tensor(y_train.values).float())
new_train_loader = DataLoader(new_train_dataset, shuffle=True, batch_size=32)

#看下数据
for b in iter(new_train_loader):
    print(b[0].shape, b[1])
    break

torch.Size([32, 104832]) tensor([0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1.])


In [89]:
#构建dnn模型
input_dim = new_X_trains.shape[1]
print(input_dim)
hidden_units = [input_dim, 128, 64, 32, 16]
dnn_model = DNN_layer(hidden_units, input_dim)
dnn_model

104832


DNN_layer(
  (dnn_network): ModuleList(
    (0): Linear(in_features=104832, out_features=128, bias=True)
    (1): Linear(in_features=128, out_features=64, bias=True)
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): Linear(in_features=32, out_features=16, bias=True)
  )
  (dropout): Dropout(p=0.0)
  (final_linear): Linear(in_features=16, out_features=1, bias=True)
)

In [90]:
#测试一下dnn模型
for fea, label in iter(new_train_loader):
    dnn_out = dnn_model(fea)
    print(dnn_out)
    break

tensor([[0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432],
        [0.4432]], grad_fn=<SigmoidBackward>)


In [91]:
# 训练dnn模型
epochs = 10
log_step_freq = 10

loss_func1 = nn.BCELoss()
optimizer1 = torch.optim.Adam(params=dnn_model.parameters(), lr=0.0001)
metric_func = auc
metric_name = 'auc'

print('start_training.........')

for epoch in range(1, epochs+1):
    
    # 训练阶段
    dnn_model.train()
    dnn_loss_sum = 0.0
    dnn_metric_sum = 0.0
    step = 1
    
    for step, (features, labels) in enumerate(new_train_loader, 1):
        # 梯度清零
        optimizer1.zero_grad()
        
        # 正向传播
        predictions = dnn_model(features);
        loss = loss_func1(predictions, labels)
        try:
            metric = metric_func(predictions, labels)
        except ValueError:
            pass
        
        # 反向传播
        loss.backward()
        optimizer1.step()
        
        # 打印batch级别日志
        dnn_loss_sum += loss.item()
        dnn_metric_sum += metric.item()
        if step % log_step_freq == 0:
            print(("[step=%d] loss: %.3f, " + metric_name + ": %.3f") % (step, dnn_loss_sum/step, dnn_metric_sum/step));

print('dnn model Finished Training')

start_training.........
[step=10] loss: 0.643, auc: 0.592
[step=20] loss: 0.639, auc: 0.595
[step=30] loss: 0.633, auc: 0.597
[step=10] loss: 0.631, auc: 0.720
[step=20] loss: 0.630, auc: 0.753
[step=30] loss: 0.629, auc: 0.742
[step=10] loss: 0.628, auc: 0.794
[step=20] loss: 0.622, auc: 0.778
[step=30] loss: 0.620, auc: 0.779
[step=10] loss: 0.615, auc: 0.789
[step=20] loss: 0.608, auc: 0.823
[step=30] loss: 0.602, auc: 0.809
[step=10] loss: 0.594, auc: 0.851
[step=20] loss: 0.580, auc: 0.836
[step=30] loss: 0.579, auc: 0.828
[step=10] loss: 0.534, auc: 0.832
[step=20] loss: 0.543, auc: 0.852
[step=30] loss: 0.548, auc: 0.856
[step=10] loss: 0.528, auc: 0.897
[step=20] loss: 0.507, auc: 0.880
[step=30] loss: 0.502, auc: 0.874
[step=10] loss: 0.490, auc: 0.893
[step=20] loss: 0.483, auc: 0.889
[step=30] loss: 0.480, auc: 0.894
[step=10] loss: 0.455, auc: 0.947
[step=20] loss: 0.456, auc: 0.910
[step=30] loss: 0.447, auc: 0.928
[step=10] loss: 0.403, auc: 0.952
[step=20] loss: 0.415, a

## 用测试集评估

In [92]:
test_y = y_test.values
y_pred_fm_probs = fm_model(torch.tensor(X_test).float())
y_pred_fm = torch.where(y_pred_fm_probs>0.5, torch.ones_like(y_pred_fm_probs), torch.zeros_like(y_pred_fm_probs))

test_auc_fm = roc_auc_score(test_y, y_pred_fm_probs.data.numpy())
test_acc_fm = accuracy_score(test_y, y_pred_fm.data.numpy())
print('test_auc_fm:%.3f test_acc_fm:%.3f'%(test_auc_fm, test_acc_fm))



new_X_test = torch.unsqueeze(torch.tensor(X_test).float(), dim=2)
v = fm_model.v
new_X_test1 = torch.mul(new_X_test, v).reshape(-1, v.shape[0]*v.shape[1])
y_pred_fnn_probs = dnn_model(new_X_test1.float())
y_pred_fnn = torch.where(y_pred_fnn_probs>0.5, torch.ones_like(y_pred_fnn_probs), torch.zeros_like(y_pred_fnn_probs))

test_auc_fnn = roc_auc_score(test_y, y_pred_fnn_probs.data.numpy())
test_acc_fnn = accuracy_score(test_y, y_pred_fnn.data.numpy())
print('test_auc_fnn:%.3f, test_acc_fnn:%.3f'%(test_auc_fnn, test_acc_fnn))

test_auc_fm:0.630 test_acc_fm:0.795
test_auc_fnn:0.643, test_acc_fnn:0.802
