In [2]:
import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

In [3]:
def sparseFeature(feat, feat_onehot_dim, embed_dim):
    return {'feat':feat, 'feat_onehot_dim':feat_onehot_dim, 'embed_dim':embed_dim}
def denseFeature(feat):
    return {'feat':feat}
def create_criteo_dataset(file_path, embed_dim=8, test_size=0.2):
    data = pd.read_csv(file_path)
    
    dense_features = ['I' + str(i) for i in range(1, 14)]
    sparse_features = ['C' + str(i) for i in range(1, 27)]
    
    #缺失值填充
    data[dense_features] = data[dense_features].fillna(0)
    data[sparse_features] = data[sparse_features].fillna('-1')
    
    #归一化
    data[dense_features] = MinMaxScaler().fit_transform(data[dense_features])
    #LabelEncoding编码
    for col in sparse_features:
        data[col] = LabelEncoder().fit_transform(data[col]).astype(int)
    
    feature_columns = [[denseFeature(feat) for feat in dense_features]] + \
    [[sparseFeature(feat, data[feat].nunique(), embed_dim) for feat in sparse_features]]
    
    X = data.drop(['label'], axis=1).values
    y = data['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    return feature_columns, (X_train, y_train), (X_test, y_test)

In [6]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
batch_size = 10
num_workers = 4
lr = 1e-4
epochs = 20

In [23]:
set(y_train)

{0, 1}

In [31]:
feature_columns, (X_train, y_train), (X_test, y_test) = create_criteo_dataset('train.txt')
cols = ['col' + str(i) for i in range(X_train.shape[1])]
cols.append('label')

y_train = y_train.values.reshape(-1,1)
y_test = y_test.values.reshape(-1,1)
trains = np.concatenate((X_train, y_train), axis=1)
train_df = pd.DataFrame(trains, columns=cols)

tests = np.concatenate((X_test, y_test), axis=1)
test_df = pd.DataFrame(tests, columns=cols)

class MyDataset(Dataset):
    def __init__(self, df):
        self.df = df
        cols = df.columns.tolist()
        cols.remove('label')
        self.X = df.loc[:, cols].values
        self.y = df.loc[:, 'label'].values
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        data_x = self.X[idx]
        data_y = self.y[idx]
        return data_x, data_y

train_data = MyDataset(train_df)
test_data = MyDataset(test_df)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=num_workers)

iter_x, iter_y = next(iter(train_loader))
print(iter_y)
print(iter_x.shape)

tensor([0., 0., 0., 0., 1., 0., 1., 0., 1., 0.], dtype=torch.float64)
torch.Size([10, 39])


In [33]:
print(feature_columns)

[[{'feat': 'I1'}, {'feat': 'I2'}, {'feat': 'I3'}, {'feat': 'I4'}, {'feat': 'I5'}, {'feat': 'I6'}, {'feat': 'I7'}, {'feat': 'I8'}, {'feat': 'I9'}, {'feat': 'I10'}, {'feat': 'I11'}, {'feat': 'I12'}, {'feat': 'I13'}], [{'feat': 'C1', 'feat_onehot_dim': 79, 'embed_dim': 8}, {'feat': 'C2', 'feat_onehot_dim': 252, 'embed_dim': 8}, {'feat': 'C3', 'feat_onehot_dim': 1293, 'embed_dim': 8}, {'feat': 'C4', 'feat_onehot_dim': 1043, 'embed_dim': 8}, {'feat': 'C5', 'feat_onehot_dim': 30, 'embed_dim': 8}, {'feat': 'C6', 'feat_onehot_dim': 7, 'embed_dim': 8}, {'feat': 'C7', 'feat_onehot_dim': 1164, 'embed_dim': 8}, {'feat': 'C8', 'feat_onehot_dim': 39, 'embed_dim': 8}, {'feat': 'C9', 'feat_onehot_dim': 2, 'embed_dim': 8}, {'feat': 'C10', 'feat_onehot_dim': 908, 'embed_dim': 8}, {'feat': 'C11', 'feat_onehot_dim': 926, 'embed_dim': 8}, {'feat': 'C12', 'feat_onehot_dim': 1239, 'embed_dim': 8}, {'feat': 'C13', 'feat_onehot_dim': 824, 'embed_dim': 8}, {'feat': 'C14', 'feat_onehot_dim': 20, 'embed_dim': 8},

In [79]:
class FFMLayer(nn.Module):
    def __init__(self, feature_columns, k, w_reg=1e-4, v_reg=1e-4):
        super(FFMLayer, self).__init__()
        self.dense_feature_columns, self.sparse_feature_columns = feature_columns
        self.k = k  #隐向量v的维度
        self.w_reg = w_reg  #一阶权重w的正则化项
        self.v_reg = v_reg  #二阶组合特征权重的正则化项
        
        #真实的特征维度是：类别型变量做了one hot之后的维度加连续型变量的维度
        self.feature_num = sum([feat['feat_onehot_dim'] for feat in self.sparse_feature_columns]) + len(self.dense_feature_columns)
        #域个数是原始特征的个数，一个特征属于一个域
        self.field_num = len(self.dense_feature_columns) + len(self.sparse_feature_columns)
        
        #一阶线性部分
        self.linear = nn.Linear(self.feature_num, 1)
        self.v = nn.Parameter(torch.randn(self.feature_num, self.field_num, k)) #二阶特征组合的交互矩阵
    def ffm_layer(self, inputs):
        #x的维度是(batch_size, 26):离散特征个数加连续特征个数，离散特征还没有做Onehot
        dense_input = inputs[:, :13]
        sparse_inputs = inputs[:, 13:]
        
        #做One hot编码 将连续特征和one hot后的特征拼接成为每个样本新的特征
        x = dense_input.to(dtype=torch.float32)
        for i in range(sparse_inputs.shape[1]):
            one_hot_value = F.one_hot(sparse_inputs[:,i].to(dtype=torch.int64), num_classes=int(self.sparse_feature_columns[i]['feat_onehot_dim']))
            x = torch.cat([x, one_hot_value.to(dtype=torch.float32)], 1)
        linear_part = self.linear(x)
        inter_part = 0
        #每维特征先跟自己的[field_num, k]相乘得到Vij*X  [None, 2291] x [2291, 39, 8] = [None, 39, 8]
        field_f = torch.tensordot(x, self.v, dims=1)
        #域之间两两相乘
        for i in range(self.field_num):
            for j in range(i+1, self.field_num):
                inter_part += torch.sum(torch.mul(field_f[:, i], field_f[:, j]), 1, keepdims=True)
        output = linear_part + inter_part
        output = output.reshape(-1)
        output = torch.sigmoid(output)
        return output.to(dtype=torch.float64)
    def forward(self, x):
        return self.ffm_layer(x)
    def fit(self, data, optimizer, epochs=100):
        #训练模型并输出测试集每一轮的loss
        criterion = F.binary_cross_entropy
        for epoch in range(epochs):
            for t, (batch_x, batch_y) in enumerate(data):
                batch_x = batch_x.to(device)
                batch_y = batch_y.to(device)
                total = self.forward(batch_x)
                loss = criterion(total, batch_y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            loader_test = DataLoader(test_data, batch_size=10, shuffle=False)
            
            r = self.test(loader_test)
            print('Epoch %d, loss=%.4f' % (epoch, r))
    def test(self, data):
        #测试集测试
        criterion = F.binary_cross_entropy
        all_loss = 0
        gt_labels = []
        pred_labels = []
        i = 0
        with torch.no_grad():
            for t, (batch_x, batch_y) in enumerate(data):
                batch_x = batch_x.to(device)
                batch_y = batch_y.to(device)
                pred = self.forward(batch_x)
                gt_label = batch_y.cpu().data.numpy()
                pred_proba = pred.cpu().data.numpy()
                gt_labels.append(gt_label)
                pred_labels.append(pred_proba)
                loss = criterion(pred, batch_y)
                all_loss += loss.item()
                i += 1
        gt_labels, pred_labels = np.concatenate(gt_labels), np.concatenate(pred_labels)
        pred_labels = pred_labels.reshape(len(pred_labels),)
        auc = roc_auc_score(gt_labels, pred_labels)
        print('auc:', auc, 'gt_lables:', gt_labels.shape, 'pred_labels:', pred_labels.shape)
        return all_loss / i

In [80]:
k = 8
ffm = FFMLayer(feature_columns, k)
ffm = ffm.to(device)
optimizer = optim.Adam(ffm.parameters(), lr=lr, weight_decay=0.0)
ffm.fit(train_loader, optimizer, epochs=20)

auc: 0.4523306544202067 gt_lables: (400,) pred_labels: (400,)
Epoch 0, loss=48.2720
auc: 0.45086107921928814 gt_lables: (400,) pred_labels: (400,)
Epoch 1, loss=48.5259
auc: 0.4523995407577497 gt_lables: (400,) pred_labels: (400,)
Epoch 2, loss=47.9334
auc: 0.4529735935706085 gt_lables: (400,) pred_labels: (400,)
Epoch 3, loss=47.6678
auc: 0.4529735935706085 gt_lables: (400,) pred_labels: (400,)
Epoch 4, loss=47.6678
auc: 0.4529735935706085 gt_lables: (400,) pred_labels: (400,)
Epoch 5, loss=47.6678
auc: 0.4529735935706085 gt_lables: (400,) pred_labels: (400,)
Epoch 6, loss=47.6678
auc: 0.4529735935706085 gt_lables: (400,) pred_labels: (400,)
Epoch 7, loss=47.6678
auc: 0.4529735935706085 gt_lables: (400,) pred_labels: (400,)
Epoch 8, loss=47.6678
auc: 0.4529735935706085 gt_lables: (400,) pred_labels: (400,)
Epoch 9, loss=47.6678
auc: 0.4529735935706085 gt_lables: (400,) pred_labels: (400,)
Epoch 10, loss=47.6678
auc: 0.4529735935706085 gt_lables: (400,) pred_labels: (400,)
Epoch 11, lo

In [82]:
ffm.test(test_loader)

auc: 0.4529735935706085 gt_lables: (400,) pred_labels: (400,)


47.667790233367285