In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split

## 构造数据

In [10]:
data = pd.read_csv('train.txt')
columns = data.columns.tolist()
categorical_features = [col for col in columns if 'C' in col]
numeric_features = [col for col in columns if col not in categorical_features]
numeric_features.remove('label')
data[categorical_features] = data[categorical_features].fillna('-1')
data[numeric_features] = data[numeric_features].fillna(0.0)

le = LabelEncoder()
for fea in categorical_features:
    data[fea] = le.fit_transform(data[fea])
scaler = MinMaxScaler()
data[numeric_features] = scaler.fit_transform(data[numeric_features])

sparse_map = {}
for fea in categorical_features:
    sparse_map[fea] = data[fea].nunique()

feature_info = [numeric_features, categorical_features, sparse_map]
data_X = data.drop('label', axis=1)
data_y = data['label'].astype(np.float64)

trains, tests, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2)
trains['label'] = y_train
tests['label'] = y_test

print(trains.shape, y_train.shape, tests.shape, y_test.shape)
print(trains.head())

(1599, 40) (1599,) (400, 40) (400,)
            I1        I2        I3        I4        I5        I6        I7  \
1427  0.000000  0.003814  0.000118  0.022989  0.001697  0.012290  0.000603   
1648  0.178947  0.000254  0.001301  0.114943  0.000111  0.004528  0.010253   
307   0.000000  0.000127  0.000236  0.000000  0.040571  0.000000  0.000000   
1979  0.021053  0.002288  0.000118  0.022989  0.000007  0.000431  0.001206   
651   0.000000  0.184973  0.000946  0.034483  0.007000  0.034066  0.001206   

            I8        I9   I10  ...  C18  C19  C20   C21  C22  C23  C24  C25  \
1427  0.018282  0.021111  0.00  ...  309   23    3  1062    0    0  531    4   
1648  0.018282  0.002838  0.25  ...  174   23    2   911    0    3  655    1   
307   0.000000  0.018272  0.00  ...    2   23    3   472    0    0   67   27   
1979  0.001828  0.000355  0.25  ...  331    0    0   883    0    0  647    0   
651   0.023766  0.021820  0.00  ...  404  148    3  1182    0    1  175   18   

      C26  lab

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [11]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
batch_size = 10
num_workers = 4
lr = 1e-4
epochs = 20
class MyDataset(Dataset):
    def __init__(self, df):
        self.df = df
        cols = df.columns.tolist()
        cols.remove('label')
        self.X = df.loc[:, cols].values
        self.y = df.loc[:, 'label'].values
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        data_x = self.X[idx]
        data_y = self.y[idx]
        return data_x, data_y

train_data = MyDataset(trains)
test_data = MyDataset(tests)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=num_workers)

iter_x, iter_y = next(iter(train_loader))
print(iter_y)
print(iter_x.shape)

tensor([0., 0., 1., 0., 0., 1., 0., 1., 0., 0.], dtype=torch.float64)
torch.Size([10, 39])


## 搭建模型

### 定义残差块

In [99]:
class Residual_block(nn.Module):
    def __init__(self, hidden_unit, dim_stack):
        super(Residual_block, self).__init__()
        self.linear1 = nn.Linear(dim_stack, hidden_unit)
        self.linear2 = nn.Linear(hidden_unit, dim_stack)
        self.relu = nn.ReLU()
    def forward(self, x):
        orig_x = x.clone()
        x = self.linear1(x)
        x = self.linear2(x)
        output = self.relu(x + orig_x)
        return output

### 定义DeepCrossing模型

In [100]:
class DeepCrossing(nn.Module):
    def __init__(self, feature_info,  hidden_units, dropout=0., embed_dim=10, output_dim=1):
        #feature_info:特征信息(数值特征，类别特征，类别特征embedding映射)
        #hidden_unit 残差网络中的隐藏层神经元数量
        #embed_dim embedding维度
        #dropout:dropout层的失活比例
        super(DeepCrossing, self).__init__()
        self.dense_feas, self.sparse_feas, self.sparse_map = feature_info
        
        #embedding层，这里需要一个列表的形式，因为每个类别特征都需要embedding
        self.embed_layers = nn.ModuleDict({
            'embed_' + str(key):nn.Embedding(num_embeddings=val, embedding_dim=embed_dim) for key, val in self.sparse_map.items()
        })
        
        #统计embedding层的输出维度
        embed_dim_sum = sum([embed_dim] * len(self.sparse_feas))
        
        #stack layers的总维度
        dim_stack = len(self.dense_feas) + embed_dim_sum
        
        #残差层，可能会有多层
        self.res_layers = nn.ModuleList([
            Residual_block(unit, dim_stack) for unit in hidden_units
        ])

        #dropout层
        self.res_dropout = nn.Dropout(dropout)
        
        #线性层
        self.linear = nn.Linear(dim_stack, output_dim)
    def forward(self, x):
        dense_inputs, sparse_inputs = x[:, :13], x[:, 13:]
        sparse_inputs = sparse_inputs.long()  #需要转成长张量，这个是embedding的输入格式要求
        sparse_embeds = [self.embed_layers['embed_'+key](sparse_inputs[:, i]) for key, i in zip(self.sparse_map.keys(), range(sparse_inputs.shape[1]))]
        sparse_embed = torch.cat(sparse_embeds, dim=-1)
        
        stack = torch.cat([sparse_embed, dense_inputs], dim=-1)
        r = stack
        for res in self.res_layers:
            r = res(r)
        r = self.res_dropout(r)
        outputs = F.sigmoid(self.linear(r))
        outputs = outputs.squeeze(-1)
        return outputs

In [94]:
hidden_units = [256, 128, 64, 32]
net = DeepCrossing(feature_info, hidden_units)
net

DeepCrossing(
  (embed_layers): ModuleDict(
    (embed_C1): Embedding(79, 10)
    (embed_C10): Embedding(908, 10)
    (embed_C11): Embedding(926, 10)
    (embed_C12): Embedding(1239, 10)
    (embed_C13): Embedding(824, 10)
    (embed_C14): Embedding(20, 10)
    (embed_C15): Embedding(819, 10)
    (embed_C16): Embedding(1159, 10)
    (embed_C17): Embedding(9, 10)
    (embed_C18): Embedding(534, 10)
    (embed_C19): Embedding(201, 10)
    (embed_C2): Embedding(252, 10)
    (embed_C20): Embedding(4, 10)
    (embed_C21): Embedding(1204, 10)
    (embed_C22): Embedding(7, 10)
    (embed_C23): Embedding(12, 10)
    (embed_C24): Embedding(729, 10)
    (embed_C25): Embedding(33, 10)
    (embed_C26): Embedding(554, 10)
    (embed_C3): Embedding(1293, 10)
    (embed_C4): Embedding(1043, 10)
    (embed_C5): Embedding(30, 10)
    (embed_C6): Embedding(7, 10)
    (embed_C7): Embedding(1164, 10)
    (embed_C8): Embedding(39, 10)
    (embed_C9): Embedding(2, 10)
  )
  (res_layers): ModuleList(
    (0)

## 训练和评估模型

In [108]:
def train(epoch):
    #训练模型并输出测试集每一轮的loss
    criterion = nn.BCELoss(reduction='mean')
    for t, (batch_x, batch_y) in enumerate(train_loader):
        batch_x = batch_x.float().to(device)
        batch_y = batch_y.float().to(device)
        total = net.forward(batch_x)
        loss = criterion(total, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    r = test()
    print('Epoch %d, loss=%.4f' % (epoch, r))
def test():
    #测试集测试
    criterion = nn.BCELoss(reduction='mean')
    all_loss = 0
    gt_labels = []
    pred_labels = []
    i = 0
    with torch.no_grad():
        for t, (batch_x, batch_y) in enumerate(test_loader):
            batch_x = batch_x.float().to(device)
            batch_y = batch_y.float().to(device)
            pred = net.forward(batch_x)
            gt_label = batch_y.cpu().data.numpy()
            pred_proba = pred.cpu().data.numpy()
            gt_labels.append(gt_label)
            pred_labels.append(pred_proba)
            loss = criterion(pred, batch_y)
            all_loss += loss.item()
            i += 1
        gt_labels, pred_labels = np.concatenate(gt_labels), np.concatenate(pred_labels)
        pred_labels = pred_labels.reshape(-1)
        auc = roc_auc_score(gt_labels, pred_labels)
        print('auc:', auc, 'gt_labels:', gt_labels.shape, 'pred_labels:', pred_labels.shape)
    return all_loss / i
def predict(x):
    with torch.no_grad():
        x = torch.from_numpy(x)
        x = x.float().to(device)
        out1 = net.forward(x)
        out = out1.cpu().data.numpy()
        out[out>=0.5] = 1.0
        out[out<0.5] = 0.0
        return out

In [105]:
net.to(device)
optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=0.0)
for epoch in range(epochs):
    train(epoch)



auc: 0.6140201451636794 gt_labels: (400,) pred_labels: (400,)
Epoch 0, loss=2.4239




auc: 0.6069100873944601 gt_labels: (400,) pred_labels: (400,)
Epoch 1, loss=2.5303




auc: 0.6168345430306621 gt_labels: (400,) pred_labels: (400,)
Epoch 2, loss=2.4241




auc: 0.6205747296696786 gt_labels: (400,) pred_labels: (400,)
Epoch 3, loss=2.0746




auc: 0.619982224855577 gt_labels: (400,) pred_labels: (400,)
Epoch 4, loss=2.1385




auc: 0.6209820767293734 gt_labels: (400,) pred_labels: (400,)
Epoch 5, loss=2.1779




auc: 0.6213894237890683 gt_labels: (400,) pred_labels: (400,)
Epoch 6, loss=2.2163




auc: 0.6215005184417124 gt_labels: (400,) pred_labels: (400,)
Epoch 7, loss=2.2516




auc: 0.621574581543475 gt_labels: (400,) pred_labels: (400,)
Epoch 8, loss=2.2841




auc: 0.6216486446452377 gt_labels: (400,) pred_labels: (400,)
Epoch 9, loss=2.3173




auc: 0.6217227077470004 gt_labels: (400,) pred_labels: (400,)
Epoch 10, loss=2.3476




auc: 0.6213894237890684 gt_labels: (400,) pred_labels: (400,)
Epoch 11, loss=2.3781




auc: 0.6214264553399497 gt_labels: (400,) pred_labels: (400,)
Epoch 12, loss=2.4066




auc: 0.621352392238187 gt_labels: (400,) pred_labels: (400,)
Epoch 13, loss=2.4351




auc: 0.6213894237890683 gt_labels: (400,) pred_labels: (400,)
Epoch 14, loss=2.4620




auc: 0.621463486890831 gt_labels: (400,) pred_labels: (400,)
Epoch 15, loss=2.4886




auc: 0.6211672344837802 gt_labels: (400,) pred_labels: (400,)
Epoch 16, loss=2.5151




auc: 0.6211302029328988 gt_labels: (400,) pred_labels: (400,)
Epoch 17, loss=2.5408




auc: 0.6212042660346616 gt_labels: (400,) pred_labels: (400,)
Epoch 18, loss=2.5667




auc: 0.621296844911865 gt_labels: (400,) pred_labels: (400,)
Epoch 19, loss=2.5911


In [109]:
#训练好的模型预测测试集
test_x = tests.drop('label', axis=1).values
test_pred = predict(test_x)
test_label = tests['label'].values
acc = accuracy_score(test_pred, test_label)
print('accuracy:',acc)

accuracy: 0.7225


