In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

## 构造数据

In [23]:
data = pd.read_csv('train.txt')
columns = data.columns.tolist()
categorical_features = [col for col in columns if 'C' in col]
numeric_features = [col for col in columns if col not in categorical_features]
numeric_features.remove('label')
data[categorical_features] = data[categorical_features].fillna('-1')
data[numeric_features] = data[numeric_features].fillna(0)

data = pd.get_dummies(data, columns=categorical_features)
scaler = MinMaxScaler()
data[numeric_features] = scaler.fit_transform(data[numeric_features])

data_X = data.drop('label', axis=1)
data_y = data['label'].astype(np.float64)

trains, tests, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2)
trains['label'] = y_train
tests['label'] = y_test
print(trains.shape, y_train.shape, tests.shape, y_test.shape)
print(trains.head())

(1599, 13105) (1599,) (400, 13105) (400,)
            I1        I2        I3        I4        I5        I6        I7  \
64    0.010526  0.000381  0.000591  0.091954  0.000007  0.001725  0.000603   
917   0.000000  0.000381  0.000236  0.057471  0.004469  0.013583  0.016888   
290   0.000000  0.004704  0.000000  0.103448  0.039948  0.000000  0.000000   
1198  0.063158  0.000254  0.001774  0.045977  0.000000  0.000647  0.003619   
1991  0.000000  0.000254  0.000236  0.011494  0.015662  0.018111  0.000603   

            I8        I9   I10  ...  C26_fbe10aa8  C26_fcd456fa  C26_fcd5a3f4  \
64    0.014625  0.001419  0.25  ...             0             0             0   
917   0.034735  0.008338  0.00  ...             0             0             0   
290   0.074954  0.001774  0.00  ...             0             0             0   
1198  0.010969  0.000710  0.25  ...             0             0             0   
1991  0.001828  0.024304  0.00  ...             0             0             0   

  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [24]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
batch_size = 10
num_workers = 4
lr = 1e-4
epochs = 20
class MyDataset(Dataset):
    def __init__(self, df):
        self.df = df
        cols = df.columns.tolist()
        cols.remove('label')
        self.X = df.loc[:, cols].values
        self.y = df.loc[:, 'label'].values
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        data_x = self.X[idx]
        data_y = self.y[idx]
        return data_x, data_y

train_data = MyDataset(trains)
test_data = MyDataset(tests)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=num_workers)

iter_x, iter_y = next(iter(train_loader))
print(iter_y)
print(iter_x.shape)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=torch.float64)
torch.Size([10, 13104])


## LSPLM模型

In [48]:
class LSPLM(nn.Module):
    def __init__(self, feature_num, m):
        super(LSPLM, self).__init__()
        self.m = m
        self.feature_num = feature_num
        self.softmax = nn.Sequential(nn.Linear(self.feature_num, self.m).double(),
                                    nn.Softmax(dim=1).double())
        self.logistic = nn.Sequential(nn.Linear(self.feature_num, self.m).double(),
                                     nn.Sigmoid())        
    def forward(self, x):
        logistic_out = self.logistic(x)
        softmax_out = self.softmax(x)
        combine_out = logistic_out.mul(softmax_out)
        return combine_out.sum(1)
    def fit(self, data, optimizer, epochs=100):
        #训练模型并输出测试集每一轮的loss
        criterion = nn.BCELoss(reduction='mean')
        for epoch in range(epochs):
            for t, (batch_x, batch_y) in enumerate(data):
                batch_x = batch_x.to(device)
                batch_y = batch_y.to(device)
                total = self.forward(batch_x)
                loss = criterion(total, batch_y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            loader_test = DataLoader(test_data, batch_size=batch_size, shuffle=False)
            r = self.test(loader_test)
            print('Epoch %d, loss=%.4f' % (epoch, r))
    def test(self, data):
        #测试集测试
        criterion = nn.BCELoss(reduction='mean')
        all_loss = 0
        gt_labels = []
        pred_labels = []
        i = 0
        with torch.no_grad():
            for t, (batch_x, batch_y) in enumerate(data):
                batch_x = batch_x.to(device)
                batch_y = batch_y.to(device)
                pred = self.forward(batch_x)
                gt_label = batch_y.cpu().data.numpy()
                pred_proba = pred.cpu().data.numpy()
                gt_labels.append(gt_label)
                pred_labels.append(pred_proba)
                loss = criterion(pred, batch_y)
                all_loss += loss.item()
                i += 1
        gt_labels, pred_labels = np.concatenate(gt_labels), np.concatenate(pred_labels)
        pred_labels = pred_labels.reshape(len(pred_labels),)
        auc = roc_auc_score(gt_labels, pred_labels)
        print('auc:', auc, 'gt_lables:', gt_labels.shape, 'pred_labels:', pred_labels.shape)
        return all_loss / i
    def predict(self, x):
        with torch.no_grad():
            #预测出X的标签
            x = torch.from_numpy(x)
            x = x.to(device)
            out1 = self.forward(x)
            out = out1.cpu().data.numpy()
            out[out >= 0.5] = 1.0
            out[out<0.5] = 0.0
            return out

In [49]:
m = 4
feature_num = data_X.shape[1]
ls_plm = LSPLM(feature_num, m)
ls_plm.to(device)
optimizer = optim.Adam(ls_plm.parameters(), lr=lr, weight_decay=0.0)
ls_plm.fit(train_loader, optimizer, epochs=30)

auc: 0.625 gt_lables: (400,) pred_labels: (400,)
Epoch 0, loss=0.6454
auc: 0.627039627039627 gt_lables: (400,) pred_labels: (400,)
Epoch 1, loss=0.6093
auc: 0.6263111888111889 gt_lables: (400,) pred_labels: (400,)
Epoch 2, loss=0.5815
auc: 0.6255099067599067 gt_lables: (400,) pred_labels: (400,)
Epoch 3, loss=0.5592
auc: 0.6285693473193473 gt_lables: (400,) pred_labels: (400,)
Epoch 4, loss=0.5410
auc: 0.6313738344988343 gt_lables: (400,) pred_labels: (400,)
Epoch 5, loss=0.5283
auc: 0.634833916083916 gt_lables: (400,) pred_labels: (400,)
Epoch 6, loss=0.5201
auc: 0.6400786713286714 gt_lables: (400,) pred_labels: (400,)
Epoch 7, loss=0.5148
auc: 0.6432837995337995 gt_lables: (400,) pred_labels: (400,)
Epoch 8, loss=0.5112
auc: 0.6487470862470863 gt_lables: (400,) pred_labels: (400,)
Epoch 9, loss=0.5085
auc: 0.6532634032634033 gt_lables: (400,) pred_labels: (400,)
Epoch 10, loss=0.5064
auc: 0.6579254079254079 gt_lables: (400,) pred_labels: (400,)
Epoch 11, loss=0.5046
auc: 0.6617497086

In [54]:
#训练好的模型预测测试集
ls_plm.test(test_loader)
test_x = tests.drop('label', axis=1).values
test_pred = ls_plm.predict(test_x)
test_label = tests['label'].values
from sklearn.metrics import accuracy_score
acc = accuracy_score(test_pred, test_label)
print('accuracy:',acc)

auc: 0.6840399184149184 gt_lables: (400,) pred_labels: (400,)
accuracy: 0.7775
