In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import time
import torch
import os
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import warnings
warnings.filterwarnings('ignore')
import random
train_df=pd.read_csv('dataset/track1_round1_train_20210222.csv',header=None)
test_df=pd.read_csv('dataset/track1_round1_testA_20210222.csv',header=None) 
train_df.columns=['report_ID','description','label']
test_df.columns=['report_ID','description']
train_df.drop(['report_ID'],axis=1,inplace=True)
test_df.drop(['report_ID'],axis=1,inplace=True)
new_des=[i.strip('|').strip() for i in train_df['description'].values]
new_label=[i.strip('|').strip() for i in train_df['label'].values]
train_df['description']=new_des
train_df['label']=new_label
new_des=[i.strip('|').strip() for i in test_df['description'].values]
test_df['description']=new_des
test_df['label'] = 0
def seed_torch(seed=1029):
    print("seed:{}".format(seed))
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed) # 为了禁止hash随机化，使得实验可复现
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True



In [2]:
import numpy as np
import pandas as pd
from bert4keras.backend import keras, search_layer, K
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import extend_with_gradient_accumulation
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.tokenizers import Tokenizer
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from keras.callbacks import ReduceLROnPlateau
from keras.layers import *
from keras.optimizers import Adam
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings('ignore')

In [2]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text_lists = self.data['description']
        self.targets = self.data['label']
        self.max_len = max_len

    def __len__(self):
        return self.data.shape[0]
    
    def get_dumm(self,s):
        re=[0]*17
        if s=='':#标签为空 返回17个0
            return re
        else:
            tmp=[int(i) for i in s.split(' ')]
            for i in tmp:
                re[i]=1
        return re
    def __getitem__(self, index):
        
        text_lists = self.text_lists[index]
        inputs = self.tokenizer.encode_plus(
            text_lists,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        targets = self.get_dumm(self.targets[index])
        return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'targets': torch.tensor(targets, dtype=torch.float)
                }

In [3]:
tokenizer = BertTokenizer.from_pretrained('pre/vocab.txt')

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        
        self.config = BertConfig.from_pretrained('pretrain6/config.json', output_hidden_states=True)
        self.l1 = BertModel.from_pretrained('pretrain6/pytorch_model.bin', config=self.config)
        self.bilstm1 = torch.nn.LSTM(1536, 256, 1, bidirectional=True)
        self.l2 = torch.nn.Linear(512, 64)
        self.a1 = torch.nn.ReLU()
        self.l3 = torch.nn.Dropout(0.3)
        self.l4 = torch.nn.Linear(64, 17)
        self.l5 = torch.nn.Linear(1536,128)
        self.l51 = torch.nn.Linear(512,128)
        self.l6 = torch.nn.Linear(128,17)
       ######################################
       # 初始时间步和最终时间步的隐藏状态作为全连接层输入
        self.w_omega = torch.nn.Parameter(torch.Tensor(
            256 * 2, 256 * 2))
        self.u_omega = torch.nn.Parameter(torch.Tensor(256 * 2, 1))
        self.decoder = torch.nn.Linear(2*256, 17)

        torch.nn.init.uniform_(self.w_omega, -0.1, 0.1)
        torch.nn.init.uniform_(self.u_omega, -0.1, 0.1)

        
    def forward(self, ids, mask, token_type_ids):
        sequence_output, pooler_output, hidden_states= self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids)
        #    bs sl hs    #bs hs
        # [bs, sl, 768]  [bs,768]
#=====================================================================
#         bs = len(sequence_output)
#         h12 = hidden_states[-1][:,0].view(1,bs,768)
#         h11 = hidden_states[-2][:,0].view(1,bs,768)
#         concat_hidden = torch.cat((h12,h11),2)   # 1 bs 768
#         x, _ = self.bilstm1(concat_hidden)# in 1536  out 512
#         x = x.permute(1, 0, 2)
#         # x形状是(batch_size, seq_len, 2 * num_hiddens)

#         # Attention过程
#         u = torch.tanh(torch.matmul(x, self.w_omega))
#        # u形状是(batch_size, seq_len, 2 * num_hiddens)
#         att = torch.matmul(u, self.u_omega)
#        # att形状是(batch_size, seq_len, 1)
#         att_score = torch.functional.F.softmax(att, dim=1)
#        # att_score形状仍为(batch_size, seq_len, 1)
#         scored_x = x * att_score
#        # scored_x形状是(batch_size, seq_len, 2 * num_hiddens)
#         # Attention过程结束
        
#         feat = torch.sum(scored_x, dim=1) #加权求和
#        # feat形状是(batch_size, 2 * num_hiddens)
#         outs = self.decoder(feat)
#         return outs
#=================================================================        
        
#=========================V_1=========================================  
        bs = len(sequence_output)
        h12 = hidden_states[-1][:,0].view(1,bs,768)
        h11 = hidden_states[-2][:,0].view(1,bs,768)
        concat_hidden = torch.cat((h12,h11),2)
        x=self.l5(concat_hidden.view(bs,1536))
        x=self.a1(x)
        x=self.l3(x)
        output=self.l6(x)
        return output
#=========================V_2=========================================  
#         bs = len(sequence_output)
#         h12 = hidden_states[-1][:,0].view(1,bs,768)
#         h11 = hidden_states[-2][:,0].view(1,bs,768)
#         concat_hidden = torch.cat((h12,h11),2)
#         x=self.l5(concat_hidden.view(bs,1536))
#         x=self.a1(x)
#         x=self.l51(x)
#         x=self.a1(x)
#         x=torch.nn.Dropout(0.1)(x)
#         output=self.l6(x)
#         return output
# if __name__=="__main__":
#     a=tokenizer("328 538 382 809 623 434 355 382 382 363 145",return_tensors='pt')
#     net=BERTClass()
#     logit=net(a['input_ids'],a['token_type_ids'],a['attention_mask'])
#     print(logit.shape)

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


In [4]:
def evaluate_accuracy(data_iter, net, device=torch.device('cpu')):
    """Evaluate accuracy of a model on the given data set."""
    acc_sum, n = torch.tensor([0], dtype=torch.float32,device=device), 0
    y_pred_, y_true_ = [], []
    for data in data_iter:
        # If device is the GPU, copy the data to the GPU.
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        net.eval()
        y_hat_ = net(ids, mask, token_type_ids)
        with torch.no_grad():
            y_pred_+=y_hat_.sigmoid().detach().cpu().numpy().tolist()
            y_true_+=targets.cpu().numpy().tolist()
    val_auc = metrics.roc_auc_score(y_true_, y_pred_, multi_class='ovo')
    log_loss=1-((metrics.log_loss(y_true_, y_pred_))/(17*len(y_pred_)))
    return val_auc ,log_loss

In [5]:
from tqdm import tqdm_notebook as tqdm
def train(epoch,train_iter, test_iter, criterion, num_epochs, optimizer, device):
    print('training on', device)
    print(len(train_iter))
    net.to(device)
    best_auc = 0
    best_loss=0
   # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)  # 设置学习率下降策略
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5, eta_min=2e-06)  # 余弦退火
    for epoch in range(num_epochs):
        print('当前学习率：{}'.format(optimizer.param_groups[-1]['lr']))
        print("="*100)
        print("Epoch-->{}/{}".format(epoch+1,num_epochs))
        print("="*100)

        train_l_sum = torch.tensor([0.0], dtype=torch.float32, device=device)
        train_acc_sum = torch.tensor([0.0], dtype=torch.float32, device=device)
        n, start = 0, time.time()
        y_pred, y_true = [], []
        
        for data in tqdm(train_iter):
            net.train()
            optimizer.zero_grad()
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            y_hat = net(ids, mask, token_type_ids)
            #model=nn.DataParallel(model,device_ids=[0,1,2]) # multi-GPU
            loss = criterion(y_hat, targets)
            loss.backward()
            optimizer.step()
        val_auc ,log_loss = evaluate_accuracy(test_iter, net, device)
        print('epoch %d,  valid_auc %.9f,  valid_log %.9f, time %.1f sec' % (epoch + 1, val_auc,log_loss,time.time() - start))
        if log_loss > best_loss:
            print('正在保存当前最佳模型')
            best_loss = log_loss
            torch.save(net.state_dict(), 'model3/best.pth')
        scheduler.step()  # 更新学习率

In [6]:
seed_torch(2021)

seed:2021


In [9]:
net = BERTClass()
net.to(device)
# 超参数设置
lr, num_epochs = 1e-5, 60
criterion = torch.nn.BCEWithLogitsLoss()  # 选择损失函数
optimizer = torch.optim.Adam(net.parameters(), lr=lr)  # 选择优化器
# Creating the dataset and dataloader for the neural network
MAX_LEN = 256
train_size = 0.8
train_dataset = train_df.sample(frac=train_size,random_state=7)
valid_dataset = train_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

# print("FULL  Dataset: {}".format(train_df.shape))
# print("TRAIN Dataset: {}".format(train_dataset.shape))
# print("VALID Dataset: {}".format(valid_dataset.shape))
# print("TEST  Dataset: {}".format(test_df.shape))

train_set = CustomDataset(train_dataset, tokenizer, 128)
valid_set = CustomDataset(valid_dataset, tokenizer, 128)
test_set = CustomDataset(test_df, tokenizer, 128)

RuntimeError: CUDA error: out of memory

In [None]:
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True}

valid_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True}

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False}

train_loader = DataLoader(train_set, **train_params)
valid_loader = DataLoader(valid_set, **valid_params)
test_loader = DataLoader(test_set, **test_params)

In [8]:
train(net,train_loader, valid_loader, criterion, num_epochs, optimizer, device)
#0.999954353   0.991

NameError: name 'train_loader' is not defined

In [10]:
test_df['label']=''
test_set = CustomDataset(test_df, tokenizer, 128)
test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False}
test_loader = DataLoader(test_set, **test_params)

In [11]:
def model_predict(net, test_iter):
    # 预测模型
    preds_list = []
    print('loading the best model')
    net.load_state_dict(torch.load('model3/best.pth'))
    net = net.to(device)
    print('begin predict.........')
    with torch.no_grad():
        for data in tqdm(test_iter):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            batch_preds = net(ids, mask, token_type_ids).sigmoid().detach().cpu().numpy()
            for preds in batch_preds:
                preds_list.append(preds) 
    for q,i in enumerate(preds_list):
        i=[str(x) for x in i]
        preds_list[q]=' '.join(i)
    return preds_list
preds_list = model_predict(net, test_loader)

loading the best model
begin predict.........


HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))




In [12]:
test_df=pd.read_csv('dataset/track1_round1_testA_20210222.csv',header=None) 
test_df.columns=['report_ID','description']
submit=test_df.copy()
print("test_df:{}".format(test_df.shape))
new_des=[i.strip('|').strip() for i in test_df['description'].values]
test_df['description']=new_des
sub_id=test_df['report_ID'].values
#
print(sub_id[0])
save_dir='submits/'
if not os.path.exists(save_dir): os.makedirs(save_dir)
str_w=''
current_time=time.localtime(time.time())
with open(save_dir+'submit{}_{}_{}.csv'.format(current_time.tm_year,current_time.tm_hour,current_time.tm_min),'w') as f:
    for i in range(len(sub_id)):
        str_w+=sub_id[i]+','+'|'+preds_list[i]+'\n'
    str_w=str_w.strip('\n')
    f.write(str_w)

test_df:(3000, 2)
0|


In [81]:
def train_model(model,criterion, optimizer, lr_scheduler=None):
    total_iters=len(trainloader)
    print('total_iters:{}'.format(total_iters))
    since = time.time()
    best_loss = 1e7
    best_epoch = 0
    #
    iters = len(trainloader)
    for epoch in range(1,max_epoch+1):
        model.train(True)
        begin_time=time.time()
        print('learning rate:{}'.format(optimizer.param_groups[-1]['lr']))
        print('Fold{}==>Epoch {}/{}'.format(fold+1,epoch, max_epoch))
        print('=' * 90)
        count=0
        train_loss = []
        for i, inputs in enumerate(trainloader):
            count+=1
            ids = inputs['ids'].to(device, dtype = torch.long)
            mask = inputs['mask'].to(device, dtype = torch.long)
            token_type_ids = inputs['token_type_ids'].to(device, dtype = torch.long)
            labels = inputs['targets'].to(device).float()
            #
            out_linear= model(ids, mask, token_type_ids)#前向传播（投料）
#             print(out_linear.size(),labels.size())
            loss = criterion(out_linear, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # 更新cosine学习率
            if lr_scheduler!=None:
                lr_scheduler.step(epoch + count / iters)
            if print_interval>0 and (i % print_interval == 0 or out_linear.size()[0] < train_batch_size):
                spend_time = time.time() - begin_time
                print(
                    ' Fold:{} Epoch:{}({}/{}) loss:{:.3f} lr:{:.7f} epoch_Time:{}min:'.format(
                        fold+1,epoch, count, total_iters,
                        loss.item(), optimizer.param_groups[-1]['lr'],
                        spend_time / count * total_iters // 60 - spend_time // 60))
            #
            train_loss.append(loss.item())
        #lr_scheduler.step()
        val_auc,val_loss= val_model(model, criterion)
        print('valLogLoss: {:.4f} valAuc: {:.4f}'.format(val_loss,val_auc))
        model_out_path = model_save_dir+"/"+'fold_'+str(fold+1)+'_'+str(epoch) + '.pth'
        best_model_out_path = model_save_dir+"/"+'fold_'+str(fold+1)+'_best'+'.pth'
        #save the best model
        if val_loss < best_loss:
            best_loss = val_loss
            best_epoch=epoch
            torch.save(model.state_dict(), best_model_out_path)
            print('='*90)
            print("|save best epoch: {} best auc: {} best logloss: {}|".format(best_epoch,val_auc,val_loss))
            print('='*90)
            print('Fold{} Best logloss: {:.3f} Best epoch:{}'.format(fold+1,best_loss,best_epoch))
    time_elapsed = time.time() - since
    #print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    return best_loss

@torch.no_grad()
def val_model(model, criterion):
    dset_sizes=len(val_dataset)
    model.eval()
    running_loss = 0.0
    running_corrects = 0
    cont = 0
    outPre = []
    outLabel = []
    pres_list=[]
    labels_list=[]
    for i, inputs in enumerate(val_loader): 
            ids = inputs['ids'].to(device, dtype = torch.long)
            mask = inputs['mask'].to(device, dtype = torch.long)
            token_type_ids = inputs['token_type_ids'].to(device, dtype = torch.long)
            labels = inputs['targets'].to(device).float()
            outputs = model(ids, mask, token_type_ids)
            pres_list+=outputs.sigmoid().detach().cpu().numpy().tolist()
            labels_list+=labels.detach().cpu().numpy().tolist()
    val_auc = metrics.roc_auc_score(labels_list, pres_list, multi_class='ovo')
    log_loss=1-((metrics.log_loss(labels_list, pres_list))/(17*len(pres_list)))#
    return val_auc,log_loss

In [82]:
train_df=pd.read_csv('dataset/track1_round1_train_20210222.csv',header=None)
test_df=pd.read_csv('dataset/track1_round1_testA_20210222.csv',header=None) 
#
train_df.columns=['report_ID','description','label']
test_df.columns=['report_ID','description']
train_df.drop(['report_ID'],axis=1,inplace=True)
test_df.drop(['report_ID'],axis=1,inplace=True)
print("train_df:{},test_df:{}".format(train_df.shape,test_df.shape))
#
new_des=[i.strip('|').strip() for i in train_df['description'].values]
new_label=[i.strip('|').strip() for i in train_df['label'].values]
train_df['description']=new_des
train_df['label']=new_label
new_des=[i.strip('|').strip() for i in test_df['description'].values]
test_df['description']=new_des
#
#总共1w条训练数据里面包含2622条正常样本，这里的正常样本标签就使用[0,0,0,....,0,0]编码
print('无异常样本:',train_df[train_df['label']==''].shape[0])#2622

# iteration = all_size / batch_size  
# epoch = the number of train all_data  
model_save_dir ='ckpt/'
print_interval=-1
train_batch_size=16
val_batch_size=32
max_epoch=10
device = torch.device('cuda')
criterion = torch.nn.BCEWithLogitsLoss()
if not os.path.exists(model_save_dir): os.makedirs(model_save_dir)
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2021).split(np.arange(train_df.shape[0]), train_df.label.values)
kfold_best=[]
tokenizer = BertTokenizer.from_pretrained('pretrain/vocab.txt')
for fold, (trn_idx, val_idx) in enumerate(folds):
    #
    print('train fold {}'.format(fold+1))
    model=BERTClass()
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3 ,weight_decay=5e-4)
    #lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.2)
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=3, T_mult=2, eta_min=1e-5, last_epoch=-1)

    train_dataset = CustomDataset(train_df,tokenizer,trn_idx,64)
    trainloader = DataLoader(train_dataset,
                            batch_size=train_batch_size,
                            shuffle=True,
                            num_workers=0)
    val_dataset = CustomDataset(train_df,tokenizer,val_idx,64)
    val_loader = DataLoader(val_dataset,
                            batch_size=val_batch_size,
                            shuffle=False,
                            num_workers=4)

    best_loss=train_model(model,criterion, optimizer,lr_scheduler=lr_scheduler)
    kfold_best.append(best_loss)
print("local cv:",kfold_best,np.mean(kfold_best))

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


train_df:(10000, 2),test_df:(3000, 1)
无异常样本: 2622
train fold 1
total_iters:625
learning rate:0.001
Fold1==>Epoch 1/10
valLogLoss: 1.0000 valAuc: 0.5497
|save best epoch: 1 best auc: 0.5497449167251046 best logloss: 0.9999778043731692|
Fold1 Best logloss: 1.000 Best epoch:1
learning rate:0.00025750000000000013
Fold1==>Epoch 2/10
valLogLoss: 1.0000 valAuc: 0.5545
learning rate:0.001
Fold1==>Epoch 3/10
valLogLoss: 1.0000 valAuc: 0.4615
|save best epoch: 3 best auc: 0.4615045282823918 best logloss: 0.9999776275267709|
Fold1 Best logloss: 1.000 Best epoch:3
learning rate:0.0009336825748732972
Fold1==>Epoch 4/10


KeyboardInterrupt: 

In [None]:
def load_model(weight_path):
    print(weight_path)
    model=BERTClass()
    model.load_state_dict(torch.load(weight_path))
    model.to(device)
    model.eval()
    return model
test_dataset = CustomDataset(test_df,tokenizer,val_idx,128)
test_loader = DataLoader(test_dataset,
                    batch_size=val_batch_size,
                    shuffle=False,
                    num_workers=4)
@torch.no_grad()
def model_predict(net, test_iter):
    # 预测模型
    preds_list = []
    print('加载最优模型')
    net.load_state_dict(torch.load('model/best.pth'))
    net = net.to(device)
    print('inference测试集')
    
    for data in tqdm(test_iter):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        batch_preds = net(ids, mask, token_type_ids).sigmoid().detach().cpu().numpy()[0]
        for preds in batch_preds:
            preds_list.append(preds)           
    return preds_list
        for i in range(len(model_list)):
            model=model_list[i]
            outputs=model(text)
            outputs=outputs.sigmoid().detach().cpu().numpy()[0]
            if i==0:
                pres_fold=outputs/len(model_list)
            else:
                pres_fold+=outputs/len(model_list)
        #
        pres_fold=[str(p) for p in pres_fold]
        pres_fold=' '.join(pres_fold)
        pres_all.append(pres_fold)
    return pres_all

if __name__=="__main__":
    device=torch.device('cuda')
    model_list=[]
    for i in range(5):
        model_list.append(load_model('ckpt/fold_'+str(i+1)+'_best.pth'))
    #
    test_df=pd.read_csv('dataset/track1_round1_testA_20210222.csv',header=None) 
    #
    test_df.columns=['report_ID','description']
    submit=test_df.copy()
    print("test_df:{}".format(test_df.shape))
    new_des=[i.strip('|').strip() for i in test_df['description'].values]
    test_df['description']=new_des
    sub_id=test_df['report_ID'].values
    #
    print(sub_id[0])
    save_dir='submits/'
    if not os.path.exists(save_dir): os.makedirs(save_dir)
    pres_all=predict(new_des)
    str_w=''
    current_time=time.localtime(time.time())
    with open(save_dir+'submit_{}_{}.csv'.format(current_time.tm_hour,current_time.tm_min),'w') as f:
        for i in range(len(sub_id)):
            str_w+=sub_id[i]+','+'|'+pres_all[i]+'\n'
        str_w=str_w.strip('\n')
        f.write(str_w