In [1]:
import pandas as pd
import ast
import numpy as np
import re
from zhon.hanzi import stops
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
PRETRAINED_MODEL_NAME = "hfl/rbtl3" # RBTL3
df_train = pd.read_csv('./tbrain/tbrain_train.csv')
df_test = pd.read_csv('./tbrain/tbrain_test.csv')
df_train = df_train.fillna('[\'\]')
df_test = df_test.fillna('[\'\']')
print(df_train.shape)
print(df_test.shape)

(4426, 4)
(491, 4)


In [2]:
def clean_string(content):
#     cc = OpenCC('t2s')
    content = content.replace('\n','。').replace('\t','，').replace('!', '！').replace('?', '？')# erease white space cause English name error
    content = re.sub("[+\.\/_,$%●▼►^*(+\"\']+|[+——~@#￥%……&*（）★]", "",content)
    content = re.sub(r"[%s]+" %stops, "。",content)
#     content = cc.convert(content)
    return content

In [3]:
def find_all(name, content):
    # +1 for [CLS]
    pos_list = [m.start()+1 for m in re.finditer(name, content)]
    count = len(pos_list)
    return pos_list , count

In [4]:
def orgi_2_array(names, contents):
    x = []
    binary_y = []
    BIO_labels = []
    nFound_count = 0
    name_count = 0
    
    for i in range(len(contents)):
        content = contents[i]
        content = clean_string(content)

        # record names
        # name = names[i] # single
        name_list = names[i]
        names_label = ast.literal_eval(name_list) # string to list
        # debug
        

        # init pos label arr
        BIO_label = np.full((512), 2) # initial to all 2 (outside)
        
        # no AML person
        if(name_list == '[]'):
            binary_y.append(0)
            x.append(content)
            BIO_label[0] = 0 # first position 0(begin)
            BIO_labels.append(BIO_label)

        else:
            # initial position list
            start_pos = []
            end_pos = []

            # if (True): # single
            for name in names_label:
              temp, count = find_all(name, content)
              if(temp == []):
  #                 print(name + ' find error in data', i)
                  nFound_count += 1
                  continue
              for j in range(count):
                start_pos.append(temp[j])
                end_pos.append(temp[j] + len(name))

#                  01234
#                B 00100
#                I 00011
#                O 11000
            if (i == 6):
              print(start_pos)
              print(end_pos)
            for j in range(len(start_pos)):
                if(start_pos[j] < 512 and end_pos[j] < 512):
                    BIO_label[start_pos[j]] = 0
                    BIO_label[start_pos[j]+1 : end_pos[j]] = 1
            binary_y.append(1)
            x.append(content)
            BIO_labels.append(BIO_label)
            

    x = np.array(x)
    binary_y = np.array(binary_y)
    BIO_labels = np.array(BIO_labels)
    
    print('nFound: ', nFound_count)
    print('name_count:', name_count)
    print(x.shape)
    print(binary_y.shape)
#     print(begin_pos_labels.shape)
#     print(inside_pos_labels.shape)
#     print(outside_pos_labels.shape)
    print(BIO_labels.shape)
    return x, binary_y, BIO_labels

In [5]:
names =  df_train['name']
contents = np.array(df_train['full_content'].tolist())
train_x, train_binary_y, train_bio_labels = orgi_2_array(names, contents)

[1, 151, 187, 191, 195, 183, 71]
[4, 154, 190, 194, 198, 186, 74]
nFound:  0
name_count: 0
(4426,)
(4426,)
(4426, 512)


In [6]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class TrainDataset(Dataset):
    def __init__(self, input_dict, y , bio_labels):
        self.input_ids = input_dict['input_ids']
        self.token_type_ids = input_dict['token_type_ids']
        self.attention_mask = input_dict['attention_mask']
        self.y = y
        self.bio_labels = bio_labels
        
    def __getitem__(self,idx):
        inputid = self.input_ids[idx]
        tokentype = self.token_type_ids[idx]
        attentionmask = self.attention_mask[idx]
        bio_label = self.bio_labels[idx]
        y = self.y[idx]
        return inputid , tokentype , attentionmask, y , bio_label
    
    def __len__(self):
        return len(self.input_ids)
    
class TestDataset(Dataset):
    def __init__(self, input_dict):
        self.input_ids = input_dict['input_ids']
        self.token_type_ids = input_dict['token_type_ids']
        self.attention_mask = input_dict['attention_mask']
        
    def __getitem__(self,idx):
        inputid = self.input_ids[idx]
        tokentype = self.token_type_ids[idx]
        attentionmask = self.attention_mask[idx]
        return inputid , tokentype , attentionmask, 
    
    def __len__(self):
        return len(self.input_ids)

In [7]:
from transformers import XLNetTokenizer

PRETRAINED_MODEL_NAME = './chinese_xlnet_mid_pytorch/'


tokenizer = XLNetTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
# 把input轉換成bert格式
train_input_dict = tokenizer.batch_encode_plus(train_x, 
                                         add_special_tokens=True,
                                         max_length=512,
                                         return_special_tokens_mask=True,
                                         pad_to_max_length=True,
                                         return_tensors='pt',
                                         truncation=True)

In [8]:
""" model budling """
from transformers import BertModel, XLNetModel
import torch
import torch.nn as nn
from transformers.modeling_utils import SequenceSummary

class AMLPredictModel(nn.Module):
    def __init__(self,config):
        super(AMLPredictModel, self).__init__()
        self.bert = XLNetModel.from_pretrained(PRETRAINED_MODEL_NAME)
        self.bert.output_hidden_states = True
        self.classifier = nn.Sequential(
                        nn.Linear(config.hidden_size, 2),
        ) # binary classification
        self.BIO_classifier = nn.Sequential(
                        nn.Linear(config.hidden_size, 3),
        ) # BIO tagging
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(-1)
        
        self.sequence_summary = SequenceSummary(config)
        

        
    def forward(self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
#         position_ids=None,
#         head_mask=None,
#         inputs_embeds=None,
    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
#             position_ids=position_ids,
#             head_mask=head_mask,
#             inputs_embeds=inputs_embeds
        )
        

       
        have_AML = outputs[0] # pooled cls (cls token through 1 linear and tanh)
        have_AML = self.sequence_summary(have_AML)
        have_AML = self.classifier(have_AML)
        
        BIO = self.BIO_classifier(outputs[0]) # 512*HIDDENSIZE word vectors
        BIO = self.softmax(BIO)
        
#         flag = 1
        # debug
#         if (flag):
#             flag = 0
#             print("forward output")
#             print(BIO)
#             print(BIO_out)
#             print(arg)
#             print("---")
        
        outputs = (have_AML, BIO) + outputs[2:]
        return outputs

In [9]:

def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    predictions_withoutmax = None
    binary_correct = 0
    total = 0
    bio_correct = 0
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0] # haveAML(binary classification)
            after_softmax = nn.functional.softmax(logits.data, dim=1)
            _, binary_pred = torch.max(after_softmax, 1)

            temp = outputs[1]
            bio_preds = torch.empty(temp.shape[0], 3, 512)
            
            for i in range(temp.shape[0]):  # run batchsize times
                arg = temp[i].argmax(1) # 3*512 into class label
                bio_preds[i] = arg

            bio_preds = np.array(bio_preds)

            # debug
            print("get pred")
            print("b_pred ", binary_pred)
            # print(binary_pred.shape)
            # print("-----")
            # print("b_label ", data[3])
            # print(data[3].shape)
            print("BIO_labels ", data[4])
            print(data[4].shape)
            # print("---")
            print("BIO_pred ",bio_preds)
            # print(bio_preds.shape)
            # break
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                binary_labels = data[3]
                total += binary_labels.size(0)
                binary_correct += (binary_pred == binary_labels).sum().item()
                bio_labels = data[4]
                bio_correct += (bio_preds == bio_labels).sum().item()
                # print(binary_correct)
                # break

                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = binary_pred
            else:
                predictions = torch.cat((predictions, binary_pred))
                
            if predictions_withoutmax is None:
                predictions_withoutmax = after_softmax
            else:
                predictions_withoutmax = torch.cat((predictions_withoutmax,after_softmax))
    
    if compute_acc:
        binary_acc = binary_correct / total
        bio_acc = bio_correct / total
        return predictions, binary_acc, bio_acc
    return predictions_withoutmax

In [10]:
""" model setting (training)"""
from transformers import AdamW , XLNetConfig


config = XLNetConfig.from_pretrained(PRETRAINED_MODEL_NAME, output_hidden_states=True)
BATCH_SIZE = 2
trainSet = TrainDataset(train_input_dict, train_binary_y, train_bio_labels)
trainLoader = DataLoader(trainSet, batch_size=BATCH_SIZE)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = AMLPredictModel(config)
optimizer = AdamW(model.parameters(), lr=1e-5) # AdamW = BertAdam
binary_loss_fct = nn.CrossEntropyLoss()
weight = torch.FloatTensor([500,450,1]).cuda()
BIO_loss_fct = nn.CrossEntropyLoss(weight=weight)

# high-level 顯示此模型裡的 modules
print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
#             print(_)
    else:
        print("{:15} {}".format(name, module))

device: cuda:0

name            module
----------------------
bert:word_embedding
bert:layer
bert:dropout
classifier      Sequential(
  (0): Linear(in_features=768, out_features=2, bias=True)
)
BIO_classifier  Sequential(
  (0): Linear(in_features=768, out_features=3, bias=True)
)
sigmoid         Sigmoid()
softmax         Softmax(dim=-1)
sequence_summary SequenceSummary(
  (summary): Linear(in_features=768, out_features=768, bias=True)
  (first_dropout): Identity()
  (last_dropout): Dropout(p=0.1, inplace=False)
)


In [11]:
""" training """
model = model.to(device)
model.train() ##########################

EPOCHS = 10
step = 0
for epoch in range(5, EPOCHS):
    running_loss = 0.0
    binary_running_loss = 0.0
    BIO_running_loss = 0.0
    for data in trainLoader:
    # data = testSet[21] # test model
    # if(True):
        
        tokens_tensors, segments_tensors, masks_tensors, \
        labels, BIO_label = [t.to(device) for t in data]

      # tokens_tensors, segments_tensors, masks_tensors, labels, BIO_label = data
      # tokens_tensors, segments_tensors, masks_tensors = data
      # tokens_tensors = tokens_tensors.reshape((1,512)).to(device)
      # segments_tensors = segments_tensors.reshape((1,512)).to(device)
      # masks_tensors = masks_tensors.reshape((1,512)).to(device)
      # labels = torch.tensor(labels).reshape((1)).to(device)
      # BIO_label = torch.tensor(BIO_label).reshape((1,512)).to(device)

      # 將參數梯度歸零
        optimizer.zero_grad()
      
      # forward pass
        outputs = model(input_ids=tokens_tensors, 
                      token_type_ids=segments_tensors, 
                      attention_mask=masks_tensors)

        BIO_pred = outputs[1]
        BIO_pred = torch.transpose(BIO_pred, 1, 2)
      
      # debug
      # print("epoch output")
      # BIO_label[0][0] = 500
      # BIO_label = BIO_label.squeeze()
      # BIO_pred = BIO_pred.squeeze()
      # print(BIO_label)
      # print(BIO_label.shape)
      # print(BIO_pred)
      # print(BIO_pred.shape)
      # print(outputs[0].shape)
      # print(labels.shape)
      # print(BIO_pred[0][0])
      # print(BIO_pred[0][1])
      # print(BIO_pred[0][2])
      # break


        binary_loss = binary_loss_fct(outputs[0], labels)
      # print(BIO_pred.shape)
      # print(BIO_label.shape)
        BIO_loss = BIO_loss_fct(BIO_pred, BIO_label)
      # print(binary_loss, BIO_loss)
        loss = binary_loss + BIO_loss
      # print(loss)
      # break
      
      # backward
      # loss.backward()
      # optimizer.step()

      # 紀錄當前 batch loss
        running_loss += loss.item()
        binary_running_loss += binary_loss.item()
        BIO_running_loss += BIO_loss.item()
      # if (step % 10 == 0):
        # print('step %d total_loss: %.3f binary_loss: %.3f BIO_loss: %.3f' %
        #     (step, running_loss, binary_running_loss, BIO_running_loss))
        step += 1
        
    CHECKPOINT_NAME = './model/XLNet_bio_EPOCHES_' + str(epoch) + '.pkl'
    torch.save(model, CHECKPOINT_NAME)
        
    # 計算分類準確率
    # _, binary_acc, bio_acc = get_predictions(model, trainLoader, compute_acc=True)

    print('[epoch %d] loss: %.3f, binary_loss: %.3f, bio_loss: %.3f' %
          (epoch + 1, running_loss, binary_running_loss, BIO_running_loss))

  "type " + obj.__name__ + ". It won't be checked "


[epoch 6] loss: 3850.026, binary_loss: 1171.979, bio_loss: 2678.046
[epoch 7] loss: 3846.388, binary_loss: 1169.730, bio_loss: 2676.658
[epoch 8] loss: 3837.620, binary_loss: 1160.967, bio_loss: 2676.653


KeyboardInterrupt: 

In [12]:
temp = df_test['name'].tolist()
ans = []
for i in range(len(temp)):
    t = ast.literal_eval(temp[i])
    if (len(t) == 0):
        t.append('')
    ans.append(t)
ans

[[''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 ['蔡開宇', '王宇正', '李訓成'],
 [''],
 ['張永泉', '郭明賓'],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 ['李瑞廷'],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 ['陳學敏', '牟孝儀'],
 ['許祈文'],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 ['黃世陽', '黃顯雄'],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 [''],
 ['黃淑頻', '呂建安'],
 [''],
 [''],
 ['章民強', '章啟光', '章啟明'],
 ['張建生', '林宏彬', '張宜豐', '陳正達', '黃志豪'],
 [''],
 [''],
 [''],
 [''],
 ['傅春生'],
 [''],
 ['莊錫根'],
 [''],
 [''],
 [''],
 [''],
 ['朱國榮', '劉慶珠']

In [13]:
names =  df_test['name']
contents = np.array(df_test['full_content'].tolist())
test_x, test_binary_y, test_bio_labels = orgi_2_array(names, contents)

nFound:  0
name_count: 0
(491,)
(491,)
(491, 512)


In [14]:
test_input_dict = tokenizer.batch_encode_plus(test_x, 
                                         add_special_tokens=True,
                                         max_length=512,
                                         return_special_tokens_mask=True,
                                         pad_to_max_length=True,
                                         return_tensors='pt',
                                         truncation=True)

In [15]:
def bio_2_string(tokens_tensors, have_AML, BIO_tagging, ckip_result):
    result = []
    if (have_AML.item() == 0):
        result.append('')
    else:
        for j in range(1, 512):
            if (BIO_tagging[j] == 0):
                start = j
                end = j + 1
                while (end < 512 and BIO_tagging[end] == 1):
                    end += 1
                if (end > start + 1):
                    s = tokenizer.decode(token_ids = tokens_tensors[start : end], skip_special_tokens = True)
                    s = s.replace(' ', '')
                    for k in range(len(ckip_result)):
                        found = s.find(ckip_result[k])
                        if (found != 1):
                            result.append(ckip_result[k])
    if (len(result) == 0):
        result.append('')
    return result

In [16]:
def get_predictions(model, testLoader, BATCH_SIZE):
  result = []
  total_count = 0 # 第n筆data
  with torch.no_grad():
    for data in testLoader:
      # 將所有 tensors 移到 GPU 上
      if next(model.parameters()).is_cuda:
        data = [t.to("cuda:0") for t in data if t is not None]
      
      # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
      # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
      tokens_tensors, segments_tensors, masks_tensors = data[:3]
      outputs = model(input_ids=tokens_tensors, 
                  token_type_ids=segments_tensors, 
                  attention_mask=masks_tensors)
      
      # print(tokens_tensors, tokens_tensors.shape)
      # print(outputs[0], outputs[0].shape)
      # print(outputs[1], outputs[1].shape)
      
      count = min(outputs[0].shape[0], BATCH_SIZE)
      for i in range(count):  # run batchsize times
        have_AML = outputs[0][i].argmax()
        BIO_pred = outputs[1][i].argmax(1) # 3*512 into class label
        text_token = tokens_tensors[i]
        ckip_names = df_test.loc[total_count, 'ckip_names']
        ckip_names_list = ast.literal_eval(ckip_names) # string to list
        r = bio_2_string(text_token, have_AML, BIO_pred, ckip_names_list)
        # print(r)
        result.append(r)
        total_count += 1
      
        # print(text_token, text_token.shape)
        # print(have_AML, have_AML.shape)
        # print(BIO_pred, BIO_pred.shape)
        # print("recover", tokenizer.decode(token_ids = tokens_tensors[0][1:5], skip_special_tokens = True))
      # break
    # print(result)
  return result

In [19]:
"""testing"""
import torch
from transformers import BertConfig , XLNetConfig

PRETRAINED_MODEL_NAME = './chinese_xlnet_mid_pytorch/'
config = XLNetConfig.from_pretrained(PRETRAINED_MODEL_NAME, output_hidden_states=True)
model = AMLPredictModel(config)
model = torch.load('./model/XLNet_bio_EPOCHES_.pkl')

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

BATCH_SIZE = 4
testSet = TestDataset(test_input_dict)
testLoader = DataLoader(testSet, batch_size=BATCH_SIZE)


predictions = get_predictions(model, testLoader, BATCH_SIZE)

# pred = predictions.cpu().data.numpy()
# pred = np.argmax(pred, axis=1)
# accuracy = (pred == test_binary_y).mean()
# print('Your test accuracy is %.6f' % (accuracy * 100))


KeyboardInterrupt: 

In [None]:
def eval(pred, ans):
    if bool(pred) is not bool(ans):
        return 0
    elif not pred and not ans:
        return 1
    else:
        pred = set(pred)
        ans = set(ans)
        interaction_len = len(pred & ans)
        if interaction_len == 0:
            return 0

        pred_len = len(pred)
        ans_len = len(ans)
        return 2 / (pred_len / interaction_len + ans_len / interaction_len)


def eval_all(pred_list, ans_list):
    assert len(pred_list) == len(ans_list)
    return sum(eval(p, a) for p, a in zip(pred_list, ans_list)) / len(pred_list)

In [None]:
eval_all(predictions, ans)