In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer
from IPython.display import clear_output

import torch
from transformers import BertTokenizer
from IPython.display import clear_output

PRETRAINED_MODEL_NAME = "hfl/chinese-bert-wwm"  # 指定繁簡中文 BERT-BASE 預訓練模型
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

clear_output()
print("PyTorch 版本：", torch.__version__)
vocab = tokenizer.vocab
print("字典大小：", len(vocab))

PyTorch 版本： 1.5.0
字典大小： 21128


In [2]:
"""
實作一個可以用來讀取訓練 / 測試集的 Dataset，這是你需要徹底了解的部分。
此 Dataset 每次將 tsv 裡的一筆成對句子轉換成 BERT 相容的格式，並回傳 3 個 tensors：
- tokens_tensor：包含 [CLS]
- segments_tensor：可以用來識別兩個句子界限的 binary tensor
- label_tensor：將分類標籤轉換成類別索引的 tensor, 如果是測試集則回傳 None
"""
from torch.utils.data import Dataset
 

class NLPDataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, mode, tokenizer,maxLength, path):
        assert mode in ["train", "test", "all data"]  # 一般訓練你會需要 dev set
        self.mode = mode
        # 大數據你會需要用 iterator=True
        self.df = pd.read_csv(path, sep="\t").fillna("")
        self.len = len(self.df)
        self.maxLength = maxLength
        self.label_map = {'postive': 0, 'negtive': 1}
        self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
    
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        news_ID, text, label = self.df.iloc[idx, :].values
        label_tensor = torch.tensor(label)
        news_ID_tensor = torch.tensor(news_ID)   
        # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ["[CLS]"]
        tokens = self.tokenizer.tokenize(text)
        lens = self.maxLength-len(tokens)-1
        for I in range(lens):
            #padding 文章長度不足的部分
            tokens += ["[PAD]"]
        word_pieces += tokens
        # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        segments_tensor = torch.tensor([1] * len(word_pieces), 
                                        dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor, news_ID_tensor)
    
    def __len__(self):
        return self.len
    

In [4]:
maxLength = 512

pathTrain = "D:/contest/E_SUN_Bank_NLP/bertNER-master/AML data/total/train_all_data_20200730.tsv"
pathTest = "D:/contest/E_SUN_Bank_NLP/bertNER-master/AML data/total/test_all_data_20200730.tsv"
pathAll = "D:/contest/E_SUN_Bank_NLP/bertNER-master/AML data/total/all_data_20200730.tsv"

trainset = NLPDataset("train", tokenizer=tokenizer, maxLength = maxLength, path = pathTrain)
testset = NLPDataset("test", tokenizer=tokenizer, maxLength = maxLength, path = pathTest)
allset = NLPDataset("all data", tokenizer=tokenizer, maxLength = maxLength, path = pathAll)

trainData = pd.read_csv(pathTrain, sep="\t").fillna("")
testData = pd.read_csv(pathTest, sep="\t").fillna("")
allDAta = pd.read_csv(pathAll, sep="\t").fillna("")

In [5]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
        
    if samples[0][3] is not None:
        news_ID_ids = torch.stack([s[3] for s in samples])
    else:
        news_ID_ids = None    
    # zero pad 到同一序列長度

    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)

    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids, news_ID_ids


# 初始化一個每次回傳 5 個訓練樣本的 DataLoader
# 利用 `collate_fn` 將 list of samples 合併成一個 mini-batch 是關鍵
BATCH_SIZE = 5
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

testloader = DataLoader(testset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

all_loader = DataLoader(allset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

In [6]:

from torch.autograd import Variable
def get_predictions(model, dataloader, dataName, size, compute_acc=False):
    predictions = None
    y_pred = []
    y_true = []
    pbar = pkbar.Pbar(name= dataName + ' predict~!!', target=size)
  
    with torch.no_grad():
        # 遍巡整個資料集
        i=0
        for data in dataloader:
            pbar.update(i)
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors, yTrue = data[:4]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            
            _, pred = torch.max(logits.data, 1)
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
#                 y_true += labels.size(0)
                if (y_true == []):
                    y_true = labels
                else:
                    y_true = torch.cat((y_true,labels), dim=0)
                if (y_pred == []):
                    y_pred = pred
                else:
                    y_pred = torch.cat((y_pred,pred), dim=0)
#                 correct += (pred == labels).sum().item()
#                 y_pred += pred 
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
            i+=1
        print("\n")
    
    if compute_acc:
        tp = (y_true * y_pred).sum().to(torch.float32)
        tn = ((1 - y_true) * (1 - y_pred)).sum().to(torch.float32)
        fp = ((1 - y_true) * y_pred).sum().to(torch.float32)
        fn = (y_true * (1 - y_pred)).sum().to(torch.float32)

        epsilon = 1e-7

        precision = tp / (tp + fp + epsilon)
        recall = tp / (tp + fn + epsilon)

        f1 = 2* (precision*recall) / (precision + recall + epsilon)  
        return predictions, f1
    return predictions

In [7]:
from transformers import BertForSequenceClassification
torch.cuda.empty_cache()
PRETRAINED_MODEL_NAME = "hfl/chinese-bert-wwm"
NUM_LABELS = 2

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

# high-level 顯示此模型裡的 modules
print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))


name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=2, bias=True)


In [8]:
PATH = "model/model_512_20200722.pkl"
model = torch.load(PATH)

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

device: cuda:0


In [10]:
import pkbar

startCount = 0
count = 0
bestTestAcc = 0
preTestAcc = 0
model.train()

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=5e-7)

EPOCHS = 5000  # 幸運數字
pbar = pkbar.Pbar(name='Train~!!', target=len(trainData)/BATCH_SIZE)
for epoch in range(EPOCHS):
    
    running_loss = 0.0
    print('Epoch: %d/%d' % (epoch + 1, EPOCHS))
    i = 0
    for data in trainloader:

        pbar.update(i)
        tokens_tensors, segments_tensors, \
        masks_tensors, labels, news_ID = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()


        # 紀錄當前 batch loss
        running_loss += loss.item()
        i += 1
    print("\n")
        
    # 計算分類準確率
#     _, train_f1 = get_predictions(model, trainloader, "train", len(trainData)/BATCH_SIZE, compute_acc=True)
    _,test_f1 = get_predictions(model, testloader, "test", len(testData)/BATCH_SIZE, compute_acc=True)
#     train_acc = Variable(train_f1).cpu().numpy()
    test_acc = Variable(test_f1).cpu().numpy()
    print('[epoch %d] train loss: %.3f, test f1: %.3f'
          %(epoch + 1, running_loss, test_acc))

    if (test_acc > 0.8 and startCount == 0):
        startCount = 1
        bestTestAcc = test_acc
    if(startCount == 1 and bestTestAcc > test_acc):
        count += 1
    elif(startCount == 1 and bestTestAcc < test_acc):
        PATH = "model_512_20200730.pkl"
        torch.save(model, PATH)
        bestTestAcc = test_acc
        count = 0    
        print("refresh and save best model,train loss: %.3f, best test f1:%.3f"%( running_loss, bestTestAcc))

    elif(startCount == 1 and bestTestAcc < test_acc):
        PATH = "model_512_20200730.pkl"
        torch.save(model, PATH)
        bestTestAcc = test_acc
        count = 0    
        print("refresh and save best model, best test f1:%.3f"%( bestTestAcc))
    if (count > 50):
        print("break")
        break
#     print('[epoch %d] train loss: %.3f, train f1: %.3f, test f1: %.3f'
#           %(epoch + 1, running_loss, train_acc, test_acc))

#     if (test_acc > 0.8 and startCount == 0):
#         startCount = 1
#         bestTestAcc = test_acc
#     if(startCount == 1 and bestTestAcc > test_acc):
#         count += 1
#     elif(startCount == 1 and bestTestAcc < test_acc):
#         PATH = "model_512_20200729.pkl"
#         torch.save(model, PATH)
#         bestTestAcc = test_acc
#         count = 0    
#         print("refresh and save best model,train loss: %.3f,  train f1: %.3f, best test f1:%.3f"%( running_loss, train_acc, bestTestAcc))
#     if (count > 10):
#         print("break")
#         break

#     elif(startCount == 1 and bestTestAcc < test_acc):
#         PATH = "model_512_20200729.pkl"
#         torch.save(model, PATH)
#         bestTestAcc = test_acc
#         count = 0    
#         print("refresh and save best model, train f1: %.3f, best test f1:%.3f"%( train_acc, bestTestAcc))
#     if (count > 10):
#         print("break")
#         break


Train~!!
Epoch: 1/5000

test predict~!!

[epoch 1] train loss: 68.608, test f1: 0.932
Epoch: 2/5000

test predict~!!

[epoch 2] train loss: 42.542, test f1: 0.940
refresh and save best model,train loss: 42.542, best test f1:0.940
Epoch: 3/5000

test predict~!!

[epoch 3] train loss: 35.754, test f1: 0.935
Epoch: 4/5000

test predict~!!

[epoch 4] train loss: 33.251, test f1: 0.939
Epoch: 5/5000

test predict~!!

[epoch 5] train loss: 27.252, test f1: 0.952
refresh and save best model,train loss: 27.252, best test f1:0.952
Epoch: 6/5000

test predict~!!

[epoch 6] train loss: 24.204, test f1: 0.942
Epoch: 7/5000

test predict~!!

[epoch 7] train loss: 19.878, test f1: 0.955
refresh and save best model,train loss: 19.878, best test f1:0.955
Epoch: 8/5000

test predict~!!

[epoch 8] train loss: 14.824, test f1: 0.942
Epoch: 9/5000

test predict~!!

[epoch 9] train loss: 10.995, test f1: 0.950
Epoch: 10/5000

test predict~!!

[epoch 10] train loss: 8.516, test f1: 0.937
Epoch: 11/5000

tes