In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer
import transformers
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

from torch.autograd import Variable
import datetime
PRETRAINED_MODEL_NAME = "hfl/chinese-bert-wwm"  # 指定繁簡中文 BERT-BASE 預訓練模型
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

print("PyTorch 版本：", torch.__version__)
vocab = tokenizer.vocab
print("字典大小：", len(vocab))

PyTorch 版本： 1.5.0
字典大小： 21128


In [2]:
print("torch.version.cuda: ",torch.version.cuda)
print("torch.__version__: ",torch.__version__)
print("transformers.__version__: ",transformers.__version__)

torch.version.cuda:  10.2
torch.__version__:  1.5.0
transformers.__version__:  2.11.0


In [3]:

"""
實作一個可以用來讀取訓練 / 測試集的 Dataset，這是你需要徹底了解的部分。
此 Dataset 每次將 tsv 裡的一筆成對句子轉換成 BERT 相容的格式，並回傳 3 個 tensors：
- tokens_tensor：包含 [CLS]
- segments_tensor：可以用來識別兩個句子界限的 binary tensor
- label_tensor：將分類標籤轉換成類別索引的 tensor, 如果是測試集則回傳 None
"""
class NLPDataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, tokenizer, path, maxLength):
        # 大數據你會需要用 iterator=True
        self.df = pd.read_csv( path, sep="\t").fillna("")
        self.len = len(self.df)
        self.label_map = {'postive': 0, 'negtive': 1}
        self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
        self.articleLegth = maxLength
    
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        news_ID, text, label = self.df.iloc[idx, :].values
        label_tensor = torch.tensor(label)
        news_ID_tensor = torch.tensor(news_ID)   
        # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ["[CLS]"]
        tokens = self.tokenizer.tokenize(text)
        lens = self.articleLegth-len(tokens)-1
        for I in range(lens):
            #padding 文章長度不足的部分
            tokens += ["[PAD]"]
        word_pieces += tokens
        # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        segments_tensor = torch.tensor([1] * len(word_pieces), 
                                        dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor, news_ID_tensor)
    
    def __len__(self):
        return self.len

def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
        
    if samples[0][3] is not None:
        news_ID_ids = torch.stack([s[3] for s in samples])
    else:
        news_ID_ids = None    
    # zero pad 到同一序列長度

    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)

    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    return tokens_tensors, segments_tensors, masks_tensors, label_ids, news_ID_ids


def modelPredictions(model, dataloader, compute_acc=True):
    predictions = None
    y_pred = []
    y_true = []
    news_ID_wrong = []
    wrongTotal = 0
    print("start model predit!! ","\n")
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
#             別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
#             且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors, yTrue, news_ID = data[:5]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)

            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            if compute_acc:
                labels = data[3]
#                 y_true += labels.size(0)
                if (y_true == []):
                    y_true = labels
                else:
                    y_true = torch.cat((y_true,labels), dim=0)
                if (y_pred == []):
                    y_pred = pred
                else:
                    y_pred = torch.cat((y_pred,pred), dim=0)
                    
            for I in range(tokens_tensors.shape[0]):
#                 if(yTrue[I] != pred[I] or yTrue[I]==1):
                if(yTrue[I] != pred[I] or pred[I] ==1 ):
                    temp = tokens_tensors[I]
                    tokens = tokenizer.convert_ids_to_tokens(temp.tolist())  
                    news_ID_wrong = np.append(news_ID_wrong,Variable(news_ID).cpu().numpy()[I])
                    combined_text = "".join(tokens)  
                    if (combined_text.find("[PAD]")!=-1):
                        combined_text = combined_text[:combined_text.find("[PAD]")]
                    print(f""" true ：{yTrue[I]} pred  ：{pred[I]} news_ID  ：{news_ID[I]}
                    --------------------
                    """)
#                     print("context:", df_all.loc[df_all.loc[:,'news_ID']==news_ID[I],'URL_CONTENT_new'].values[0])
                    print("context:", combined_text[5:])
                    print("\n")
                    wrongTotal += 1
        print("wrongTotal: ",wrongTotal)
        print("news_ID_wrong: ",news_ID_wrong.astype(int))
        
    if compute_acc:
        tp = (y_true * y_pred).sum().to(torch.float32)
        tn = ((1 - y_true) * (1 - y_pred)).sum().to(torch.float32)
        fp = ((1 - y_true) * y_pred).sum().to(torch.float32)
        fn = (y_true * (1 - y_pred)).sum().to(torch.float32)

        epsilon = 1e-7

        precision = tp / (tp + fp + epsilon)
        recall = tp / (tp + fn + epsilon)

        f1 = 2* (precision*recall) / (precision + recall + epsilon)  
        print("fi score ：%.3f, recall ：%.3f, precision  ：%.3f"%(f1,recall,precision)) 
        
def mainDataProcess(BATCH_SIZE, path, maxLength):
    dataSet = NLPDataset(tokenizer=tokenizer, path = path, maxLength = maxLength)  
    loader = DataLoader(dataSet, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)
    return loader

In [None]:
if __name__ == '__main__':
    BATCH_SIZE = 1
    path = "data/"
    maxLength = 512
    PATH = "model/model_512_20200730.pkl"
    dataloader = mainDataProcess(BATCH_SIZE, path + 'all_data_20200730.tsv', maxLength)
    model = torch.load(PATH)
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("device:", device)
    model = model.to(device)
    startTime = datetime.datetime.now()
    modelPredictions(model, dataloader)
    endTime = datetime.datetime.now()
    print("time:", endTime-startTime)
    