In [1]:
import torch
import torch.nn.functional as F
import pandas as pd
from transformers import XLNetTokenizer
# from IPython.display import clear_output
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


In [5]:
PRETRAINED_MODEL_NAME = "bert-base-chinese"  # 指定繁簡中文 BERT-BASE 預訓練模型
tokenizer = XLNetTokenizer.from_pretrained('./chinese_xlnet_mid_pytorch/')
# clear_output()
print("PyTorch 版本：", torch.__version__)

PyTorch 版本： 1.4.0


In [3]:
column_names = ['type','title','text']
df_unlabel = pd.read_csv('./udn_for_mct.tsv',sep='\t',names=column_names)
df_all = pd.read_csv('./all_after_mapping.tsv',sep='\t',names=column_names)
li = [df_unlabel,df_all]
df_combine = pd.concat(li)

In [4]:
texts = df_all['text'].tolist()
y = df_all['type'].values

In [5]:
y.shape

(35546,)

In [6]:
input_dict = tokenizer.batch_encode_plus(texts, 
                                         add_special_tokens=True,
                                         max_length=512,
                                         return_special_tokens_masks=True,
                                         pad_to_max_length=True,
                                         return_tensors='pt')

In [6]:
class XLNetDataset(Dataset):
    def __init__(self, input_dict,y):
        self.input_ids = input_dict['input_ids']
        self.token_type_ids = input_dict['token_type_ids']
        self.attention_mask = input_dict['attention_mask']
        self.y = y
    def __getitem__(self,idx):
        inputid = self.input_ids[idx]
        tokentype = self.token_type_ids[idx]
        attentionmask = self.attention_mask[idx]
        y = self.y[idx]
        return inputid , tokentype , attentionmask , y
    
    def __len__(self):
        return len(self.input_ids)
class TestDataset(Dataset):
    def __init__(self, input_dict):
        self.input_ids = input_dict['input_ids']
        self.token_type_ids = input_dict['token_type_ids']
        self.attention_mask = input_dict['attention_mask']
       
    def __getitem__(self,idx):
        inputid = self.input_ids[idx]
        tokentype = self.token_type_ids[idx]
        attentionmask = self.attention_mask[idx]
        return inputid , tokentype , attentionmask 
    
    def __len__(self):
        return len(self.input_ids)

        

In [8]:
BATCH_SIZE = 2
trainset = XLNetDataset(input_dict,y)
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE)

In [9]:
from transformers import XLNetForSequenceClassification

NUM_LABELS = 7

model  = XLNetForSequenceClassification.from_pretrained('./chinese_xlnet_mid_pytorch/',num_labels=NUM_LABELS)

In [7]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    predictions_withoutmax = None
    correct = 0
    total = 0
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            after_softmax = F.softmax(logits.data, dim=1)
            _, pred = torch.max(after_softmax, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
                
            if predictions_withoutmax is None:
                predictions_withoutmax = after_softmax
            else:
                predictions_withoutmax = torch.cat((predictions_withoutmax,after_softmax))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions_withoutmax
    
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# print("device:", device)
# _, acc = get_predictions(model, trainloader, compute_acc=True)
# print("classification acc:", acc)

In [11]:
from torch.autograd import Variable

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

model.train()

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

EPOCHS = 2
for epoch in range(EPOCHS):
    step = 0
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)
        # 將參數梯度歸零
        optimizer.zero_grad()

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()

        # 紀錄當前 batch loss
        running_loss += loss.item()
    torch.save(model, 'model_xlnet.pkl')
        
    # 計算分類準確率
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))

device: cuda:0




[epoch 1] loss: 5887.066, acc: 0.896
[epoch 2] loss: 3457.409, acc: 0.923
[epoch 3] loss: 2189.507, acc: 0.913


In [3]:
model = torch.load('model_xlnet.pkl')
model.eval()

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e

In [4]:
column_names = ['type','title','text']
dftrain = pd.read_csv('./data_after_sep/train.tsv',sep='\t',names=column_names)
dftest = pd.read_csv('./data_after_sep/test.tsv',sep='\t',names=column_names)
dfdev = pd.read_csv('./data_after_sep/dev.tsv',sep='\t',names=column_names)
testans = dftest['type'].values
testans = np.array(testans)

In [8]:
column_names = ['type','title','text']
dftest = pd.read_csv('./data_after_sep/test.tsv',sep='\t',names=column_names)
texts = dftest['text'].tolist()
input_dict = tokenizer.batch_encode_plus(texts, 
                                         add_special_tokens=True,
                                         max_length=510,
                                         return_special_tokens_masks=True,
                                         pad_to_max_length=True,
                                         return_tensors='pt')


BATCH_SIZE = 32
testset = TestDataset(input_dict)
testloader = DataLoader(testset, batch_size=BATCH_SIZE)
# 用分類模型預測測試集
predictions = get_predictions(model, testloader)




In [9]:
pred = predictions.cpu().data.numpy()
pred = np.argmax(pred, axis=1)
accuracy = (pred == testans).mean()
print('Your test accuracy is %.6f' % (accuracy * 100))

Your test accuracy is 86.440000
