In [1]:
import torch
import random
import pandas as pd
import numpy as np
import torch.nn as nn
from transformers import AdamW, BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from IPython.display import clear_output

In [2]:
tags_vals = ['OTHER', 'TRANSFER']
tag2idx = {t: i for i, t in enumerate(tags_vals)}
tag2idx

{'OTHER': 0, 'TRANSFER': 1}

In [3]:
df = pd.read_csv("dataset_classification.csv")
df.columns = ['text', 'label']
df["label"] = df["label"].apply(lambda x: tag2idx[x])
df

Unnamed: 0,text,label
0,幫忙從台幣帳戶轉3560塊到父親的帳戶,1
1,幫轉8028到我的父親帳戶從台幣帳戶,1
2,我要轉5890從我薪轉戶到父親,1
3,要轉去我的父親戶頭8711塊從我台幣戶,1
4,請幫忙轉去我的父親從我的外幣戶696塊錢,1
...,...,...
108740,人們願意與他做生意有時商業事務通過電話即可辦理,0
108741,經過十幾年的努力他已成為世界最大的私人集裝箱船船主,0
108742,妻賢子孝家庭幸福,0
108743,希臘人將瓦西里斯與奧納西斯比較時總不忘補充一句他和奧納西斯不同他沒有改組家庭,0


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'GeForce RTX 3060'

In [5]:
model = BertForSequenceClassification.from_pretrained("bert-base-chinese")
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
# tokenizer = BertTokenizer("bert-base-chinese-vocab.txt")
clear_output()

In [6]:
MAX_LEN = 0
for sent in df['text'].values.tolist():
    if(len(sent) > MAX_LEN):
        MAX_LEN = len(sent)
    
print("最長輸入長度:", MAX_LEN)

最長輸入長度: 70


In [7]:
batch_size = 32

In [8]:
class TransferDataset(Dataset):
    def __init__(self, df):
        self.dataset = df

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset.loc[idx, "text"]
        label = self.dataset.loc[idx, "label"]
        sample = {"text": text, "label": label}
        return sample

In [9]:
# df = df.sample(frac=1).reset_index(drop=True)
# df_train = df.head(int(0.8*len(df)))
# df_valid = df.tail(int(0.2*len(df)))
# len(df_train), len(df_valid)

In [10]:
trainset = TransferDataset(df)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)
validset = TransferDataset(df)
validloader = DataLoader(validset, batch_size=batch_size, shuffle=False, num_workers=2)

In [11]:
def convert_text_to_ids(tokenizer, text, max_len=70):
    if isinstance(text, str):
        tokenized_text = tokenizer.encode_plus(text, max_length=max_len, add_special_tokens=True)
        input_ids = tokenized_text["input_ids"]
        token_type_ids = tokenized_text["token_type_ids"]
    elif isinstance(text, list):
        input_ids = []
        token_type_ids = []
        for t in text:
            tokenized_text = tokenizer.encode_plus(t, max_length=max_len, add_special_tokens=True)
            input_ids.append(tokenized_text["input_ids"])
            token_type_ids.append(tokenized_text["token_type_ids"])
    else:
        print("Unexpected input")
    return input_ids, token_type_ids

def seq_padding(tokenizer, X):
    pad_id = tokenizer.convert_tokens_to_ids("[PAD]")
    if len(X) <= 1:
        return torch.tensor(X)
    L = [len(x) for x in X]
    ML = max(L)
    X = torch.Tensor([x + [pad_id] * (ML - len(x)) if len(x) < ML else x for x in X])
    return X

In [12]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {
            'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            'weight_decay': 1e-2
        }, {
            'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            'weight_decay': 0.0
        }
]

#optimizer = AdamW(model.parameters(), lr=learning_rate)
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)
criterion = nn.CrossEntropyLoss()
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [13]:
def train(model, iterator, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    epoch_acc = 0

    for i, batch in enumerate(iterator):
        text = batch["text"]
        label = batch["label"]
        
        input_ids, token_type_ids = convert_text_to_ids(tokenizer, text, MAX_LEN)
        input_ids = seq_padding(tokenizer, input_ids)
        token_type_ids = seq_padding(tokenizer, token_type_ids)
        
        label = label.unsqueeze(1)  # (batch_size, 1)
        input_ids, token_type_ids, label = input_ids.long(), token_type_ids.long(), label.long()
        
        optimizer.zero_grad()
        input_ids, token_type_ids, label = input_ids.to(device), token_type_ids.to(device), label.to(device)
        output = model(input_ids=input_ids, token_type_ids=token_type_ids, labels=label)
    
        y_pred_prob = output[1]
        y_pred_label = y_pred_prob.argmax(dim=1)
        
        # 计算loss, 这个 loss 和 output[0] 是一样的
        loss = criterion(y_pred_prob.view(-1, 2), label.view(-1))
        #loss = output[0]
        acc = ((y_pred_label == label.view(-1)).sum()).item()
    
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc
        
        if i % 200 == 0:
            print("current loss:", epoch_loss / (i+1), "\t", "current acc:", epoch_acc / ((i+1)*len(label)))
    return epoch_loss / len(iterator), epoch_acc / len(iterator.dataset.dataset)

In [14]:
def evaluate(model, iterator, criterion, device):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0

    with torch.no_grad():
        for _, batch in enumerate(iterator):
            text = batch["text"]
            label = batch["label"]
            
            input_ids, token_type_ids = convert_text_to_ids(tokenizer, text, MAX_LEN)
            input_ids = seq_padding(tokenizer, input_ids)
            token_type_ids = seq_padding(tokenizer, token_type_ids)

            label = label.unsqueeze(1)
            input_ids, token_type_ids, label = input_ids.long(), token_type_ids.long(), label.long()
            input_ids, token_type_ids, label = input_ids.to(device), token_type_ids.to(device), label.to(device)

            output = model(input_ids=input_ids, token_type_ids=token_type_ids, labels=label)
            y_pred_label = output[1].argmax(dim=1)
            
            loss = output[0]
            acc = ((y_pred_label == label.view(-1)).sum()).item()
            
            epoch_loss += loss.item()
            epoch_acc += acc
    return epoch_loss / len(iterator), epoch_acc / len(iterator.dataset.dataset)

In [15]:
epochs = 1
for i in range(epochs):
    train_loss, train_acc = train(model, trainloader, optimizer, criterion, device)
    print("train loss: ", train_loss, "\t", "train acc:", train_acc)
    
    valid_loss, valid_acc = evaluate(model, validloader, criterion, device)
    print("valid loss: ", valid_loss, "\t", "valid acc:", valid_acc)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


current loss: 0.7969380617141724 	 current acc: 0.3125
current loss: 0.08219996878740596 	 current acc: 0.974502487562189
current loss: 0.043424009123382776 	 current acc: 0.9865960099750624
current loss: 0.029627097342682446 	 current acc: 0.9908485856905158
current loss: 0.022333447182353076 	 current acc: 0.9931335830212235
current loss: 0.017918927688291158 	 current acc: 0.9945054945054945
current loss: 0.01575971455946231 	 current acc: 0.9952643630308077
current loss: 0.01353629006124896 	 current acc: 0.9959403997144897
current loss: 0.011861007136140347 	 current acc: 0.996447532792005
current loss: 0.010554910104166355 	 current acc: 0.9968420322043309
current loss: 0.00951490300060217 	 current acc: 0.9971576711644178
current loss: 0.0086634630347926 	 current acc: 0.9974159472966834
current loss: 0.007946293594290848 	 current acc: 0.997631195335277
current loss: 0.007338750511073343 	 current acc: 0.9978133410226836
current loss: 0.006817544473651112 	 current acc: 0.99796

In [16]:
torch.save(model, "model_classification_gpu_epoch_"+str(epochs)+"_batch_"+str(batch_size))
torch.save(model.module,"model_classification_gpu_epoch_"+str(epochs)+"_batch_"+str(batch_size)+".pkl")

In [17]:
model = torch.load("model_classification_gpu_epoch_1_batch_64")

In [18]:
def predict_classification(text):
    input_ids, token_type_ids = convert_text_to_ids(tokenizer, text)
    input_ids = seq_padding(tokenizer, input_ids)
    token_type_ids = seq_padding(tokenizer, token_type_ids)

    input_ids, token_type_ids = input_ids.long(), token_type_ids.long()
    input_ids, token_type_ids = input_ids.to(device), token_type_ids.to(device)

    model.eval()
    with torch.no_grad():
        output = model(input_ids=input_ids, token_type_ids=token_type_ids)

    logits = output.logits.detach().cpu().numpy()
    return(np.argmax(logits, axis=1)[0])

In [19]:
text = ['我想轉給父親的戶頭5063元從薪轉']
result = predict_classification(text)
print(result)

1
