### Training

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
from torch.utils.data import Dataset, DataLoader
from datasets import load_metric
import torch.nn as nn
import torch
from tqdm.auto import tqdm
import random
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report
from commom import load_jsonl, save_jsonl

from transformers import (
    AdamW,
    get_scheduler,
    BertTokenizer,
    AutoTokenizer,
    AutoModelForSequenceClassification
)

In [3]:
class NLIDataset(Dataset):
    def __init__(self, data_list, max_length=512, model_name="bert-base-multilingual-cased"):
        self.d_list = data_list
        self.len = len(self.d_list)
        self.max_length = max_length
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.label2index = {
            'SPAM': 0,
            'EDM': 1,
            'HAM': 2
        }

    def __getitem__(self, index):
        data = self.d_list[index]
        context = data['context']
        label = data['label']
        
        processed_sample = dict()
        processed_sample['labels'] = torch.tensor(self.label2index[label])
        tokenized_input = self.tokenizer(context,
                                         max_length=self.max_length,
                                         padding='max_length', 
                                         truncation=True,
                                         return_tensors="pt")
        
        input_items = {key: val.squeeze() for key, val in tokenized_input.items()}
        processed_sample.update(input_items)
        return processed_sample

    def __len__(self):
        return self.len

In [4]:
train_list = load_jsonl('datasets/orig_dataset_50000/train.jsonl')
dev_list = load_jsonl('datasets/orig_dataset_50000/dev.jsonl')

Load Jsonl: datasets/orig_dataset_50000/train.jsonl


40000it [00:02, 15298.17it/s]


Load Jsonl: datasets/orig_dataset_50000/dev.jsonl


5000it [00:00, 14593.03it/s]


In [5]:
train_dataset = NLIDataset(train_list)
dev_dataset = NLIDataset(dev_list)

In [6]:
train_list[0]

{'index': '1114',
 'md5sum': '00dcfd29cf81813464e4507a47939c4c',
 'label': 'EDM',
 'context': '親愛的客戶/同業先進/協力廠商\u3000鈞鑑： 本公司為提昇服務品質及工作環境， KEYENCE台北總公司即將於5月24日遷移至新址，電話與傳真不變。 當日相關訂單收受及出貨安排皆不受影響， 如有造成其他不便之處，請您見諒。 2021年5月24日(星期一)起 【台北總公司】 電話：(02)2721-8080 (不變) 傳真：(02)2721-7770 (不變) 新址：104 台北市中山區中山北路二段42號12樓 (舊址：104台北市中山區南京東路三段168號8樓之1) 新竹、台中、高雄服務處地址及電話不變，若有其他問題， 也歡迎您來電或來信諮詢。非常感謝！ 免付費諮詢專線：0800-010-898 【新竹服務處】 電話：(03)668-6270 傳真：(03)668-6737 地址：300 新竹市東區慈雲路118號21樓之5 【台中服務處】 電話：(04) 2251-6602 傳真：(04) 2251-0031 新址：407台中市西屯區市政路402號21樓之1 【高雄服務處】 電話：(07) 333-2829 傳真：(07) 333-2919 新址：806 高雄市前鎮區成功二路25號9樓之2 www.keyence.com.tw KEYENCE TAIWAN Co. LTD. 台灣基恩斯股份有限公司敬上'}

In [9]:
train_dataset[0]

{'labels': tensor(1),
 'input_ids': tensor([  101,  7150,  3910,  5718,  3396,  3993,   120,  2773,  4671,  2431,
          7745,   120,  2679,  2594,  3690,  2890,  7923,  8028, 10078,  4476,
          2452,  2763,  5287,  4181,  4364,  4463,  2625,  2854,  7457,  2730,
          3584,  2259,  5576,  3139, 10064,   148, 11259, 14703, 50655, 43161,
          2757,  2650,  6277,  2452,  2763,  2697,  3452,  4336,   126,  4460,
         10233,  4348,  7778,  5965,  6623,  4333,  3040, 10064,  8299,  7207,
          6631,  2387,  5769,  2080,  7292,  1882,  5639,  4348,  5760,  8160,
          7169,  2928,  4280,  2738,  2730,  2527,  7421,  3378,  4151,  5719,
          2080,  2738,  3756,  8371, 10064,  3241,  4461,  7740,  3975,  2460,
          2196,  2080,  2299,  2120,  6946, 10064,  7238,  3869,  7143,  7242,
          1882, 67267,  3642,   126,  4460, 10233,  4348,   113,  4374,  4470,
          2072,   114,  7533,  1894,  2757,  2650,  6277,  2452,  2763,  1895,
          8299,  

In [10]:
# note: 3 classes
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=3)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
if torch.cuda.device_count() >1:
    model = nn.DataParallel(model,device_ids=[0])
model.to(device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [11]:
train_batch_size=40
learning_rate=2e-5 
train_epochs=5

optimizer = AdamW(model.parameters(), lr=learning_rate)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
dev_dataloader = DataLoader(dev_dataset, shuffle=True, batch_size=train_batch_size)
print(len(train_dataloader))
print(len(dev_dataloader))

1000
125


In [12]:
for batch_index, batch_dict in enumerate(train_dataloader):
    print(batch_dict)
    break

{'labels': tensor([1, 1, 1, 2, 1, 1, 1, 0, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1, 1,
        0, 1, 1, 1, 1, 1, 0, 1, 2, 1, 1, 0, 0, 1, 0, 1]), 'input_ids': tensor([[  101,  6417,  8458,  ...,  2079,  2928,   102],
        [  101,  5318,  4941,  ...,  2146,  3767,   102],
        [  101, 67267,  3642,  ...,  2756,  2204,   102],
        ...,
        [  101,  3241,  5765,  ...,     0,     0,     0],
        [  101,   516, 35865,  ...,     0,     0,     0],
        [  101,   115,  3241,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [13]:
## 進度條
num_training_steps = train_epochs * len(train_dataloader)
progress_bar = tqdm(range(num_training_steps))

## 設定warmup
lr_scheduler = get_scheduler(
  "linear",
  optimizer=optimizer,
  num_warmup_steps=10,
  num_training_steps=num_training_steps
)

## start training
for epoch in range(train_epochs):
    model.train()
    for batch_index, batch_dict in enumerate(train_dataloader):
        
        input_items = {key: val.to(device) for key, val in batch_dict.items()}
#         del input_items['token_type_ids'] ## bart不需要這個
        
        optimizer.zero_grad()
        outputs = model(**input_items)
        
        loss = outputs.loss
        if torch.cuda.device_count() >1: ##多GPU的情況要對loss求平均
            loss = loss.mean()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        progress_bar.update(1)
        
        if batch_index % 500 ==0:
            print('epoch: ', epoch, '  loss: ', loss)
            
    model.eval()
    predictions = []
    references = []
    with torch.no_grad():
        for batch_index, batch_dict in enumerate(dev_dataloader):
            input_items = {key: val.to(device) for key, val in batch_dict.items()}
            outputs = model(**input_items)

            predictions += outputs.logits.argmax(dim=-1).tolist()
            references += batch_dict['labels'].tolist()

    accuracy = accuracy_score(references, predictions)
    f1 = f1_score(references, predictions,average='macro')
    print('acc: ', accuracy)
    print('f1: ', f1)
    
    ## save model
    save_path = 'datasets/orig_dataset_50000/epoch_' + str(epoch+1)
    if torch.cuda.device_count() >1:
        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(save_path)
    else:
        model.save_pretrained(save_path)

  0%|          | 0/5000 [00:00<?, ?it/s]

epoch:  0   loss:  tensor(1.1109, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.1790, device='cuda:0', grad_fn=<NllLossBackward0>)
acc:  0.9886
f1:  0.9808894377913866
epoch:  1   loss:  tensor(0.1737, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  1   loss:  tensor(0.1687, device='cuda:0', grad_fn=<NllLossBackward0>)
acc:  0.9898
f1:  0.9830665476186331
epoch:  2   loss:  tensor(0.0018, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  2   loss:  tensor(0.0024, device='cuda:0', grad_fn=<NllLossBackward0>)
acc:  0.99
f1:  0.9837065244843096
epoch:  3   loss:  tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  3   loss:  tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward0>)
acc:  0.9908
f1:  0.9847784660667024
epoch:  4   loss:  tensor(0.0034, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  4   loss:  tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward0>)
acc:  0.9914
f1:  0.9861254911032896


# Test

In [14]:
test_list = load_jsonl('datasets/orig_dataset_50000/test.jsonl')
test_dataset = NLIDataset(test_list)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=512)

Load Jsonl: datasets/orig_dataset_50000/test.jsonl



0it [00:00, ?it/s][A
1663it [00:00, 16626.25it/s][A
3326it [00:00, 16404.06it/s][A
5000it [00:00, 16211.46it/s][A


In [15]:
# note: 3 classes
model_path = 'datasets/orig_dataset_50000/epoch_5'
# model_path = 'models/Mail_Classifier/epoch_5'
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=3)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [16]:
model.eval()
predictions = []
references = []
num_steps = len(test_dataloader)
progress_bar = tqdm(range(num_steps))

with torch.no_grad():
    for batch_index, batch_dict in enumerate(test_dataloader):
        input_items = {key: val.to(device) for key, val in batch_dict.items()}
        outputs = model(**input_items)

        predictions += outputs.logits.argmax(dim=-1).tolist()
        references += batch_dict['labels'].tolist()
        progress_bar.update(1)

accuracy = accuracy_score(references, predictions)
f1 = f1_score(references, predictions,average='macro')
print('acc: ', accuracy)
print('f1: ', f1)

  0%|          | 0/10 [00:00<?, ?it/s]

acc:  0.989
f1:  0.9804473102600001


In [17]:
# orig_50000
print(model_path)
print(classification_report(references, predictions))

datasets/orig_dataset_50000/epoch_5
              precision    recall  f1-score   support

           0       0.97      0.95      0.96       570
           1       0.99      1.00      0.99      3334
           2       0.99      0.99      0.99      1096

    accuracy                           0.99      5000
   macro avg       0.98      0.98      0.98      5000
weighted avg       0.99      0.99      0.99      5000



## inference

In [1]:
import torch.nn as nn
import torch
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report
from commom import load_jsonl, save_jsonl

from transformers import (
    BartTokenizer,
    BartForSequenceClassification,
    BertTokenizer,
    AutoTokenizer,
    AutoModelForSequenceClassification
)

In [2]:
test_list = load_jsonl('datasets/test.jsonl')

Load Jsonl: datasets/test.jsonl


32440it [00:01, 17159.63it/s]


In [10]:
model = AutoModelForSequenceClassification.from_pretrained('models/Mail_Classifier_10/epoch_5', num_labels=3) 
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")  

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
if torch.cuda.device_count() >1:
    model = nn.DataParallel(model,device_ids=[0,1])
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [11]:
test_num = 10

# subject = test_list[test_num]['subject']
context = test_list[test_num]['context']
label = test_list[test_num]['label']

In [12]:
tokenized_input = tokenizer(context,
                            max_length=512,
                            truncation=True,
                            return_tensors="pt")
class_number = {'SPAM':0, 'EDM':1, 'HAM':2}
model.eval()
with torch.no_grad():
    input_items = {key: val.to(device) for key, val in tokenized_input.items()}
    del input_items['token_type_ids'] ## bart不需要這個
    
    outputs = model(**input_items)
    prediction = outputs.logits.argmax(dim=-1)
    print(type(int(prediction)))
    
    
    # print('主旨: ', subject)
    print('內文: ', context)
    print('label: ', class_number[label], label)
    print('predict: ', int(prediction), list(class_number.keys())[list(class_number.values()).index(int(prediction))])
    
    # if int(prediction) == 0:
    #     print('predict: ham')
    # else:
    #     print('predict: spam')

<class 'int'>
內文:  圖片 顯 現時 按此 下載 文宣 網頁 不想 收到 本行 信用卡 行銷 訊息 按此 登入 網路 銀行 信用卡 專區 進行 行銷 訊息 取消 訂閱 設定 電子 訊息 內容 包括 附件 合作金庫 銀行 股份 有限公司 傳送 電子 訊息 內容 機密性 經由 公司 授權 方可 利用 電子 訊息 指定 收件 任何人 公司 電子 訊息 內容 審閱 傳送 散佈 揭露 重製 指定 收件 通知 並請 刪除 電子 訊息 內容 謝謝您 合作 電子 訊息 內容 變更 網際網路 保證 電子 訊息 內容 完整性 公司 變更 修改 竄改 偽造 電子 訊息 內容 恕 不負 責任 網路 通訊 含有 電腦病毒 收件 應 自行 確認 郵件 內容 損害 公司 恕 負責
label:  1 EDM
predict:  1 EDM


## eval

In [13]:
from sklearn.metrics import classification_report

class_number = {'SPAM':0, 'EDM':1, 'HAM':2}

predictions = []
references = []

for batch_index, batch_dict in enumerate(test_list[:200]):
    input_items = {key: val.to(device) for key, val in tokenized_input.items()}
    outputs = model(**input_items)

    predictions += outputs.logits.argmax(dim=-1).tolist()

    references += [class_number[batch_dict['label']]]
    # print(reference[:10])
print(len(predictions))

accuracy = accuracy_score(references, predictions)
f1 = f1_score(references, predictions,average='macro')
print('acc: ', accuracy)
print('f1: ', f1)

print("----------")
classification_report(references, predictions, target_names=class_number)

200
acc:  0.725
f1:  0.2801932367149758
----------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n        SPAM       0.00      0.00      0.00        22\n         EDM       0.72      1.00      0.84       145\n         HAM       0.00      0.00      0.00        33\n\n    accuracy                           0.73       200\n   macro avg       0.24      0.33      0.28       200\nweighted avg       0.53      0.72      0.61       200\n'

In [None]:
from torch.utils.data import Dataset, DataLoader
from datasets import load_metric
import torch.nn as nn
import torch
from tqdm.auto import tqdm
import random
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report
from common import load_jsonl, save_jsonl

from transformers import (
    AdamW,
    get_scheduler,
    BartTokenizer,
    BartForSequenceClassification,
    BertTokenizer,
    AutoTokenizer,
    AutoModelForSequenceClassification
#     AutoModelForMaskedLM,
)

In [None]:
import torch
from transformers import BertConfig, BertForSequenceClassification, BertTokenizerFast

In [None]:
eval_list = load_jsonl('test-data.jsonl')
eval_dataset = NLIDataset(eval_list)
eval_dataset[0]
test_dataloader = DataLoader(eval_dataset, shuffle=True, batch_size=train_batch_size)

In [None]:
predictions = []
references = []

for batch_index, batch_dict in enumerate(test_dataloader):
    input_items = {key: val.to(device) for key, val in batch_dict.items()}
    outputs = model(**input_items)

    predictions += outputs.logits.argmax(dim=-1).tolist()
    references += batch_dict['labels'].tolist()

accuracy = accuracy_score(references, predictions)
f1 = f1_score(references, predictions,average='macro')
print('acc: ', accuracy)
print('f1: ', f1)

In [None]:
num = 3
print("the sentence is ")
print()
print(input_items[3])
outputs = model(**input_items[3])
predictions = outputs.logits.argmax(dim=-1).tolist()
print("predict : ", predictions)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
label_list = ["0", "1"]
tokenizer = BertTokenizerFast.from_pretrained("Transformers Trainer")
config = BertConfig.from_pretrained("Transformers Trainer", finetuning_task="cola")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

In [None]:
sentence = "The probable hostile German reaction is unfortunate."  # @param {type:"string"}
tokenized_input = tokenizer(sentence, return_tensors="pt").to(device)
outputs = model(**tokenized_input)
print(f"Prediction: {label_list[outputs.logits.argmax(dim=-1).item()]}")