## datasets prepare

In [8]:
# check datasets 

import csv
csv.field_size_limit(500 * 1024 * 1024)

data_list = []
class_number = {'SPAM':0, 'EDM':0, 'HAM':0, 'NOTE':0}

with open('spam_data_append_4class.csv', newline='', encoding='utf-8') as csvfile:
    # count = 0
   
    rows = csv.reader(csvfile)
    for i, row in enumerate(rows):
        if row[2] in class_number.keys():
            pass
        else:
            print(i)

In [9]:
# load datasets

import csv
csv.field_size_limit(500 * 1024 * 1024)

data_list = []
class_number = {'SPAM':0, 'EDM':0, 'HAM':0, 'NOTE':0}

with open('spam_data_append_4class.csv', newline='', encoding='utf-8') as csvfile:
    
    
   
    rows = csv.reader(csvfile)
    for i, row in enumerate(rows):
        # print(row[0])
        if i != 0:
            context = row[3].replace('Num','')
            class_number[row[2]]+=1
            data_list.append({'index': row[0], 
                              'md5sum': row[1],
                              'label':row[2], 
                              'context':context})

In [10]:
# number of data

len(data_list)

347700

In [11]:
# number of data for each class

print(class_number)

{'SPAM': 38053, 'EDM': 218647, 'HAM': 68952, 'NOTE': 22048}


In [12]:
data_list[3]

{'index': '4',
 'md5sum': '00010a27a02be1b98537cd22e44d40a4',
 'label': 'EDM',
 'context': 'Read email browser 再生能源 業者 農曆 五月 五日 端午 佳節 到來 古時 稱惡 惡日 這天 驅除 瘟疫 惡運 臺灣 近期 疫情 影響 藉由 機會 驅瘟 去疫 快速 回復 自由 活動 無拘無束 生活 能源 週 呼籲 居家 追劇 喫 肉糉 盡量 外出 常備 酒精 做好 消毒 防疫 臺灣 國際 智慧 能源 週 Energy Taiwan 實體 展     虛擬 展      Tel    Email emailAddress 訂閱 取消 訂閱 Subscribe Unsubscribe'}

In [13]:
# train : val : test = 8:1:1

from sklearn.model_selection import train_test_split

train_data, dev_test_data = train_test_split(data_list, random_state=777, train_size=0.8)
dev_data, test_data = train_test_split(dev_test_data, random_state=777, train_size=0.5)

In [14]:
print(len(train_data))
print(len(dev_data))
print(len(test_data))

278160
34770
34770


In [15]:
from common import save_jsonl

save_jsonl(train_data, 'datasets/train_4class.jsonl')
save_jsonl(dev_data, 'datasets/dev_4class.jsonl')
save_jsonl(test_data, 'datasets/test_4class.jsonl')

Save to Jsonl: datasets/train_4class.jsonl
Save to Jsonl: datasets/dev_4class.jsonl
Save to Jsonl: datasets/test_4class.jsonl


### Training

In [16]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [17]:
from torch.utils.data import Dataset, DataLoader
from datasets import load_metric
import torch.nn as nn
import torch
from tqdm.auto import tqdm
import random
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report
from common import load_jsonl, save_jsonl

from transformers import (
    AdamW,
    get_scheduler,
    BertTokenizer,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorForTokenClassification
)

from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

In [18]:
class NLIDataset(Dataset):
    def __init__(self, data_list, max_length=512, model_name="bert-base-multilingual-cased"):  #bert-base-multilingual-cased
        self.d_list = data_list
        self.len = len(self.d_list)
        self.max_length = max_length
        # self.tokenizer = MBart50TokenizerFast.from_pretrained(model_name, src_lang="en_XX", tgt_lang="ro_RO")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.label2index = {
            'SPAM': 0,
            'EDM': 1,
            'HAM': 2,
            'NOTE': 3,
        }

    def __getitem__(self, index):
        data = self.d_list[index]
        context = data['context']
        label = data['label']
        
        processed_sample = dict()
        processed_sample['labels'] = torch.tensor(self.label2index[label])
        tokenized_input = self.tokenizer(context,
                                         max_length=self.max_length,
                                         padding='max_length', 
                                         truncation=True,
                                         return_tensors="pt")
        
        input_items = {key: val.squeeze() for key, val in tokenized_input.items()}
        processed_sample.update(input_items)
        return processed_sample

    def __len__(self):
        return self.len

In [19]:
train_list = load_jsonl('datasets/train_4class.jsonl')
dev_list = load_jsonl('datasets/dev_4class.jsonl')

Load Jsonl: datasets/train_4class.jsonl


278160it [00:14, 18849.33it/s]


Load Jsonl: datasets/dev_4class.jsonl


34770it [00:01, 18877.01it/s]


In [20]:
train_dataset = NLIDataset(train_list)
dev_dataset = NLIDataset(dev_list)

In [21]:
train_list[0]

{'index': '165966',
 'md5sum': '7d224f12c9aec8a4bbcd17755f6420f3',
 'label': 'EDM',
 'context': ' 空氣 清淨機 排行榜 出爐 秋冬 空污 新冠肺炎 疫情 流感病毒 因素 空氣 清淨機 居家 必備 家電 行政院 消保處  抽查 檢測 市售  款 空氣 清淨機 潔淨 空氣 提供 率 CASR 排行 建議 消費者 選購 節能 標章 CASR 數值 高 產品 潔淨 空氣 提供 率 CASR 完整 內容 送 真空 蟎 吸塵器 Winix 空氣 清淨機 ZERO S   購買   APP 限時 驚喜價 Coway 濾淨 力 空氣 清淨機 AP 1216L 送 濾網    特價   3M 濾淨 型 空氣 清淨機 進階版   特價   本週 主打 送 騎士 堡 半日券 CoClean 隨身 空氣 清淨機 兒童版   特價  車用 空氣 清淨機 首選 未來 實驗室 N7 負離子 空氣 清淨機   特價  掃地 拖地 吸塵 三合一 愛迪生 三合一 智能 掃地 拖地 吸塵 機器人   特價  蹣 吸塵 機 兩用 美國 Massey 紫外線 真空 除蹣機   特價  獨家 贈三大 好禮 伊萊克斯 完美 管家 吸塵器   特價   直立 手持 兩用 丹比 DANBY 強力 旋風 有線 吸塵器   特價   質感 女神 購物節 金額 下單 抽 SOGO 禮券 累積 消費 抽  dyson 累積  單再 抽  天 免費 下午茶 大禮包 生活 分享 購多 入組 再享  回饋 購物 想 取消 訂閱 點選'}

In [22]:
train_dataset[0]

{'labels': tensor(1),
 'input_ids': tensor([   101,   6006,   4859,   5061,   5048,   4741,   4151,   7069,   4689,
           2527,   5376,   5952,   2490,   6006,   4890,   4333,   2484,   6520,
           5272,   5656,   3878,   4982,   3911,   5667,   4837,   3000,   6195,
           6006,   4859,   5061,   5048,   4741,   3486,   3408,   3793,   2380,
           3408,   8299,   7069,   4285,   8222,   5010,   2312,   6946,   4055,
           4547,   4753,   5084,   3600,   2884,   4780,   6006,   4859,   5061,
           5048,   4741,   5182,   5048,   6006,   4859,   4181,   2286,   5476,
          92923,  11273,   4151,   7069,   3697,   7285,   5010,   7433,   6457,
           7779,   7461,   6104,   6546,   4720,   6040,  92923,  11273,   4310,
           2355,   8595,   5601,   2854,   5182,   5048,   6006,   4859,   4181,
           2286,   5476,  92923,  11273,   3380,   4307,   2447,   3410,   7719,
           5769,   6006,    100,   2800,   3135,   2970,  60987,  13274,  

In [23]:
# note: 5 classes
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=4)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
if torch.cuda.device_count() >1:
    model = nn.DataParallel(model,device_ids=[0])
model.to(device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [30]:
train_batch_size=40
learning_rate=2e-5 
train_epochs=5

optimizer = AdamW(model.parameters(), lr=learning_rate)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
dev_dataloader = DataLoader(dev_dataset, shuffle=True, batch_size=train_batch_size)
print(len(train_dataloader))
print(len(dev_dataloader))

6954
870


In [31]:
for batch_index, batch_dict in enumerate(train_dataloader):
    print(batch_dict)
    break

{'labels': tensor([1, 0, 0, 3, 1, 1, 1, 1, 1, 0, 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 1,
        2, 1, 1, 1, 0, 1, 1, 2, 0, 2, 1, 1, 1, 2, 1, 1]), 'input_ids': tensor([[  101,  8148,  7290,  ...,     0,     0,     0],
        [  101,  2774,  5611,  ...,     0,     0,     0],
        [  101, 11520, 42492,  ...,     0,     0,     0],
        ...,
        [  101, 12222, 10883,  ...,     0,     0,     0],
        [  101,  8412,  5898,  ...,     0,     0,     0],
        [  101,  4040,  2406,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [32]:
## 進度條
num_training_steps = train_epochs * len(train_dataloader)
progress_bar = tqdm(range(num_training_steps))

## 設定warmup
lr_scheduler = get_scheduler(
  "linear",
  optimizer=optimizer,
  num_warmup_steps=10,
  num_training_steps=num_training_steps
)

## start training
for epoch in range(train_epochs):
    model.train()
    for batch_index, batch_dict in enumerate(train_dataloader):
        
        input_items = {key: val.to(device) for key, val in batch_dict.items()}
        # del input_items['token_type_ids'] ## bart不需要這個
        
        optimizer.zero_grad()
        outputs = model(**input_items)
        
        loss = outputs.loss
        if torch.cuda.device_count() >1: ##多GPU的情況要對loss求平均
            loss = loss.mean()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        progress_bar.update(1)
        
        if batch_index % 500 ==0:
            print('epoch: ', epoch, '  loss: ', loss)
            
    model.eval()
    predictions = []
    references = []
    with torch.no_grad():
        for batch_index, batch_dict in enumerate(dev_dataloader):
            input_items = {key: val.to(device) for key, val in batch_dict.items()}
            outputs = model(**input_items)

            predictions += outputs.logits.argmax(dim=-1).tolist()
            references += batch_dict['labels'].tolist()

    accuracy = accuracy_score(references, predictions)
    f1 = f1_score(references, predictions,average='macro')
    print('acc: ', accuracy)
    print('f1: ', f1)
    
    ## save model
    save_path = 'model/notice_4class_epoch_' + str(epoch+1)
    if torch.cuda.device_count() >1:
        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(save_path)
    else:
        model.save_pretrained(save_path)

  0%|          | 0/34770 [00:00<?, ?it/s]

epoch:  0   loss:  tensor(0.3320, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0823, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0070, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0167, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0065, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.1025, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0055, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0049, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0047, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0478, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0513, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0333, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0034, device='cuda:0', g

# Test

In [33]:
from common import load_jsonl
test_list = load_jsonl('datasets/test_4class.jsonl')
test_dataset = NLIDataset(test_list)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=512)

Load Jsonl: datasets/test_4class.jsonl



0it [00:00, ?it/s][A
1921it [00:00, 19205.26it/s][A
3842it [00:00, 18814.02it/s][A
5724it [00:00, 18349.12it/s][A
7561it [00:00, 18339.80it/s][A
9396it [00:00, 18081.26it/s][A
11205it [00:00, 18048.27it/s][A
13136it [00:00, 18453.79it/s][A
14994it [00:00, 18488.27it/s][A
16844it [00:00, 18184.13it/s][A
18664it [00:01, 17958.65it/s][A
20636it [00:01, 18485.80it/s][A
22487it [00:01, 18315.68it/s][A
24321it [00:01, 18177.85it/s][A
26140it [00:01, 17926.95it/s][A
28057it [00:01, 18292.39it/s][A
29888it [00:01, 18159.59it/s][A
31735it [00:01, 18245.77it/s][A
34770it [00:01, 18240.73it/s][A


In [42]:
print(len(test_list))

34770


In [34]:
# note: 5 classes
model_path = 'model/notice_4class_epoch_5'
# model_path = 'models/Mail_Classifier/epoch_5'
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=4)
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")   
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [35]:
model.eval()
predictions = []
references = []
num_steps = len(test_dataloader)
progress_bar = tqdm(range(num_steps))

with torch.no_grad():
    for batch_index, batch_dict in enumerate(test_dataloader):
        input_items = {key: val.to(device) for key, val in batch_dict.items()}
        outputs = model(**input_items)

        predictions += outputs.logits.argmax(dim=-1).tolist()
        references += batch_dict['labels'].tolist()
        progress_bar.update(1)

accuracy = accuracy_score(references, predictions)
f1 = f1_score(references, predictions,average='macro')
print('acc: ', accuracy)
print('f1: ', f1)

  0%|          | 0/68 [00:00<?, ?it/s]

acc:  0.9930974978429681
f1:  0.9898595974492835


In [36]:
print(model_path)
print(classification_report(references, predictions))

model/notice_4class_epoch_5
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      3855
           1       1.00      1.00      1.00     22001
           2       0.99      0.99      0.99      6788
           3       0.99      1.00      1.00      2126

    accuracy                           0.99     34770
   macro avg       0.99      0.99      0.99     34770
weighted avg       0.99      0.99      0.99     34770



## find out class 3 (note)

In [37]:
import pandas as pd
import operator
class_number = {'SPAM':0, 'EDM':1, 'HAM':2, 'NOTE':3}

count = 0
wrong = 0
right = 0
which_class = 0

print(len(references))

for i, class_num in enumerate(references):
    if class_num == which_class:
        count += 1
        if predictions[i] != which_class:
            wrong += 1
            # print(test_list[i]['context'])
        else:
            right += 1
    
print('in testdatasets , there are %d %s messages ~'%(count, [k for k, v in class_number.items() if v == which_class][0]))
print('the accurate of %s is : %.2f'%([k for k, v in class_number.items() if v == which_class][0], (right - wrong)/count))
       

        
# print('in testdatasets , there are %d notice messages ~')

# print(test_list[i]['context'])
# print(predictions[i])

34770
in testdatasets , there are 3855 SPAM messages ~
the accurate of SPAM is : 0.95


In [38]:
import pandas as pd
import operator
class_number = {'SPAM':0, 'EDM':1, 'HAM':2, 'NOTE':3}

count = 0
wrong = 0
right = 0
which_class = 1

print(len(references))

for i, class_num in enumerate(references):
    if class_num == which_class:
        count += 1
        if predictions[i] != which_class:
            wrong += 1
            # print(test_list[i]['context'])
        else:
            right += 1
    
print('in testdatasets , there are %d %s messages ~'%(count, [k for k, v in class_number.items() if v == which_class][0]))
print('the accurate of %s is : %.2f'%([k for k, v in class_number.items() if v == which_class][0], (right - wrong)/count))
       

        
# print('in testdatasets , there are %d notice messages ~')

# print(test_list[i]['context'])
# print(predictions[i])

34770
in testdatasets , there are 22001 EDM messages ~
the accurate of EDM is : 0.99


In [39]:
import pandas as pd
import operator
class_number = {'SPAM':0, 'EDM':1, 'HAM':2, 'NOTE':3}

count = 0
wrong = 0
right = 0
which_class = 2

print(len(references))

for i, class_num in enumerate(references):
    if class_num == which_class:
        count += 1
        if predictions[i] != which_class:
            wrong += 1
            # print(test_list[i]['context'])
        else:
            right += 1
    
print('in testdatasets , there are %d %s messages ~'%(count, [k for k, v in class_number.items() if v == which_class][0]))
print('the accurate of %s is : %.2f'%([k for k, v in class_number.items() if v == which_class][0], (right - wrong)/count))
       

        
# print('in testdatasets , there are %d notice messages ~')

# print(test_list[i]['context'])
# print(predictions[i])

34770
in testdatasets , there are 6788 HAM messages ~
the accurate of HAM is : 0.99


In [40]:
import pandas as pd
import operator
class_number = {'SPAM':0, 'EDM':1, 'HAM':2, 'NOTE':3}

count = 0
wrong = 0
right = 0
which_class = 3

print(len(references))

for i, class_num in enumerate(references):
    if class_num == which_class:
        count += 1
        if predictions[i] != which_class:
            wrong += 1
            # print(test_list[i]['context'])
        else:
            right += 1
    
print('in testdatasets , there are %d %s messages ~'%(count, [k for k, v in class_number.items() if v == which_class][0]))
print('the accurate of %s is : %.2f'%([k for k, v in class_number.items() if v == which_class][0], (right - wrong)/count))
       

        
# print('in testdatasets , there are %d notice messages ~')

# print(test_list[i]['context'])
# print(predictions[i])

34770
in testdatasets , there are 2126 NOTE messages ~
the accurate of NOTE is : 1.00


## find out class 4 (hacker)

In [33]:
import pandas as pd
import operator
class_number = {'SPAM':0, 'HACKER':1}

count = 0
wrong = 0
right = 0
which_class = 1
wrong_list, pred_list, ref_list = [], [], []

print(len(references))

for i, class_num in enumerate(references):
    if class_num == which_class:
        count += 1
        if predictions[i] != which_class:
            print("this sentence %s is incorrect ! "%(i))
            wrong += 1
            wrong_list.append(test_list[i]['context'])
            pred_list.append(predictions[i])
            ref_list.append(references[i])
            # print(test_list[i]['context'])
        else:
            right += 1

output_dic = {}
output_dic['pred'] = pred_list
output_dic['label'] = ref_list
output_dic['context'] = wrong_list
output = pd.DataFrame(output_dic)
output.to_csv("wrong_list.csv", encoding = 'utf-8-sig')
    
print('in testdatasets , there are %d %s messages ~'%(count, [k for k, v in class_number.items() if v == which_class][0]))
print('the accurate of %s is : %.2f'%([k for k, v in class_number.items() if v == which_class][0], (right - wrong)/count))
       


3806
this sentence 17 is incorrect ! 
this sentence 75 is incorrect ! 
this sentence 210 is incorrect ! 
this sentence 224 is incorrect ! 
this sentence 237 is incorrect ! 
this sentence 259 is incorrect ! 
this sentence 261 is incorrect ! 
this sentence 275 is incorrect ! 
this sentence 319 is incorrect ! 
this sentence 320 is incorrect ! 
this sentence 339 is incorrect ! 
this sentence 440 is incorrect ! 
this sentence 453 is incorrect ! 
this sentence 457 is incorrect ! 
this sentence 478 is incorrect ! 
this sentence 503 is incorrect ! 
this sentence 590 is incorrect ! 
this sentence 607 is incorrect ! 
this sentence 710 is incorrect ! 
this sentence 731 is incorrect ! 
this sentence 747 is incorrect ! 
this sentence 837 is incorrect ! 
this sentence 878 is incorrect ! 
this sentence 933 is incorrect ! 
this sentence 1003 is incorrect ! 
this sentence 1016 is incorrect ! 
this sentence 1119 is incorrect ! 
this sentence 1120 is incorrect ! 
this sentence 1147 is incorrect ! 
this s

## predict from batch_data

In [65]:
import pandas as pd
import operator
class_number = {'SPAM':0, 'EDM':1, 'HAM':2, 'NOTE':3, 'HACKER':4}
model.eval()

softmax=torch.nn.Softmax()
csvfile = pd.read_csv('test_data/notice.csv')    # change the value up to the colab limit

predict_list = []
score_list = []
sm_score_list = []
for token in csvfile['context']:

    tokenized_input = tokenizer(token,
                                max_length=20,
                                truncation=True,
                                return_tensors="pt")
    with torch.no_grad():
        input_items = {key: val.to(device) for key, val in tokenized_input.items()}
#         del input_items['token_type_ids'] ## bart不需要這個

        outputs = model(**input_items)
        prediction = outputs.logits.argmax(dim=-1)

        prediction = int(prediction)
        sm = softmax(outputs.logits[0])

        predict_list.append(prediction)
        score_list.append(outputs.logits[0][prediction].item())
        sm_score_list.append(sm[prediction].item())
        
print(predict_list)




[3, 3, 3, 1, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]


## predict from each sentence

In [64]:
test_num = 10

# subject = test_list[test_num]['subject']
context = test_list[test_num]['context']
label = test_list[test_num]['label']

tokenized_input = tokenizer(context,
                            max_length=512,
                            truncation=True,
                            return_tensors="pt")
class_number = {'SPAM':0, 'EDM':1, 'HAM':2, 'NOTE':3, 'KACKER':4}
model.eval()
with torch.no_grad():
    input_items = {key: val.to(device) for key, val in tokenized_input.items()}
    del input_items['token_type_ids'] ## bart不需要這個
    
    outputs = model(**input_items)
    prediction = outputs.logits.argmax(dim=-1)
    print(type(int(prediction)))
    
    
    # print('主旨: ', subject)
    print('內文: ', context)
    print('label: ', class_number[label], label)
    print('predict: ', int(prediction), list(class_number.keys())[list(class_number.values()).index(int(prediction))])
    
    # if int(prediction) == 0:
    #     print('predict: ham')
    # else:
    #     print('predict: spam')

<class 'int'>
內文:  EyeCloud  親愛的  林  淵博  設備  狀態  切換  資訊  設備  名稱  華府  總部  伺服器  型號  UR    隸屬  羣組  設備  狀態  切換  信件  EyeCloud  自動  發送  回覆  本信件  登入  EyeCloud  查看  管理員  謝謝  UrlText
label:  3 NOTE
predict:  3 NOTE
