## datasets prepare

In [1]:
# check datasets 

import csv
csv.field_size_limit(500 * 1024 * 1024)

data_list = []
class_number = {'SPAM':0, 'HACK':0}

with open('spam_data_append_2class.csv', newline='', encoding='utf-8') as csvfile:
    # count = 0
   
    rows = csv.reader(csvfile)
    for i, row in enumerate(rows):
        if row[2] in class_number.keys():
            pass
        else:
            print(i)

In [2]:
# load datasets

import csv
csv.field_size_limit(500 * 1024 * 1024)

data_list = []
class_number = {'SPAM':0, 'HACK':0}

with open('spam_data_append_2class.csv', newline='', encoding='utf-8') as csvfile:
    
    
   
    rows = csv.reader(csvfile)
    for i, row in enumerate(rows):
        # print(row[0])
        if i != 0:
            context = row[3].replace('Num','')
            class_number[row[2]]+=1
            data_list.append({'index': row[0], 
                              'md5sum': row[1],
                              'label':row[2], 
                              'context':context})

In [3]:
# number of data

len(data_list)

38053

In [4]:
# number of data for each class

print(class_number)

{'SPAM': 36796, 'HACK': 1257}


In [5]:
data_list[3]

{'index': '4',
 'md5sum': '000a972374fb1e149c350018a6522ca4',
 'label': 'SPAM',
 'context': 'recalculate house payment ready advertisement quicken loan receive high score j power   tied  primary mortgage origination   primary mortgage servicer study customer satisfaction mortgage sale experience mortgage servicer company respectively visit urltext quicken loan llc nmls  urltext equal housing lender license  state al license mc  control  ar tx  woodward ave detroit mi      az  n central ave ste  phoenix az  mortgage banker license bk  ca license dept business oversight ca residential mortgage lending act finance lender law co regulate division real estate ga residential mortgage licensee  il residential mortgage licensee  dept financial professional regulation k license mortgage company mc  mortgage lender license ml  supervise lender license mn offer rate lock agreement m license ms dept bank consumer finance nh license nh banking dept 6743mb nv license  nj new jersey quicken loan llc 

In [6]:
# train : val : test = 8:1:1

from sklearn.model_selection import train_test_split

train_data, dev_test_data = train_test_split(data_list, random_state=777, train_size=0.8)
dev_data, test_data = train_test_split(dev_test_data, random_state=777, train_size=0.5)

In [7]:
print(len(train_data))
print(len(dev_data))
print(len(test_data))

30442
3805
3806


In [8]:
from common import save_jsonl

save_jsonl(train_data, 'datasets/train_2class.jsonl')
save_jsonl(dev_data, 'datasets/dev_2class.jsonl')
save_jsonl(test_data, 'datasets/test_2class.jsonl')

Save to Jsonl: datasets/train_2class.jsonl
Save to Jsonl: datasets/dev_2class.jsonl
Save to Jsonl: datasets/test_2class.jsonl


### Training

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
from torch.utils.data import Dataset, DataLoader
from datasets import load_metric
import torch.nn as nn
import torch
from tqdm.auto import tqdm
import random
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report
from common import load_jsonl, save_jsonl

from transformers import (
    AdamW,
    get_scheduler,
    BertTokenizer,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorForTokenClassification
)

from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

In [3]:
class NLIDataset(Dataset):
    def __init__(self, data_list, max_length=512, model_name="facebook/mbart-large-50"):  #bert-base-multilingual-cased
        self.d_list = data_list
        self.len = len(self.d_list)
        self.max_length = max_length
        self.tokenizer = MBart50TokenizerFast.from_pretrained(model_name, src_lang="en_XX", tgt_lang="ro_RO")
        self.label2index = {
            'SPAM': 0,
            'HACK': 1,
        }

    def __getitem__(self, index):
        data = self.d_list[index]
        context = data['context']
        label = data['label']
        
        processed_sample = dict()
        processed_sample['labels'] = torch.tensor(self.label2index[label])
        tokenized_input = self.tokenizer(context,
                                         max_length=self.max_length,
                                         padding='max_length', 
                                         truncation=True,
                                         return_tensors="pt")
        
        input_items = {key: val.squeeze() for key, val in tokenized_input.items()}
        processed_sample.update(input_items)
        return processed_sample

    def __len__(self):
        return self.len

In [12]:
train_list = load_jsonl('datasets/train_2class.jsonl')
dev_list = load_jsonl('datasets/dev_2class.jsonl')

Load Jsonl: datasets/train_2class.jsonl


30442it [00:01, 15672.34it/s]


Load Jsonl: datasets/dev_2class.jsonl


3805it [00:00, 15778.12it/s]


In [13]:
torch.__version__

'1.11.0'

In [14]:
train_dataset = NLIDataset(train_list)
dev_dataset = NLIDataset(dev_list)

In [15]:
train_list[0]

{'index': '1325',
 'md5sum': '08fb87703f57f0d763001b73d97a7274',
 'label': 'SPAM',
 'context': 'hello sale greeting susan environmental product corporation need product idea company resume work yes please respond immediately enable u place order without delay immediate reply highly appreciated hop hear soonest thanks regard susan oniel purchasing manager environmental product corporation  great hill road naugatuck ct  p    c    email emailaddress'}

In [16]:
train_dataset[0]

{'labels': tensor(0),
 'input_ids': tensor([250004,  33600,     31,  11473,   3514,     13,   1916,   1817,     66,
         156444,  12996, 216487,   3871,  12996,   6528,  14380, 138755,   4488,
          72272,  22936,  35644, 109312,     22,   2886,     75,   3687,  12989,
          15490,      8,   5259, 168894,  75836, 103210,  77947,     71,  72720,
          36802,  33662,    525,  45458,  28601,   1817,     66,     98,  11896,
           7398,  28661,    214,  31095, 156444,  12996, 216487,   6782, 130473,
          33816,     24,     34,   8554,  75554, 108963,    915,    501,   3340,
           3340,    712, 107421,      2,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,  

In [17]:
# note: 5 classes
model = AutoModelForSequenceClassification.from_pretrained("facebook/mbart-large-50", num_labels=2)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
if torch.cuda.device_count() >1:
    model = nn.DataParallel(model,device_ids=[0])
model.to(device)

Some weights of the model checkpoint at facebook/mbart-large-50 were not used when initializing MBartForSequenceClassification: ['final_logits_bias', 'lm_head.weight']
- This IS expected if you are initializing MBartForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MBartForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MBartForSequenceClassification were not initialized from the model checkpoint at facebook/mbart-large-50 and are newly initialized: ['classification_head.dense.weight', 'classification_head.dense.bias', 'classification_head.out_proj.weight', 'classification_head.out_proj.bias']
You should probably TRAIN this model on a 

MBartForSequenceClassification(
  (model): MBartModel(
    (shared): Embedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): Embedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0): MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=

In [18]:
train_batch_size=16
learning_rate=2e-5 
train_epochs=8

optimizer = AdamW(model.parameters(), lr=learning_rate)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
dev_dataloader = DataLoader(dev_dataset, shuffle=True, batch_size=train_batch_size)
print(len(train_dataloader))
print(len(dev_dataloader))

1903
238


In [19]:
for batch_index, batch_dict in enumerate(train_dataloader):
    print(batch_dict)
    break

{'labels': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]), 'input_ids': tensor([[250004,  11341,   4927,  ...,      1,      1,      1],
        [250004,    636,   4503,  ...,  20781,      6,      2],
        [250004,      6,  16682,  ...,      1,      1,      1],
        ...,
        [250004, 206873, 125072,  ...,      1,      1,      1],
        [250004,  33600,     31,  ...,      1,      1,      1],
        [250004,  33600,     31,  ...,      1,      1,      1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [20]:
## 進度條
num_training_steps = train_epochs * len(train_dataloader)
progress_bar = tqdm(range(num_training_steps))

## 設定warmup
lr_scheduler = get_scheduler(
  "linear",
  optimizer=optimizer,
  num_warmup_steps=10,
  num_training_steps=num_training_steps
)

## start training
for epoch in range(train_epochs):
    model.train()
    for batch_index, batch_dict in enumerate(train_dataloader):
        
        input_items = {key: val.to(device) for key, val in batch_dict.items()}
        # del input_items['token_type_ids'] ## bart不需要這個
        
        optimizer.zero_grad()
        outputs = model(**input_items)
        
        loss = outputs.loss
        if torch.cuda.device_count() >1: ##多GPU的情況要對loss求平均
            loss = loss.mean()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        progress_bar.update(1)
        
        if batch_index % 500 ==0:
            print('epoch: ', epoch, '  loss: ', loss)
            
    model.eval()
    predictions = []
    references = []
    with torch.no_grad():
        for batch_index, batch_dict in enumerate(dev_dataloader):
            input_items = {key: val.to(device) for key, val in batch_dict.items()}
            outputs = model(**input_items)

            predictions += outputs.logits.argmax(dim=-1).tolist()
            references += batch_dict['labels'].tolist()

    accuracy = accuracy_score(references, predictions)
    f1 = f1_score(references, predictions,average='macro')
    print('acc: ', accuracy)
    print('f1: ', f1)
    
    ## save model
    save_path = 'model/notice_2class_epoch_' + str(epoch+1)
    if torch.cuda.device_count() >1:
        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(save_path)
    else:
        model.save_pretrained(save_path)

  0%|          | 0/15224 [00:00<?, ?it/s]

epoch:  0   loss:  tensor(0.8331, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.2957, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.3112, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0039, device='cuda:0', grad_fn=<NllLossBackward0>)
acc:  0.990275952693824
f1:  0.9194317732449506
epoch:  1   loss:  tensor(0.0024, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  1   loss:  tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  1   loss:  tensor(0.0052, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  1   loss:  tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward0>)
acc:  0.9908015768725361
f1:  0.9284513525608963
epoch:  2   loss:  tensor(0.0188, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  2   loss:  tensor(0.0059, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  2   loss:  tensor(0.0016, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  2   loss:  tensor(0.0008, 

# Test

In [4]:
from common import load_jsonl
test_list = load_jsonl('datasets/test_2class.jsonl')
test_dataset = NLIDataset(test_list)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=512)

Load Jsonl: datasets/test_2class.jsonl


3806it [00:00, 16034.94it/s]


In [5]:
# note: 5 classes
model_path = 'model/notice_2class_epoch_7'
# model_path = 'models/Mail_Classifier/epoch_5'
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")  

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

MBartForSequenceClassification(
  (model): MBartModel(
    (shared): Embedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): Embedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0): MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=

In [6]:
model.eval()
predictions = []
references = []
num_steps = len(test_dataloader)
progress_bar = tqdm(range(num_steps))

with torch.no_grad():
    for batch_index, batch_dict in enumerate(test_dataloader):
        input_items = {key: val.to(device) for key, val in batch_dict.items()}
        outputs = model(**input_items)

        predictions += outputs.logits.argmax(dim=-1).tolist()
        references += batch_dict['labels'].tolist()
        progress_bar.update(1)

accuracy = accuracy_score(references, predictions)
f1 = f1_score(references, predictions,average='macro')
print('acc: ', accuracy)
print('f1: ', f1)

  0%|          | 0/8 [00:00<?, ?it/s]

acc:  0.9952706253284288
f1:  0.96127916440586


In [7]:
print(model_path)
print(classification_report(references, predictions))

model/notice_2class_epoch_7
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3680
           1       0.97      0.88      0.93       126

    accuracy                           1.00      3806
   macro avg       0.98      0.94      0.96      3806
weighted avg       1.00      1.00      1.00      3806



## find out class 3 (note)

In [8]:
import pandas as pd
import operator
class_number = {'SPAM':0, 'EDM':1, 'HAM':2, 'NOT':3, 'HACKER':4}

count = 0
wrong = 0
right = 0
which_class = 0

print(len(references))

for i, class_num in enumerate(references):
    if class_num == which_class:
        count += 1
        if predictions[i] != which_class:
            wrong += 1
            # print(test_list[i]['context'])
        else:
            right += 1
    
print('in testdatasets , there are %d %s messages ~'%(count, [k for k, v in class_number.items() if v == which_class][0]))
print('the accurate of %s is : %.2f'%([k for k, v in class_number.items() if v == which_class][0], (right - wrong)/count))
       

        
# print('in testdatasets , there are %d notice messages ~')

# print(test_list[i]['context'])
# print(predictions[i])

3806
in testdatasets , there are 3680 SPAM messages ~
the accurate of SPAM is : 1.00


## find out class 4 (hacker)

In [10]:
import pandas as pd
import operator
class_number = {'SPAM':0, 'HACKER':1}

count = 0
wrong = 0
right = 0
which_class = 1
wrong_list, pred_list, ref_list = [], [], []

print(len(references))

for i, class_num in enumerate(references):
    if class_num == which_class:
        count += 1
        if predictions[i] == which_class:
            print("this sentence %s is incorrect ! "%(i))
            wrong += 1
            wrong_list.append(test_list[i]['context'])
            pred_list.append(predictions[i])
            ref_list.append(references[i])
            # print(test_list[i]['context'])
        else:
            right += 1

output_dic = {}
output_dic['pred'] = pred_list
output_dic['label'] = ref_list
output_dic['context'] = wrong_list
output = pd.DataFrame(output_dic)
output.to_csv("wrong_list.csv", encoding = 'utf-8-sig')
    
print('in testdatasets , there are %d %s messages ~'%(count, [k for k, v in class_number.items() if v == which_class][0]))
print('the accurate of %s is : %.2f'%([k for k, v in class_number.items() if v == which_class][0], (right - wrong)/count))
       


3806
this sentence 17 is incorrect ! 
this sentence 75 is incorrect ! 
this sentence 210 is incorrect ! 
this sentence 224 is incorrect ! 
this sentence 237 is incorrect ! 
this sentence 259 is incorrect ! 
this sentence 261 is incorrect ! 
this sentence 275 is incorrect ! 
this sentence 319 is incorrect ! 
this sentence 320 is incorrect ! 
this sentence 339 is incorrect ! 
this sentence 440 is incorrect ! 
this sentence 453 is incorrect ! 
this sentence 457 is incorrect ! 
this sentence 478 is incorrect ! 
this sentence 503 is incorrect ! 
this sentence 590 is incorrect ! 
this sentence 607 is incorrect ! 
this sentence 710 is incorrect ! 
this sentence 731 is incorrect ! 
this sentence 747 is incorrect ! 
this sentence 837 is incorrect ! 
this sentence 878 is incorrect ! 
this sentence 933 is incorrect ! 
this sentence 1003 is incorrect ! 
this sentence 1016 is incorrect ! 
this sentence 1119 is incorrect ! 
this sentence 1120 is incorrect ! 
this sentence 1147 is incorrect ! 
this s

## predict from batch_data

In [65]:
import pandas as pd
import operator
class_number = {'SPAM':0, 'EDM':1, 'HAM':2, 'NOTE':3, 'HACKER':4}
model.eval()

softmax=torch.nn.Softmax()
csvfile = pd.read_csv('test_data/notice.csv')    # change the value up to the colab limit

predict_list = []
score_list = []
sm_score_list = []
for token in csvfile['context']:

    tokenized_input = tokenizer(token,
                                max_length=20,
                                truncation=True,
                                return_tensors="pt")
    with torch.no_grad():
        input_items = {key: val.to(device) for key, val in tokenized_input.items()}
#         del input_items['token_type_ids'] ## bart不需要這個

        outputs = model(**input_items)
        prediction = outputs.logits.argmax(dim=-1)

        prediction = int(prediction)
        sm = softmax(outputs.logits[0])

        predict_list.append(prediction)
        score_list.append(outputs.logits[0][prediction].item())
        sm_score_list.append(sm[prediction].item())
        
print(predict_list)




[3, 3, 3, 1, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]


## predict from each sentence

In [64]:
test_num = 10

# subject = test_list[test_num]['subject']
context = test_list[test_num]['context']
label = test_list[test_num]['label']

tokenized_input = tokenizer(context,
                            max_length=512,
                            truncation=True,
                            return_tensors="pt")
class_number = {'SPAM':0, 'EDM':1, 'HAM':2, 'NOTE':3, 'KACKER':4}
model.eval()
with torch.no_grad():
    input_items = {key: val.to(device) for key, val in tokenized_input.items()}
    del input_items['token_type_ids'] ## bart不需要這個
    
    outputs = model(**input_items)
    prediction = outputs.logits.argmax(dim=-1)
    print(type(int(prediction)))
    
    
    # print('主旨: ', subject)
    print('內文: ', context)
    print('label: ', class_number[label], label)
    print('predict: ', int(prediction), list(class_number.keys())[list(class_number.values()).index(int(prediction))])
    
    # if int(prediction) == 0:
    #     print('predict: ham')
    # else:
    #     print('predict: spam')

<class 'int'>
內文:  EyeCloud  親愛的  林  淵博  設備  狀態  切換  資訊  設備  名稱  華府  總部  伺服器  型號  UR    隸屬  羣組  設備  狀態  切換  信件  EyeCloud  自動  發送  回覆  本信件  登入  EyeCloud  查看  管理員  謝謝  UrlText
label:  3 NOTE
predict:  3 NOTE
