### 資料前處理(處理過不用再跑)

In [1]:
# coding=gbk
import os
import codecs
import re
import base64
import pandas as pd
from collections import defaultdict
from tqdm.auto import tqdm

In [21]:
# 去掉非中文字符
def clean_str(string):
    string = re.sub(r"[^\u4e00-\u9fff]", " ", string)
    string = re.sub(r"\s{2,}", " ", string)  # 能直接变成一个字符串 可以在txt中写成一行
    return string.strip()
 
def Index_File():
    """index文件 路径--标签 对照表"""
    index_file = 'trec06c/full/index'
    f = codecs.open(index_file, 'r', 'gbk', errors='ignore')
    table = defaultdict(list)
    for line in f:
        label, path = line.strip().split()
        if label == 'spam': # 是垃圾邮件
            label = 1
        else:
            label = 0
        table['label'].append(label)
        table['path'].append(path)
    table = pd.DataFrame(data=table)
    return table
 
#提取四个特征
 
def From_email(email):
    # 发件人
    # 先提取From后的所有内容
    try:
        From_raw = re.search(r'From: (.*)', email).group(1)
    except:
        From_raw = ''
    From = ''
    # 先看看有没有加密部分 有加密部分就给他解密
    name = re.search(r'=\?GB2312\?B\?(.*)\?=', From_raw, re.I)  # name保存加密部分
    if name is None:  # 没有加密部分
        name = ''
        # 没有加密部分 就保留串的所有内容
        From = From_raw
    else:  # 有加密部分
        name = name.group(1)
        try:
            name = base64.b64decode(name).decode('gb2312')
        except:
            try:
                name = base64.b64decode(name).decode('gbk')
            except:
                name = ''
        From = name + re.search(r'\?=(.*)', From_raw).group(1)
    # print('From: ', From)
    return From
 
def To_email(email):
    # 收件人
    To = re.search(r'^To: (.*)', email, re.M | re.I).group(1)  # re.M 从每行文本开头的位置开始匹配
    # print('To: ', To)
    return To
 
def Subject_email(email):
    # 主题
    Subject = re.search(r'=\?gb2312\?B\?(.*)\?=', email)
    if Subject is None:
        Subject = ''
    else:  # subject 有内容
        Subject = Subject.group(1)
        Subject = base64.b64decode(Subject)  # 解密
        try:
            Subject = Subject.decode('gb2312')  # 解码
        except:
            try:
                Subject = Subject.decode('gbk')  # 解码
            except:
                Subject = ''
    # print('Subject: ', Subject)
    return Subject
 
def zhengwen_email(email):
    # 正文
    zhengwen = re.search(r'\n\n(.*)', email, re.S).group(1)
    zhengwen = clean_str(zhengwen) # 剔除了非中文字符
    # print('正文: \n', zhengwen)
    return zhengwen
 
def data_process():
    # 获取 路径--标签 对照表
    table = Index_File()
    spam = 0
    ham = 0
    path = 'trec06c/data'
    mail_list = []
    
    dirs = os.listdir(path)# ['000','001',...]
    progress_bar = tqdm(range(len(table)))
    
    for dir in dirs: # 文件夹
        dir_path = path + '/' + dir
        files = os.listdir(dir_path) # ['000','001',...]
        for file in files: # 数据文件
            file_path = dir_path + '/' + file
            f = codecs.open(file_path, 'r', 'gbk', errors='ignore')
            email = '' # 存储一封邮件的所有内容
            for line in f: # 每一行
                email += line
            index = '../data/' + dir + '/' + file
            # print(index)
            # 发件人
            From = From_email(email)
            # 收件人
            To = To_email(email)
            # 主题
            Subject = Subject_email(email)
            # 正文
            zhengwen = zhengwen_email(email)
 
            # print('*'*100)
            f.close()            
 
            flag = table[table['path'] == index]['label'].values[0]
            if flag == 1:
                mail = {'from': From, 'to': To, 'subject': Subject, 'context': zhengwen, 'label': 'spam'}
                spam += 1
            elif flag == 0:
                mail = {'from': From, 'to': To, 'subject': Subject, 'context': zhengwen, 'label': 'ham'}
                ham += 1
            
            mail_list.append(mail)
            progress_bar.update(1)
            
    return mail_list

In [22]:
mail_list = data_process()

  0%|          | 0/64620 [00:00<?, ?it/s]

In [23]:
mail_list[2]

{'from': '李倩娜 <mo@163.com>',
 'to': 'chi@ccert.edu.cn',
 'subject': '打造自己的家园',
 'context': '',
 'label': 'spam'}

In [24]:
from common import save_jsonl
save_jsonl(mail_list, 'trec06c/full-data.jsonl')

Save to Jsonl: trec06c/full-data.jsonl


In [1]:
import random
from common import save_jsonl

random.shuffle(mail_list)

## 切成訓練資料跟測試資料
cut_lower = int(len(mail_list) * 0.75)
cut_upper = int(len(mail_list) * 0.90)
save_jsonl(mail_list[:cut_lower], 'trec06c/train-dataset.jsonl')
save_jsonl(mail_list[cut_lower:cut_upper], 'trec06c/val-dataset.jsonl')
save_jsonl(mail_list[cut_upper:], 'trec06c/test-dataset.jsonl')

NameError: name 'mail_list' is not defined

### Training

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [4]:
from torch.utils.data import Dataset, DataLoader
from datasets import load_metric
import torch.nn as nn
import torch
from tqdm.auto import tqdm
import random
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report
from common import load_jsonl, save_jsonl

from transformers import (
    AdamW,
    get_scheduler,
    BartTokenizer,
    BartForSequenceClassification,
    BertTokenizer,
    AutoTokenizer,
    AutoModelForSequenceClassification
#     AutoModelForMaskedLM,
)

In [5]:
class NLIDataset(Dataset):
    # def __init__(self, data_list, max_length=512, model_name='fnlp/bart-large-chinese'):
    def __init__(self, data_list, max_length=512, model_name="bert-base-multilingual-cased"):
        self.d_list = data_list
        self.len = len(self.d_list)
        self.max_length = max_length
#         self.tokenizer = BertTokenizer.from_pretrained(model_name)  ##模型作者指定用BertTokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.label2index = {
            'ham': 0,
            'spam': 1
        }

    def __getitem__(self, index):
        data = self.d_list[index]
        subject = data['subject']
        context = data['context']
        label = data['label']
        
        processed_sample = dict()
        processed_sample['labels'] = torch.tensor(self.label2index[label])
        tokenized_input = self.tokenizer(subject, context,
                                                  max_length=self.max_length,
                                                  padding='max_length', 
                                                  truncation=True,
                                                  return_tensors="pt")
        
        input_items = {key: val.squeeze() for key, val in tokenized_input.items()}
        processed_sample.update(input_items)
        return processed_sample

    def __len__(self):
        return self.len

In [6]:
train_list = load_jsonl('trec06c/train-dataset.jsonl')
eval_list = load_jsonl('trec06c/val-dataset.jsonl')

Load Jsonl: trec06c/train-dataset.jsonl


48465it [00:02, 16739.11it/s]


Load Jsonl: trec06c/val-dataset.jsonl


9693it [00:00, 16305.18it/s]


In [7]:
train_dataset = NLIDataset(train_list)
eval_dataset = NLIDataset(eval_list)

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

In [8]:
train_list[0]

{'from': 'ling@cernet.com',
 'to': 'hong@ccert.edu.cn',
 'subject': 'Re: 大家觉得建立一个IT职业者联盟如何',
 'context': '你看人 都有那种组织的 定期弄些劳资谈判什么的 缓和劳资矛盾 而我们所处的 行业 唉 我感觉是一个比较尴尬的行业 一方面说待遇好 一方面又是有种吃青春饭的感觉 没有保障 当然我所说的是大多数人 那么我们是否有必要建立一个类似工会一样的组织来维护我们这个行业的就业人员的共同利益呢 只是我们这个是一个行业工会 全 行业的员工都可以成为联盟的成员 至于联盟的权利 那就很多了 最基础的一个我觉得就是帮助建立一个合理的劳资合同 比如可以确立最低劳资保障 最低合同年限等 另外还可以有就业经验交流之类的活动 具体还要集思广益',
 'label': 'ham'}

In [9]:
train_dataset[0]

{'labels': tensor(0),
 'input_ids': tensor([  101, 20304,   131,  3197,  3408,  7162,  3775,  3697,  6033,  2072,
          2102, 26956,  6478,  2090,  6457,  6479,  5749,  3241,  2253,   102,
          2262,  5765,  2179,  7838,  4461,  7802,  5953,  6328,  6331,  5718,
          3388,  4470,  3705,  2159,  2608,  7507,  7366,  2549,  2181,  2118,
          5718,  6374,  2833,  2608,  7507,  5813,  5762,  6459,  3976,  2206,
          3999,  3182,  5718,  7069,  2090,   100,  3976,  3911,  7162,  4380,
          2072,  2102,  4839,  7653,  3473,  3470,  5718,  7069,  2090,  2072,
          4335,  8335,  7354,  3765,  7754,  3240,  2072,  4335,  8335,  2728,
          4380,  4461,  5953,  2767,  8325,  4376,  8489,  5718,  3911,  7162,
          4917,  4461,  2312,  8253,  3740,  5322,  3976,  3999,  7354,  5718,
          4380,  3197,  3191,  4305,  2179,  7802,  2118,  3976,  2206,  4380,
          2789,  4461,  3793,  7139,  3697,  6033,  2072,  2102,  6141,  2240,
          3584,  

In [10]:
# model = BartForSequenceClassification.from_pretrained("fnlp/bart-large-chinese", num_labels=2) ## 2分類
# model = AutoModelForMaskedLM.from_pretrained("bert-base-multilingual-cased", num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
if torch.cuda.device_count() >1:
    model = nn.DataParallel(model,device_ids=[0])
model.to(device)

Downloading:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [11]:
train_batch_size=5
learning_rate=2e-5 
train_epochs=5

optimizer = AdamW(model.parameters(), lr=learning_rate)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
eval_dataloader = DataLoader(eval_dataset, shuffle=True, batch_size=train_batch_size)
print(len(train_dataloader))
print(len(eval_dataloader))

9693
1939


In [12]:
for batch_index, batch_dict in enumerate(train_dataloader):
    print(batch_dict)
    break

{'labels': tensor([1, 1, 1, 1, 1]), 'input_ids': tensor([[ 101, 4037, 7507,  ...,    0,    0,    0],
        [ 101, 7535, 6309,  ..., 2196, 2206,  102],
        [ 101, 5410, 2211,  ...,    0,    0,    0],
        [ 101, 2090, 2599,  ...,    0,    0,    0],
        [ 101,  102, 3620,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 1,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [13]:
## 進度條
num_training_steps = train_epochs * len(train_dataloader)
progress_bar = tqdm(range(num_training_steps))

## 設定warmup
lr_scheduler = get_scheduler(
  "linear",
  optimizer=optimizer,
  num_warmup_steps=10,
  num_training_steps=num_training_steps
)

## start training
for epoch in range(train_epochs):
    model.train()
    for batch_index, batch_dict in enumerate(train_dataloader):
        
        input_items = {key: val.to(device) for key, val in batch_dict.items()}
#         del input_items['token_type_ids'] ## bart不需要這個
        
        optimizer.zero_grad()
        outputs = model(**input_items)
        
        loss = outputs.loss
        if torch.cuda.device_count() >1: ##多GPU的情況要對loss求平均
            loss = loss.mean()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        progress_bar.update(1)
        
        if batch_index % 500 ==0:
            print('epoch: ', epoch, '  loss: ', loss)
            
    model.eval()
    predictions = []
    references = []
    with torch.no_grad():
        for batch_index, batch_dict in enumerate(eval_dataloader):
            input_items = {key: val.to(device) for key, val in batch_dict.items()}
            outputs = model(**input_items)

            predictions += outputs.logits.argmax(dim=-1).tolist()
            references += batch_dict['labels'].tolist()

    accuracy = accuracy_score(references, predictions)
    f1 = f1_score(references, predictions,average='macro')
    print('acc: ', accuracy)
    print('f1: ', f1)
    
    ## save model
    save_path = 'model/epoch_' + str(epoch+1)
    if torch.cuda.device_count() >1:
        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(save_path)
    else:
        model.save_pretrained(save_path)

  0%|          | 0/48465 [00:00<?, ?it/s]

epoch:  0   loss:  tensor(0.6812, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0072, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0019, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0031, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0485, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward0>)
epoch:  0   loss:  tensor(0.0016, device='cuda:0', g

acc:  0.9993809965954813
f1:  0.9993027558396207


## inference

In [37]:
import torch.nn as nn
import torch
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report
from common import load_jsonl, save_jsonl

from transformers import (
    BartTokenizer,
    BartForSequenceClassification,
    BertTokenizer,
    AutoTokenizer,
    AutoModelForSequenceClassification
)

In [38]:
test_list = load_jsonl('test-data.jsonl')

Load Jsonl: test-data.jsonl





0it [00:00, ?it/s][A[A[A


2683it [00:00, 26815.77it/s][A[A[A


5371it [00:00, 26846.33it/s][A[A[A


8056it [00:00, 26769.73it/s][A[A[A


12924it [00:00, 26328.51it/s][A[A[A


In [40]:
model = AutoModelForSequenceClassification.from_pretrained('model/epoch_5', num_labels=2) ## 2分類
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")  ##模型作者指定用BertTokenizer

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
if torch.cuda.device_count() >1:
    model = nn.DataParallel(model,device_ids=[0,1])
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [45]:
test_num = 10

subject = test_list[test_num]['subject']
context = test_list[test_num]['context']
label = test_list[test_num]['label']

In [46]:
tokenized_input = tokenizer(subject, context,
                            max_length=512,
                            truncation=True,
                            return_tensors="pt")

model.eval()
with torch.no_grad():
    input_items = {key: val.to(device) for key, val in tokenized_input.items()}
    del input_items['token_type_ids'] ## bart不需要這個
    
    outputs = model(**input_items)
    prediction = outputs.logits.argmax(dim=-1)
    
    
    print('主旨: ', subject)
    print('內文: ', context)
    print('label: ', label)
    if int(prediction) == 0:
        print('predict: ham')
    else:
        print('predict: spam')

主旨:  商业联络！
內文:  尊敬的负责人 经理 财务 您好 我是深圳伟和商贸发展有限公司 在我公司持续稳定的发展中实业雄厚 有着 一定的社会关系 在我公司每月进项多销项少 每月有余额发票 对外联系些业务 贵公司在平时作帐及销售方面是需要用些票据可在我公司代开 如商品销售 地税 建筑安装 其它服务 运输 广告发票等 收费 左右 还可代办海关缴款书 在我公司成立多年一直坚持以信用 所开绝对真票 更希望贵公司共同合作快乐 本公司的承诺 以最优惠的价格和最及时的服务满足你的要求 提供到位 如 贵公司有些担心 可在电脑网上查证或确认后再付款 以真诚的服务合作一次 必成永久的朋友 欢迎来电联系 联 系 人 杨 生 电 话
label:  spam
predict: spam


## eval

In [14]:
from torch.utils.data import Dataset, DataLoader
from datasets import load_metric
import torch.nn as nn
import torch
from tqdm.auto import tqdm
import random
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report
from common import load_jsonl, save_jsonl

from transformers import (
    AdamW,
    get_scheduler,
    BartTokenizer,
    BartForSequenceClassification,
    BertTokenizer,
    AutoTokenizer,
    AutoModelForSequenceClassification
#     AutoModelForMaskedLM,
)

In [15]:
import torch
from transformers import BertConfig, BertForSequenceClassification, BertTokenizerFast

In [16]:
eval_list = load_jsonl('test-data.jsonl')
eval_dataset = NLIDataset(eval_list)
eval_dataset[0]
test_dataloader = DataLoader(eval_dataset, shuffle=True, batch_size=train_batch_size)

Load Jsonl: test-data.jsonl



0it [00:00, ?it/s][A
1645it [00:00, 16417.49it/s][A
3417it [00:00, 17177.40it/s][A
5268it [00:00, 17785.21it/s][A
7124it [00:00, 18073.65it/s][A
8932it [00:00, 17672.30it/s][A
10738it [00:00, 17801.64it/s][A
12924it [00:00, 17537.28it/s][A


In [17]:
predictions = []
references = []

for batch_index, batch_dict in enumerate(test_dataloader):
    input_items = {key: val.to(device) for key, val in batch_dict.items()}
    outputs = model(**input_items)

    predictions += outputs.logits.argmax(dim=-1).tolist()
    references += batch_dict['labels'].tolist()

accuracy = accuracy_score(references, predictions)
f1 = f1_score(references, predictions,average='macro')
print('acc: ', accuracy)
print('f1: ', f1)

acc:  0.9996904982977406
f1:  0.9996526132173413


In [18]:
num = 3
print("the sentence is ")
print()
print(input_items[3])
outputs = model(**input_items[3])
predictions = outputs.logits.argmax(dim=-1).tolist()
print("predict : ", predictions)

the sentence is 



KeyError: 3

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
label_list = ["0", "1"]
tokenizer = BertTokenizerFast.from_pretrained("Transformers Trainer")
config = BertConfig.from_pretrained("Transformers Trainer", finetuning_task="cola")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

In [None]:
sentence = "The probable hostile German reaction is unfortunate."  # @param {type:"string"}
tokenized_input = tokenizer(sentence, return_tensors="pt").to(device)
outputs = model(**tokenized_input)
print(f"Prediction: {label_list[outputs.logits.argmax(dim=-1).item()]}")