In [1]:
"""
记得每次训练更改新的model name用以分别保存模型参数文件
可以通过dataset id来选择三个数据集中的一个
"""

m_name = 'berttextcnn_crwe_z1' # model name, bbc = 'bert-base-chinese
dsid = 2 # dataset id = ['/bq_corpus','/lcqmc','/paws-x-zh'], 千言文本相似度比赛三个数据集是分开记分的

In [None]:
debug = 0
seed = 225

# Model hyperparameter
device = 'cuda'
bert_model = 'hfl/chinese-roberta-wwm-ext' # 'bert-base-chinese' # 'hfl/chinese-roberta-wwm-ext'
freeze_bert = False
maxlen = 128
finetune_units = 768
dropout_rate = 0.1

#　Train Hyperparameter
bs = 16
lr = 2e-5 #1e-3 #2e-5
if debug:
    epochs = 4
    num_warmup_steps = 0
else:
    epochs = 8
    num_warmup_steps = 2
es_counts_MAX = 3
# Postprocess hyperparameter
thres = 0.5

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers

In [None]:
from scipy.spatial import distance
from scipy.spatial.distance import cosine
import nltk
from scipy.stats import pearsonr

import sys
import os
import numpy as np 
import pandas as pd
import copy
import matplotlib.pyplot as plt
import random
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, Dataset

import torch
from torch import nn
import torch.nn.functional as F
import transformers
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

#float16和float32自动混合精度加速计算，官方文档：https://pytorch.org/docs/stable/amp.html
from torch.cuda.amp import autocast
from torch.cuda.amp import GradScaler

In [None]:
def set_seed(seed = 42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    return seed

seed = set_seed(seed)

In [None]:
# PATH Info
CURR_PATH = os.getcwd()
ROOT_PATH = CURR_PATH + '/drive/MyDrive/Baidu_Qianyan'

In [None]:
def mkdir(path):
	folder = os.path.exists(path)
	if not folder:                   #判断是否存在文件夹如果不存在则创建为文件夹
		os.makedirs(path)            #makedirs 创建文件时如果路径不存在会创建这个路径
		print('---  New Model Folder: {}  ---'.format(m_name))
 
	else:
		print('---  Model Dir Exsiting!  ---')

def read_tsv(input_file):
    with open(input_file,"r",encoding="utf-8") as file:
        lines = []
        for line in file:
            if len(line.strip().split("\t")) != 1:
                lines.append(line.strip().split("\t"))
        df = pd.DataFrame(lines)
    return df

DATASET_PATH = ['/bq_corpus','/lcqmc','/paws-x-zh']
dataset_path = DATASET_PATH[dsid]
ROOT_PATH = '/content/drive/MyDrive/Baidu_Qianyan'
DATA_PATH = ['/train.tsv','/dev.tsv','/test.tsv']
MODEL_SAVE_PATH = ROOT_PATH + '/model' + dataset_path + '/' + m_name 
mkdir(MODEL_SAVE_PATH)     

train = pd.DataFrame()
dev = pd.DataFrame()
test = pd.DataFrame()
for data_path in DATA_PATH:
    PATH = ''.join([ROOT_PATH,dataset_path])
    PATH = ''.join([PATH,data_path])
    df = read_tsv(PATH)
    if data_path == '/train.tsv':
        train = pd.concat([train,df],axis = 0)
    if data_path == '/dev.tsv':
        dev = pd.concat([dev,df],axis = 0)
    if data_path == '/test.tsv':
        test = pd.concat([test,df],axis = 0)

## bq_corpus在20746行的格式有问题，以下方法无法读取
# train = pd.DataFrame()
# for dataset_path in DATASET_PATH:
#     print(dataset_path)
#     for data_path in DATA_PATH:
#         PATH = ''.join([ROOT_PATH,dataset_path])
#         PATH = ''.join([PATH,data_path])
#         read_df = pd.read_csv(PATH, header=0, delimiter='\t')
#         train.append(read_df)

train[[2]] = train[[2]].astype(int)
dev[[2]] = dev[[2]].astype(int)

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
cols_dict=['sentence_a', 'sentence_b', 'similarity']
train.columns = cols_dict
dev.columns = cols_dict
test.columns = cols_dict[:2]

In [None]:
train.head()

In [None]:
train['len_a']=train['sentence_a'].map(lambda x: len(x))
train['len_b']=train['sentence_b'].map(lambda x: len(x))
train.describe()

In [None]:
if debug:
    df_train = train.iloc[2000:20000,:].reset_index(drop = True)
    df_val = train.iloc[:2000,:]
else:
    df_train = train
    df_val = dev
df_val.head()

In [None]:
df_train.head()

In [None]:
class LoadDataset(Dataset):
    def __init__(self, data, maxlen, with_labels=True, bert_model='bert-base-chinese'):
        self.data = data
        self.tokenizer = BertTokenizer.from_pretrained(bert_model,output_loading_info = False)  
        self.maxlen = maxlen
        self.with_labels = with_labels 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        # Selecting sentence1 and sentence2 at the specified index in the data frame
        sent1 = str(self.data.loc[index,'sentence_a'])
        sent2 = str(self.data.loc[index,'sentence_b'])
        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoded_input1 = self.tokenizer(sent1,sent2, padding='max_length', truncation=True, max_length=self.maxlen, return_tensors='pt')
        token_ids1 =  encoded_input1['input_ids'].squeeze(0) 
        attn_masks1 =  encoded_input1['attention_mask'].squeeze(0)  
        token_type_ids1 =  encoded_input1['token_type_ids'].squeeze(0) 

        if self.with_labels:  # True if the dataset has labels
            label = self.data.loc[index, 'similarity']
            return token_ids1, attn_masks1, token_type_ids1, label
        else:
            return token_ids1, attn_masks1, token_type_ids1

In [None]:
def val_lossF(net, device, criterion, dataloader):
    net.eval()
    mean_loss = 0
    count = 0
    true_labelss = []
    list_val_outputs = []
    val_metric = 0
    
    with torch.no_grad():
        for  i, (token_ids1, attn_masks1, token_type_ids1,labels) in enumerate(dataloader):
            token_ids1, attn_masks1, token_type_ids1 = token_ids1.to(device), attn_masks1.to(device), token_type_ids1.to(device)
            labels = labels.to(device)
            
            val_output = net(token_ids1, attn_masks1, token_type_ids1)
            mean_loss += criterion(val_output, labels.float()).item()
            count += 1

            val_outputs = val_output.sigmoid().cpu().numpy()
            val_outputs = np.where(val_outputs>thres, 1, 0)
            list_val_outputs += val_outputs.tolist()
            labelss = labels.cpu().numpy()
            true_labelss += labelss.tolist()  
        val_metric = accuracy_score(list_val_outputs,true_labelss)       
    return mean_loss / count, val_metric

In [None]:
class BertTextCNN(nn.Module):
    def __init__(self, dropout_rate=0.2, finetune_units=768, bert_model='bert-base-chinese', freeze_bert=False):
        super(BertTextCNN, self).__init__()
        self.bert_layer1 = BertModel.from_pretrained(bert_model,output_loading_info = False)
        if bert_model == 'bert-base-chinese':
            self.hidden_size = 768
        elif bert_model == 'hfl/chinese-roberta-wwm-ext':
            self.hidden_size = 768
            
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        self.dropout0 = nn.Dropout(p=dropout_rate)
        # textcnn
        channel_num = 1
        filter_num=128
        filter_sizes=[2,3,4]
        pool_way='avg'
        self.convs = nn.ModuleList(
            [nn.Conv2d(channel_num, filter_num, (size, self.hidden_size)) for size in filter_sizes])
        self.pool_way = pool_way
        self.dropout1 = nn.Dropout(p=dropout_rate)
        self.fc = nn.Linear(len(filter_sizes) * filter_num, 1)

    @autocast()
    def forward(self,  token_ids1, attn_masks1, token_type_ids1):
        vecs1 = self.bert_layer1(token_ids1, attn_masks1, token_type_ids1)
        x = self.dropout0(vecs1[0])
        # textcnn
        x = x.unsqueeze(1) # conv2d 需要接收 4维 的输入
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs] 
        if self.pool_way == 'max':
            x = [F.max_pool1d(item, item.size(2)).squeeze(2) for item in x]
        elif self.pool_way == 'avg':
            x = [F.avg_pool1d(item, item.size(2)).squeeze(2) for item in x]  
        x = torch.cat(x, 1)
        x = self.dropout1(x)
        x = self.fc(x)

        return x.squeeze(-1)

In [None]:
device = torch.device(device if torch.cuda.is_available() else "cpu")
net = BertTextCNN(dropout_rate=dropout_rate, finetune_units=finetune_units,bert_model=bert_model,freeze_bert=freeze_bert)
net.to(device)

In [None]:
train_set = LoadDataset(df_train, maxlen, bert_model)
val_set = LoadDataset(df_val, maxlen, bert_model)
train_loader = DataLoader(train_set, batch_size=bs)
val_loader = DataLoader(val_set, batch_size=bs)

#criterion = nn.MSELoss()
criterion = nn.BCEWithLogitsLoss()
opti = AdamW(net.parameters(), lr=lr, weight_decay=1e-2)
num_training_steps = epochs * len(train_loader)  # The total number of training steps
lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps )
scaler = GradScaler()

best_loss = np.Inf
best_metric = -np.Inf
best_ep = 1
iters = []
train_losses = []
val_losses = []
val_metrics = []
es_count = 0
for ep in range(epochs):
    for it, (token_ids1, attn_masks1, token_type_ids1,labels) in tqdm(enumerate(train_loader), total = len(train_loader)):
        net.train()
        
        token_ids1, attn_masks1, token_type_ids1 = token_ids1.to(device), attn_masks1.to(device), token_type_ids1.to(device)
        labels = labels.to(device)
        opti.zero_grad()
        with autocast():
            output = net(token_ids1, attn_masks1, token_type_ids1)
            loss = criterion(output, labels.float())
        scaler.scale(loss).backward()
        scaler.step(opti)
        scaler.update()      
        lr_scheduler.step()
        
        # if it % 100 == 0:
        #     val_loss, val_metric = val_lossF(net, device, criterion, val_loader)  # Compute validation loss
        #     print("it = {}, train_loss = {}, val_loss = {}, val_metric= {}".format(it+1,loss,val_loss,val_metric))
            
    val_loss, val_metric = val_lossF(net, device, criterion, val_loader)  # Compute validation loss  
    print("Epoch {} complete! Train Loss : {} , Validation Loss : {} , Validation Metric - Accuracy : {} ".format(ep+1, loss, val_loss, val_metric))
    train_losses.append(loss)
    val_losses.append(val_loss)  
    val_metrics.append(val_metric)
    # if val_loss < best_loss:       
    #     print("Best validation loss improved from {} to {}".format(best_loss, val_loss))
    #     net_copy = copy.deepcopy(net)  # save a copy of the model
    #     best_loss = val_loss
    #     best_ep = ep + 1
    #     path_to_model='ep_{}_val_loss_{}.pt'.format(best_ep, round(best_loss, 4))
    #     torch.save(net_copy.state_dict(), MODEL_SAVE_PATH + '/' + path_to_model)
    #     print("The model has been saved in {}".format(path_to_model))
    if val_metric > best_metric:       
        print("Best validation metric improved from {} to {}".format(best_metric, val_metric))
        net_copy = copy.deepcopy(net)  # save a copy of the model
        best_metric = val_metric
        best_ep = ep + 1
        path_to_model='ep_{}_val_metric_{}.pt'.format(best_ep, round(best_metric, 4))
        torch.save(net_copy.state_dict(), MODEL_SAVE_PATH + '/' + path_to_model)
        print("The model has been saved in {}".format(path_to_model))
    # else:
    #     es_count += 1
    
    # if early_stop and es_count>es_counts_MAX:
    #     print('Early Stop Train in Epoch : {} '.format(ep+1))
    #     break

del loss
torch.cuda.empty_cache()

In [None]:
p1 = plt.plot(range(epochs),train_losses,'b--',label='train_loss')
p2 = plt.plot(range(epochs),val_losses,'r--',label='validation_loss')
p3 = plt.plot(range(epochs),val_metrics,'g--',label='validation_metric')
plt.plot(range(epochs),train_losses,'bo-',range(epochs),val_losses,'r+-',range(epochs),val_metrics,'g^-')
plt.title('Loss')
plt.xlabel('epoch')
plt.ylabel('loss & metric')
plt.legend()
plt.show()
print('train loss = ', train_losses)
print('val loss = ', val_losses)
print('val metric = ', val_metrics)


In [None]:
net = BertTextCNN(dropout_rate=dropout_rate, finetune_units=finetune_units,bert_model=bert_model)
net.load_state_dict(torch.load(MODEL_SAVE_PATH + '/' + path_to_model))
net.to(device)

test_set = LoadDataset(test, maxlen, with_labels=False, bert_model = bert_model)
test_loader = DataLoader(test_set, batch_size=bs)

net.eval()
results = []
with torch.no_grad():
    for token_ids1, attn_masks1, token_type_ids1 in tqdm(test_loader):
        token_ids1, attn_masks1, token_type_ids1 = token_ids1.to(device), attn_masks1.to(device), token_type_ids1.to(device)
        output = net(token_ids1, attn_masks1, token_type_ids1)
        output = output.sigmoid().cpu().numpy()
        output = np.where(output>thres, 1, 0)
        results += output.tolist()

test['similarity'] = results

In [None]:
test.to_csv(MODEL_SAVE_PATH+'dataset_path'+'preds.csv')
test.head()