In [238]:
!pip install transformers



In [239]:
import numpy as np
import pandas as pd
import random
import time
import re
import os
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import transformers as tfs
from transformers import AutoModel, BertTokenizerFast, RobertaTokenizer, RobertaModel
from transformers import AdamW
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from sklearn.utils import shuffle




In [240]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

SEED = 1234
batch_size = 256
epochs = 1

DROPOUT = 0.5
ALPHA = 0.3
GAMMA = 2
TEMP_1 = 0.12
TEMP_2 = 0.05

#train_path = "./drive/MyDrive/DCL/preprocess_train.csv"
#dev_path = "./drive/MyDrive/DCL/preprocess_dev.csv"
#test_path = "./drive/MyDrive/DCL/preprocess_test.csv"

# train_path = "./SemEval_Task5/relabel_train.csv"
# dev_path = "./SemEval_Task5/relabel_dev.csv"
# test_path = "./SemEval_Task5/relabel_test.csv"

#train_path = "./drive/MyDrive/hateval/hateval2019_en_train_preprocessed.csv"
#dev_path = "./drive/MyDrive/hateval/hateval2019_en_dev_preprocessed.csv"
#test_path = "./drive/MyDrive/hateval/hateval2019_en_test_preprocessed.csv"

#train_path = "./drive/MyDrive/davidson_dataset/davidson_preprocessed_train.csv"
#dev_path = "./drive/MyDrive/davidson_dataset/davidson_preprocessed_dev.csv"
#test_path = "./drive/MyDrive/davidson_dataset/davidson_preprocessed_test.csv"
train_path = "./drive/MyDrive/HatEval/preprocess_train.csv"
dev_path = "./drive/MyDrive/HatEval/preprocess_dev.csv"
test_path = "./drive/MyDrive/HatEval/preprocess_test.csv"

In [241]:
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
setup_seed(SEED)

def sen_lable(sen_score):
    if sen_score < 0:
        return -1
    elif sen_score > 0:
        return 1
    else:
        return 0

# 小写字母
def lower_text(text):
    if type(text) != str:
        return text
    return (text.lower())

# 数据集
def build_dataset(path, batch_size=64, if_train=False, if_test=False):
    train_df_ = pd.read_csv(path)
    train_df_['label'] = train_df_['label'].astype('int')
    # train_df_['HS'] = train_df_['HS'].astype('int')
    train_df_ = train_df_.dropna()
    if not if_test:
        train_df_ = shuffle(train_df_)
    text = train_df_["tweet"].values
    label = train_df_["label"].values
    #if_poison = train_df_["if_poison"].values

    batch_train_inputs, batch_train_targets = [], []
    # 如果使用对比损失，batch_size折半，生成batch_size个新样本
    if if_train:
        batch_size = int(batch_size/2)
    batch_count = int(round(len(text) / batch_size))
    #batch_count = int(math.ceil(len(text) / batch_size))
    for i in range(batch_count):
        if (len(text) > (i+1)*batch_size):
            batch_train_inputs.append(text[i*batch_size : (i+1)*batch_size])
            batch_train_targets.append(label[i*batch_size : (i+1)*batch_size])
        else:
            batch_train_inputs.append(text[i*batch_size:])
            batch_train_targets.append(label[i*batch_size:])

    return batch_count, [batch_train_inputs, batch_train_targets]

def copy_label(labels):
    labels = labels.unsqueeze(1)
    labels = torch.cat((labels, labels), dim=1).reshape(-1, 1).squeeze(1)
    return labels



In [242]:
train_batch, train_data = build_dataset(train_path, batch_size, if_train=True)
val_batch, val_data = build_dataset(dev_path, batch_size)
test_batch, test_data = build_dataset(test_path, batch_size, if_test=True)

# import BERT-base pretrained model and tokenizer
#bert = AutoModel.from_pretrained('bert-base-cased')
#tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
# import roberta
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# bert = RobertaModel.from_pretrained('roberta-base')

In [243]:
# model
class BERT_Arch(nn.Module):
    def __init__(self):
        super(BERT_Arch, self).__init__()
        model_class, tokenizer_class, pretrained_weights = (tfs.BertModel, tfs.BertTokenizer, 'bert-base-cased')
        self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
        self.bert = model_class.from_pretrained(pretrained_weights)
        self.dim = 768
        self.dense = nn.Linear(self.dim, 1)
        self.dropout = nn.Dropout(0.5)
        #self.dropout_2 = nn.Dropout(0.7)

    def forward(self, batch_sentences, if_train=False):
        batch_tokenized = self.tokenizer.batch_encode_plus(batch_sentences, add_special_tokens=True,
                                max_length=45, padding='max_length', truncation=True)      #tokenize、add special token、pad
        input_ids = torch.tensor(batch_tokenized['input_ids']).to(device)
        attention_mask = torch.tensor(batch_tokenized['attention_mask']).to(device)

        bert_output = self.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True)
        #bert_cls_hidden_state = bert_output[0][:,0,:]   #提取[CLS]对应的隐藏状态
        bert_cls_hidden_state = bert_output[1]  #提取pooled后的句向量
        torch.cuda.empty_cache()

        # hidden_states = bert_output[2][-1]
        # bert_cls_hidden_state = torch.mean(hidden_states, 1)

        # 如果使用无监督对比学习，利用dropout生成句向量的正样本，样本总量*2
        if if_train:
            bert_cls_hidden_state_copy = self.dropout(bert_cls_hidden_state)
            bert_cls_hidden_state = torch.cat((bert_cls_hidden_state, bert_cls_hidden_state_copy), dim=1).reshape(-1, self.dim)
        else:
            bert_cls_hidden_state = self.dropout(bert_cls_hidden_state)

        linear_output = self.dense(bert_cls_hidden_state)
        linear_output = linear_output.squeeze(1)

        return bert_cls_hidden_state, linear_output

In [244]:
# Focal Loss
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.4, gamma=2, size_average=True):
        super(FocalLoss, self).__init__()
        self.alpha = torch.tensor(alpha)
        self.gamma = gamma
        self.size_average = size_average

    def forward(self, pred, target):

        device = target.device
        self.alpha = self.alpha.to(device)

        pred = nn.Sigmoid()(pred)
        pred = pred.view(-1, 1)
        target = target.view(-1, 1)
        pred = torch.cat((1-pred, pred), dim=1)

        class_mask = torch.zeros(pred.shape[0], pred.shape[1]).to(device)
        class_mask.scatter_(1, target.view(-1, 1).long(), 1.)
        probs = (pred * class_mask).sum(dim=1).view(-1, 1)
        probs = probs.clamp(min=0.0001, max=1.0)

        log_p = probs.log()
        alpha = torch.ones(pred.shape[0], pred.shape[1]).to(device)
        alpha[:, 0] = alpha[:, 0] * (1 - self.alpha)
        alpha[:, 1] = alpha[:, 1] * self.alpha
        alpha = (alpha * class_mask).sum(dim=1).view(-1, 1)

        batch_loss = -alpha * (torch.pow((1 - probs), self.gamma)) * log_p

        if self.size_average:
            loss = batch_loss.mean()
        else:
            loss = batch_loss.sum()

        return loss

In [245]:
# Unsupervised Contrastive loss
def simcse_loss(batch_emb):
    # 构造标签
    batch_size = batch_emb.size(0)
    y_true = torch.cat([torch.arange(1, batch_size, step=2, dtype=torch.long).unsqueeze(1),
                        torch.arange(0, batch_size, step=2, dtype=torch.long).unsqueeze(1)],
                       dim=1).reshape([batch_size,]).to(device)

    # 计算score和loss
    norm_emb = F.normalize(batch_emb, dim=1, p=2)
    sim_score = torch.matmul(norm_emb, norm_emb.transpose(0,1))  # 句向量点积
    sim_score = sim_score - (torch.eye(batch_size) * 1e12).to(device)
    sim_score = sim_score / TEMP_1  # 温度系数为 0.05，也就是乘以20

    loss_func = nn.CrossEntropyLoss()
    loss = loss_func(sim_score, y_true)

    return loss

# Supervised Contrastive loss
def sup_simcse_loss(batch_emb, label):
    n = batch_emb.size(0)

    similarity_matrix = F.cosine_similarity(batch_emb.unsqueeze(1), batch_emb.unsqueeze(0), dim=2)
    mask = torch.ones_like(similarity_matrix) * (label.expand(n, n).eq(label.expand(n, n).t()))

    mask_no_sim = torch.ones_like(mask) - mask
    mask_dui_jiao_0 = ((torch.ones(n,n) - torch.eye(n,n)) * 1e12).to(device)
    similarity_matrix = torch.exp(similarity_matrix/TEMP_2)
    #print(similarity_matrix)
    similarity_matrix = similarity_matrix * mask_dui_jiao_0

    sim = mask*similarity_matrix
    no_sim = similarity_matrix - sim
    no_sim_sum = torch.sum(no_sim , dim=1)
    no_sim_sum_expend = no_sim_sum.repeat(n, 1).T

    sim_sum  = sim + no_sim_sum_expend
    loss = torch.div(sim , sim_sum)
    loss = mask_no_sim + loss + (torch.eye(n, n)/1e12).to(device)
    loss = -torch.log(loss)
    loss = torch.sum(torch.sum(loss, dim=1))/(2*n)
    #loss = loss/100
    return loss

model = BERT_Arch().to(device)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(),lr = 1e-4)
#optimizer = optim.SGD(model.parameters(), lr = 1e-1)

#criteon = nn.NLLLoss(weight=weights)
#criteon = nn.BCEWithLogitsLoss()
##criteon = nn.NLLLoss()
criteon = FocalLoss(ALPHA, GAMMA)

In [246]:
def binary_acc(preds, y):
    preds = torch.round(torch.sigmoid(preds))
    correct = torch.eq(preds, y).float()
    acc = correct.sum() / len(correct)
    return preds, acc

# def pred_lable(preds):
#     _, preds = torch.max(preds, 1)
#     pred_lables = []
#     return pred_lables

# 训练函数
def train(model, batch_count, batch_data, optimizer, criteon):
    batch_train_inputs, batch_train_targets = batch_data[0], batch_data[1]
    avg_loss = []
    avg_acc = []
    model.train()
    step = 0

    for i in tqdm(range(batch_count)):
        inputs = batch_train_inputs[i]
        labels = torch.tensor(batch_train_targets[i]).to(device)
        labels = copy_label(labels)
        # print(inputs[:20])
        # break
        emb, pred = model(inputs, if_train=True)
        #emb, pred = model(inputs)

        loss = criteon(pred, labels.float())
        loss_sim = simcse_loss(emb)
        loss_supsim = sup_simcse_loss(emb, labels)

        _, acc = binary_acc(pred, labels)
        avg_loss.append(loss.item())
        avg_acc.append(acc.item())

        loss = loss + (loss_sim + loss_supsim)
        #loss = loss + loss_supsim
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        step += 1

    avg_loss = np.array(avg_loss).mean()
    avg_acc = np.array(avg_acc).mean()
    return avg_loss, avg_acc

# 评估函数
def eval(model, batch_count, batch_data, criteon):
    batch_train_inputs, batch_train_targets = batch_data[0], batch_data[1]
    avg_loss = []
    avg_acc = []
    total_labels = []
    total_preds = []
    model.eval()

    with torch.no_grad():
        for i in tqdm(range(batch_count)):
            inputs = batch_train_inputs[i]
            labels = torch.tensor(batch_train_targets[i]).to(device)
            emb, pred = model(inputs, False)

            loss = criteon(pred, labels.float())
            preds, acc = binary_acc(pred, labels)
            avg_loss.append(loss.item())
            avg_acc.append(acc.item())

            pred_lables = preds.detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()
            total_preds.extend(pred_lables)
            total_labels.extend(labels)

    print(total_labels[100:120])
    print(total_preds[100:120])
    avg_loss = np.array(avg_loss).mean()
    avg_acc = np.array(avg_acc).mean()
    print(classification_report(total_labels, total_preds, digits = 4))
    return avg_loss, avg_acc

In [247]:
best_valid_acc = float('-inf')

for epoch in range(epochs):
    start_time = time.time()

    train_loss, train_acc = train(model, train_batch, train_data, optimizer, criteon)
    dev_loss, dev_acc = eval(model, val_batch, val_data, criteon)

    end_time = time.time()

    epoch_mins, epoch_secs = divmod(end_time - start_time, 60)

    if dev_acc > best_valid_acc:          #只要模型效果变好，就保存
        best_valid_acc = dev_acc
        torch.save(model.state_dict(), 'wordavg-model.pt')

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs:.2f}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val Loss: {dev_loss:.3f} |  Val Acc: {dev_acc*100:.2f}%')

#用保存的模型参数预测数据
model.load_state_dict(torch.load("wordavg-model.pt"))
test_loss, test_acc = eval(model, test_batch, test_data, criteon)
print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

# from transformers_interpret import SequenceClassificationExplainer
# cls_explainer = SequenceClassificationExplainer(
#     model,
#     tokenizer)
# word_attributions = cls_explainer("I love you, I like you")
# print(word_attributions)

100%|██████████| 70/70 [01:13<00:00,  1.05s/it]
100%|██████████| 4/4 [00:02<00:00,  1.35it/s]


[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0]
[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
              precision    recall  f1-score   support

           0     0.6971    0.9213    0.7937       572
           1     0.8148    0.4637    0.5910       427

    accuracy                         0.7257       999
   macro avg     0.7560    0.6925    0.6924       999
weighted avg     0.7474    0.7257    0.7071       999

Epoch: 01 | Epoch Time: 1.0m 16.72s
	Train Loss: 0.081 | Train Acc: 60.02%
	 Val Loss: 0.059 |  Val Acc: 72.66%


100%|██████████| 11/11 [00:08<00:00,  1.26it/s]

[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1]
[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0]
              precision    recall  f1-score   support

           0     0.7509    0.6751    0.7110      1625
           1     0.6071    0.6915    0.6466      1180

    accuracy                         0.6820      2805
   macro avg     0.6790    0.6833    0.6788      2805
weighted avg     0.6904    0.6820    0.6839      2805

Test Loss: 0.077 |  Test Acc: 68.21%



