In [None]:
!pip install transformers

# 微调分类器

## 加载数据集

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import torch
import transformers
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# 加载预训练模型
model_name = 'bert-base-uncased' # 预训练模型名字
tokenizer = transformers.BertTokenizer.from_pretrained(model_name)
model = transformers.BertForSequenceClassification.from_pretrained(model_name, num_labels=2) # 需要预测的类别数为2

# 设置训练参数
optimizer = transformers.AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 100
batch_size = 32

# 加载数据集
remote = True
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
if remote:
  os.chdir('/content/drive/MyDrive/ModelDebug/classifier')
  print(os.getcwd())
  df = pd.read_csv('/content/drive/MyDrive/ModelDebug/classifier/textNFR.csv',usecols=['RequirementText','NFR'])
  df1 = pd.read_csv('/content/drive/MyDrive/ModelDebug/classifier/new_dataset1.csv',usecols=['RequirementText','NFR'])
  df2 = pd.read_csv('/content/drive/MyDrive/ModelDebug/classifier/new_dataset2.csv',usecols=['RequirementText','NFR'])
  df3 = pd.read_csv('/content/drive/MyDrive/ModelDebug/classifier/new_dataset3.csv',usecols=['RequirementText','NFR'])
  #dfenhence1 = pd.merge(df,newdf,on=['RequirementText','NFR'],how='outer')
else :
  os.chdir('../classifier')
  df = pd.read_csv('textNFR.csv')
train_data, test_data = train_test_split(df, test_size=0.2)
#train_data, test_data = train_test_split(newdf, test_size=0.2)
train_tokenized = tokenizer(
    train_data["RequirementText"].tolist(),
    padding=True,
    truncation=True,
    max_length=512,
)

test_tokenized = tokenizer(
    test_data["RequirementText"].tolist(),
    padding=True,
    truncation=True,
    max_length=512,
)

train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_tokenized["input_ids"]),
    torch.tensor(train_tokenized["attention_mask"]),
    torch.tensor(train_data["NFR"].tolist()),
)

test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_tokenized["input_ids"]),
    torch.tensor(test_tokenized["attention_mask"]),
    torch.tensor(test_data["NFR"].tolist()),
)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

/content/drive/MyDrive/ModelDebug/classifier


## 提前停止

In [4]:
class EarlyStopping:
    def __init__(self, patience=5, verbose=False, delta=0):
        """
        Early stopping utility to stop training if no improvement after certain epochs.
        
        Args:
        patience (int): How long to wait after last time validation loss improved. Default: 5.
        verbose (bool): If True, prints a message for each validation loss improvement. Default: False.
        delta (float): Minimum change in the monitored quantity to qualify as an improvement. Default: 0.
        """
        self.patience = patience
        self.verbose = verbose
        self.delta = delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf

    def __call__(self, val_loss):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.counter = 0
            self.val_loss_min = val_loss
early_stopping = EarlyStopping(patience=5, verbose=True)

## train

In [None]:
from sklearn.metrics import recall_score, f1_score
model.to(device)
# 记录每个 epoch 的召回率和准确率
train_recall_list = []
test_recall_list = []
test_accuracy_list = []
test_f1_list = []

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch in train_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
    train_loss = train_loss / len(train_loader)
    
    # 计算训练集召回率
    train_preds = []
    train_labels = []
    with torch.no_grad():
        for batch in train_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predicted = torch.max(logits.data, 1)
            train_preds.extend(predicted.cpu().tolist())
            train_labels.extend(labels.cpu().tolist())
    train_recall = recall_score(train_labels, train_preds, average='macro')
    train_recall_list.append(train_recall)
    
    model.eval()
    with torch.no_grad():
        test_loss = 0
        total = 0
        correct = 0
        for batch in test_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            test_loss += loss.item()
            logits = outputs.logits
            _, predicted = torch.max(logits.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        test_loss = test_loss / len(test_loader)
        accuracy = correct / total
        
        # 计算测试集召回率和 F1 分数
        test_preds = []
        test_labels = []
        for batch in test_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predicted = torch.max(logits.data, 1)
            test_preds.extend(predicted.cpu().tolist())
            test_labels.extend(labels.cpu().tolist())
        test_recall = recall_score(test_labels, test_preds, average='macro')
        test_recall_list.append(test_recall)
        test_accuracy_list.append(accuracy)
        test_f1 = f1_score(test_labels, test_preds, average='macro')
        test_f1_list.append(test_f1)
    # early_stopping(test_loss)
    # if early_stopping.early_stop:
    #     print("Early stopping")
    #     break
    print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f} | Test Loss: {test_loss:.4f} | Test Accuracy: {accuracy:.4f} | Test Recall: {test_recall:.4f} | Test F1 Score: {test_f1:.4f}")

# 输出平均性能参数
print(f"Average Test Accuracy: {sum(test_accuracy_list)/len(test_accuracy_list):.4f} | Average Test Recall: {sum(test_recall_list)/len(test_recall_list):.4f} | Average Test F1: {sum(test_f1_list)/len(test_f1_list):.4f}")


## baseline

## debuged model


In [10]:
# 保存微调后的模型
if remote:
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    #}, '/content/drive/MyDrive/transformers_ner_master/classifier/model.pt')
    }, '/content/drive/MyDrive/ModelDebug/classifier/baseline.pt')
else:
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, 'model.pt')

# 模型优化

In [15]:
baseline = torch.load('./baseline.pt')

# 问题
## 过拟合
模型一开始，即第一个epoch就出现了过拟合，train loss=0.003,test loss=0.6.
这说明数据集太少，可以通过我们的数据增强方法生成新的数据，来改善过拟合现象。
# 实验数据
## epoch=200,batchsize=4,no earlystopping,new dataset
Average Test Accuracy: 0.8991 | Average Test Recall: 0.8999 | Average Test F1: 0.8978
## epoch=200,batchsize=32,earlystopping new dataset
Average Test Accuracy: 0.8207 | Average Test Recall: 0.8057 | Average Test F1: 0.8023
## epoch=200,batchsize=32,earlystopping, old dataset
Average Test Accuracy: 0.8967 | Average Test Recall: 0.8907 | Average Test F1: 0.8941
## epoch=200,batchsize=32,earlystopping, old dataset+new dataset
Average Test Accuracy: 0.9107 | Average Test Recall: 0.9063 | Average Test F1: 0.9080