In [2]:
import os
import torch
from tqdm.auto import tqdm
import warnings
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import pandas as pd

# 忽略特定的警告
warnings.filterwarnings("ignore", message="Be aware, overflowing tokens are not returned*")

# 加载数据
new_file_path = 'dataprocessed_10_to_13.csv'
df = pd.read_csv(new_file_path, encoding='latin-1')
df = df[['bug_id', 'summary', 'who','description']]
label_dict = {label: idx for idx, label in enumerate(df['who'].unique())}
df['label'] = df['who'].replace(label_dict)

  df['label'] = df['who'].replace(label_dict)


In [17]:


# 合并bug_id和summary作为模型的输入
df['text_input'] = df['bug_id'].astype(str) + " " + df['summary'] +" "+ df['description']  # 使用空格作为分隔符

X_train, X_val, y_train, y_val = train_test_split(df.index.values, df.label.values, test_size=0.15, random_state=42, stratify=df.label.values)
df['data_type'] = ['not_set']*df.shape[0]
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# 对训练和验证数据的合并文本进行编码
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text_input.values,  # 使用合并后的文本
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=512, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text_input.values,  # 使用合并后的文本
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=512, 
    return_tensors='pt'
)

# 准备Tensor数据
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

# 定义DataLoader
batch_size = 2
train_loader = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)
val_loader = DataLoader(dataset_val, sampler=SequentialSampler(dataset_val), batch_size=32)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [18]:
# 初始化模型
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict), output_attentions=False, output_hidden_states=False)
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
# 加载模型
# 计算层数
num_transformer_layers = len(model.bert.encoder.layer)
print(f'The BERT model has {num_transformer_layers} transformer layers.')
print(model)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The BERT model has 12 transformer layers.
BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             



In [1]:
import pandas as pd
from transformers import BertTokenizer

# 加载数据集
df = pd.read_csv('filtered_bug_raw_10_to_13.csv', encoding='latin-1')
df['text_input'] = df['bug_id'].astype(str) + " " + df['summary']+df['description']

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# 对所有合并后的文本进行编码
encoded_inputs = tokenizer.batch_encode_plus(df['text_input'].tolist(), add_special_tokens=True, truncation=False, padding=False)

# 计算所有编码后的长度
lengths = [len(input_ids) for input_ids in encoded_inputs['input_ids']]

# 计算平均长度、中位数、最大和最小长度
average_length = sum(lengths) / len(lengths)
median_length = sorted(lengths)[len(lengths) // 2]
max_length = max(lengths)
min_length = min(lengths)

print(f"Average length: {average_length}")
print(f"Median length: {median_length}")
print(f"Max length: {max_length}")
print(f"Min length: {min_length}")


Average length: 110.25808444902162
Median length: 86
Max length: 512
Min length: 18


In [20]:
# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

checkpoint_path = 'model_checkpoint_bert3.pth'

# 检查是否有可用的检查点
if os.path.isfile(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    print(f'Resuming training from epoch {start_epoch}')
else:
    start_epoch = 0
    print('Starting training from scratch')



Starting training from scratch


In [21]:


# 训练和验证循环
num_epochs = 130  # 这里设置一个小一点的数，以便于测试，您可以根据需要调整
for epoch in range(start_epoch, num_epochs):
    model.train()
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        progress_bar.set_postfix(loss=loss.item())

    torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, checkpoint_path)
    model.eval()
    correct = 0
    total = 0
    val_progress_bar = tqdm(val_loader, desc="Validating")

    for batch in val_progress_bar:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)

        logits = outputs[0]
        _, predicted = torch.max(logits, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Accuracy after epoch {epoch + 1}: {accuracy:.2f}%')


Epoch 1:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 1: 2.59%


Epoch 2:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 2: 17.24%


Epoch 3:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 3: 37.93%


Epoch 4:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 4: 63.79%


Epoch 5:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 5: 76.72%


Epoch 6:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 6: 82.76%


Epoch 7:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 7: 85.34%


Epoch 8:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 8: 89.66%


Epoch 9:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 9: 87.93%


Epoch 10:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 10: 88.79%


Epoch 11:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 11: 88.79%


Epoch 12:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 12: 90.52%


Epoch 13:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 13: 88.79%


Epoch 14:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 14: 89.66%


Epoch 15:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 15: 89.66%


Epoch 16:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 16: 90.52%


Epoch 17:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 17: 88.79%


Epoch 18:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 18: 87.93%


Epoch 19:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 19: 89.66%


Epoch 20:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 20: 89.66%


Epoch 21:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 21: 89.66%


Epoch 22:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 22: 89.66%


Epoch 23:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 23: 89.66%


Epoch 24:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 24: 89.66%


Epoch 25:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 25: 89.66%


Epoch 26:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 26: 89.66%


Epoch 27:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 27: 89.66%


Epoch 28:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 28: 89.66%


Epoch 29:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 29: 89.66%


Epoch 30:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 30: 89.66%


Epoch 31:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 31: 89.66%


Epoch 32:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 32: 89.66%


Epoch 33:   0%|          | 0/327 [00:00<?, ?it/s]

Validating:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy after epoch 33: 89.66%


Epoch 34:   0%|          | 0/327 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [14]:

# 冻结所有预训练层的参数
for param in model.bert.parameters():
    param.requires_grad = False

num_epochs = 113  # 训练周期数
num_layers = len(model.bert.encoder.layer)  # BERT模型的层数
layers_unfrozen = 0  # 已解冻的层数

base_lr = 1e-5  # 基础学习率
unfrozen_lr = 2e-5  # 新解冻层的学习率

# 初始化优化器
optimizer = AdamW(model.parameters(), lr=base_lr, eps=1e-8)

for epoch in range(num_epochs):
    if epoch % 10 == 0 and layers_unfrozen < num_layers:
        # 解冻下一层
        layer = model.bert.encoder.layer[layers_unfrozen]
        for param in layer.parameters():
            param.requires_grad = True
        
        layers_unfrozen += 1
        print(f'Unfrozen layer {layers_unfrozen} at epoch {epoch+1}')
        
        # 更新优化器参数组，为新解冻的层使用更高的学习率
        optimizer_grouped_parameters = [
            {'params': [p for p in model.bert.parameters() if p.requires_grad], 'lr': unfrozen_lr},
            {'params': [p for p in model.classifier.parameters() if p.requires_grad], 'lr': base_lr}
        ]
        optimizer = AdamW(optimizer_grouped_parameters, eps=1e-8)
    
    # 训练逻辑
    model.train()
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        progress_bar.set_postfix(loss=loss.item())
    
    # 验证逻辑
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predicted = torch.max(logits, dim=1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    print(f'Accuracy after epoch {epoch + 1}: {accuracy:.2f}%')
    
    # 保存检查点
    torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, checkpoint_path)


Unfrozen layer 1 at epoch 1




Epoch 1:   0%|          | 0/327 [00:00<?, ?it/s]

RuntimeError: The expanded size of the tensor (800) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [2, 800].  Tensor sizes: [1, 512]