In [5]:
import os
import torch
from tqdm.auto import tqdm
import warnings
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import pandas as pd

# 忽略特定的警告
warnings.filterwarnings("ignore", message="Be aware, overflowing tokens are not returned*")

# 加载新的数据集
new_file_path = './dataset/Eclipse_morethan10_processed_again.csv'
# 指定需要提取的列
columns_to_extract = ['bug_id', 'product', 'abstracts', 'description', 'component', 'severity', 'priority', 'developer',  'status']
# columns_to_extract = [ 'description', 'developer']
df = pd.read_csv(new_file_path, usecols=columns_to_extract, encoding='latin-1')

# 将developer列作为标签
label_dict = {label: idx for idx, label in enumerate(df['developer'].unique())}
df['label'] = df['developer'].replace(label_dict)

  df['label'] = df['developer'].replace(label_dict)


In [6]:
# 合并文本信息为模型的输入，除了developer列
# df['text_input'] = df[['bug_id', 'product', 'abstracts', 'description', 'component', 'severity', 'priority',  'status']].astype(str).agg(' '.join, axis=1)
df['text_input'] =   df['abstracts'].astype(str)+ " " + df['description'].astype(str)  # 使用空格作为分隔符

# df['text_input'] = df[[ 'product',   'component', 'severity', 'priority',  'status','description']].astype(str).agg(' '.join, axis=1)
X_train, X_val, y_train, y_val = train_test_split(df.index.values, df.label.values, test_size=0.15, random_state=42, stratify=df.label.values)
df['data_type'] = ['not_set']*df.shape[0]
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


In [7]:

# 对训练和验证数据的合并文本进行编码
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text_input.values,  # 使用合并后的文本
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length',  # 更新pad_to_max_length为padding
    max_length=512, 
    truncation=True,  # 明确启用截断
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text_input.values,  # 使用合并后的文本
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length',  # 更新pad_to_max_length为padding
    max_length=512, 
    truncation=True,  # 明确启用截断
    return_tensors='pt'
)

# 准备Tensor数据
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

# 定义DataLoader
batch_size = 4
train_loader = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)
val_loader = DataLoader(dataset_val, sampler=SequentialSampler(dataset_val), batch_size=32)

In [8]:
# # 初始化模型
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict), output_attentions=False, output_hidden_states=False)
# optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
# # 加载模型
# # 计算层数
# num_transformer_layers = len(model.bert.encoder.layer)
# print(f'The BERT model has {num_transformer_layers} transformer layers.')
# print(model)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict), output_attentions=False, output_hidden_states=False)
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
# 加载模型
# 计算层数
num_transformer_layers = len(model.bert.encoder.layer)
print(f'The BERT model has {num_transformer_layers} transformer layers.')
print(model)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The BERT model has 12 transformer layers.
BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [9]:
# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

checkpoint_path = 'model_checkpoint__eclipseprocessed_again_withabstract+description_normal.pth'

# 检查是否有可用的检查点
if os.path.isfile(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    print(f'Resuming training from epoch {start_epoch}')
else:
    start_epoch = 0
    print('Starting training from scratch')



Resuming training from epoch 2


In [10]:
import pymysql
from datetime import datetime

# 数据库连接信息
host = '38.147.173.234'
user = 'root'
password = '123456'
db = 'training_statistics_db'

# 模型名称，根据实际情况手动设置
model_name = 'bert-base-cased'
# 学习率和可选特性，根据实际情况手动设置
learning_rate = 1e-5  # 示例学习率
optional_feature = 'abstract+descrition'  # 示例可选特性
dataset = new_file_path
num_epochs=15

for epoch in range(start_epoch, num_epochs):
    model.train()
    start_time = datetime.now()
    # ...训练过程...
    end_time = datetime.now()
    duration = (end_time - start_time).total_seconds()
        
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        progress_bar.set_postfix(loss=loss.item())
    torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, checkpoint_path)
    model.eval()
    correct_topk = {k: 0 for k in range(1, 11)}
    total = 0
    val_progress_bar = tqdm(val_loader, desc="Validating")
    
    for batch in val_progress_bar:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs[0]
        total += labels.size(0)
        
        # 计算top1到top10的正确率
        _, predicted_topk = torch.topk(logits, k=10, dim=1)
        labels_expanded = labels.unsqueeze(1)
        for k in range(1, 11):
            correct_topk[k] += (predicted_topk[:, :k] == labels_expanded).any(dim=1).sum().item()
                
    # 打印每个topK的准确率
    top10accuracy = []  # 初始化存储Top1到Top10准确率的数组

    for k in range(1, 11):
        accuracy = 100 * correct_topk[k] / total
        top10accuracy.append(accuracy)  # 将计算出的准确率添加到数组中
        print(f'Accuracy after epoch {epoch + 1}: Top{k}: {accuracy:.2f}%')
        print(top10accuracy)
    import pandas as pd
    import os

    # 定义数据字典，用于创建DataFrame
    data = {
            'epoch': [epoch],
            'start_time': [start_time],
            'end_time': [end_time],
            'duration': [duration],
            'user_id': [1],
            'model': [model_name],
            'top1_accuracy': [top10accuracy[0]],
            'top2_accuracy': [top10accuracy[1]],
            'top3_accuracy': [top10accuracy[2]],
            'top4_accuracy': [top10accuracy[3]],
            'top5_accuracy': [top10accuracy[4]],
            'top6_accuracy': [top10accuracy[5]],
            'top7_accuracy': [top10accuracy[6]],
            'top8_accuracy': [top10accuracy[7]],
            'top9_accuracy': [top10accuracy[8]],
            'top10_accuracy': [top10accuracy[9]],
            'optional_feature': [optional_feature],
            'learning_rate': [learning_rate],
            'dataset': [dataset]
    }

        # 创建DataFrame
    df = pd.DataFrame(data)

        # 检查train.csv文件是否存在来决定是否添加表头
    file_exists = os.path.isfile('train.csv')

        # 如果文件存在，不写入表头，模式为追加；如果文件不存在，写入表头，模式为写入
    df.to_csv('train.csv', mode='a', header=not file_exists, index=False)

    print(f'Epoch {epoch + 1} training data inserted into train.csv.')

Epoch 3:   0%|          | 0/8599 [00:00<?, ?it/s]

Validating:   0%|          | 0/190 [00:00<?, ?it/s]

Accuracy after epoch 3: Top1: 39.49%
[39.489291598023065]
Accuracy after epoch 3: Top2: 53.82%
[39.489291598023065, 53.82207578253707]
Accuracy after epoch 3: Top3: 62.03%
[39.489291598023065, 53.82207578253707, 62.026359143327845]
Accuracy after epoch 3: Top4: 66.80%
[39.489291598023065, 53.82207578253707, 62.026359143327845, 66.80395387149917]
Accuracy after epoch 3: Top5: 71.07%
[39.489291598023065, 53.82207578253707, 62.026359143327845, 66.80395387149917, 71.07084019769357]
Accuracy after epoch 3: Top6: 73.92%
[39.489291598023065, 53.82207578253707, 62.026359143327845, 66.80395387149917, 71.07084019769357, 73.92092257001647]
Accuracy after epoch 3: Top7: 76.33%
[39.489291598023065, 53.82207578253707, 62.026359143327845, 66.80395387149917, 71.07084019769357, 73.92092257001647, 76.32619439868205]
Accuracy after epoch 3: Top8: 78.22%
[39.489291598023065, 53.82207578253707, 62.026359143327845, 66.80395387149917, 71.07084019769357, 73.92092257001647, 76.32619439868205, 78.22075782537067

Epoch 4:   0%|          | 0/8599 [00:00<?, ?it/s]

Validating:   0%|          | 0/190 [00:00<?, ?it/s]

Accuracy after epoch 4: Top1: 42.14%
[42.14168039538715]
Accuracy after epoch 4: Top2: 57.35%
[42.14168039538715, 57.34761120263592]
Accuracy after epoch 4: Top3: 65.17%
[42.14168039538715, 57.34761120263592, 65.17298187808896]
Accuracy after epoch 4: Top4: 70.00%
[42.14168039538715, 57.34761120263592, 65.17298187808896, 70.0]
Accuracy after epoch 4: Top5: 73.81%
[42.14168039538715, 57.34761120263592, 65.17298187808896, 70.0, 73.80560131795717]
Accuracy after epoch 4: Top6: 76.74%
[42.14168039538715, 57.34761120263592, 65.17298187808896, 70.0, 73.80560131795717, 76.73805601317957]
Accuracy after epoch 4: Top7: 78.86%
[42.14168039538715, 57.34761120263592, 65.17298187808896, 70.0, 73.80560131795717, 76.73805601317957, 78.86326194398681]
Accuracy after epoch 4: Top8: 80.59%
[42.14168039538715, 57.34761120263592, 65.17298187808896, 70.0, 73.80560131795717, 76.73805601317957, 78.86326194398681, 80.59308072487644]
Accuracy after epoch 4: Top9: 82.24%
[42.14168039538715, 57.34761120263592, 6

Epoch 5:   0%|          | 0/8599 [00:00<?, ?it/s]

Validating:   0%|          | 0/190 [00:00<?, ?it/s]

Accuracy after epoch 5: Top1: 45.11%
[45.10708401976936]
Accuracy after epoch 5: Top2: 59.46%
[45.10708401976936, 59.456342668863265]
Accuracy after epoch 5: Top3: 67.22%
[45.10708401976936, 59.456342668863265, 67.2158154859967]
Accuracy after epoch 5: Top4: 72.49%
[45.10708401976936, 59.456342668863265, 67.2158154859967, 72.48764415156508]
Accuracy after epoch 5: Top5: 76.33%
[45.10708401976936, 59.456342668863265, 67.2158154859967, 72.48764415156508, 76.32619439868205]
Accuracy after epoch 5: Top6: 78.71%
[45.10708401976936, 59.456342668863265, 67.2158154859967, 72.48764415156508, 76.32619439868205, 78.71499176276771]
Accuracy after epoch 5: Top7: 80.71%
[45.10708401976936, 59.456342668863265, 67.2158154859967, 72.48764415156508, 76.32619439868205, 78.71499176276771, 80.70840197693575]
Accuracy after epoch 5: Top8: 82.41%
[45.10708401976936, 59.456342668863265, 67.2158154859967, 72.48764415156508, 76.32619439868205, 78.71499176276771, 80.70840197693575, 82.40527182866556]
Accuracy af

Epoch 6:   0%|          | 0/8599 [00:00<?, ?it/s]

Validating:   0%|          | 0/190 [00:00<?, ?it/s]

Accuracy after epoch 6: Top1: 45.60%
[45.601317957166394]
Accuracy after epoch 6: Top2: 60.48%
[45.601317957166394, 60.477759472817134]
Accuracy after epoch 6: Top3: 67.92%
[45.601317957166394, 60.477759472817134, 67.92421746293246]
Accuracy after epoch 6: Top4: 72.97%
[45.601317957166394, 60.477759472817134, 67.92421746293246, 72.9654036243822]
Accuracy after epoch 6: Top5: 76.49%
[45.601317957166394, 60.477759472817134, 67.92421746293246, 72.9654036243822, 76.49093904448105]
Accuracy after epoch 6: Top6: 79.08%
[45.601317957166394, 60.477759472817134, 67.92421746293246, 72.9654036243822, 76.49093904448105, 79.07742998352553]
Accuracy after epoch 6: Top7: 81.15%
[45.601317957166394, 60.477759472817134, 67.92421746293246, 72.9654036243822, 76.49093904448105, 79.07742998352553, 81.15321252059309]
Accuracy after epoch 6: Top8: 82.88%
[45.601317957166394, 60.477759472817134, 67.92421746293246, 72.9654036243822, 76.49093904448105, 79.07742998352553, 81.15321252059309, 82.8830313014827]
Acc

Epoch 7:   0%|          | 0/8599 [00:00<?, ?it/s]

Validating:   0%|          | 0/190 [00:00<?, ?it/s]

Accuracy after epoch 7: Top1: 45.95%
[45.94728171334432]
Accuracy after epoch 7: Top2: 60.49%
[45.94728171334432, 60.494233937397034]
Accuracy after epoch 7: Top3: 68.17%
[45.94728171334432, 60.494233937397034, 68.17133443163097]
Accuracy after epoch 7: Top4: 72.88%
[45.94728171334432, 60.494233937397034, 68.17133443163097, 72.8830313014827]
Accuracy after epoch 7: Top5: 76.44%
[45.94728171334432, 60.494233937397034, 68.17133443163097, 72.8830313014827, 76.44151565074135]
Accuracy after epoch 7: Top6: 79.28%
[45.94728171334432, 60.494233937397034, 68.17133443163097, 72.8830313014827, 76.44151565074135, 79.27512355848435]
Accuracy after epoch 7: Top7: 81.40%
[45.94728171334432, 60.494233937397034, 68.17133443163097, 72.8830313014827, 76.44151565074135, 79.27512355848435, 81.4003294892916]
Accuracy after epoch 7: Top8: 83.10%
[45.94728171334432, 60.494233937397034, 68.17133443163097, 72.8830313014827, 76.44151565074135, 79.27512355848435, 81.4003294892916, 83.09719934102142]
Accuracy aft

Epoch 8:   0%|          | 0/8599 [00:00<?, ?it/s]

Validating:   0%|          | 0/190 [00:00<?, ?it/s]

Accuracy after epoch 8: Top1: 46.16%
[46.16144975288303]
Accuracy after epoch 8: Top2: 60.44%
[46.16144975288303, 60.44481054365733]
Accuracy after epoch 8: Top3: 68.29%
[46.16144975288303, 60.44481054365733, 68.28665568369028]
Accuracy after epoch 8: Top4: 72.97%
[46.16144975288303, 60.44481054365733, 68.28665568369028, 72.9654036243822]
Accuracy after epoch 8: Top5: 76.54%
[46.16144975288303, 60.44481054365733, 68.28665568369028, 72.9654036243822, 76.54036243822075]
Accuracy after epoch 8: Top6: 79.23%
[46.16144975288303, 60.44481054365733, 68.28665568369028, 72.9654036243822, 76.54036243822075, 79.22570016474465]
Accuracy after epoch 8: Top7: 81.25%
[46.16144975288303, 60.44481054365733, 68.28665568369028, 72.9654036243822, 76.54036243822075, 79.22570016474465, 81.25205930807249]
Accuracy after epoch 8: Top8: 82.73%
[46.16144975288303, 60.44481054365733, 68.28665568369028, 72.9654036243822, 76.54036243822075, 79.22570016474465, 81.25205930807249, 82.73476112026358]
Accuracy after ep

Epoch 9:   0%|          | 0/8599 [00:00<?, ?it/s]

Validating:   0%|          | 0/190 [00:00<?, ?it/s]

Accuracy after epoch 9: Top1: 46.79%
[46.787479406919275]
Accuracy after epoch 9: Top2: 60.61%
[46.787479406919275, 60.609555189456344]
Accuracy after epoch 9: Top3: 68.11%
[46.787479406919275, 60.609555189456344, 68.10543657331137]
Accuracy after epoch 9: Top4: 72.97%
[46.787479406919275, 60.609555189456344, 68.10543657331137, 72.9654036243822]
Accuracy after epoch 9: Top5: 76.51%
[46.787479406919275, 60.609555189456344, 68.10543657331137, 72.9654036243822, 76.50741350906095]
Accuracy after epoch 9: Top6: 79.31%
[46.787479406919275, 60.609555189456344, 68.10543657331137, 72.9654036243822, 76.50741350906095, 79.30807248764415]
Accuracy after epoch 9: Top7: 81.10%
[46.787479406919275, 60.609555189456344, 68.10543657331137, 72.9654036243822, 76.50741350906095, 79.30807248764415, 81.10378912685337]
Accuracy after epoch 9: Top8: 82.70%
[46.787479406919275, 60.609555189456344, 68.10543657331137, 72.9654036243822, 76.50741350906095, 79.30807248764415, 81.10378912685337, 82.70181219110378]
Ac

Epoch 10:   0%|          | 0/8599 [00:00<?, ?it/s]

Validating:   0%|          | 0/190 [00:00<?, ?it/s]

Accuracy after epoch 10: Top1: 47.25%
[47.24876441515651]
Accuracy after epoch 10: Top2: 60.56%
[47.24876441515651, 60.560131795716636]
Accuracy after epoch 10: Top3: 68.12%
[47.24876441515651, 60.560131795716636, 68.12191103789127]
Accuracy after epoch 10: Top4: 73.28%
[47.24876441515651, 60.560131795716636, 68.12191103789127, 73.27841845140033]
Accuracy after epoch 10: Top5: 76.69%
[47.24876441515651, 60.560131795716636, 68.12191103789127, 73.27841845140033, 76.68863261943987]
Accuracy after epoch 10: Top6: 79.32%
[47.24876441515651, 60.560131795716636, 68.12191103789127, 73.27841845140033, 76.68863261943987, 79.32454695222405]
Accuracy after epoch 10: Top7: 81.42%
[47.24876441515651, 60.560131795716636, 68.12191103789127, 73.27841845140033, 76.68863261943987, 79.32454695222405, 81.4168039538715]
Accuracy after epoch 10: Top8: 83.08%
[47.24876441515651, 60.560131795716636, 68.12191103789127, 73.27841845140033, 76.68863261943987, 79.32454695222405, 81.4168039538715, 83.08072487644152]

Epoch 11:   0%|          | 0/8599 [00:00<?, ?it/s]

Validating:   0%|          | 0/190 [00:00<?, ?it/s]

Accuracy after epoch 11: Top1: 47.27%
[47.26523887973641]
Accuracy after epoch 11: Top2: 60.23%
[47.26523887973641, 60.23064250411861]
Accuracy after epoch 11: Top3: 67.64%
[47.26523887973641, 60.23064250411861, 67.64415156507414]
Accuracy after epoch 11: Top4: 72.47%
[47.26523887973641, 60.23064250411861, 67.64415156507414, 72.47116968698518]
Accuracy after epoch 11: Top5: 75.58%
[47.26523887973641, 60.23064250411861, 67.64415156507414, 72.47116968698518, 75.58484349258649]
Accuracy after epoch 11: Top6: 78.14%
[47.26523887973641, 60.23064250411861, 67.64415156507414, 72.47116968698518, 75.58484349258649, 78.13838550247117]
Accuracy after epoch 11: Top7: 80.05%
[47.26523887973641, 60.23064250411861, 67.64415156507414, 72.47116968698518, 75.58484349258649, 78.13838550247117, 80.0494233937397]
Accuracy after epoch 11: Top8: 81.42%
[47.26523887973641, 60.23064250411861, 67.64415156507414, 72.47116968698518, 75.58484349258649, 78.13838550247117, 80.0494233937397, 81.4168039538715]
Accurac

Epoch 12:   0%|          | 0/8599 [00:00<?, ?it/s]

Validating:   0%|          | 0/190 [00:00<?, ?it/s]

Accuracy after epoch 12: Top1: 47.89%
[47.89126853377265]
Accuracy after epoch 12: Top2: 61.09%
[47.89126853377265, 61.08731466227348]
Accuracy after epoch 12: Top3: 68.40%
[47.89126853377265, 61.08731466227348, 68.40197693574959]
Accuracy after epoch 12: Top4: 72.59%
[47.89126853377265, 61.08731466227348, 68.40197693574959, 72.58649093904448]
Accuracy after epoch 12: Top5: 76.01%
[47.89126853377265, 61.08731466227348, 68.40197693574959, 72.58649093904448, 76.01317957166393]
Accuracy after epoch 12: Top6: 78.45%
[47.89126853377265, 61.08731466227348, 68.40197693574959, 72.58649093904448, 76.01317957166393, 78.45140032948929]
Accuracy after epoch 12: Top7: 80.43%
[47.89126853377265, 61.08731466227348, 68.40197693574959, 72.58649093904448, 76.01317957166393, 78.45140032948929, 80.42833607907743]
Accuracy after epoch 12: Top8: 81.93%
[47.89126853377265, 61.08731466227348, 68.40197693574959, 72.58649093904448, 76.01317957166393, 78.45140032948929, 80.42833607907743, 81.92751235584844]
Accu

Epoch 13:   0%|          | 0/8599 [00:00<?, ?it/s]

Validating:   0%|          | 0/190 [00:00<?, ?it/s]

Accuracy after epoch 13: Top1: 47.00%
[47.00164744645799]
Accuracy after epoch 13: Top2: 60.26%
[47.00164744645799, 60.26359143327842]
Accuracy after epoch 13: Top3: 68.11%
[47.00164744645799, 60.26359143327842, 68.10543657331137]
Accuracy after epoch 13: Top4: 72.11%
[47.00164744645799, 60.26359143327842, 68.10543657331137, 72.10873146622735]
Accuracy after epoch 13: Top5: 75.35%
[47.00164744645799, 60.26359143327842, 68.10543657331137, 72.10873146622735, 75.35420098846788]
Accuracy after epoch 13: Top6: 77.83%
[47.00164744645799, 60.26359143327842, 68.10543657331137, 72.10873146622735, 75.35420098846788, 77.82537067545304]
Accuracy after epoch 13: Top7: 79.70%
[47.00164744645799, 60.26359143327842, 68.10543657331137, 72.10873146622735, 75.35420098846788, 77.82537067545304, 79.70345963756178]
Accuracy after epoch 13: Top8: 81.14%
[47.00164744645799, 60.26359143327842, 68.10543657331137, 72.10873146622735, 75.35420098846788, 77.82537067545304, 79.70345963756178, 81.13673805601319]
Accu

Epoch 14:   0%|          | 0/8599 [00:00<?, ?it/s]

Validating:   0%|          | 0/190 [00:00<?, ?it/s]

Accuracy after epoch 14: Top1: 47.10%
[47.1004942339374]
Accuracy after epoch 14: Top2: 59.74%
[47.1004942339374, 59.73640856672158]
Accuracy after epoch 14: Top3: 66.99%
[47.1004942339374, 59.73640856672158, 66.9851729818781]
Accuracy after epoch 14: Top4: 71.86%
[47.1004942339374, 59.73640856672158, 66.9851729818781, 71.86161449752883]
Accuracy after epoch 14: Top5: 74.70%
[47.1004942339374, 59.73640856672158, 66.9851729818781, 71.86161449752883, 74.69522240527183]
Accuracy after epoch 14: Top6: 77.25%
[47.1004942339374, 59.73640856672158, 66.9851729818781, 71.86161449752883, 74.69522240527183, 77.2487644151565]
Accuracy after epoch 14: Top7: 79.24%
[47.1004942339374, 59.73640856672158, 66.9851729818781, 71.86161449752883, 74.69522240527183, 77.2487644151565, 79.24217462932455]
Accuracy after epoch 14: Top8: 80.97%
[47.1004942339374, 59.73640856672158, 66.9851729818781, 71.86161449752883, 74.69522240527183, 77.2487644151565, 79.24217462932455, 80.97199341021417]
Accuracy after epoch 

Epoch 15:   0%|          | 0/8599 [00:00<?, ?it/s]

Validating:   0%|          | 0/190 [00:00<?, ?it/s]

Accuracy after epoch 15: Top1: 46.51%
[46.50741350906095]
Accuracy after epoch 15: Top2: 58.96%
[46.50741350906095, 58.96210873146623]
Accuracy after epoch 15: Top3: 66.71%
[46.50741350906095, 58.96210873146623, 66.70510708401977]
Accuracy after epoch 15: Top4: 70.87%
[46.50741350906095, 58.96210873146623, 66.70510708401977, 70.87314662273477]
Accuracy after epoch 15: Top5: 73.92%
[46.50741350906095, 58.96210873146623, 66.70510708401977, 70.87314662273477, 73.92092257001647]
Accuracy after epoch 15: Top6: 76.19%
[46.50741350906095, 58.96210873146623, 66.70510708401977, 70.87314662273477, 73.92092257001647, 76.19439868204283]
Accuracy after epoch 15: Top7: 78.15%
[46.50741350906095, 58.96210873146623, 66.70510708401977, 70.87314662273477, 73.92092257001647, 76.19439868204283, 78.15485996705107]
Accuracy after epoch 15: Top8: 79.80%
[46.50741350906095, 58.96210873146623, 66.70510708401977, 70.87314662273477, 73.92092257001647, 76.19439868204283, 78.15485996705107, 79.80230642504118]
Accu

然后我们再来一遍同样一模一样的操作

In [11]:
import os
import torch
from tqdm.auto import tqdm
import warnings
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import pandas as pd

# 忽略特定的警告
warnings.filterwarnings("ignore", message="Be aware, overflowing tokens are not returned*")

# 加载新的数据集
new_file_path = './dataset/Eclipse_total_processed_jump_final_deleted_lowdeveloper_again.csv'
# 指定需要提取的列
columns_to_extract = ['bug_id', 'product', 'abstracts', 'description', 'component', 'severity', 'priority', 'developer',  'status']
# columns_to_extract = [ 'description', 'developer']
df = pd.read_csv(new_file_path, usecols=columns_to_extract, encoding='latin-1')

# 将developer列作为标签
label_dict = {label: idx for idx, label in enumerate(df['developer'].unique())}
df['label'] = df['developer'].replace(label_dict)

  df['label'] = df['developer'].replace(label_dict)


In [12]:
# 合并文本信息为模型的输入，除了developer列
# df['text_input'] = df[['bug_id', 'product', 'abstracts', 'description', 'component', 'severity', 'priority',  'status']].astype(str).agg(' '.join, axis=1)
df['text_input'] =   df['abstracts'].astype(str)+ " " + df['description'].astype(str)  # 使用空格作为分隔符

# df['text_input'] = df[[ 'product',   'component', 'severity', 'priority',  'status','description']].astype(str).agg(' '.join, axis=1)
X_train, X_val, y_train, y_val = train_test_split(df.index.values, df.label.values, test_size=0.15, random_state=42, stratify=df.label.values)
df['data_type'] = ['not_set']*df.shape[0]
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


In [13]:

# 对训练和验证数据的合并文本进行编码
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text_input.values,  # 使用合并后的文本
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length',  # 更新pad_to_max_length为padding
    max_length=512, 
    truncation=True,  # 明确启用截断
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text_input.values,  # 使用合并后的文本
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length',  # 更新pad_to_max_length为padding
    max_length=512, 
    truncation=True,  # 明确启用截断
    return_tensors='pt'
)

# 准备Tensor数据
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

# 定义DataLoader
batch_size = 4
train_loader = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)
val_loader = DataLoader(dataset_val, sampler=SequentialSampler(dataset_val), batch_size=32)

In [14]:
# # 初始化模型
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict), output_attentions=False, output_hidden_states=False)
# optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
# # 加载模型
# # 计算层数
# num_transformer_layers = len(model.bert.encoder.layer)
# print(f'The BERT model has {num_transformer_layers} transformer layers.')
# print(model)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict), output_attentions=False, output_hidden_states=False)
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
# 加载模型
# 计算层数
num_transformer_layers = len(model.bert.encoder.layer)
print(f'The BERT model has {num_transformer_layers} transformer layers.')
print(model)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The BERT model has 12 transformer layers.
BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             



In [15]:
# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

checkpoint_path = 'model_checkpoint__eclipseprocessed_again_withabstract+description_jump.pth'

# 检查是否有可用的检查点
if os.path.isfile(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    print(f'Resuming training from epoch {start_epoch}')
else:
    start_epoch = 0
    print('Starting training from scratch')



Starting training from scratch


In [16]:
from datetime import datetime

# 数据库连接信息
host = '38.147.173.234'
user = 'root'
password = '123456'
db = 'training_statistics_db'

# 模型名称，根据实际情况手动设置
model_name = 'bert-base-cased'
# 学习率和可选特性，根据实际情况手动设置
learning_rate = 1e-5  # 示例学习率
optional_feature = 'abstract+descrition(hasbeenprocessedbyjump)'  # 示例可选特性
dataset = new_file_path
num_epochs=15

for epoch in range(start_epoch, num_epochs):
    model.train()
    start_time = datetime.now()
    # ...训练过程...
    end_time = datetime.now()
    duration = (end_time - start_time).total_seconds()
        
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        progress_bar.set_postfix(loss=loss.item())
    torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, checkpoint_path)
    model.eval()
    correct_topk = {k: 0 for k in range(1, 11)}
    total = 0
    val_progress_bar = tqdm(val_loader, desc="Validating")
    
    for batch in val_progress_bar:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs[0]
        total += labels.size(0)
        
        # 计算top1到top10的正确率
        _, predicted_topk = torch.topk(logits, k=10, dim=1)
        labels_expanded = labels.unsqueeze(1)
        for k in range(1, 11):
            correct_topk[k] += (predicted_topk[:, :k] == labels_expanded).any(dim=1).sum().item()
                
    # 打印每个topK的准确率
    top10accuracy = []  # 初始化存储Top1到Top10准确率的数组

    for k in range(1, 11):
        accuracy = 100 * correct_topk[k] / total
        top10accuracy.append(accuracy)  # 将计算出的准确率添加到数组中
        print(f'Accuracy after epoch {epoch + 1}: Top{k}: {accuracy:.2f}%')
        print(top10accuracy)
    import pandas as pd
    import os

    # 定义数据字典，用于创建DataFrame
    data = {
            'epoch': [epoch],
            'start_time': [start_time],
            'end_time': [end_time],
            'duration': [duration],
            'user_id': [1],
            'model': [model_name],
            'top1_accuracy': [top10accuracy[0]],
            'top2_accuracy': [top10accuracy[1]],
            'top3_accuracy': [top10accuracy[2]],
            'top4_accuracy': [top10accuracy[3]],
            'top5_accuracy': [top10accuracy[4]],
            'top6_accuracy': [top10accuracy[5]],
            'top7_accuracy': [top10accuracy[6]],
            'top8_accuracy': [top10accuracy[7]],
            'top9_accuracy': [top10accuracy[8]],
            'top10_accuracy': [top10accuracy[9]],
            'optional_feature': [optional_feature],
            'learning_rate': [learning_rate],
            'dataset': [dataset]
    }

        # 创建DataFrame
    df = pd.DataFrame(data)

        # 检查train.csv文件是否存在来决定是否添加表头
    file_exists = os.path.isfile('train.csv')

        # 如果文件存在，不写入表头，模式为追加；如果文件不存在，写入表头，模式为写入
    df.to_csv('train.csv', mode='a', header=not file_exists, index=False)

    print(f'Epoch {epoch + 1} training data inserted into train.csv.')

Epoch 1:   0%|          | 0/3536 [00:00<?, ?it/s]

Validating:   0%|          | 0/78 [00:00<?, ?it/s]

Accuracy after epoch 1: Top1: 21.43%
[21.434294871794872]
Accuracy after epoch 1: Top2: 30.93%
[21.434294871794872, 30.92948717948718]
Accuracy after epoch 1: Top3: 37.66%
[21.434294871794872, 30.92948717948718, 37.66025641025641]
Accuracy after epoch 1: Top4: 41.39%
[21.434294871794872, 30.92948717948718, 37.66025641025641, 41.38621794871795]
Accuracy after epoch 1: Top5: 44.79%
[21.434294871794872, 30.92948717948718, 37.66025641025641, 41.38621794871795, 44.791666666666664]
Accuracy after epoch 1: Top6: 47.64%
[21.434294871794872, 30.92948717948718, 37.66025641025641, 41.38621794871795, 44.791666666666664, 47.63621794871795]
Accuracy after epoch 1: Top7: 50.84%
[21.434294871794872, 30.92948717948718, 37.66025641025641, 41.38621794871795, 44.791666666666664, 47.63621794871795, 50.84134615384615]
Accuracy after epoch 1: Top8: 54.13%
[21.434294871794872, 30.92948717948718, 37.66025641025641, 41.38621794871795, 44.791666666666664, 47.63621794871795, 50.84134615384615, 54.12660256410256]


Epoch 2:   0%|          | 0/3536 [00:00<?, ?it/s]

Validating:   0%|          | 0/78 [00:00<?, ?it/s]

Accuracy after epoch 2: Top1: 28.89%
[28.88621794871795]
Accuracy after epoch 2: Top2: 39.50%
[28.88621794871795, 39.50320512820513]
Accuracy after epoch 2: Top3: 45.83%
[28.88621794871795, 39.50320512820513, 45.833333333333336]
Accuracy after epoch 2: Top4: 50.88%
[28.88621794871795, 39.50320512820513, 45.833333333333336, 50.881410256410255]
Accuracy after epoch 2: Top5: 55.09%
[28.88621794871795, 39.50320512820513, 45.833333333333336, 50.881410256410255, 55.08814102564103]
Accuracy after epoch 2: Top6: 58.45%
[28.88621794871795, 39.50320512820513, 45.833333333333336, 50.881410256410255, 55.08814102564103, 58.45352564102564]
Accuracy after epoch 2: Top7: 60.98%
[28.88621794871795, 39.50320512820513, 45.833333333333336, 50.881410256410255, 55.08814102564103, 58.45352564102564, 60.9775641025641]
Accuracy after epoch 2: Top8: 63.58%
[28.88621794871795, 39.50320512820513, 45.833333333333336, 50.881410256410255, 55.08814102564103, 58.45352564102564, 60.9775641025641, 63.58173076923077]
Acc

Epoch 3:   0%|          | 0/3536 [00:00<?, ?it/s]

Validating:   0%|          | 0/78 [00:00<?, ?it/s]

Accuracy after epoch 3: Top1: 32.29%
[32.291666666666664]
Accuracy after epoch 3: Top2: 46.27%
[32.291666666666664, 46.27403846153846]
Accuracy after epoch 3: Top3: 54.05%
[32.291666666666664, 46.27403846153846, 54.04647435897436]
Accuracy after epoch 3: Top4: 59.33%
[32.291666666666664, 46.27403846153846, 54.04647435897436, 59.3349358974359]
Accuracy after epoch 3: Top5: 63.42%
[32.291666666666664, 46.27403846153846, 54.04647435897436, 59.3349358974359, 63.42147435897436]
Accuracy after epoch 3: Top6: 66.55%
[32.291666666666664, 46.27403846153846, 54.04647435897436, 59.3349358974359, 63.42147435897436, 66.54647435897436]
Accuracy after epoch 3: Top7: 69.23%
[32.291666666666664, 46.27403846153846, 54.04647435897436, 59.3349358974359, 63.42147435897436, 66.54647435897436, 69.23076923076923]
Accuracy after epoch 3: Top8: 71.71%
[32.291666666666664, 46.27403846153846, 54.04647435897436, 59.3349358974359, 63.42147435897436, 66.54647435897436, 69.23076923076923, 71.71474358974359]
Accuracy 

Epoch 4:   0%|          | 0/3536 [00:00<?, ?it/s]

Validating:   0%|          | 0/78 [00:00<?, ?it/s]

Accuracy after epoch 4: Top1: 35.42%
[35.416666666666664]
Accuracy after epoch 4: Top2: 50.76%
[35.416666666666664, 50.76121794871795]
Accuracy after epoch 4: Top3: 58.57%
[35.416666666666664, 50.76121794871795, 58.57371794871795]
Accuracy after epoch 4: Top4: 64.62%
[35.416666666666664, 50.76121794871795, 58.57371794871795, 64.62339743589743]
Accuracy after epoch 4: Top5: 68.51%
[35.416666666666664, 50.76121794871795, 58.57371794871795, 64.62339743589743, 68.50961538461539]
Accuracy after epoch 4: Top6: 71.96%
[35.416666666666664, 50.76121794871795, 58.57371794871795, 64.62339743589743, 68.50961538461539, 71.9551282051282]
Accuracy after epoch 4: Top7: 73.80%
[35.416666666666664, 50.76121794871795, 58.57371794871795, 64.62339743589743, 68.50961538461539, 71.9551282051282, 73.79807692307692]
Accuracy after epoch 4: Top8: 75.28%
[35.416666666666664, 50.76121794871795, 58.57371794871795, 64.62339743589743, 68.50961538461539, 71.9551282051282, 73.79807692307692, 75.28044871794872]
Accurac

Epoch 5:   0%|          | 0/3536 [00:00<?, ?it/s]

Validating:   0%|          | 0/78 [00:00<?, ?it/s]

Accuracy after epoch 5: Top1: 38.18%
[38.181089743589745]
Accuracy after epoch 5: Top2: 52.48%
[38.181089743589745, 52.48397435897436]
Accuracy after epoch 5: Top3: 61.02%
[38.181089743589745, 52.48397435897436, 61.017628205128204]
Accuracy after epoch 5: Top4: 66.95%
[38.181089743589745, 52.48397435897436, 61.017628205128204, 66.94711538461539]
Accuracy after epoch 5: Top5: 70.27%
[38.181089743589745, 52.48397435897436, 61.017628205128204, 66.94711538461539, 70.2724358974359]
Accuracy after epoch 5: Top6: 73.80%
[38.181089743589745, 52.48397435897436, 61.017628205128204, 66.94711538461539, 70.2724358974359, 73.79807692307692]
Accuracy after epoch 5: Top7: 75.92%
[38.181089743589745, 52.48397435897436, 61.017628205128204, 66.94711538461539, 70.2724358974359, 73.79807692307692, 75.92147435897436]
Accuracy after epoch 5: Top8: 77.72%
[38.181089743589745, 52.48397435897436, 61.017628205128204, 66.94711538461539, 70.2724358974359, 73.79807692307692, 75.92147435897436, 77.72435897435898]
Ac

Epoch 6:   0%|          | 0/3536 [00:00<?, ?it/s]

Validating:   0%|          | 0/78 [00:00<?, ?it/s]

Accuracy after epoch 6: Top1: 38.78%
[38.782051282051285]
Accuracy after epoch 6: Top2: 53.21%
[38.782051282051285, 53.205128205128204]
Accuracy after epoch 6: Top3: 61.70%
[38.782051282051285, 53.205128205128204, 61.69871794871795]
Accuracy after epoch 6: Top4: 67.03%
[38.782051282051285, 53.205128205128204, 61.69871794871795, 67.02724358974359]
Accuracy after epoch 6: Top5: 71.43%
[38.782051282051285, 53.205128205128204, 61.69871794871795, 67.02724358974359, 71.43429487179488]
Accuracy after epoch 6: Top6: 75.32%
[38.782051282051285, 53.205128205128204, 61.69871794871795, 67.02724358974359, 71.43429487179488, 75.32051282051282]
Accuracy after epoch 6: Top7: 77.56%
[38.782051282051285, 53.205128205128204, 61.69871794871795, 67.02724358974359, 71.43429487179488, 75.32051282051282, 77.56410256410257]
Accuracy after epoch 6: Top8: 79.45%
[38.782051282051285, 53.205128205128204, 61.69871794871795, 67.02724358974359, 71.43429487179488, 75.32051282051282, 77.56410256410257, 79.4471153846153

Epoch 7:   0%|          | 0/3536 [00:00<?, ?it/s]

Validating:   0%|          | 0/78 [00:00<?, ?it/s]

Accuracy after epoch 7: Top1: 39.70%
[39.70352564102564]
Accuracy after epoch 7: Top2: 54.61%
[39.70352564102564, 54.607371794871796]
Accuracy after epoch 7: Top3: 63.70%
[39.70352564102564, 54.607371794871796, 63.70192307692308]
Accuracy after epoch 7: Top4: 69.39%
[39.70352564102564, 54.607371794871796, 63.70192307692308, 69.39102564102564]
Accuracy after epoch 7: Top5: 73.24%
[39.70352564102564, 54.607371794871796, 63.70192307692308, 69.39102564102564, 73.23717948717949]
Accuracy after epoch 7: Top6: 76.08%
[39.70352564102564, 54.607371794871796, 63.70192307692308, 69.39102564102564, 73.23717948717949, 76.08173076923077]
Accuracy after epoch 7: Top7: 78.12%
[39.70352564102564, 54.607371794871796, 63.70192307692308, 69.39102564102564, 73.23717948717949, 76.08173076923077, 78.125]
Accuracy after epoch 7: Top8: 79.69%
[39.70352564102564, 54.607371794871796, 63.70192307692308, 69.39102564102564, 73.23717948717949, 76.08173076923077, 78.125, 79.6875]
Accuracy after epoch 7: Top9: 80.97%


Epoch 8:   0%|          | 0/3536 [00:00<?, ?it/s]

Validating:   0%|          | 0/78 [00:00<?, ?it/s]

Accuracy after epoch 8: Top1: 39.86%
[39.86378205128205]
Accuracy after epoch 8: Top2: 54.85%
[39.86378205128205, 54.84775641025641]
Accuracy after epoch 8: Top3: 64.14%
[39.86378205128205, 54.84775641025641, 64.1426282051282]
Accuracy after epoch 8: Top4: 69.35%
[39.86378205128205, 54.84775641025641, 64.1426282051282, 69.35096153846153]
Accuracy after epoch 8: Top5: 73.48%
[39.86378205128205, 54.84775641025641, 64.1426282051282, 69.35096153846153, 73.4775641025641]
Accuracy after epoch 8: Top6: 76.48%
[39.86378205128205, 54.84775641025641, 64.1426282051282, 69.35096153846153, 73.4775641025641, 76.4823717948718]
Accuracy after epoch 8: Top7: 78.41%
[39.86378205128205, 54.84775641025641, 64.1426282051282, 69.35096153846153, 73.4775641025641, 76.4823717948718, 78.40544871794872]
Accuracy after epoch 8: Top8: 79.97%
[39.86378205128205, 54.84775641025641, 64.1426282051282, 69.35096153846153, 73.4775641025641, 76.4823717948718, 78.40544871794872, 79.96794871794872]
Accuracy after epoch 8: T

Epoch 9:   0%|          | 0/3536 [00:00<?, ?it/s]

KeyboardInterrupt: 