In [1]:
import os
import torch
from tqdm.auto import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW
# 接下来，您可以使用与BERT相同的训练和验证循环逻辑，但确保所有的模型、数据和优化器都已经切换到XLNet。

In [2]:
# # 加载数据
# new_file_path = './dataset/Eclipse_morethan10_processed_again.csv'
# # 指定需要提取的列
# columns_to_extract = ['bug_id', 'product', 'abstracts', 'description', 'component', 'severity', 'priority', 'developer',  'status']
# # columns_to_extract = [ 'description', 'developer']
# df = pd.read_csv(new_file_path, usecols=columns_to_extract, encoding='latin-1')

# # 将developer列作为标签
# label_dict = {label: idx for idx, label in enumerate(df['developer'].unique())}
# print(f' the number of label is {len(label_dict)}')
# df['label'] = df['developer'].replace(label_dict).infer_objects()
# # 合并bug_id和summary作为模型的输入
# df['text_input'] =  df['bug_id'].astype(str) + " " + df['component'].astype(str)+ " " + df['abstracts'].astype(str)  # 使用空格作为分隔符
# X_train, X_val, y_train, y_val = train_test_split(df.index.values, df.label.values, test_size=0.15, random_state=42, stratify=df.label.values)
# df['data_type'] = ['not_set']*df.shape[0]
# df.loc[X_train, 'data_type'] = 'train'
# df.loc[X_val, 'data_type'] = 'val'

In [3]:
new_file_path = 'dataprocessed_description_more_than10.csv'
df = pd.read_csv(new_file_path, encoding='latin-1')
df = df[['bug_id', 'summary', 'who', 'description']]
label_dict = {label: idx for idx, label in enumerate(df['who'].unique())}
df['label'] = df['who'].replace(label_dict)
print(f' the number of label is {len(label_dict)}')
# 合并bug_id和summary作为模型的输入
df['text_input'] = df['description']  # 使用空格作为分隔符
X_train, X_val, y_train, y_val = train_test_split(df.index.values, df.label.values, test_size=0.15, random_state=42, stratify=df.label.values)
df['data_type'] = ['not_set']*df.shape[0]
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

 the number of label is 1059


  df['label'] = df['who'].replace(label_dict)


In [4]:
# 使用XLNet的分词器
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

In [5]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text_input.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='max_length',  # 更新此处
    truncation=True,  # 添加此行
    max_length=512,
    return_tensors='pt'
)
encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text_input.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='max_length',  # 更新此处
    truncation=True,  # 添加此行
    max_length=512,
    return_tensors='pt'
)

In [6]:
# 准备Tensor数据
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [7]:

# 定义DataLoader
batch_size = 2
train_loader = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)
val_loader = DataLoader(dataset_val, sampler=SequentialSampler(dataset_val), batch_size=32)



In [8]:
# 初始化XLNet模型
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=len(label_dict))
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [9]:
checkpoint_path = 'model_checkpoint_bert_morethan10_xlnet_description_ljy.pth'

# 检查是否有可用的检查点
if os.path.isfile(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    print(f'Resuming training from epoch {start_epoch}')
else:
    start_epoch = 0
    print('Starting training from scratch')

Resuming training from epoch 6


In [10]:
import pymysql
from datetime import datetime

# 数据库连接信息
host = '38.147.173.234'
user = 'root'
password = '123456'
db = 'training_statistics_db'

# 模型名称，根据实际情况手动设置
model_name = 'xlnet-base-cased'
# 学习率和可选特性，根据实际情况手动设置
learning_rate = 1e-5  # 示例学习率
optional_feature = 'feature_example'  # 示例可选特性
dataset = 'Eclipse_19w.csv'

# 连接到数据库
connection = pymysql.connect(host=host,
                             user=user,
                             password=password,
                             database=db,
                             charset='utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)
num_epochs=120
try:
    for epoch in range(start_epoch, num_epochs):
        model.train()
        start_time = datetime.now()
        # ...训练过程...
        end_time = datetime.now()
        duration = (end_time - start_time).total_seconds()
        
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
        for batch in progress_bar:
            optimizer.zero_grad()
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            loss.backward()
            optimizer.step()
            progress_bar.set_postfix(loss=loss.item())
        torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, checkpoint_path)
        model.eval()
        correct_topk = {k: 0 for k in range(1, 11)}
        total = 0
        val_progress_bar = tqdm(val_loader, desc="Validating")
    
        for batch in val_progress_bar:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs[0]
            total += labels.size(0)
        
            # 计算top1到top10的正确率
            _, predicted_topk = torch.topk(logits, k=10, dim=1)
            labels_expanded = labels.unsqueeze(1)
            for k in range(1, 11):
                correct_topk[k] += (predicted_topk[:, :k] == labels_expanded).any(dim=1).sum().item()
        # 打印每个topK的准确率
        for k in range(1, 11):
            accuracy = 100 * correct_topk[k] / total
            print(f'Accuracy after epoch {epoch + 1}: Top{k}: {accuracy:.2f}%')
        
        # 插入数据到数据库
        with connection.cursor() as cursor:
            sql = """
            INSERT INTO training_info (epoch, start_time, end_time, duration, user_id, model, 
            top1_accuracy, top2_accuracy, top3_accuracy, top4_accuracy, top5_accuracy, 
            top6_accuracy, top7_accuracy, top8_accuracy, top9_accuracy, top10_accuracy,
            optional_feature, learning_rate, dataset)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            """
            cursor.execute(sql, (epoch, start_time, end_time, duration, 1, model_name, 
                                 *accuracy, optional_feature, learning_rate,dataset))
            connection.commit()
        print(f'Epoch {epoch + 1} training data inserted into database.')
finally:
    connection.close()

Epoch 7:   0%|          | 0/82974 [00:00<?, ?it/s]

Validating:   0%|          | 0/916 [00:00<?, ?it/s]

Accuracy after epoch 7: Top1: 81.77%, Top5: 93.84%, Top10: 96.12%


Epoch 8:   0%|          | 0/82974 [00:00<?, ?it/s]

Validating:   0%|          | 0/916 [00:00<?, ?it/s]

Accuracy after epoch 8: Top1: 81.61%, Top5: 93.92%, Top10: 96.17%


Epoch 9:   0%|          | 0/82974 [00:00<?, ?it/s]

Validating:   0%|          | 0/916 [00:00<?, ?it/s]

Accuracy after epoch 9: Top1: 82.01%, Top5: 94.11%, Top10: 96.12%


Epoch 10:   0%|          | 0/82974 [00:00<?, ?it/s]

Validating:   0%|          | 0/916 [00:00<?, ?it/s]

Accuracy after epoch 10: Top1: 81.93%, Top5: 94.13%, Top10: 96.19%


Epoch 11:   0%|          | 0/82974 [00:00<?, ?it/s]

Validating:   0%|          | 0/916 [00:00<?, ?it/s]

Accuracy after epoch 11: Top1: 81.94%, Top5: 93.96%, Top10: 96.00%


Epoch 12:   0%|          | 0/82974 [00:00<?, ?it/s]

KeyboardInterrupt: 