In [8]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

# 加载新的数据集
new_file_path = './data_开发者阈值10_词频阈值10/Mozilla_total_10_10.csv'
columns_to_extract = ['bug_id', 'product', 'abstracts', 'description', 'component', 'severity', 'priority', 'history', 'status', 'developer']
df = pd.read_csv(new_file_path, usecols=columns_to_extract, encoding='latin-1')

# 合并文本信息为模型的输入，除了developer列
df['text_input'] = df[['bug_id', 'product', 'abstracts', 'description', 'component', 'severity', 'priority', 'history', 'status']].astype(str).agg(' '.join, axis=1)

# 将developer列作为标签
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['developer'])


In [9]:
# 合并文本信息为模型的输入，除了developer列
df['text_input'] = df[['bug_id', 'product', 'abstracts', 'description', 'component', 'severity', 'priority', 'history', 'status']].astype(str).agg(' '.join, axis=1)

X_train, X_val, y_train, y_val = train_test_split(df.index.values, df.label.values, test_size=0.15, random_state=42, stratify=df.label.values)
df['data_type'] = ['not_set']*df.shape[0]
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [10]:

from tqdm.auto import tqdm

# 假设df已经按照之前的步骤加载和预处理

# 使用XLNet的tokenizer
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

# 对训练和验证数据的合并文本进行编码
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text_input.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='max_length',
    max_length=512,
    truncation=True,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text_input.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='max_length',
    max_length=512,
    truncation=True,
    return_tensors='pt'
)

# 准备Tensor数据
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

dataset_train = torch.utils.data.TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = torch.utils.data.TensorDataset(input_ids_val, attention_masks_val, labels_val)



In [11]:
# 定义DataLoader
batch_size = 2
train_loader = DataLoader(dataset_train, sampler=torch.utils.data.RandomSampler(dataset_train), batch_size=batch_size)
val_loader = DataLoader(dataset_val, sampler=torch.utils.data.SequentialSampler(dataset_val), batch_size=32)

# 初始化XLNet模型
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=len(np.unique(df.label.values)))

optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [12]:
print(len(np.unique(df.label.values)))

229


In [6]:
import pandas as pd
from transformers import BertTokenizer

# 加载新的数据集
new_file_path = './data_开发者阈值10_词频阈值10/Mozilla_total_10_10.csv'
columns_to_extract = ['bug_id', 'product', 'abstracts', 'description', 'component', 'severity', 'priority', 'history', 'status']
df = pd.read_csv(new_file_path, usecols=columns_to_extract, encoding='latin-1')

# 合并文本信息为模型的输入，除了developer列
df['text_input'] = df[['bug_id', 'product', 'abstracts', 'description', 'component', 'severity', 'priority', 'history', 'status']].astype(str).agg(' '.join, axis=1)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# 对所有合并后的文本进行编码
encoded_inputs = tokenizer.batch_encode_plus(df['text_input'].tolist(), add_special_tokens=True, truncation=True, padding=False, max_length=512)

# 计算所有编码后的长度
lengths = [len(input_ids) for input_ids in encoded_inputs['input_ids']]

# 计算平均长度、中位数、最大和最小长度
average_length = sum(lengths) / len(lengths)
median_length = sorted(lengths)[len(lengths) // 2]
max_length = max(lengths)
min_length = min(lengths)

print(f"Average length: {average_length}")
print(f"Median length: {median_length}")
print(f"Max length: {max_length}")
print(f"Min length: {min_length}")

Average length: 110.25808444902162
Median length: 86
Max length: 512
Min length: 18


In [13]:
import os

# 设置检查点保存路径
checkpoint_path = 'xlnet_model_checkpoint.pth'

# 检查是否有可用的检查点
if os.path.isfile(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    print(f'Resuming training from epoch {start_epoch}')
else:
    start_epoch = 0
    print('Starting training from scratch')

num_epochs = 113  # 根据需要调整
for epoch in range(start_epoch, num_epochs):
    model.train()
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device, dtype=torch.long)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        progress_bar.set_postfix(loss=loss.item())

    # 保存检查点
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, checkpoint_path)

    # 简单评估
    model.eval()
    correct = 0
    total = 0
    val_progress_bar = tqdm(val_loader, desc="Validating")
    for batch in val_progress_bar:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device, dtype=torch.long)
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, predicted = torch.max(logits, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Accuracy after epoch {epoch + 1}: {accuracy:.2f}%')

Starting training from scratch


Epoch 1:   0%|          | 0/6190 [00:00<?, ?it/s]

Validating:   0%|          | 0/69 [00:00<?, ?it/s]

Accuracy after epoch 1: 79.45%


Epoch 2:   0%|          | 0/6190 [00:00<?, ?it/s]

Validating:   0%|          | 0/69 [00:00<?, ?it/s]

Accuracy after epoch 2: 83.89%


Epoch 3:   0%|          | 0/6190 [00:00<?, ?it/s]

Validating:   0%|          | 0/69 [00:00<?, ?it/s]

Accuracy after epoch 3: 84.58%


Epoch 4:   0%|          | 0/6190 [00:00<?, ?it/s]

KeyboardInterrupt: 