In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
from data_splitting import X_train, y_train, X_val, y_val, X_test, y_test

ModuleNotFoundError: No module named 'pandas'

In [2]:
# check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
# Load HateBERT model and tokenizer
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
num_labels = 2  # hate or none-hate

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  state_dict = torch.load(resolved_archive_file, map_location="cpu")


In [4]:
# X_train = pd.DataFrame(X_train)
# X_val = pd.DataFrame(X_val)
# X_test = pd.DataFrame(X_test)

In [5]:
X_train

array(["GalaxyGamer: Hey, what's up? You ready to dive into this new game update? ShadowSlayer: Yeah, let's do it! Just need to blow off some steam after dealing with my annoying boss all day. You know how it is. GalaxyGamer: Totally get that. Work's been crazy for me too. But hey, at least we can escape here for a bit. By the way, did you catch that new sci-fi series everyone's talking about? ShadowSlayer: Nah, man. I don't waste my time on that garbage. It's just more brain-dead content for sheep. People eat that stuff up and think they're all cultured or something. GalaxyGamer: Oh, I mean, I thought it was pretty entertaining. But hey, each to their own, right? Anyway, let's see if we can conquer this raid today. ShadowSlayer: Yeah, let's crush it. Just hope we don't get stuck with any noobs. Can't stand when people act like they know what they're doing but just drag the team down. Like, just uninstall already. GalaxyGamer: Yeah, I hear you. We definitely need a good team to make it

In [6]:
data = pd.read_csv("../Data/merged_all_conversations.csv")
data = data[['message', 'label']]

In [7]:
from datasets import Dataset, DatasetDict

# 构造 DataFrame
train_df = pd.DataFrame({"text": X_train, "labels": y_train}).dropna()
val_df = pd.DataFrame({"text": X_val, "labels": y_val}).dropna()
test_df = pd.DataFrame({"text": X_test, "labels": y_test}).dropna()

# 转换为 Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
val_dataset = Dataset.from_pandas(val_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 2074
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 259
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 260
    })
})

In [8]:
tokenizer(dataset['train'][0]['text'], truncation=True, padding="max_length", max_length=512)

{'input_ids': [101, 9088, 16650, 2099, 1024, 4931, 1010, 2054, 1005, 1055, 2039, 1029, 2017, 3201, 2000, 11529, 2046, 2023, 2047, 2208, 10651, 1029, 6281, 24314, 1024, 3398, 1010, 2292, 1005, 1055, 2079, 2009, 999, 2074, 2342, 2000, 6271, 2125, 2070, 5492, 2044, 7149, 2007, 2026, 15703, 5795, 2035, 2154, 1012, 2017, 2113, 2129, 2009, 2003, 1012, 9088, 16650, 2099, 1024, 6135, 2131, 2008, 1012, 2147, 1005, 1055, 2042, 4689, 2005, 2033, 2205, 1012, 2021, 4931, 1010, 2012, 2560, 2057, 2064, 4019, 2182, 2005, 1037, 2978, 1012, 2011, 1996, 2126, 1010, 2106, 2017, 4608, 2008, 2047, 16596, 1011, 10882, 2186, 3071, 1005, 1055, 3331, 2055, 1029, 6281, 24314, 1024, 20976, 1010, 2158, 1012, 1045, 2123, 1005, 1056, 5949, 2026, 2051, 2006, 2008, 13044, 1012, 2009, 1005, 1055, 2074, 2062, 4167, 1011, 2757, 4180, 2005, 8351, 1012, 2111, 4521, 2008, 4933, 2039, 1998, 2228, 2027, 1005, 2128, 2035, 3226, 2094, 2030, 2242, 1012, 9088, 16650, 2099, 1024, 2821, 1010, 1045, 2812, 1010, 1045, 2245, 2009, 200

In [9]:
dataset['train']

Dataset({
    features: ['text', 'labels'],
    num_rows: 2074
})

In [10]:
# Tokenization text data
def tokenize_function(batch):
    return tokenizer(batch['text'], truncation=True, padding=True, max_length=512, return_tensors="pt")

# 批量处理数据
train_dataset = dataset['train'].map(tokenize_function, batched=True)
val_dataset = dataset['validation'].map(tokenize_function, batched=True)
test_dataset = dataset['test'].map(tokenize_function, batched=True)


print(train_dataset)

Map: 100%|██████████| 2074/2074 [00:00<00:00, 3996.49 examples/s]
Map: 100%|██████████| 259/259 [00:00<00:00, 3601.29 examples/s]
Map: 100%|██████████| 260/260 [00:00<00:00, 3660.14 examples/s]

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 2074
})





In [11]:
train_dataset = train_dataset.remove_columns(['text'])
val_dataset = val_dataset.remove_columns(['text'])

In [15]:
# Set format (Trainer requires PyTorch format)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [24]:
# # 转换为 PyTorch dataset
# def convert_to_torch(data):
#     return {key: torch.tensor(val) for key, val in data.items() if key in ["input_ids", "attention_mask", "labels"]}
#
# train_dataset = list(map(convert_to_torch, train_dataset))
# val_dataset = list(map(convert_to_torch, val_dataset))
# test_dataset = list(map(convert_to_torch, test_dataset))


# 创建 PyTorch DataLoader
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)

In [17]:
type(train_dataset[0]['input_ids'])

torch.Tensor

In [None]:
# check if type is tensor
batch = next(iter(train_dataloader))
print({k: type(v) for k, v in batch.items()})  # 查看每个字段的类型

{'labels': <class 'torch.Tensor'>, 'input_ids': <class 'torch.Tensor'>, 'attention_mask': <class 'torch.Tensor'>}


In [25]:
# 检查一下

for batch in train_dataloader:
    print(batch.keys())  # 确保 `labels` 存在
    print("Batch labels:", batch["labels"])  # 打印部分 labels
    print("Unique labels:", torch.unique(batch["labels"]))  # 查看标签的唯一值
    break

import numpy as np

train_labels = np.array([ex["labels"] for ex in train_dataset])
val_labels = np.array([ex["labels"] for ex in val_dataset])

print("Train label distribution:", np.bincount(train_labels))
print("Val label distribution:", np.bincount(val_labels))

dict_keys(['labels', 'input_ids', 'attention_mask'])
Batch labels: tensor([1, 0, 1, 0, 1, 1, 1, 1])
Unique labels: tensor([0, 1])
Train label distribution: [ 940 1134]
Val label distribution: [117 142]


In [26]:
print(len(train_dataset))
print(len(val_dataset))

2074
259


In [29]:
# 设置训练轮数
epoch_num = 15

# 设置优化器 (AdamW) 和学习率调度器
# AdamW 是 transformers 推荐的优化器,线性学习率调度器可防止模型训练过快导致不稳定
from transformers import get_scheduler
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=0.01)
num_training_steps = len(train_dataloader) * epoch_num  # 训练 3 轮
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


In [None]:
# 微调模型的 classifier 层
from tqdm import tqdm
from matplotlib import pyplot as plt

# 初始化存储训练和验证准确率的列表
train_accuracies = []
val_accuracies = []
epochs = epoch_num  # 训练轮数

# 训练 Loop
for epoch in range(epochs):

    model.train()

    correct_train = 0
    total_train = len(train_dataset)
    loop = tqdm(train_dataloader, leave=True)

    for batch in loop:
        batch = {key: val.to(device) for key, val in batch.items()}


        # print(batch['labels'])

        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # 计算训练集准确率
        preds = torch.argmax(outputs.logits, dim=-1)
        correct_train += (preds == batch["labels"]).sum().item()

        # total_train += batch["labels"].size(0)

    train_acc = correct_train / total_train
    train_accuracies.append(train_acc)

    # 计算验证集准确率
    model.eval()
    correct_val = 0
    total_val = len(val_dataset)

    with torch.no_grad():
        for batch in val_dataloader:
            batch = {key: val.to(device) for key, val in batch.items()}

            outputs = model(**batch)

            preds = torch.argmax(outputs.logits, dim=-1)
            correct_val += (preds == batch["labels"]).sum().item()

            # total_val += batch["labels"].size(0)

    val_acc = correct_val / total_val
    val_accuracies.append(val_acc)

    print(f"Epoch {epoch+1}/{epochs} - Train Acc: {train_acc:.4f} - Val Acc: {val_acc:.4f}")

# 绘制训练和验证准确率曲线
plt.figure(figsize=(8, 5))
plt.plot(range(1, epochs+1), train_accuracies, label="Train Accuracy", marker='o')
plt.plot(range(1, epochs+1), val_accuracies, label="Validation Accuracy", marker='s')
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Train vs Validation Accuracy")
plt.legend()
plt.grid(True)
plt.show()

# for epoch in range(epochs):
#     loop = tqdm(train_dataloader, leave=True)
#     for batch in loop:
#         batch = {key: val.to(device) for key, val in batch.items()}  # 移动到 GPU/CPU
#
#         optimizer.zero_grad()
#         outputs = model(**batch)  # 前向传播
#         loss = outputs.loss  # 计算损失
#         loss.backward()  # 反向传播
#         optimizer.step()  # 更新参数
#         lr_scheduler.step()  # 调整学习率
#
#         loop.set_description(f"Epoch {epoch+1}")
#         loop.set_postfix(loss=loss.item())

  7%|▋         | 18/260 [00:04<01:01,  3.90it/s]