In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import pickle
import torch
from torch import tensor
from torch.utils.data import DataLoader, TensorDataset, Dataset
from transformers import (
    BertTokenizer,
    AdamW,
    get_linear_schedule_with_warmup,
    BertForSequenceClassification,
)
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
class DataProcessing(Dataset):
    """
    负责对输入的dataframe文本数据进行处理, 返回一个model可训练的dataloader数据
    param:
        model_path: 选择文本tokenize的模型
    """

    def __init__(self, model_path: str):
        self.model_path = model_path

    def _preprocess_data(
        self,
        data: pd.DataFrame,
        text_col: int,
        label_col: int,
        label_count_least: int,
        random_seed: int,
    ):
        """数据初步清洗"""
        # 去重
        data = data.drop_duplicates().dropna()
        if len(data.columns) >= 2:
            # 选择文本和标签列
            data["text"] = data.iloc[:, text_col]
            data["label"] = data.iloc[:, label_col]
            data = data[["text", "label"]]

            # 增加 label_count_least 条笨笨同学唤醒词数据
            repeat_df = pd.DataFrame({"text": ["笨笨同学"], "label": ["唤醒词"]})
            repeat_df = pd.concat([repeat_df] * label_count_least, ignore_index=True)
            data = pd.concat([data, repeat_df], ignore_index=True)

            # 选择标签有一定数量的数据
            label_value_count = data["label"].value_counts()
            label_list = list(
                label_value_count[label_value_count > label_count_least].index
            )
            data = data[data["label"].isin(label_list)]

            # 进行标签编码
            self.label_encoder = LabelEncoder()
            data["label"] = self.label_encoder.fit_transform(data["label"])

            # 进行数据均等分
            min_label_count = data["label"].value_counts().min()
            result_data = pd.DataFrame()
            for label in data["label"].unique():
                df_sampled = data[data["label"] == label].sample(
                    n=min_label_count, random_state=random_seed
                )
                result_data = pd.concat([result_data, df_sampled])
        else:
            result_data = data.iloc[:, text_col]

        return result_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data["text"][index]
        label = self.data["label"][index]
        return self._tokenize(text, label)

    def _tokenize(self, max_length, data):
        if isinstance(data, pd.DataFrame):
            text = list(data["text"].values)
            label = list(data["label"].values)
        else:
            text = list(data)

        tokenizer = BertTokenizer.from_pretrained(self.model_path)
        tokenized_sentence = tokenizer(
            text,
            return_tensors="pt",  # 返回pytorch tensor类型
            max_length=max_length,  # 最大长度
            padding="max_length",  # 填充长度
            truncation=True,
        )

        tokenized_ids = tokenized_sentence["input_ids"]
        tokenized_mask = tokenized_sentence["attention_mask"]
        tokenized_type_ids = tokenized_sentence["token_type_ids"]

        if isinstance(data, pd.DataFrame):
            return (
                tokenized_ids,
                tokenized_mask,
                tokenized_type_ids,
                tensor(label).long(),
            )
        else:
            return tokenized_ids, tokenized_mask, tokenized_type_ids

    def _to_dataloader(self, batch_size, max_length, data):
        """转化为dataloader"""
        tensors = self._tokenize(max_length, data)
        dataset = TensorDataset(*tensors)
        return DataLoader(
            dataset=dataset, batch_size=batch_size, shuffle=False, num_workers=1
        )

    def get_dataloader(
        self,
        data: pd.DataFrame,
        text_loc: int = 0,
        label_loc: int = 1,
        label_count_least: int = 150,
        random_seed: int = 42,
        batch_size: int = 5,
        test_size: float = 0,
        max_length: int = 100,
    ):
        """
        param:
            data: 输入的数据
            text_loc: 文本列的位置
            label_loc: 标签列的位置
            label_count_least: 标签最少数量
            random_seed: 随机种子
            batch_size: 批次大小
            test_size: 测试集比例, 测试集比例不为0, 则返回训练集和测试集
            max_length: padding的最大长度
        return: dataloader数据
        """
        data = self._preprocess_data(
            data, text_loc, label_loc, label_count_least, random_seed
        )
        if test_size == 0:
            return self._to_dataloader(batch_size, max_length, data)
        else:
            train_data, test_data = train_test_split(
                data, test_size=test_size, random_state=random_seed
            )
            train_dataloder = self._to_dataloader(batch_size, max_length, train_data)
            test_dataloder = self._to_dataloader(batch_size, max_length, test_data)
            return train_dataloder, test_dataloder

    def get_label_count(self):
        """获取标签数量"""
        return self.label_encoder.classes_.shape[0]

    def save_label_encoder(self, path: str = "output/label_encoder.pkl"):
        """保存标签映射文件"""
        with open(path, "wb") as f:
            pickle.dump(self.label_encoder, f)

In [None]:
class ClassifyModel:
    def __init__(self):
        self.model = None

    def flat_accuracy(self, preds, labels):
        pred_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()
        return accuracy_score(labels_flat, pred_flat)

    def train_process(
        self,
        model_path: str,
        num_labels: int,
        epochs: int,
        train_dataloader: DataLoader,
        valid_dataloader: DataLoader,
        learning_rate: float = 5e-5,
        device: str = "cpu",
    ):
        self.model = BertForSequenceClassification.from_pretrained(
            model_path,
            num_labels=num_labels,
            output_attentions=False,
            output_hidden_states=False,
        )

        if torch.cuda.is_available():
            self.model.cuda()

        total_steps = len(train_dataloader) * epochs
        optimizer = AdamW(self.model.parameters(), lr=learning_rate)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=total_steps
        )
        for epoch in range(epochs):
            self.model.train()
            train_loss, valid_loss = 0, 0
            eval_accuracy = 0

            train_pbar = tqdm(
                train_dataloader,
                total=len(train_dataloader),
                desc=f"Epoch {epoch+1}/{epochs}",
            )
            for batch in train_pbar:
                self.model.zero_grad()
                input_ids = batch[0].to(device)
                attention_mask = batch[1].to(device)
                labels = batch[3].to(device)
                outputs = self.model(
                    input_ids, attention_mask=attention_mask, labels=labels
                )
                loss = outputs[0]
                train_loss += loss.item()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                loss.backward()
                optimizer.step()
                scheduler.step()

            self.model.eval()
            valid_pbar = tqdm(
                valid_dataloader,
                total=len(valid_dataloader),
                desc=f"Epoch {epoch+1}/{epochs}",
            )
            for batch in valid_pbar:
                input_ids = batch[0].to(device)
                attention_mask = batch[1].to(device)
                labels = batch[3].to(device)
                outputs = self.model(
                    input_ids, attention_mask=attention_mask, labels=labels
                )
                loss = outputs[0]
                valid_loss += loss.item()
                logits = outputs[1]
                logits = logits.detach().cpu().numpy()
                label_ids = labels.to("cpu").numpy()
                eval_accuracy += self.flat_accuracy(logits, label_ids)

            print(f"Train Epoch: {epoch+1}")
            print(f"Training Loss: {train_loss/len(train_dataloader):.3f}")
            print(f"Validation Loss: {valid_loss/len(valid_dataloader):.3f}")
            print(f"Training Accuracy: {eval_accuracy/len(valid_dataloader):.3f}")
            print("\n")

    def save_model(self, path):
        torch.save(self.model, path)

    def predict(self, path, dataloader: DataLoader, device: str = "cpu"):
        model = torch.load(path, map_location=torch.device(device))
        model.eval()
        pred_logits = []
        for batch in dataloader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            pred_logits.extend(logits)

        final_pred_logits = np.array(pred_logits)
        final_pred_logits = np.argmax(final_pred_logits, axis=1)

        return final_pred_logits

In [None]:
import configparser
import platform

conf = configparser.ConfigParser()
conf.read("04_aiui_text_cls\config.ini", encoding="utf-8")
base_path = conf.get(
    "Path", "base_window_path" if platform.system() == "Windows" else "base_linux_path"
)

# 数据配置
train_data_path = os.path.join(base_path, conf.get("DataConfig", "train_data_file"))
test_data_path = os.path.join(base_path, conf.get("DataConfig", "test_data_file"))
text_loc = conf.getint("DataConfig", "text_loc")
label_loc = conf.getint("DataConfig", "label_loc")
label_count_least = conf.getint("DataConfig", "label_count_least")
random_seed = conf.getint("DataConfig", "random_seed")
test_size = conf.getfloat("DataConfig", "test_size")
max_length = conf.getint("DataConfig", "max_length")
batch_size = conf.getint("DataConfig", "batch_size")
label_encode_path = os.path.join(base_path, conf.get("DataConfig", "label_encode_path"))

# 模型配置
bert_model_path = os.path.join(base_path, conf.get("ModelConfig", "bert_model_path"))
epochs = conf.getint("ModelConfig", "epochs")
learning_rate = conf.getfloat("ModelConfig", "learning_rate")
device = "cuda" if torch.cuda.is_available() else "cpu"
output_model_path = os.path.join(
    base_path, conf.get("ModelConfig", "output_model_path")
)

In [None]:
# 训练模型

# 数据处理
train_data = pd.read_csv(train_data_path, sep=";", header=0)
DP = DataProcessing(bert_model_path)
train_dataloder, valid_dataloader = DP.get_dataloader(
    train_data,
    text_loc,
    label_loc,
    label_count_least,
    random_seed,
    batch_size,
    test_size,
    max_length,
)

# 保存标签映射
DP.save_label_encoder(label_encode_path)

# 模型训练
num_labels = DP.get_label_count()
classify_model = ClassifyModel()
classify_model.train_process(
    bert_model_path,
    num_labels,
    epochs,
    train_dataloder,
    valid_dataloader,
    learning_rate,
    device,
)

# 保存模型和tokenizer
tokenizer = BertTokenizer.from_pretrained(bert_model_path)
classify_model.model.save_pretrained(output_model_path)
tokenizer.save_pretrained(output_model_path)

In [None]:
from utils import TextCls

TC = TextCls()
data = pd.read_csv(test_data_path, sep=";", header=0)
data["predict"] = TC.htw_text_cls(list(data["text"]))
result = (
    data.groupby(["label"]).agg({"text": "count", "is_correct": "sum"}).reset_index()
)
result["correct_rate"] = round(result["is_correct"] / result["text"] * 100, 2)
print(
    "avg_accucy: ", round(data["is_correct"].sum() / data["text"].count() * 100, 2), "%"
)
result.sort_values(by="correct_rate", ascending=True)