In [23]:
# 挂载Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
!pip install datasets



In [25]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
import os
from transformers import AutoTokenizer
from torch.utils.data import DataLoader, TensorDataset
from transformers import default_data_collator, get_linear_schedule_with_warmup
from tqdm import tqdm
from datasets import load_dataset, Dataset
from sklearn.metrics import accuracy_score, classification_report

device = "cuda"

### 载入数据

In [26]:
# 定义TXT文件路径
file_path_true = "/content/drive/My Drive/fake news/test_nonrumor.txt"  # 真信息数据集
file_path_fake = "/content/drive/My Drive/fake news/test_rumor.txt"  # 假信息数据集

# 读取TXT文件并转换为列表
with open(file_path_true, 'r', encoding='utf-8') as f:
    data_true = f.read().splitlines()

with open(file_path_fake, 'r', encoding='utf-8') as f:
    data_fake = f.read().splitlines()

# 为真信息和假信息创建带标签的DataFrame
df_true = pd.DataFrame({'text': data_true, 'label': 'true'})
df_fake = pd.DataFrame({'text': data_fake, 'label': 'fake'})

# 合并数据集
df = pd.concat([df_true, df_fake], ignore_index=True)

# 创建Hugging Face datasets对象
dataset = Dataset.from_pandas(df)

In [27]:
text_column = "text"
label_column = "type"
max_length = 640
lr = 0.01
num_epochs = 10
batch_size = 32

In [28]:
import math
from typing import List, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F


class LinearLoRA(nn.Module):
    """
    A low-rank adapted linear layer.

    Args:
        in_dim: int = An integer representing the input dimension of the linear layer
        out_dim: int = An integer representing the output dimension of the linear layer
        r: int = An integer representing the rank of the low-rank approximated matrices
        lora_alpha: int = An integer representing the numerator of the scaling constant alpha / r
        lora_dropout: float = A float between 0 and 1 representing the dropout probability
    """

    def __init__(
        self,
        in_dim: int,
        out_dim: int,
        r: int = 8,
        lora_alpha: int = 16,
        lora_dropout: float = 0.,
    ):
        super().__init__()
        self.r = r
        self.lora_alpha = lora_alpha
        self.lora_dropout = nn.Dropout(lora_dropout)

        # Check that the rank is at least 1
        assert r > 0, "Variable 'r' is not greater than zero. Choose a rank of 1 or greater."

        # recreate the linear layer and freeze it (the actual weight values will be copied in outside of this class)
        self.pretrained = nn.Linear(in_dim, out_dim, bias=True)
        self.pretrained.weight.requires_grad = False

        # create the low-rank A matrix and initialize with same method as in Hugging Face PEFT library
        self.lora_A = nn.Linear(in_dim, r, bias=False)
        nn.init.kaiming_uniform_(self.lora_A.weight, a=math.sqrt(5))

        # create the low-rank B matrix and initialize to zero
        self.lora_B = nn.Linear(r, out_dim, bias=False)
        nn.init.constant_(self.lora_B.weight, 0)

        # scaling constant
        self.scaling = self.lora_alpha / self.r

    def forward(self, x):
        pretrained_out = self.pretrained(x)
        lora_out = self.lora_dropout(x)
        lora_out = self.lora_A(lora_out)
        lora_out = self.lora_B(lora_out)
        lora_out = lora_out * self.scaling
        return pretrained_out + lora_out


def freeze_model(model):
    """Freezes all layers except the LoRa modules and classifier."""
    for name, param in model.named_parameters():
        if "lora" not in name and "classifier" not in name:
            param.requires_grad = False


def create_lora(module, r, lora_dropout, lora_alpha):
    """Converts a linear module to a LoRA linear module."""
    k, d = module.weight.shape  # pytorch nn.Linear weights are transposed, that is why shape is (k, d) and not (d, k)
    lora = LinearLoRA(d, k, r, lora_dropout=lora_dropout, lora_alpha=lora_alpha)
    with torch.no_grad():
        lora.pretrained.weight.copy_(module.weight)
        lora.pretrained.bias.copy_(module.bias)
    return lora


def add_lora_layers(
    model,
    module_names: Tuple=("query", "value"),
    r: int=8,
    lora_alpha: float=16,
    lora_dropout: float=0.1,
    ignore_layers: List[int]=[]
):
    """
        Replaces chosen linear modules with LoRA equivalents.

        Args:
            model: torch.nn.Module = The PyTorch model to be used
            module_names: Tuple = A tuple containing the names of the linear layers to replace
                Ex. ("query") to replace the linear modules with "query" in the name --> bert.encoder.layer.0.attention.self.query
            r: int =
            lora_alpha: int = An integer representing the numerator of the scaling constant alpha / r
            lora_dropout: float = A float between 0 and 1 representing the dropout probability
            ignore_layers: list = A list with the indices of all BERT layers NOT to add LoRA modules
        """
    module_types: Tuple=(nn.Linear,)

    # disable dropout in frozen layers
    for module in model.modules():
        if isinstance(module, nn.Dropout):
            module.p = 0.0
    # replace chosen linear modules with lora modules
    for name, module in model.named_children():
        if isinstance(module, module_types) and name in module_names:
            temp_lora = create_lora(module, r=r, lora_dropout=lora_dropout, lora_alpha=lora_alpha)
            setattr(model, name, temp_lora)
        else:
            ignore_layers_str = [str(i) for i in ignore_layers]
            if name not in ignore_layers_str:
                add_lora_layers(module, module_names, r, lora_dropout, lora_alpha, ignore_layers)


def unfreeze_model(model):
    """Unfreezes all parameters in a model by setting requires_grad to True."""
    for name, param in model.named_parameters():
        param.requires_grad = True


def create_linear(module):
    """Converts a LoRA linear module back to a linear module."""
    k, d = module.pretrained.weight.shape  # pytorch nn.Linear weights are transposed, that is why variables are k, d and not d, k
    linear = nn.Linear(d, k, bias=True)

    with torch.no_grad():
        linear.weight.copy_(module.pretrained.weight + (module.lora_B.weight @ module.lora_A.weight) * module.scaling)
        linear.bias.copy_(module.pretrained.bias)

    return linear


def merge_lora_layers(model, module_names: Tuple=("query", "value"), dropout=0.1):
    """
        Replaces LoRA modules with their original linear equivalents.

        Args:
            model: torch.nn.Module = The PyTorch model to be used
            module_names: Tuple = A tuple containing the names of the LoRA layers to replace
                Ex. ("query") to replace the LoRA modules with "query" in the name --> bert.encoder.layer.0.attention.self.query
            r: int =
            dropout: float = A float between 0 and 1 representing the dropout probability
        """
    # enable dropout in frozen layers
    for module in model.modules():
        if isinstance(module, nn.Dropout):
            module.p = dropout
    # replace chosen linear modules with lora modules
    for name, module in model.named_children():
        if name in module_names and hasattr(module, "pretrained"):
            temp_linear = create_linear(module)
            setattr(model, name, temp_linear)
        else:
            merge_lora_layers(module, module_names=module_names, dropout=0.1)


In [29]:
!pip install transformers




In [30]:
from transformers import BertPreTrainedModel, BertModel, BertConfig
from torch import nn
import torch
from torch.utils.data import DataLoader, TensorDataset



class BertForSequenceClassificationWithLoRA(BertPreTrainedModel):
    def __init__(self, config, lora_config=None, pretrained_model=None):
        super().__init__(config)
        if pretrained_model is None:
            self.bert = BertModel(config)
        else:
            self.bert = pretrained_model
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # 假设lora_config包含了LoRA所需的配置
        self.classifier = (
            LinearLoRA(config.hidden_size, config.num_labels, **lora_config)
            if lora_config
            else nn.Linear(config.hidden_size, config.num_labels)
        )

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

    # 添加一个辅助方法来加载预训练模型并替换分类器
    @classmethod
    def from_pretrained_with_lora(cls, model_name, num_labels, lora_config=None, **kwargs):
        config = BertConfig.from_pretrained(model_name, **kwargs)
        pretrained_model = BertModel.from_pretrained(model_name, **kwargs)
        model = cls(config, lora_config=lora_config, pretrained_model=pretrained_model)
        return model

# 使用示例
lora_config = {'r': 8}  # 假设的LoRA配置
model = BertForSequenceClassificationWithLoRA.from_pretrained_with_lora("bert-base-uncased", num_labels=2, lora_config=lora_config)



In [43]:
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, classification_report
import torch

# 假设你有一个简单的Dataset类
import torch
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {'input_ids': self.input_ids[idx], 'attention_mask': self.attention_mask[idx], 'labels': self.labels[idx]}

# 示例数据（这里仅用于说明，您应该用实际的数据替代）
# 假设每个样本是32个token IDs
input_ids = torch.randint(0, 100, (6000, 32))  # 6000个样本，每个样本32个token IDs
attention_mask = torch.ones(6000, 32, dtype=torch.long)  # 6000个样本，每个样本32个token的attention mask，全为1表示没有padding
labels = torch.randint(0, 2, (6000,))  # 6000个样本的标签

# 创建Dataset实例
dataset = MyDataset(input_ids, attention_mask, labels)

# 创建DataLoader实例
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 现在你可以在你的evaluate函数中使用这个dataloader了
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def evaluate(model, dataloader, device):
    model.eval()
    model.to(device)  # 确保模型被移动到了正确的设备上
    total_preds = []
    total_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            # 假设outputs.logits是模型最后一层的输出
            # 注意：不同的模型可能有不同的输出结构，这里需要根据你的模型来调整
            if isinstance(outputs, tuple):  # 如果模型输出是一个元组（例如包含logits和losses）
                logits = outputs[0]
            else:  # 如果模型输出直接就是logits
                logits = outputs
            _, preds = torch.max(logits, dim=1)
            total_preds.extend(preds.cpu().numpy())
            total_labels.extend(labels.cpu().numpy())
    return total_preds, total_labels

# 首先确定设备
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# 现在你可以调用evaluate函数，并传入device
total_preds, total_labels = evaluate(model, dataloader, device)

# 计算准确率和报告（确保你已经导入了相关的库）
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(total_labels, total_preds)
report = classification_report(total_labels, total_preds)
print(f"Accuracy: {accuracy}")
print(report)

Accuracy: 0.49983333333333335
              precision    recall  f1-score   support

           0       0.50      1.00      0.67      3001
           1       0.46      0.00      0.01      2999

    accuracy                           0.50      6000
   macro avg       0.48      0.50      0.34      6000
weighted avg       0.48      0.50      0.34      6000

