In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torch.utils.data import DataLoader
import numpy as np
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Sequence, WhitespaceSplit, Punctuation, Digits
import re
import jieba
import pickle
import torch.utils.checkpoint as checkpoint

训练tokenizers

In [43]:
def load_data(xml_text):
    """从XML格式文本中加载句子和标签"""
    import re
    
    sentences = []
    labels = []
    
    # 使用正则表达式提取每个review
    pattern = r'<review id="\d+"\s+label="(\d)">(.*?)</review>'
    matches = re.findall(pattern, xml_text, re.DOTALL)
    
    for label, text in matches:
        sentences.append(text.strip())
        labels.append(int(label))
    
    return sentences, labels
def load_data_from_file(file_path):
    """从XML文件加载数据"""
    with open(file_path, 'r', encoding='utf8',errors='ignore') as f:
        xml_text = f.read()
    return load_data(xml_text)
sentences_cn,labels_cn=load_data_from_file("Sentiment Classification with Deep Learning/test.label.cn.txt")
sentences_en,labels_en=load_data_from_file("Sentiment Classification with Deep Learning/test.label.en.txt")
sentences=sentences_cn+sentences_en
labels=labels_cn+labels_en
sentences_ids=[]
for i in sentences:
    sentences_ids.append(tokenizer.encode(i, add_special_tokens=False).ids)
def test(model,sentences_ids,labels,loss_F,device="cuda"):
    labels_tensor=[]
    for i in labels:
        if i==1:
            labels_tensor.append(torch.tensor([[1.,0.]],device=device))
        else:
            labels_tensor.append(torch.tensor([[0.,1.]],device=device))
    total_loss=0
    for i in range(len(sentences_ids)):
        model.eval()
        model.to(device=device)
        _,out=model.forward(torch.tensor([sentences_ids[i]]).to(device=device))
        total_loss+=(loss_F(out,labels_tensor[i]).to(device=device)).mean().cpu()/len(sentences_ids)
    return float(total_loss)

In [6]:
import re
def load_reviews(file_path):
    """加载评论数据并提取句子"""
    reviews = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # 使用正则表达式提取所有<review>标签内的内容
    pattern = r'<review[^>]*>(.*?)</review>'
    matches = re.findall(pattern, content, re.DOTALL)
    
    # 清理提取的文本（去除首尾空白，合并换行）
    for match in matches:
        # 去除空白字符，并合并多行
        cleaned_text = match.strip()
        reviews.append(cleaned_text)
    
    return reviews

In [7]:
data1=load_reviews("./evaltask2_sample_data/cn_sample_data/sample.positive.txt")
data_t=list(torch.tensor([[1,0]]).repeat(len(data1),1))
print(len(data1))
data1=data1+load_reviews("./evaltask2_sample_data/cn_sample_data/sample.negative.txt")
data_t=data_t+list(torch.tensor([[0,1]]).repeat(len(data1)-len(data_t),1))
print(len(data1))
data1=data1+load_reviews("./evaltask2_sample_data/en_sample_data/sample.positive.txt")
data_t=data_t+list(torch.tensor([[1,0]]).repeat(len(data1)-len(data_t),1))
print(len(data1))
data1=data1+load_reviews("./evaltask2_sample_data/en_sample_data/sample.negative.txt")
data_t=data_t+list(torch.tensor([[0,1]]).repeat(len(data1)-len(data_t),1))
print(len(data1))

5000
10000
14987
19985


In [None]:
f=open("cn_Whitespace.txt","a",encoding="utf8")
for i in data1:
    seg_list = jieba.lcut(i)
    f.write(" ".join(seg_list)+"\n")
f.close()

f=open("en_Whitespace.txt","a",encoding="utf8")
for i in data2:
    f.write(i.replace("\n", "")+"\n")
f.close()

In [20]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))  # 使用BPE算法
tokenizer.pre_tokenizer = Sequence([
    WhitespaceSplit(),
    Punctuation(behavior="isolated"),
    Digits(individual_digits=False),
])

trainer = BpeTrainer(
    vocab_size=30000,  # 目标词表大小
    special_tokens=["[UNK]","[PAD]"], # 特殊标记
    min_frequency=10  # 最小出现频次
)

files = ["cn_Whitespace.txt","en_Whitespace.txt"]  # 你的训练文本路径
tokenizer.train(files, trainer)

tokenizer.save("./my_tokenizer.json")  # 保存
# 加载
tokenizer = Tokenizer.from_file("./my_tokenizer.json")

构建训练集

In [2]:
tokenizer = Tokenizer.from_file("./my_tokenizer.json")

In [8]:
from tqdm import tqdm
def bucket_and_pad_with_dynamic_buckets(texts0, tokenizer, pad_id=1, min_bucket=8):
    """
    动态计算桶大小并分桶填充
    """
    data_t=texts0[1]
    texts=texts0[0]
    
    # 计算最大长度
    max_len = max(len(tokenizer.encode(text, add_special_tokens=False)) for text in tqdm(texts))
    
    # 生成桶大小（2的幂次方，直到大于最大长度）
    bucket_sizes = []
    size = min_bucket
    while size < max_len:
        bucket_sizes.append(size)
        size *= 2
    bucket_sizes.append(max(size, max_len))
    
    # 分桶填充
    buckets = {size: [] for size in bucket_sizes}
    
    for idx,text in tqdm(enumerate(texts)):
        tokens = tokenizer.encode(text, add_special_tokens=False).ids
        token_len = len(tokens)
        
        # 选择桶大小
        for size in bucket_sizes:
            if token_len <= size:
                padded = tokens + [pad_id] * (size - token_len)
                buckets[size].append([padded,data_t[idx]])
                break
    
    return buckets, bucket_sizes

In [9]:
bucket,bucket_sizes=bucket_and_pad_with_dynamic_buckets((data1,data_t),tokenizer)

100%|██████████████████████████████████████████████████████████████████████████| 19985/19985 [00:02<00:00, 8520.97it/s]
19985it [00:02, 8369.46it/s] 


In [10]:
tokenizer.encode('请问这机不是有个遥控器的吗？', add_special_tokens=False).tokens

['请问', '这机', '不是', '有个', '遥', '控器', '的', '吗', '？']

In [11]:
bucket_sizes

[8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]

In [None]:
###删除过长序列
bucket.pop(8192)

In [13]:
import random

def create_batches_from_buckets_with_shuffle(buckets, batch_size=64, pad_token_id=1, shuffle=True):
    all_batches = []
    
    for bucket_size, texts_in_bucket in buckets.items():
        if not texts_in_bucket:
            continue
        
        # 如果需要，打乱当前桶内的文本顺序
        if shuffle:
            random.shuffle(texts_in_bucket)
        
        # 按batch_size分组
        for i in range(0, len(texts_in_bucket), batch_size):
            batch_texts = texts_in_bucket[i:i+batch_size]
            # 创建attention mask
            attention_masks = [
                [1 if token != pad_token_id else 0 for token in text_tokens[0]]
                for text_tokens in batch_texts
            ]
            
            all_batches.append((batch_texts, attention_masks))
    
    # 如果需要，打乱所有batch的顺序
    if shuffle:
        random.shuffle(all_batches)
    
    return all_batches

In [14]:
all_batches = create_batches_from_buckets_with_shuffle(bucket)

保存构建好的训练集

In [17]:
import pickle
with open('file.pkl', 'wb') as f:
    pickle.dump(all_batches, f)  # 序列化到文件

训练

In [3]:
import pickle
with open('file.pkl', 'rb') as f:
    all_batches = pickle.load(f)  # 从文件反序列化


In [4]:
from torch.utils.data import Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm
import torch.utils.checkpoint as checkpoint

class DINOImageDataset(Dataset):
    """DINO训练的自定义数据集"""
    def __init__(self, all_batches, transform=None):
        super(DINOImageDataset, self).__init__()
        self.data = all_batches
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):

        return self.data[idx]
train_loader=DINOImageDataset(all_batches)

In [5]:
class ImprovedLSTMModel(nn.Module):
    def __init__(self, input_size=128, hidden_size=96, num_layers=2, 
                 output_size=256, dropout=0., vocab_size=30000, embedding_dim=128):
        super(ImprovedLSTMModel, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim,padding_idx=1)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        # 添加嵌入层的dropout
        self.embedding_dropout = nn.Dropout(dropout)
        
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=True
        )
        
        # 添加更多全连接层
        self.fc1 = nn.Linear(hidden_size*4, 512)
        self.silu = nn.SiLU()
        self.layer_norm = nn.LayerNorm(512)
        self.fc2 = nn.Linear(512, output_size)
        
        self.fc3 = nn.Linear(output_size, 512)
        self.layer_norm = nn.LayerNorm(512)
        self.fc4 = nn.Linear(512, 2)
        self.softmax = nn.Softmax(dim = -1)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.embedding_dropout(x)
        
        batch_size = x.size(0)

        
        lstm_out, (hidden, cell) = checkpoint.checkpoint(self.lstm,x,use_reentrant=False)
        hidden_final = torch.cat((hidden[-2, :, :], hidden[-1, :, :],F.adaptive_max_pool1d(lstm_out.permute(0, 2, 1), output_size=1).squeeze(2)), dim=1)
        # 更多非线性变换
        out = self.fc1(hidden_final)
        out = self.silu(out)
        out = self.layer_norm(out)
        out = self.fc2(out)

        out0 = self.fc3(out)
        out0 = self.silu(out0)
        out0 = self.layer_norm(out0)
        out0 = self.fc4(out0)
        out0 = self.softmax(out0)
        return F.normalize(out),out0

In [6]:
# ============== 1. 定义核心组件 ==============

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torch.utils.data import DataLoader
import numpy as np
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Sequence, WhitespaceSplit, Punctuation, Digits
import re
import jieba
import pickle
import torch.utils.checkpoint as checkpoint
class MCRLoss(nn.Module):
    def __init__(self, ncrops, reduce_cov=0, expa_type=0, eps=0.1, coeff=0.2):
        super().__init__()
        self.ncrops = ncrops
        self.eps = eps
        self.coeff = coeff
        self.reduce_cov = reduce_cov
        self.expa_type = expa_type

    def forward(self, student_feat, teacher_feat):
        """
        Expansion Loss and Compression Loss between features of the teacher and student networks.
        """
        student_feat = student_feat.view(self.ncrops, -1, student_feat.shape[-1])
        teacher_feat = teacher_feat.view(2, -1, teacher_feat.shape[-1])
        
        comp_loss = self.calc_compression(student_feat, teacher_feat)
        if self.expa_type == 0: # only compute expansion on global views
            expa_loss = self.calc_expansion(student_feat[:len(teacher_feat)])
        elif self.expa_type == 1:
            expa_loss = self.calc_expansion((student_feat[:len(teacher_feat)]+teacher_feat)/2)
        loss = - self.coeff * comp_loss - expa_loss
        return loss, comp_loss.detach(), expa_loss.detach()
    
    def calc_compression(self, student_feat_list, teacher_feat_list):
        """
        Compute compression loss between student and teacher features.
        """
        # Convert lists of tensors to a single tensor for vectorized operations
        
        sim = F.cosine_similarity(teacher_feat_list.unsqueeze(1), student_feat_list.unsqueeze(0), dim=-1)
        sim.view(-1, sim.shape[-1])[:: (len(student_feat_list) + 1), :].fill_(0)  # Trick to fill diagonal
        
        n_loss_terms = len(teacher_feat_list)* len(student_feat_list) - min(len(teacher_feat_list), len(student_feat_list))
        # Sum the cosine similarities
        comp_loss = sim.mean(2).sum()/n_loss_terms
        # global_comp_loss = (sim[:, :len(teacher_feat_list)].mean(2).sum()).detach_().div_(len(teacher_feat_list))
        return comp_loss
    
    def calc_expansion(self, feat_list) -> torch.Tensor:
        """
        Compute expansion loss using Coding Rate estimation.
        """
        cov_list = []
        num_views = len(feat_list)
        m, p = feat_list[0].shape
        
        cov_list = [W.T.matmul(W) for W in feat_list]
        cov_list = torch.stack(cov_list)
        N=1

        scalar = p / (m * N * self.eps)
        I = torch.eye(p, device=cov_list[0].device)
        loss:torch.Tensor = 0
        for i in range(num_views):
            loss += torch.linalg.cholesky_ex(I + scalar * cov_list[i])[0].diagonal().log().sum()
        loss /= num_views
        loss *= (p+N*m)/(p*N*m) # the balancing factor gamma, you can also use the next line. This is ultimately a heuristic, so feel free to experiment.
        # loss *= ((self.eps * N * m) ** 0.5 / p)
        return loss
def monitor_features(feats):
    # 特征范数（应该接近1，因为L2归一化）
    norms = torch.norm(feats, dim=1).mean().item()
    
    # 特征间的平均余弦相似度（应该适中，不是0也不是1）
    sim_matrix = feats @ feats.T
    avg_sim = (sim_matrix.sum() - feats.shape[0]) / (feats.shape[0] * (feats.shape[0]-1))
    
    # 特征协方差矩阵的秩（应该接近特征维度）
    cov = feats.T @ feats / feats.shape[0]
    rank = torch.linalg.matrix_rank(cov).item()
    
    return norms, avg_sim, rank
# ============== 2. 数据增强 ==============

def generate_sorted_random(k, min_val, max_val, device='cpu'):
    n = max_val - min_val
    if k > n:
        raise ValueError(f"k={k}不能大于范围大小{n}")
    indices = torch.randperm(n, device=device)[:k]
    sorted_indices = torch.sort(indices).values + min_val
    return sorted_indices
def remove_elements(tensor, indices_to_remove):
    mask = torch.ones(len(tensor), dtype=torch.bool)
    mask[indices_to_remove] = False
    return tensor[mask]
#def random_remove_elements_batch_vectorized(tensor, mask, remove_ratio=0.2, seed=None, device='cuda'):
def random_remove_elements_batch_vectorized(tensor, mask, remove_ratio=0.2, seed=None, device='cuda'):
    """
    更快的批量随机移除元素版本，使用完全的向量化操作
    """
    if device == 'cuda' and torch.cuda.is_available():
        tensor = tensor.to(device)
        mask = mask.to(device)
    
    batch_size, seq_len = tensor.shape[:2]
    remove_num = int(seq_len * remove_ratio)
    
    if seed is not None:
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(seed)
    mask=mask>0
    # 生成随机矩阵用于选择要移除的位置
    random_matrix = torch.rand(batch_size, seq_len, device=device)
    
    # 将无效位置（mask=False）的随机值设为很大，确保不会被选为最小
    random_matrix = random_matrix.masked_fill(~mask, float('inf'))
    
    # 对于每个样本，选择remove_num个随机值最小的位置
    _, remove_indices = torch.topk(random_matrix, remove_num, dim=1, largest=False)
    
    # 创建移除掩码
    remove_mask = torch.zeros((batch_size, seq_len), dtype=torch.bool, device=device)
    
    # 使用scatter_快速设置移除位置
    batch_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand(-1, remove_num)
    remove_mask.scatter_(1, remove_indices, True)
    
    # 保留掩码 = 有效掩码 AND 非移除掩码
    keep_mask = mask & (~remove_mask)
    
    # 重新组织张量
    # 计算每个样本保留的元素数量
    keep_counts = keep_mask.sum(dim=1)
    max_keep = keep_counts.max().item()
    
    # 创建结果张量
    if len(tensor.shape) == 2:
        result = torch.ones((batch_size, max_keep), dtype=tensor.dtype, device=device)
    else:
        result = torch.ones((batch_size, max_keep, *tensor.shape[2:]), 
                           dtype=tensor.dtype, device=device)
    
    # 填充结果
    for i in range(batch_size):
        keep_indices = torch.nonzero(keep_mask[i], as_tuple=True)[0]
        result[i, :len(keep_indices)] = tensor[i][keep_indices]
    
    return result
def build_cropped_text(text, mask, crop_rate, device='cuda'):
    if device == 'cuda' and torch.cuda.is_available():
        text = text.cuda()
        mask = mask.cuda()
    return [random_remove_elements_batch_vectorized(text, mask, rate, device=device) for rate in crop_rate]


class twin_model:
    def __init__(self, student, teacher, device):
        self.student = student.to(device)
        self.teacher = teacher.to(device)
        for p in self.teacher.parameters():
            p.requires_grad = False
        self.loss_F=MCRLoss(9)

    def update_teacher(self, momentum):
        """教师模型EMA更新"""
        with torch.no_grad():
            # 获取学生和教师模型的参数
            student_params = dict(self.student.named_parameters())
            teacher_params = dict(self.teacher.named_parameters())
            
            # 应用EMA更新：teacher = momentum * teacher + (1 - momentum) * student
            for name in teacher_params:
                teacher_params[name].data.mul_(momentum).add_(
                    student_params[name].data, alpha=1 - momentum
                )
    def student_forward(self,x):
        return self.student(x)
    def teacher_forward(self,x):
        return self.teacher(x)
    def cal_loss(self,student,teacher):
        return self.loss_F.forward(student,teacher)
def load_data(xml_text):
    """从XML格式文本中加载句子和标签"""
    import re
    
    sentences = []
    labels = []
    
    # 使用正则表达式提取每个review
    pattern = r'<review id="\d+"\s+label="(\d)">(.*?)</review>'
    matches = re.findall(pattern, xml_text, re.DOTALL)
    
    for label, text in matches:
        sentences.append(text.strip())
        labels.append(int(label))
    
    return sentences, labels
def load_data_from_file(file_path):
    """从XML文件加载数据"""
    with open(file_path, 'r', encoding='utf8',errors='ignore') as f:
        xml_text = f.read()
    return load_data(xml_text)


def test(model,sentences_ids,labels,loss_F,device="cuda"):
    labels_tensor=[]
    for i in labels:
        if i==1:
            labels_tensor.append(torch.tensor([[1.,0.]],device=device))
        else:
            labels_tensor.append(torch.tensor([[0.,1.]],device=device))
    total_loss=0
    model.eval()
    model.to(device=device)
    for i in range(len(sentences_ids)):
        with torch.no_grad():

            _,out=model.forward(torch.tensor([sentences_ids[i]]).to(device=device))
            total_loss+=(loss_F(out,labels_tensor[i]).to(device=device)).mean().cpu()/len(sentences_ids)
    return float(total_loss)

def train_dino_simple(
    twin_model,
    train_dataset,
    epochs=100,
    batch_size=64,
    lr=1e-4,
    weight_decay=0.,
    momentum_schedule=None,
    device="cuda"
):
    """
    简化版DINO训练
    """
    device=device
    # 初始化训练器
    
    # 初始化优化器
    optimizer = torch.optim.AdamW(
        twin_model.student.parameters(),
        lr=lr,
    )
    tokenizer = Tokenizer.from_file("./my_tokenizer.json")
    target_loss_F=nn.CrossEntropyLoss()
    # 训练循环
    sentences_cn,labels_cn=load_data_from_file("Sentiment Classification with Deep Learning/test.label.cn.txt")
    sentences_en,labels_en=load_data_from_file("Sentiment Classification with Deep Learning/test.label.en.txt")
    sentences=sentences_cn+sentences_en
    labels=labels_cn+labels_en
    sentences_ids=[]
    for i in sentences:
        sentences_ids.append(tokenizer.encode(i, add_special_tokens=False).ids)
    print("test_loss : "+str(test(twin_model.student,sentences_ids,labels,target_loss_F)))
    for epoch in range(1,epochs+1):
        twin_model.student.train()
        total_loss = 0
        total_comp_loss =0
        total_expa_loss =0
        if momentum_schedule is not None:
            momentum = momentum_schedule[epoch]
        else:
            momentum = 0.996
        T=tqdm(train_loader,total = len(train_loader))
        e_loss=[]
        e_tar_loss=[]
        e_norms=[]
        e_avg_sim=[]
        e_rank=[]
        for batch_idx, batch in enumerate(train_loader):
            batch=list(batch)
            batch[0],target=(zip(*batch[0]))
            batch=torch.tensor(batch)
            T.set_description(f'Epoch [{batch_idx+1}/{len(train_loader)}]')
            # 训练步骤
            global_input=[batch[0]]+build_cropped_text(batch[0],batch[1],[0.05])
            local_input=build_cropped_text(batch[0],batch[1],[0.1,0.15,0.2,0.25,0.3,0.35,0.4])
            
            teacher_input=global_input
            student_input=global_input+local_input
            
            teacher_outputs = []
            
            with torch.no_grad():
                for i in range(len(teacher_input)):
                    teacher_out,_ = twin_model.teacher_forward(teacher_input[i].to(device))
                    teacher_outputs.append(F.normalize(teacher_out, p=2, dim=-1))
                teacher_outputs = torch.cat(teacher_outputs, dim=0)
            # ============ 学生模型前向传播 ============
            # 学生模型处理所有裁剪
            tar_loss=0
            student_outputs = []
            for i in range(len(student_input)):
                student_out,cla_out = twin_model.student_forward(student_input[i].to(device))
                tar_loss+=target_loss_F(cla_out,torch.stack(target).to(device=device,dtype=torch.float))*(1.5 + (0.2 - 1.5) * (i / len(student_input)))/len(student_input)
                student_outputs.append(F.normalize(student_out, p=2, dim=-1))
            student_outputs = torch.cat(student_outputs, dim=0)  # [(2+n_local)*batch_size, out_dim]
            # ============ 计算损失 ============
            # 重复教师目标以匹配学生输出的数量
            
            loss,comp_loss,expa_loss=twin_model.cal_loss(student_outputs,teacher_outputs)
            loss=loss*np.sin((epoch / epochs)*np.pi/2)**2+tar_loss*np.cos((epoch / epochs)*np.pi/2)**2
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            test_tensor=student_outputs.clone().detach()
            norms, avg_sim, rank = monitor_features(test_tensor.cpu())
            
            twin_model.update_teacher(momentum)
            total_loss += loss
            total_comp_loss += comp_loss
            total_expa_loss += expa_loss
            e_tar_loss.append(float(tar_loss.detach()))
            e_loss.append(float(loss.detach()))
            e_norms  .append(norms)
            e_avg_sim.append(avg_sim)
            e_rank   .append(rank)
            
            T.set_postfix(loss=np.mean(e_loss[(batch_idx-50) if batch_idx>50 else 0:batch_idx]),
                          norms=np.mean(e_norms  [(batch_idx-50) if batch_idx>50 else 0:batch_idx]),
                          avg_sim=np.mean(e_avg_sim[(batch_idx-50) if batch_idx>50 else 0:batch_idx]),
                          rank=np.mean(e_rank   [(batch_idx-50) if batch_idx>50 else 0:batch_idx]),
                          tar_loss=np.mean(e_tar_loss   [(batch_idx-50) if batch_idx>50 else 0:batch_idx]),
                         )
            T.update(1)
        T.close()
        # 打印epoch统计
        avg_loss = total_loss / len(train_loader)
        avg_comp_loss = total_comp_loss / len(train_loader)
        avg_expa_loss = total_expa_loss / len(train_loader)
        

        print(f"Epoch {epoch} completed. Avg Loss: {avg_loss:.4f}, Avg comp Loss: {avg_comp_loss:.4f}, Avg expa Loss: {avg_expa_loss:.4f}, Momentum: {momentum:.4f}")
        if (epoch) % 5 == 0:
            print("test_loss : "+str(test(twin_model.student,sentences_ids,labels,target_loss_F)))
        # 保存检查点
        if (epoch) % 10 == 0:
            torch.save({
                'student_state_dict': twin_model.student.state_dict(),
                'teacher_state_dict': twin_model.teacher.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'epoch': epoch,
                'loss': avg_loss,
            }, f'checkpoint_epoch_{epoch}.pth')
    
    return twin_model.student, twin_model.teacher



student_model = ImprovedLSTMModel(num_layers=2)
teacher_model = ImprovedLSTMModel(num_layers=2)


#teacher_model.load_state_dict(student_model.state_dict())
twin_model=twin_model(student_model,teacher_model,"cuda")


device="cuda"


In [None]:
trained_student, trained_teacher = train_dino_simple(
    twin_model,
    train_loader,
    epochs=200,
    batch_size=1
)

test_loss : 0.7025463581085205


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Epoch [318/318]: 100%|█| 318/318 [01:00<00:00,  5.27it/s, avg_sim=0.647, loss=0.583, norms=1, rank=150, tar_loss=0.584]


Epoch 1 completed. Avg Loss: 0.6104, Avg comp Loss: 0.3384, Avg expa Loss: 0.6227, Momentum: 0.9960


Epoch [318/318]: 100%|██| 318/318 [01:02<00:00,  5.12it/s, avg_sim=0.363, loss=0.525, norms=1, rank=185, tar_loss=0.53]


Epoch 2 completed. Avg Loss: 0.5502, Avg comp Loss: 0.7191, Avg expa Loss: 0.8909, Momentum: 0.9960


Epoch [318/318]: 100%|█| 318/318 [01:01<00:00,  5.15it/s, avg_sim=0.209, loss=0.485, norms=1, rank=223, tar_loss=0.499]


Epoch 3 completed. Avg Loss: 0.5080, Avg comp Loss: 0.7709, Avg expa Loss: 1.2114, Momentum: 0.9960


Epoch [318/318]: 100%|█| 318/318 [01:01<00:00,  5.19it/s, avg_sim=0.125, loss=0.451, norms=1, rank=237, tar_loss=0.478]


Epoch 4 completed. Avg Loss: 0.4817, Avg comp Loss: 0.7257, Avg expa Loss: 1.4734, Momentum: 0.9960


Epoch [318/318]: 100%|██| 318/318 [01:01<00:00,  5.17it/s, avg_sim=0.069, loss=0.404, norms=1, rank=248, tar_loss=0.45]


Epoch 5 completed. Avg Loss: 0.4343, Avg comp Loss: 0.6975, Avg expa Loss: 1.7593, Momentum: 0.9960
test_loss : 0.5496216416358948


Epoch [318/318]: 100%|█| 318/318 [01:01<00:00,  5.21it/s, avg_sim=0.0559, loss=0.379, norms=1, rank=250, tar_loss=0.446


Epoch 6 completed. Avg Loss: 0.3962, Avg comp Loss: 0.6891, Avg expa Loss: 1.8865, Momentum: 0.9960


Epoch [318/318]: 100%|█| 318/318 [01:01<00:00,  5.15it/s, avg_sim=0.0561, loss=0.338, norms=1, rank=250, tar_loss=0.431


Epoch 7 completed. Avg Loss: 0.3575, Avg comp Loss: 0.6820, Avg expa Loss: 1.9529, Momentum: 0.9960


Epoch [318/318]: 100%|█| 318/318 [01:01<00:00,  5.17it/s, avg_sim=0.042, loss=0.293, norms=1, rank=251, tar_loss=0.414]


Epoch 8 completed. Avg Loss: 0.3225, Avg comp Loss: 0.6751, Avg expa Loss: 2.0029, Momentum: 0.9960


Epoch [318/318]: 100%|█| 318/318 [01:01<00:00,  5.21it/s, avg_sim=0.0324, loss=0.247, norms=1, rank=251, tar_loss=0.402


Epoch 9 completed. Avg Loss: 0.2689, Avg comp Loss: 0.6751, Avg expa Loss: 2.0715, Momentum: 0.9960


Epoch [318/318]: 100%|█| 318/318 [01:01<00:00,  5.16it/s, avg_sim=0.0416, loss=0.207, norms=1, rank=251, tar_loss=0.398


Epoch 10 completed. Avg Loss: 0.2189, Avg comp Loss: 0.6675, Avg expa Loss: 2.1047, Momentum: 0.9960
test_loss : 0.5365180373191833


Epoch [318/318]: 100%|█| 318/318 [01:01<00:00,  5.19it/s, avg_sim=0.0219, loss=0.147, norms=1, rank=251, tar_loss=0.377


Epoch 11 completed. Avg Loss: 0.1707, Avg comp Loss: 0.6610, Avg expa Loss: 2.1278, Momentum: 0.9960


Epoch [318/318]: 100%|█| 318/318 [01:00<00:00,  5.22it/s, avg_sim=0.0298, loss=0.108, norms=1, rank=251, tar_loss=0.381


Epoch 12 completed. Avg Loss: 0.1188, Avg comp Loss: 0.6554, Avg expa Loss: 2.1527, Momentum: 0.9960


Epoch [318/318]: 100%|█| 318/318 [01:00<00:00,  5.24it/s, avg_sim=0.027, loss=0.0662, norms=1, rank=251, tar_loss=0.386


Epoch 13 completed. Avg Loss: 0.0759, Avg comp Loss: 0.6460, Avg expa Loss: 2.1585, Momentum: 0.9960


Epoch [318/318]: 100%|█| 318/318 [01:01<00:00,  5.18it/s, avg_sim=0.0212, loss=0.00114, norms=1, rank=251, tar_loss=0.3


Epoch 14 completed. Avg Loss: 0.0156, Avg comp Loss: 0.6462, Avg expa Loss: 2.1807, Momentum: 0.9960


Epoch [318/318]: 100%|█| 318/318 [01:00<00:00,  5.22it/s, avg_sim=0.0216, loss=-0.053, norms=1, rank=251, tar_loss=0.36


Epoch 15 completed. Avg Loss: -0.0397, Avg comp Loss: 0.6388, Avg expa Loss: 2.1867, Momentum: 0.9960
test_loss : 0.5598496794700623


Epoch [318/318]: 100%|█| 318/318 [01:00<00:00,  5.23it/s, avg_sim=0.0273, loss=-0.0972, norms=1, rank=251, tar_loss=0.3


Epoch 16 completed. Avg Loss: -0.0915, Avg comp Loss: 0.6355, Avg expa Loss: 2.1963, Momentum: 0.9960


Epoch [254/318]:  80%|▊| 253/318 [00:51<00:14,  4.35it/s, avg_sim=0.0194, loss=-0.169, norms=1, rank=254, tar_loss=0.35

In [13]:
net=twin_model.student
net.eval()

ImprovedLSTMModel(
  (embedding): Embedding(30000, 128, padding_idx=1)
  (embedding_dropout): Dropout(p=0.0, inplace=False)
  (lstm): LSTM(128, 96, num_layers=2, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=384, out_features=512, bias=True)
  (silu): SiLU()
  (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=512, bias=True)
  (fc4): Linear(in_features=512, out_features=2, bias=True)
  (softmax): Softmax(dim=-1)
)

测试代码，把文件都丢到目录下就可以运行了

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import numpy as np
from tokenizers import Tokenizer
import re
import pickle
import torch.utils.checkpoint as checkpoint
class ImprovedLSTMModel(nn.Module):
    def __init__(self, input_size=128, hidden_size=96, num_layers=2, 
                 output_size=256, dropout=0., vocab_size=30000, embedding_dim=128):
        super(ImprovedLSTMModel, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim,padding_idx=1)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        # 添加嵌入层的dropout
        self.embedding_dropout = nn.Dropout(dropout)
        
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=True
        )
        
        # 添加更多全连接层
        self.fc1 = nn.Linear(hidden_size*4, 512)
        self.silu = nn.SiLU()
        self.layer_norm = nn.LayerNorm(512)
        self.fc2 = nn.Linear(512, output_size)
        
        self.fc3 = nn.Linear(output_size, 512)
        self.layer_norm = nn.LayerNorm(512)
        self.fc4 = nn.Linear(512, 2)
        self.softmax = nn.Softmax(dim = -1)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.embedding_dropout(x)
        
        batch_size = x.size(0)

        
        lstm_out, (hidden, cell) = checkpoint.checkpoint(self.lstm,x,use_reentrant=False)
        hidden_final = torch.cat((hidden[-2, :, :], hidden[-1, :, :],F.adaptive_max_pool1d(lstm_out.permute(0, 2, 1), output_size=1).squeeze(2)), dim=1)
        # 更多非线性变换
        out = self.fc1(hidden_final)
        out = self.silu(out)
        out = self.layer_norm(out)
        out = self.fc2(out)

        out0 = self.fc3(out)
        out0 = self.silu(out0)
        out0 = self.layer_norm(out0)
        out0 = self.fc4(out0)
        out0 = self.softmax(out0)
        return F.normalize(out),out0

In [4]:
from tqdm import tqdm
def load_checkpoint(model,path):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['student_state_dict'])
    return model
def load_reviews(file_path):
    reviews = []
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    pattern = r'<review[^>]*>(.*?)</review>'
    matches = re.findall(pattern, content, re.DOTALL)
    for match in matches:
        cleaned_text = match.strip()
        reviews.append(cleaned_text)
    return reviews
net = ImprovedLSTMModel(num_layers=2)
net=load_checkpoint(net,"checkpoint_epoch_200 - 副本.pth").to("cuda")
net.eval()
data_p=load_reviews("./evaltask2_sample_data/cn_sample_data/sample.positive.txt")
data_n=load_reviews("./evaltask2_sample_data/cn_sample_data/sample.negative.txt")
data_p=data_p+load_reviews("./evaltask2_sample_data/en_sample_data/sample.positive.txt")
data_n=data_n+load_reviews("./evaltask2_sample_data/en_sample_data/sample.negative.txt")
tokenizer = Tokenizer.from_file("./my_tokenizer.json")
feature_p=None
for i in tqdm(data_p):
    if feature_p==None:
        with torch.no_grad():
            feature_p,_=net(torch.tensor([tokenizer.encode(i, add_special_tokens=False).ids]).to("cuda"))
    else:
        with torch.no_grad():
            feature_p=torch.cat([feature_p,net(torch.tensor([tokenizer.encode(i, add_special_tokens=False).ids]).to("cuda"))[0]])
feature_n=None
for i in tqdm(data_n):
    if feature_n==None:
        with torch.no_grad():
            feature_n,_=net(torch.tensor([tokenizer.encode(i, add_special_tokens=False).ids]).to("cuda"))
    else:
        with torch.no_grad():
            feature_n=torch.cat([feature_n,net(torch.tensor([tokenizer.encode(i, add_special_tokens=False).ids]).to("cuda"))[0]])
feature_p=feature_p.cpu().numpy()
feature_n=feature_n.cpu().numpy()
with open('feature_p.pkl', 'wb') as f:
    pickle.dump(feature_p, f)  # 序列化到文件
with open('feature_n.pkl', 'wb') as f:
    pickle.dump(feature_n, f)  # 序列化到文件

100%|█████████████████████████████████████████████████████████████████████████████| 9987/9987 [00:10<00:00, 929.26it/s]
100%|████████████████████████████████████████████████████████████████████████████| 9998/9998 [00:09<00:00, 1024.42it/s]


In [6]:
def load_checkpoint(model,path):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['student_state_dict'])
    return model
def load_data(xml_text):
    """从XML格式文本中加载句子和标签"""
    import re
    sentences = []
    labels = []
    pattern = r'<review id="\d+"\s+label="(\d)">(.*?)</review>'
    matches = re.findall(pattern, xml_text, re.DOTALL)
    for label, text in matches:
        sentences.append(text.strip())
        labels.append(int(label))
    return sentences, labels
def load_data_from_file(file_path):
    """从XML文件加载数据"""
    with open(file_path, 'r', encoding='utf8',errors='ignore') as f:
        xml_text = f.read()
    return load_data(xml_text)
net = ImprovedLSTMModel(num_layers=2)
net=load_checkpoint(net,"checkpoint_epoch_200 - 副本.pth").to("cuda")
net.eval()
sentences_cn,labels_cn=load_data_from_file("Sentiment Classification with Deep Learning/test.label.cn.txt")
sentences_en,labels_en=load_data_from_file("Sentiment Classification with Deep Learning/test.label.en.txt")
sentences=sentences_cn+sentences_en
labels=labels_cn+labels_en
sentences_ids=[]
for i in sentences:
    sentences_ids.append(tokenizer.encode(i, add_special_tokens=False).ids)
sentences_feature=[]
with torch.no_grad():
    for i in tqdm(sentences_ids):
        feature,_=net(torch.tensor([i]).to("cuda"))
        sentences_feature.append(feature[0].cpu().numpy())
    with open('sentences_feature.pkl', 'wb') as f:
        pickle.dump(sentences_feature, f)  # 序列化到文件
    with open('labels.pkl', 'wb') as f:
        pickle.dump(labels, f)  # 序列化到文件

100%|████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:04<00:00, 1136.71it/s]


In [27]:
from sklearn.metrics import classification_report, accuracy_score
def load_data(xml_text):
    import re
    sentences = []
    labels = []
    pattern = r'<review id="\d+"\s+label="(\d)">(.*?)</review>'
    matches = re.findall(pattern, xml_text, re.DOTALL)
    for label, text in matches:
        sentences.append(text.strip())
        labels.append(int(label))
    return sentences, labels
def load_data_from_file(file_path):
    with open(file_path, 'r', encoding='utf8',errors='ignore') as f:
        xml_text = f.read()
    return load_data(xml_text)
def test(model,sentences_ids,labels,device="cuda"):
    labels_tensor=[]
    pred=[]
    for i in labels:
        if i==1:
            labels_tensor.append(0)
        else:
            labels_tensor.append(1)
    total_loss=0
    model.eval()
    model.to(device=device)
    for i in range(len(sentences_ids)):
        with torch.no_grad():
            _,out=model.forward(torch.tensor([sentences_ids[i]]).to(device=device))
            pred.append(torch.argmax(out.cpu()))
    print(f"准确率 (Accuracy): {accuracy_score(labels_tensor, pred):.4f}")
    print("\n分类详情:")
    print(classification_report(labels_tensor, pred))

In [28]:
sentences_cn,labels_cn=load_data_from_file("Sentiment Classification with Deep Learning/test.label.cn.txt")
sentences_en,labels_en=load_data_from_file("Sentiment Classification with Deep Learning/test.label.en.txt")
sentences=sentences_cn+sentences_en
labels=labels_cn+labels_en
sentences_ids=[]
for i in sentences:
    sentences_ids.append(tokenizer.encode(i, add_special_tokens=False).ids)
test(net,sentences_ids,labels)

准确率 (Accuracy): 0.7688

分类详情:
              precision    recall  f1-score   support

           0       0.78      0.75      0.77      2500
           1       0.76      0.78      0.77      2500

    accuracy                           0.77      5000
   macro avg       0.77      0.77      0.77      5000
weighted avg       0.77      0.77      0.77      5000

