In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel,AutoTokenizer,pipeline
model = AutoModel.from_pretrained('hfl/chinese-roberta-wwm-ext-large')
tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext-large')

In [None]:
class sentiment_classify(nn.Module):
    def __init__(self, model, num_labels):
        super(CustomRobertaForClassification, self).__init__()
        self.roberta = model 
        self.classifier = nn.Linear(model.config.hidden_size, num_labels)  #

    def forward(self, input_ids, attention_mask=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0][:, 0, :]  
        logits = self.classifier(sequence_output)
        return logits

num_labels = 3
model = sentiment_classify(model, num_labels)


In [None]:
from torch.utils.data import DataLoader, Dataset, random_split
import pandas as pd
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        self.max_length = max_length

        # 编码文本
        self.encodings = self.tokenizer(self.texts, truncation=True, padding=True, max_length=self.max_length, return_tensors='pt')

        # 将标签编码为数字
        self.labels = torch.tensor([int(label) for label in self.labels])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

def create_data_loaders(texts, labels, tokenizer, batch_size=32, train_size=0.8):

    dataset = SentimentDataset(texts, labels, tokenizer)
    train_size = int(len(dataset) * train_size)
    val_size = len(dataset) - train_size
    
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader

data = pd.read_csv(r'E:\Bayesian_INT\sentiment_analysis.csv')
train_loader, val_loader = create_data_loaders(data['text'],data['label'],tokenizer,batch_size=32)

In [None]:
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = CrossEntropyLoss()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_and_evaluate(model, train_loader, val_loader, optimizer, criterion, device, validate_every=100,best_model_path=r"E:\Bayesian_INT\my_fine_tuned_model"):
    model.to(device)
    model.train()
    best_accuracy = 0
    total_steps = len(train_loader)
    for epoch in range(num_epochs):
        for i, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            if (i + 1) % validate_every == 0:
                acc = evaluate(model, val_loader, device)
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{total_steps}], Loss: {loss.item()}, Validation Acc: {acc}')
                if acc > best_accuracy:
                    best_accuracy = acc
                    torch.save(model.state_dict(), best_model_path)
                    print(f'Best model saved with accuracy: {best_accuracy}')
                
def evaluate(model, val_loader, device):
    model.eval()
    total_correct = 0
    total_samples = 0
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, 1)
            
            total_samples += labels.size(0)
            total_correct += (predicted == labels).sum().item()
    
    accuracy = 100.0 * total_correct / total_samples
    model.train()
    return accuracy

def predict(model, tokenizer, text, model_path, device):
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    pre_data = tokenier(text,return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(pre_data['input_ids'].to(device), pre_data['attention_mask'].to(device))
        predictions = torch.argmax(outputs, dim=1)

    return predictions
