In [1]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import confusion_matrix
import torch.nn.functional as F
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score, roc_curve
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau




# 氨基酸到索引的映射（1-20）
amino_to_index = {amino: i+1 for i, amino in enumerate('ARNDCQEGHILKMFPSTWYV')}

# 创建自定义数据集类
class ProteinDataset(Dataset):
    def __init__(self, data, amino_to_index, max_length=100):
        self.data = data
        self.amino_to_index = amino_to_index
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sequence = self.data.iloc[idx, 0]
        label = self.data.iloc[idx, 1]

        # 将氨基酸序列转换为索引序列，并填充到指定的最大长度
        sequence_indices = [self.amino_to_index.get(amino, 0) for amino in sequence]
        sequence_indices += [0] * (self.max_length - len(sequence_indices))
        sequence_indices = sequence_indices[:self.max_length]

        return {
            'sequence_indices': torch.tensor(sequence_indices),
            'label': torch.tensor(label)
        }

# 从本地CSV文件中读取数据
data = pd.read_csv('At_Rice_1vs1.csv')

device = torch.device('cuda')
# Separating features (sequences) and labels
X = data.iloc[:, 0]  # The first column is the sequence
y = data.iloc[:, 1]  # The second column is the label

# Split the data ensuring consistent class ratios in train and test sets
train_X_1, test_X_1, train_y_1, test_y_1 = train_test_split(X[y == 1], y[y == 1], test_size=0.3, random_state=100)
train_X_0, test_X_0, train_y_0, test_y_0 = train_test_split(X[y == 0], y[y == 0], test_size=0.3, random_state=100)

train_X = pd.concat([train_X_1, train_X_0])
test_X = pd.concat([test_X_1, test_X_0])
train_y = pd.concat([train_y_1, train_y_0])
test_y = pd.concat([test_y_1, test_y_0])

# Combine features and labels for train and test sets and shuffle the data within each class to add randomness
train_data = shuffle(pd.DataFrame({'sequence': train_X, 'label': train_y}), random_state=100)
test_data = shuffle(pd.DataFrame({'sequence': test_X, 'label': test_y}), random_state=100)

# Create dataset objects for train and test sets
train_dataset = ProteinDataset(train_data, amino_to_index=amino_to_index, max_length=100)
test_dataset = ProteinDataset(test_data, amino_to_index=amino_to_index, max_length=100)

batch_size = 64
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 定义学习率调度器参数
initial_lr = 0.00001  # 初始学习率，您可以根据需要调整
lr_decay_factor = 0.5
lr_patience = 2
current_lr = initial_lr
no_improvement_count = 0
# 初始化RoBERTa模型和tokenizer
# 指定本地模型路径
local_model_path = "E:/JAQ/01.Sp/02_DL/Try11_1/roberta"
# 加载tokenizer
tokenizer = RobertaTokenizer.from_pretrained(local_model_path)
# 加载模型
model = RobertaForSequenceClassification.from_pretrained(local_model_path, num_labels=2)

# 定义优化器和损失函数
optimizer = torch.optim.AdamW(model.parameters(), lr=initial_lr, weight_decay=1e-5)

class_weight = torch.tensor([0.008, 1.0])  # 根据类别不平衡情况调整权重
criterion = torch.nn.CrossEntropyLoss(weight=class_weight.to(device))

#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# Define learning rate scheduler
lr_scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=lr_decay_factor, patience=lr_patience, verbose=True)

# 训练和测试循环
num_epochs = 30
best_auc = 0.0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch in train_dataloader:
        sequence_indices = batch['sequence_indices'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(sequence_indices)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)

    # 验证循环
    model.eval()
    true_labels = []
    predicted_labels = []
    correct = 0
    total = 0
    all_labels = []
    all_scores = []

    with torch.no_grad():
        for batch in test_dataloader:
            sequence_indices = batch['sequence_indices'].to(device)
            labels = batch['label'].to(device)

            outputs = model(sequence_indices)
            _, predicted = torch.max(outputs.logits, 1)

            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predicted.cpu().numpy())

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)[:, 1]
            all_labels.extend(labels.cpu().numpy())
            all_scores.extend(probabilities.cpu().numpy())

        auc = roc_auc_score(all_labels, all_scores)
        mcc = matthews_corrcoef(true_labels, predicted_labels)
        conf_matrix = confusion_matrix(true_labels, predicted_labels)

        tp = conf_matrix[1, 1]
        tn = conf_matrix[0, 0]
        fp = conf_matrix[0, 1]
        fn = conf_matrix[1, 0]

        sn = tp / (tp + fn) if (tp + fn) != 0 else 0.0
        sp = tn / (tn + fp) if (tn + fp) != 0 else 0.0
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        precision = tp / (tp + fp) if (tp + fp) != 0 else 0.0
        f1_score = 2 * (precision * sn) / (precision + sn) if (precision + sn) != 0 else 0.0

        print('Epoch [{}/{}]\tAvg Loss: {:.4f}, True Positives: {:.4f}, True Negatives: {:.4f}, False Positives: {:.4f}, False Negatives: {:.4f},'
            .format(epoch+1, num_epochs, avg_loss, tp, tn, fp, fn))

        print('Epoch [{}/{}]\tTrain Loss: {:.4f}\tTest Accuracy: {:.2f}%, AUC: {:.4f}, Sn: {:.4f}, Sp: {:.4f}, Mcc: {:.4f}, Precision: {:.4f}, f1_score: {:.4f}'
            .format(epoch+1, num_epochs, avg_loss, accuracy * 100, auc, sn, sp, mcc, precision, f1_score))

        # Update learning rate
        lr_scheduler.step(auc)

        if auc > best_auc:
            best_auc = auc
            # 保存模型检查点
            torch.save(model.state_dict(), 'best_Roberta_gpu.pth')
        else:
            no_improvement_count += 1
            if no_improvement_count >= lr_patience:
                current_lr *= lr_decay_factor
                optimizer = torch.optim.AdamW(model.parameters(), lr=current_lr)
                print(f'Learning rate reduced to {current_lr}')


  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at E:/JAQ/01.Sp/02_DL/Try11_1/roberta and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch [1/30]	Avg Loss: 0.0586, True Positives: 4036.0000, True Negatives: 1030.0000, False Positives: 4652.0000, False Negatives: 0.0000,
Epoch [1/30]	Train Loss: 0.0586	Test Accuracy: 52.13%, AUC: 0.8965, Sn: 1.0000, Sp: 0.1813, Mcc: 0.2902, Precision: 0.4645, f1_score: 0.6344


KeyboardInterrupt: 