In [None]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import confusion_matrix
import torch.nn.functional as F
from sklearn.metrics import matthews_corrcoef
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score, roc_curve
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score
# Define the amino acid to index mapping
amino_to_index = {amino: i + 1 for i, amino in enumerate('ARNDCQEGHILKMFPSTWYV')}

# Custom dataset class
class ProteinDataset(Dataset):
    def __init__(self, data, amino_to_index, max_length=100):
        self.data = data
        self.amino_to_index = amino_to_index
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sequence = self.data.iloc[idx, 0]
        label = self.data.iloc[idx, 1]

        # Convert amino acid sequence to index sequence
        sequence_indices = [self.amino_to_index.get(amino, 0) for amino in sequence]

        # Padding sequence to the maximum length
        sequence_indices += [0] * (self.max_length - len(sequence_indices))
        sequence_indices = sequence_indices[:self.max_length]

        return {
            'sequence_indices': torch.tensor(sequence_indices),
            'label': torch.tensor(label)
        }


# 从本地CSV文件中读取数据
data = pd.read_csv('At_Rice_1vs1.csv')

device = torch.device('cuda')
# Separating features (sequences) and labels
X = data.iloc[:, 0]  # The first column is the sequence
y = data.iloc[:, 1]  # The second column is the label

# Split the data ensuring consistent class ratios in train and test sets
train_X_1, test_X_1, train_y_1, test_y_1 = train_test_split(X[y == 1], y[y == 1], test_size=0.3, random_state=100)
train_X_0, test_X_0, train_y_0, test_y_0 = train_test_split(X[y == 0], y[y == 0], test_size=0.3, random_state=100)

train_X = pd.concat([train_X_1, train_X_0])
test_X = pd.concat([test_X_1, test_X_0])
train_y = pd.concat([train_y_1, train_y_0])
test_y = pd.concat([test_y_1, test_y_0])

# Combine features and labels for train and test sets and shuffle the data within each class to add randomness
train_data = shuffle(pd.DataFrame({'sequence': train_X, 'label': train_y}), random_state=100)
test_data = shuffle(pd.DataFrame({'sequence': test_X, 'label': test_y}), random_state=100)

# Create dataset objects for train and test sets
train_dataset = ProteinDataset(train_data, amino_to_index=amino_to_index, max_length=100)
test_dataset = ProteinDataset(test_data, amino_to_index=amino_to_index, max_length=100)


batch_size = 64
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Fine-tuning BERT on your protein sequence data
device = torch.device('cuda')
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# Define the loss function
criterion = nn.CrossEntropyLoss()

# Example with Adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001, weight_decay=1e-5)


# 定义学习率调度器
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.5, verbose=True) #ReduceLROnPlateau： 在验证集上的性能不再提升时降低学习率.

# Training and testing loops
num_epochs = 30
best_accuracy = 0.0
all_labels = []
all_scores = []

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in train_dataloader:
        sequence_indices = batch['sequence_indices'].to(device)  
        labels = batch['label'].to(device)  

        optimizer.zero_grad()
        outputs = model(sequence_indices)
        loss = criterion(outputs.logits, labels)  # Use logits for BERT models
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)

    # Validation Loop
    model.eval()
    true_labels = []
    predicted_labels = []
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in test_dataloader:
            sequence_indices = batch['sequence_indices'].to(device)
            labels = batch['label'].to(device)

            outputs = model(sequence_indices)
            _, predicted = torch.max(outputs.logits, 1)  # Use logits for BERT models

            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predicted.cpu().numpy())

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)[:, 1]
            all_labels.extend(labels.cpu().numpy())
            all_scores.extend(probabilities.cpu().numpy())

        auc = roc_auc_score(all_labels, all_scores)
        mcc = matthews_corrcoef(true_labels, predicted_labels)
        conf_matrix = confusion_matrix(true_labels, predicted_labels)

        tp = conf_matrix[1, 1]
        tn = conf_matrix[0, 0]
        fp = conf_matrix[0, 1]
        fn = conf_matrix[1, 0]

        sn = tp / (tp + fn) if (tp + fn) != 0 else 0.0
        sp = tn / (tn + fp) if (tn + fp) != 0 else 0.0
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        precision = tp / (tp + fp) if (tp + fp) != 0 else 0.0
        f1_score = 2 * (precision * sn) / (precision + sn) if (precision + sn) != 0 else 0.0

        print('Epoch [{}/{}]\tAvg Loss: {:.4f}, True Positives: {:.4f}, True Negatives: {:.4f}, False Positives: {:.4f}, False Negatives: {:.4f},'
            .format(epoch+1, num_epochs, avg_loss, tp, tn, fp, fn))

        print('Epoch [{}/{}]\tTrain Loss: {:.4f}\tTest Accuracy: {:.2f}%, AUC: {:.4f}, Sn: {:.4f}, Sp: {:.4f}, Mcc: {:.4f}, Precision: {:.4f}, f1_score: {:.4f}'
            .format(epoch+1, num_epochs, avg_loss, accuracy * 100, auc, sn, sp, mcc, precision, f1_score))

        # if accuracy > best_accuracy:
        #     best_accuracy = accuracy
        #     # Save the model checkpoint
        #     torch.save(model.state_dict(), 'best_bert_gpu.pth')
    # Update the learning rate scheduler
    scheduler.step(accuracy)



  from .autonotebook import tqdm as notebook_tqdm
'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /bert-base-uncased/resolve/main/tokenizer_config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000021CDB06F340>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: bd150f8e-4509-4dac-a715-dbd760ff4f9b)')' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json
'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /bert-base-uncased/resolve/main/config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000021CDB06FB80>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: 02260062-1fa9-4c8c-ac8a-69d452ef68d1)')' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/config.json
'(MaxR