In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib qt

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == 'cuda':
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU available, using CPU.")

# Load the data from the Excel file
df = pd.read_excel('Aggregated.xlsx')

# Extract sequences and labels
X = df['Sequence'].tolist()
y = df['Label'].values

# Split the data into training and testing sets
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

# Preprocess sequences
X_train_preprocessed = [' '.join(seq) for seq in X_train]
X_test_preprocessed = [' '.join(seq) for seq in X_test]

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert")

# Tokenize and encode the data
max_length = 60  # based on distribution of seq lengths
train_encodings = tokenizer(X_train_preprocessed, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
valid_encodings = tokenizer(X_test_preprocessed, truncation=True, padding=True, max_length=max_length, return_tensors="pt")

# Move the tokenized data to the selected device
train_encodings = {key: val.to(device) for key, val in train_encodings.items()}
valid_encodings = {key: val.to(device) for key, val in valid_encodings.items()}

# Load the pre-trained model and move it to the device
model = BertModel.from_pretrained('Rostlab/prot_bert_bfd_localization').to(device)

def extract_embeddings_in_batches(encodings, model, batch_size=16):
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    
    all_embeddings = []
    
    with torch.no_grad():
        for i in range(0, len(input_ids), batch_size):
            batch_input_ids = input_ids[i:i+batch_size]
            batch_attention_mask = attention_mask[i:i+batch_size]
            
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
            embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            all_embeddings.append(embeddings)
    
    return np.vstack(all_embeddings)

# Extract embeddings for training and validation sets
train_embeddings = extract_embeddings_in_batches(train_encodings, model)
valid_embeddings = extract_embeddings_in_batches(valid_encodings, model)

# Prepare the data for t-SNE
tsne = TSNE(n_components=2, random_state=42)
train_tsne_embeddings = tsne.fit_transform(train_embeddings)
valid_tsne_embeddings = tsne.fit_transform(valid_embeddings)

# Create DataFrame for visualization
train_df = pd.DataFrame(train_tsne_embeddings, columns=['x', 'y'])
train_df['label'] = y_train
train_df['set'] = 'train'

valid_df = pd.DataFrame(valid_tsne_embeddings, columns=['x', 'y'])
valid_df['label'] = y_test
valid_df['set'] = 'test'

df_tsne = pd.concat([train_df, valid_df])

# Map labels to colors
df_tsne['color'] = df_tsne.apply(lambda row: f"{row['set']}_" + ('AMPs' if row['label'] == 1 else 'non-AMPs'), axis=1)

# Plotting the t-SNE results
plt.figure(figsize=(10, 8))
sns.scatterplot(
    x='x', y='y',
    hue='color',
    palette={
        'train_AMPs': 'blue',
        'train_non-AMPs': 'red',
        'test_AMPs': 'green',
        'test_non-AMPs': 'orange'
    },
    data=df_tsne,
    legend='full',
    alpha=0.6
)
plt.title('t-SNE of BERT Sequence Embeddings')
plt.show()

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import torch.nn as nn
%matplotlib qt

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == 'cuda':
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU available, using CPU.")

# Load the data from the Excel file
df = pd.read_excel('Aggregated.xlsx')

# Extract sequences and labels
X = df['Sequence'].tolist()
y = df['Label'].values

# Split the data into training and testing sets
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

# Preprocess sequences
X_train_preprocessed = [' '.join(seq) for seq in X_train]
X_test_preprocessed = [' '.join(seq) for seq in X_test]

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert")

# Tokenize and encode the data
max_length = 60  # based on distribution of seq lengths
train_encodings = tokenizer(X_train_preprocessed, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
valid_encodings = tokenizer(X_test_preprocessed, truncation=True, padding=True, max_length=max_length, return_tensors="pt")

# Move the tokenized data to the selected device
train_encodings = {key: val.to(device) for key, val in train_encodings.items()}
valid_encodings = {key: val.to(device) for key, val in valid_encodings.items()}

# Define the classifier model with your custom weights
class ProteinClassifier(nn.Module):
    def __init__(self, n_classes):
        super(ProteinClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('Rostlab/prot_bert_bfd_localization')
        self.classifier = nn.Sequential(
            nn.Dropout(p=0.4),
            nn.Linear(self.bert.config.hidden_size, n_classes),
            nn.Tanh()
        )
        
    def forward(self, input_ids, attention_mask):
        output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        return output.last_hidden_state

# Load the pre-trained model architecture
model = ProteinClassifier(n_classes=1).to(device)

# Load your trained model weights
model.load_state_dict(torch.load("protein_classifier_model.pth", map_location=device))

def extract_embeddings_in_batches(encodings, model, batch_size=16):
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    
    all_embeddings = []
    
    with torch.no_grad():
        for i in range(0, len(input_ids), batch_size):
            batch_input_ids = input_ids[i:i+batch_size]
            batch_attention_mask = attention_mask[i:i+batch_size]
            
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
            embeddings = outputs.mean(dim=1).cpu().numpy()
            all_embeddings.append(embeddings)
    
    return np.vstack(all_embeddings)

# Extract embeddings for training and validation sets
train_embeddings = extract_embeddings_in_batches(train_encodings, model)
valid_embeddings = extract_embeddings_in_batches(valid_encodings, model)

# Prepare the data for t-SNE
tsne = TSNE(n_components=2, random_state=42)
train_tsne_embeddings = tsne.fit_transform(train_embeddings)
valid_tsne_embeddings = tsne.fit_transform(valid_embeddings)

# Create DataFrame for visualization
train_df = pd.DataFrame(train_tsne_embeddings, columns=['x', 'y'])
train_df['label'] = y_train
train_df['set'] = 'train'

valid_df = pd.DataFrame(valid_tsne_embeddings, columns=['x', 'y'])
valid_df['label'] = y_test
valid_df['set'] = 'test'

df_tsne = pd.concat([train_df, valid_df])

# Map labels to colors
df_tsne['color'] = df_tsne.apply(lambda row: f"{row['set']}_" + ('AMPs' if row['label'] == 1 else 'non-AMPs'), axis=1)

# Plotting the t-SNE results
plt.figure(figsize=(10, 8))
sns.scatterplot(
    x='x', y='y',
    hue='color',
    palette={
        'train_AMPs': 'blue',
        'train_non-AMPs': 'red',
        'test_AMPs': 'green',
        'test_non-AMPs': 'orange'
    },
    data=df_tsne,
    legend='full',
    alpha=0.6
)
plt.title('t-SNE of BERT Sequence Embeddings after Fine-Tuning')
plt.show()