In [61]:
import tqdm as notebook_tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split

MODEL_NAME = "Rostlab/prot_bert" 
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu" # using mps instead of cuda for training on mac
#DEVICE = "cpu"  # use GPU if available, otherwise CPU
print(f"Using device: {DEVICE}")
NUM_CLASSES = 6  # num classes for classification
BATCH_SIZE = 16
EPOCHS = 5
LR = 0.001

Using device: mps


In [62]:
import pandas as pd

records = []  # uniprot_ac, kingdom, type_, sequence, label
with open("/Users/jonas/Desktop/Uni/PBL/data/complete_set_unpartitioned.fasta", "r") as f:
    current_record = None
    for line in f:
        if line.startswith(">"):
            if current_record is not None:
                if current_record["sequence"] is not None and current_record["label"] is not None:
                    # Save the previous record before starting a new one
                    records.append(current_record)
                else:
                    # If the previous record is incomplete, skip it
                    print("Skipping incomplete record:", current_record)
            # Start a new record
            uniprot_ac, kingdom, type_ = line[1:].strip().split("|")
            current_record = {"uniprot_ac": uniprot_ac, "kingdom": kingdom, "type": type_, "sequence": None, "label": None}
        else:
            # Check if the line contains a sequence or a label
            if current_record["sequence"] is None:
                current_record["sequence"] = line.strip()
            elif current_record["label"] is None:
                current_record["label"] = line.strip()
            else:
                # If both sequence and label are already set, skip this line
                print("Skipping extra line in record:", current_record)
    # Save the last record if it's complete
    if current_record is not None:
        if current_record["sequence"] is not None and current_record["label"] is not None:
            records.append(current_record)
        else:
            print("Skipping incomplete record:", current_record)

"""
# Save the DataFrame to a CSV file
df_raw.to_csv("/Users/jonas/Desktop/Uni/PBL/data/complete_set_unpartitioned.csv", index=False)
"""
# Print the number of records
print(f"Total records: {len(records)}")
df_raw = pd.DataFrame(records)
df_raw.head()


Total records: 25693


Unnamed: 0,uniprot_ac,kingdom,type,sequence,label
0,Q8TF40,EUKARYA,NO_SP,MAPTLFQKLFSKRTGLGAPGRDARDPDCGFSWPLPEFDPSQIRLIV...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...
1,Q1ENB6,EUKARYA,NO_SP,MDFTSLETTTFEEVVIALGSNVGNRMNNFKEALRLMKDYGISVTRH...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...
2,P36001,EUKARYA,NO_SP,MDDISGRQTLPRINRLLEHVGNPQDSLSILHIAGTNGKETVSKFLT...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...
3,P55317,EUKARYA,NO_SP,MLGTVKMEGHETSDWNSYYADTQEAYSSVPVSNMNSGLGSMNSMNT...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...
4,P35583,EUKARYA,NO_SP,MLGAVKMEGHEPSDWSSYYAEPEGYSSVSNMNAGLGMNGMNTYMSM...,IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII...


In [63]:
df = df_raw[~df_raw["label"].str.contains("P")]

df["type"] = df["type"].replace("NO_SP", 0)
df["type"] = df["type"].replace("SP", 1)
df["type"] = df["type"].replace("LIPO", 1)
df["type"] = df["type"].replace("TAT", 1)
df["type"] = df["type"].replace("TATLIPO", 1)

# print all unique types
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25580 entries, 0 to 25692
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   uniprot_ac  25580 non-null  object
 1   kingdom     25580 non-null  object
 2   type        25580 non-null  int64 
 3   sequence    25580 non-null  object
 4   label       25580 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.2+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["type"] = df["type"].replace("NO_SP", 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["type"] = df["type"].replace("SP", 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["type"] = df["type"].replace("LIPO", 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using 

In [64]:
label_map = {'S': 0, 'T': 1, 'L': 2, 'I': 3, 'M': 4, 'O': 5}

df_encoded = df.copy()
df_encoded["label"] = df_encoded["label"].apply(lambda x: [label_map[c] for c in x if c in label_map])
df_encoded = df_encoded[df_encoded["label"].map(len) > 0]  # Remove rows with empty label lists

df_encoded.head()


Unnamed: 0,uniprot_ac,kingdom,type,sequence,label
0,Q8TF40,EUKARYA,0,MAPTLFQKLFSKRTGLGAPGRDARDPDCGFSWPLPEFDPSQIRLIV...,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
1,Q1ENB6,EUKARYA,0,MDFTSLETTTFEEVVIALGSNVGNRMNNFKEALRLMKDYGISVTRH...,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
2,P36001,EUKARYA,0,MDDISGRQTLPRINRLLEHVGNPQDSLSILHIAGTNGKETVSKFLT...,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
3,P55317,EUKARYA,0,MLGTVKMEGHETSDWNSYYADTQEAYSSVPVSNMNSGLGSMNSMNT...,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
4,P35583,EUKARYA,0,MLGAVKMEGHEPSDWSSYYAEPEGYSSVSNMNAGLGMNGMNTYMSM...,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."


In [65]:
# perform oversampling of class 1 to balance the dataset
from sklearn.utils import resample
# Separate majority and minority classes
df_majority = df_encoded[df_encoded["type"] == 0]
df_minority = df_encoded[df_encoded["type"] == 1]
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                  replace=True,     # sample with replacement
                                  n_samples=len(df_majority),    # to match majority class
                                  random_state=42) # reproducible results
# Combine majority class with upsampled minority class
df_balanced = pd.concat([df_majority, df_minority_upsampled])
# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
sequences = df_balanced["sequence"].tolist()
label_seqs = df_balanced["label"].tolist()
types = df_balanced["type"].tolist()

# print frequency of each type
print("Type distribution after balancing:")
print(df_balanced["type"].value_counts())


Type distribution after balancing:
type
1    19036
0    19036
Name: count, dtype: int64


In [66]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)
encoder = BertModel.from_pretrained(MODEL_NAME)
encoder.to(DEVICE)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30, 1024, padding_idx=0)
    (position_embeddings): Embedding(40000, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-29): 30 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.0, i

In [67]:
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from collections import Counter
from sklearn.model_selection import train_test_split

# Function to get BERT embeddings for a given text and return mean-pooled embeddings
# Function to get BERT embeddings
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(DEVICE)
    with torch.no_grad():
        outputs = encoder(**inputs)
    attention_mask = inputs["attention_mask"].unsqueeze(-1)
    token_embeddings = outputs.last_hidden_state
    sum_embeddings = (token_embeddings * attention_mask).sum(1)
    sum_mask = attention_mask.sum(1)
    mean_pooled = sum_embeddings / sum_mask
    return mean_pooled.cpu().numpy().flatten()

print("Generating BERT embeddings...")
# Extract features from all sequences using the encoder
# This can be slow for large datasets; consider batch processing for efficiency
embeddings = np.array([get_bert_embedding(seq) for seq in sequences])
y = np.array(types)
print(f"Shape of X: {embeddings.shape}")
print(f"Shape of y: {y.shape}")

# Split the dataset into training and testing subsets
embeddings_train, embeddings_test, y_train, y_test = train_test_split(embeddings, y, test_size=0.2, random_state=42, stratify=y) 

# Initial model evaluation (your original code snippet)
model_scaled = XGBClassifier(
    eval_metric='logloss',
    random_state=42,
)

print("\n--- Training Model with scale_pos_weight ---")
model_scaled.fit(embeddings_train, y_train)

y_pred_proba = model_scaled.predict_proba(embeddings_test)[:, 1]

# Try different thresholds:
thresholds_to_test = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] # Add more granular steps if needed

for threshold in thresholds_to_test:
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)

    accuracy_adjusted = accuracy_score(y_test, y_pred_adjusted)
    precision_adjusted, recall_adjusted, f1_adjusted, _ = precision_recall_fscore_support(
        y_test, y_pred_adjusted, average='binary', zero_division=0
    )

    print(f"\n--- Model Performance with Adjusted Threshold ({threshold:.2f}) ---")
    print(f"Accuracy: {accuracy_adjusted:.2f}")
    print(f"Precision: {precision_adjusted:.2f}")
    print(f"Recall: {recall_adjusted:.2f}")
    print(f"F1-Score: {f1_adjusted:.2f}")
    print(f"Distribution of adjusted predictions: {Counter(y_pred_adjusted)}")

Generating BERT embeddings...
Shape of X: (38072, 1024)
Shape of y: (38072,)

--- Training Model with scale_pos_weight ---

--- Model Performance with Adjusted Threshold (0.10) ---
Accuracy: 0.50
Precision: 0.50
Recall: 1.00
F1-Score: 0.67
Distribution of adjusted predictions: Counter({1: 7615})

--- Model Performance with Adjusted Threshold (0.20) ---
Accuracy: 0.50
Precision: 0.50
Recall: 1.00
F1-Score: 0.67
Distribution of adjusted predictions: Counter({1: 7615})

--- Model Performance with Adjusted Threshold (0.30) ---
Accuracy: 0.50
Precision: 0.50
Recall: 1.00
F1-Score: 0.67
Distribution of adjusted predictions: Counter({1: 7615})

--- Model Performance with Adjusted Threshold (0.40) ---
Accuracy: 0.50
Precision: 0.50
Recall: 1.00
F1-Score: 0.67
Distribution of adjusted predictions: Counter({1: 7615})

--- Model Performance with Adjusted Threshold (0.50) ---
Accuracy: 0.50
Precision: 0.50
Recall: 1.00
F1-Score: 0.67
Distribution of adjusted predictions: Counter({1: 7615})

--- Mo