# Train ProtBERT-GRU-Attention

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

2025-04-26 10:32:47.384178: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-26 10:32:47.387791: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-26 10:32:47.455937: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
DATASET_LOCATION = Path("../datasets/bernett_v4/")

## Load dataset

In [5]:
train_pos = pd.read_csv("bernett/TrainPPI/Intra1_pos_rr.txt", sep=" ", header=None)
train_neg = pd.read_csv("bernett/TrainPPI/Intra1_neg_rr.txt", sep=" ", header=None)
val_pos = pd.read_csv("bernett/TrainPPI/Intra0_pos_rr.txt", sep=" ", header=None)
val_neg = pd.read_csv("bernett/TrainPPI/Intra0_neg_rr.txt", sep=" ", header=None)
test_pos = pd.read_csv("bernett/TrainPPI/Intra2_pos_rr.txt", sep=" ", header=None)
test_neg = pd.read_csv("bernett/TrainPPI/Intra2_neg_rr.txt", sep=" ", header=None)

print(f"Train size: {train_pos.shape[0]} positives, {train_neg.shape[0]} negatives, {train_pos.shape[0] + train_neg.shape[0]} total.")
print(f"Validation size: {val_pos.shape[0]} positives, {val_neg.shape[0]} negatives, {val_pos.shape[0] + val_neg.shape[0]} total.")
print(f"Test size: {test_pos.shape[0]} positives, {test_neg.shape[0]} negatives, {test_pos.shape[0] + test_neg.shape[0]} total.")

Train size: 8160 positives, 8160 negatives, 16320 total.
Validation size: 2963 positives, 2963 negatives, 5926 total.
Test size: 2602 positives, 2602 negatives, 5204 total.


In [5]:
# Check shape and NaN values
dfs = [train_pos, train_neg, val_pos, val_neg, test_pos, test_neg]
results = {
    "shape_check": [df.shape[1] == 2 for df in dfs],
    "nan_check": [df.isna().sum().sum() == 0 for df in dfs]
}
results = pd.DataFrame(results, index=["train_pos", "train_neg", "val_pos", "val_neg", "test_pos", "test_neg"])
results

Unnamed: 0,shape_check,nan_check
train_pos,True,True
train_neg,True,True
val_pos,True,True
val_neg,True,True
test_pos,True,True
test_neg,True,True


In [None]:
from Bio import SeqIO

# Load your FASTA file and create a mapping of ID -> Sequence
seq_dict = {record.id: str(record.seq) for record in SeqIO.parse(DATASET_LOCATION / "human_swissprot_oneliner.fasta", "fasta")}

mapped_dfs = []
for i, df in enumerate(dfs):
  df = df.applymap(lambda id: seq_dict.get(id, np.nan))
  df.columns = ['seq1', 'seq2']
  df['label'] = (i+1) % 2
  mapped_dfs.append(df)

results = {
    "shape_check": [df.shape[1] == 3 for df in mapped_dfs],
    "nan_check": [df.isna().sum().sum() for df in mapped_dfs]
}
results = pd.DataFrame(results, index=["train_pos", "train_neg", "val_pos", "val_neg", "test_pos", "test_neg"])
results

NameError: name 'dfs' is not defined

In [7]:
seq_dataset = DatasetDict({
  "train": Dataset.from_pandas(pd.concat([mapped_dfs[0], mapped_dfs[1]]).reset_index(drop=True)),
  "validation": Dataset.from_pandas(pd.concat([mapped_dfs[2], mapped_dfs[3]]).reset_index(drop=True)),
  "test": Dataset.from_pandas(pd.concat([mapped_dfs[4], mapped_dfs[5]]).reset_index(drop=True))
})
seq_dataset

DatasetDict({
    train: Dataset({
        features: ['seq1', 'seq2', 'label'],
        num_rows: 16320
    })
    validation: Dataset({
        features: ['seq1', 'seq2', 'label'],
        num_rows: 5926
    })
    test: Dataset({
        features: ['seq1', 'seq2', 'label'],
        num_rows: 5204
    })
})

In [8]:
np.unique(seq_dataset["train"]["label"], return_counts=True)

(array([0, 1]), array([8160, 8160]))

In [9]:
np.unique(seq_dataset["validation"]["label"], return_counts=True)

(array([0, 1]), array([2963, 2963]))

In [10]:
np.unique(seq_dataset["test"]["label"], return_counts=True)

(array([0, 1]), array([2602, 2602]))

The dataset is balanced.

In [11]:
# Free some memory
del mapped_dfs, dfs, results, seq_dict, train_pos, train_neg, val_pos, val_neg, test_pos, test_neg

## Tokenize

In [21]:
tokenizer = BertTokenizerFast.from_pretrained("Rostlab/prot_bert", do_lower_case=False, use_fast=True)

In [22]:
# from concurrent.futures import ProcessPoolExecutor
import math
from functools import partial

def tokenize(batch, tokenizer, N=500):
  seqs1, seqs2 = batch['seq1'], batch['seq2'] # batch must be a dataframe with two columns.

  # replace unknown aminoacids and turn to uppercase
  seqs1 = [re.sub(r"[UZOB]", "X", seq.upper()) for seq in seqs1]
  seqs2 = [re.sub(r"[UZOB]", "X", seq.upper()) for seq in seqs2]

  # Truncate sequences according to the paper
  seqs1 = [seq[:math.ceil(N/2)] + seq[-N//2:] if len(seq) > N else seq for seq in seqs1]
  seqs2 = [seq[:math.ceil(N/2)] + seq[-N//2:] if len(seq) > N else seq for seq in seqs2]

  # introduce spaces between sequences and pad those that are not at least 500
  seqs1 = [' '.join(seq) + f" {tokenizer.pad_token}"*(N - len(seq)) for seq in seqs1]
  seqs2 = [' '.join(seq) + f" {tokenizer.pad_token}"*(N - len(seq)) for seq in seqs2]

  return tokenizer(seqs1, seqs2, padding=False, truncation=False)

tokenize_batch = partial(tokenize, tokenizer=tokenizer, N=500)

In [13]:
tok_dataset = seq_dataset.map(tokenize_batch, num_proc=1, batched=True, batch_size=32)

Map:   0%|          | 0/16320 [00:00<?, ? examples/s]

Map:   0%|          | 0/5926 [00:00<?, ? examples/s]

Map:   0%|          | 0/5204 [00:00<?, ? examples/s]

In [14]:
tok_dataset['train'] = tok_dataset['train'].shuffle(seed=42)

In [15]:
tok_dataset['train']['label'][10:20]

[0, 0, 1, 1, 1, 0, 1, 0, 1, 0]

In [16]:
tok_dataset.save_to_disk("tokenized_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/16320 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5926 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5204 [00:00<?, ? examples/s]

In [20]:
tok_dataset

DatasetDict({
    train: Dataset({
        features: ['seq1', 'seq2', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16320
    })
    validation: Dataset({
        features: ['seq1', 'seq2', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5926
    })
    test: Dataset({
        features: ['seq1', 'seq2', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5204
    })
})

# Full tokenized dataset

In [None]:
train_pos = pd.read_csv(DATASET_LOCATION / "Intra1_pos_rr.txt", sep=" ", header=None)
train_neg = pd.read_csv(DATASET_LOCATION / "Intra1_neg_rr.txt", sep=" ", header=None)
val_pos = pd.read_csv(DATASET_LOCATION / "Intra0_pos_rr.txt", sep=" ", header=None)
val_neg = pd.read_csv(DATASET_LOCATION / "Intra0_neg_rr.txt", sep=" ", header=None)
test_pos = pd.read_csv(DATASET_LOCATION / "Intra2_pos_rr.txt", sep=" ", header=None)
test_neg = pd.read_csv(DATASET_LOCATION / "Intra2_neg_rr.txt", sep=" ", header=None)

print(f"Train size: {train_pos.shape[0]} positives, {train_neg.shape[0]} negatives, {train_pos.shape[0] + train_neg.shape[0]} total.")
print(f"Validation size: {val_pos.shape[0]} positives, {val_neg.shape[0]} negatives, {val_pos.shape[0] + val_neg.shape[0]} total.")
print(f"Test size: {test_pos.shape[0]} positives, {test_neg.shape[0]} negatives, {test_pos.shape[0] + test_neg.shape[0]} total.")

Train size: 81596 positives, 81596 negatives, 163192 total.
Validation size: 29630 positives, 29630 negatives, 59260 total.
Test size: 26024 positives, 26024 negatives, 52048 total.


In [14]:
# Check shape and NaN values
dfs = [train_pos, train_neg, val_pos, val_neg, test_pos, test_neg]
results = {
    "shape_check": [df.shape[1] == 2 for df in dfs],
    "nan_check": [df.isna().sum().sum() == 0 for df in dfs]
}
results = pd.DataFrame(results, index=["train_pos", "train_neg", "val_pos", "val_neg", "test_pos", "test_neg"])
results

Unnamed: 0,shape_check,nan_check
train_pos,True,True
train_neg,True,True
val_pos,True,True
val_neg,True,True
test_pos,True,True
test_neg,True,True


In [15]:
from Bio import SeqIO

# Load your FASTA file and create a mapping of ID -> Sequence
seq_dict = {record.id: str(record.seq) for record in SeqIO.parse("bernett/human_swissprot_oneliner.fasta", "fasta")}

mapped_dfs = []
for i, df in enumerate(dfs):
  df = df.applymap(lambda id: seq_dict.get(id, np.nan))
  df.columns = ['seq1', 'seq2']
  df['label'] = (i+1) % 2
  mapped_dfs.append(df)

results = {
    "shape_check": [df.shape[1] == 3 for df in mapped_dfs],
    "nan_check": [df.isna().sum().sum() for df in mapped_dfs]
}
results = pd.DataFrame(results, index=["train_pos", "train_neg", "val_pos", "val_neg", "test_pos", "test_neg"])
results

Unnamed: 0,shape_check,nan_check
train_pos,True,0
train_neg,True,0
val_pos,True,0
val_neg,True,0
test_pos,True,0
test_neg,True,0


In [16]:
seq_dataset = DatasetDict({
  "train": Dataset.from_pandas(pd.concat([mapped_dfs[0], mapped_dfs[1]]).reset_index(drop=True)),
  "validation": Dataset.from_pandas(pd.concat([mapped_dfs[2], mapped_dfs[3]]).reset_index(drop=True)),
  "test": Dataset.from_pandas(pd.concat([mapped_dfs[4], mapped_dfs[5]]).reset_index(drop=True))
})
seq_dataset

DatasetDict({
    train: Dataset({
        features: ['seq1', 'seq2', 'label'],
        num_rows: 163192
    })
    validation: Dataset({
        features: ['seq1', 'seq2', 'label'],
        num_rows: 59260
    })
    test: Dataset({
        features: ['seq1', 'seq2', 'label'],
        num_rows: 52048
    })
})

In [17]:
np.unique(seq_dataset["train"]["label"], return_counts=True)

(array([0, 1]), array([81596, 81596]))

In [18]:
np.unique(seq_dataset["validation"]["label"], return_counts=True)

(array([0, 1]), array([29630, 29630]))

In [19]:
np.unique(seq_dataset["test"]["label"], return_counts=True)

(array([0, 1]), array([26024, 26024]))

In [23]:
tok_dataset = seq_dataset.map(tokenize_batch, num_proc=1, batched=True, batch_size=32)

Map:   0%|          | 0/163192 [00:00<?, ? examples/s]

Map:   0%|          | 0/59260 [00:00<?, ? examples/s]

Map:   0%|          | 0/52048 [00:00<?, ? examples/s]

In [24]:
tok_dataset.save_to_disk("tokenized_dataset_full")

Saving the dataset (0/3 shards):   0%|          | 0/163192 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/59260 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/52048 [00:00<?, ? examples/s]