In [9]:
import pandas as pd
import numpy as np
from pathlib import Path

In [3]:
!mkdir -p bernett/FT bernett/TrainPPI

In [None]:
DATASET_LOCATION = Path("../datasets/bernett_v4/")

In [None]:
train_pos = pd.read_csv(DATASET_LOCATION / "Intra1_pos_rr.txt", sep=" ", header=None)
train_neg = pd.read_csv(DATASET_LOCATION / "Intra1_neg_rr.txt", sep=" ", header=None)
val_pos = pd.read_csv(DATASET_LOCATION / "Intra0_pos_rr.txt", sep=" ", header=None)
val_neg = pd.read_csv(DATASET_LOCATION / "Intra0_neg_rr.txt", sep=" ", header=None)
test_pos = pd.read_csv(DATASET_LOCATION / "Intra2_pos_rr.txt", sep=" ", header=None)
test_neg = pd.read_csv(DATASET_LOCATION / "Intra2_neg_rr.txt", sep=" ", header=None)

print(f"Train size: {train_pos.shape[0]} positives, {train_neg.shape[0]} negatives, {train_pos.shape[0] + train_neg.shape[0]} total.")
print(f"Validation size: {val_pos.shape[0]} positives, {val_neg.shape[0]} negatives, {val_pos.shape[0] + val_neg.shape[0]} total.")
print(f"Test size: {test_pos.shape[0]} positives, {test_neg.shape[0]} negatives, {test_pos.shape[0] + test_neg.shape[0]} total.")

Train size: 81596 positives, 81596 negatives, 163192 total.
Validation size: 29630 positives, 29630 negatives, 59260 total.
Test size: 26024 positives, 26024 negatives, 52048 total.


In [8]:
# Check shape and NaN values
dfs = [train_pos, train_neg, val_pos, val_neg, test_pos, test_neg]
results = {
    "shape_check": [df.shape[1] == 2 for df in dfs],
    "nan_check": [df.isna().sum().sum() == 0 for df in dfs]
}
results = pd.DataFrame(results, index=["train_pos", "train_neg", "val_pos", "val_neg", "test_pos", "test_neg"])
results

Unnamed: 0,shape_check,nan_check
train_pos,True,True
train_neg,True,True
val_pos,True,True
val_neg,True,True
test_pos,True,True
test_neg,True,True


# Split dataset

90% for fine-tuning ProtBERT, and 10% for training.

In [None]:
for filename in ("Intra1_pos_rr.txt", "Intra1_neg_rr.txt", "Intra0_pos_rr.txt", "Intra0_neg_rr.txt", "Intra2_pos_rr.txt", "Intra2_neg_rr.txt"):
    df = pd.read_csv(DATASET_LOCATION / filename, sep=" ", header=None)

    ft_df = df.sample(frac=0.9, random_state=42)
    train_df = df.drop(ft_df.index)

    ft_df.to_csv(Path("bernett/FT") / filename, sep=" ", header=None, index=False)
    train_df.to_csv(Path("bernett/TrainPPI") / filename, sep=" ", header=None, index=False)

# Quick check that there is no data leakage

In [12]:
for filename in ("Intra1_pos_rr.txt", "Intra1_neg_rr.txt", "Intra0_pos_rr.txt", "Intra0_neg_rr.txt", "Intra2_pos_rr.txt", "Intra2_neg_rr.txt"):
    ft_df = pd.read_csv(Path("bernett/FT") / filename, sep=" ", header=None)
    train_df = pd.read_csv(Path("bernett/TrainPPI") / filename, sep=" ", header=None)

    ratio = ft_df.shape[0] / (ft_df.shape[0] + train_df.shape[0])
    duplicates = pd.merge(ft_df, train_df, how="inner")

    if not duplicates.empty:
        print("There are duplicated rows in", filename)
        print(duplicates)
    else:
        print("No duplicated rows in", filename, f"Ratio {ratio:.3f}")
    

No duplicated rows in Intra1_pos_rr.txt Ratio 0.900
No duplicated rows in Intra1_neg_rr.txt Ratio 0.900
No duplicated rows in Intra0_pos_rr.txt Ratio 0.900
No duplicated rows in Intra0_neg_rr.txt Ratio 0.900
No duplicated rows in Intra2_pos_rr.txt Ratio 0.900
No duplicated rows in Intra2_neg_rr.txt Ratio 0.900
