In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Loading Data w/ Pandas

In [2]:
scop_df = pd.read_csv("/scratch/gpfs/jr8867/main/db/scop_data_fold.csv")
scop_df

Unnamed: 0,index,uid,fa,sf,fold,seq
0,0,Q03131,4000119,3000038,2000148,MSGPRSRTTSRRTPVRIGAVVVASSTSELLDGLAAVADGRPHASVV...
1,1,P09147,4000088,3000038,2000148,MRVLVTGGSGYIGSHTCVQLLQNGHDVIILDNLCNSKRSVLPVIER...
2,2,P61889,4000045,3000039,2000005,MKVAVLGAAGGIGQALALLLKTQLPSGSELSLYDIAPVTPGVAVDL...
3,3,P00334,4000029,3000038,2000148,MSFTLTNKNVIFVAGLGGIGLDTSKELLKRDLKNLVILDRIENPAA...
4,4,O33830,4000089,3000039,2000005,MPSVKIGIIGAGSAVFSLRLVSDLCKTPGLSGSTVTLMDIDEERLD...
...,...,...,...,...,...,...
35972,35972,P20585,4004015,3000587,2001251,MSRRKPASGGLAASSSAPARQAVLSRFFQSTGSLKSTSSSTGAADQ...
35973,35973,P20585,4004015,3002020,2001251,MSRRKPASGGLAASSSAPARQAVLSRFFQSTGSLKSTSSSTGAADQ...
35974,35974,P52701,4004015,3001688,2001251,MSRQSTLYSFFPKSPALSDANKASARASREGGRAAAAPGASPSPGG...
35975,35975,P52701,4004015,3000587,2001251,MSRQSTLYSFFPKSPALSDANKASARASREGGRAAAAPGASPSPGG...


In [5]:
def get_folds(indicies):
    return np.array([scop_df.loc[scop_df['index'] == i, 'fold'].values[0] for i in indicies])

def get_superfamilies(indicies):
    return np.array([scop_df.loc[scop_df['index'] == i, 'sf'].values[0] for i in indicies])

def get_families(indicies):
    return np.array([scop_df.loc[scop_df['index'] == i, 'fa'].values[0] for i in indicies])

print(get_folds([0, 1, 35976]))
print(get_superfamilies([0, 1, 35976]))
print(get_families([0, 1, 35976]))

[2000148 2000148 2001251]
[3000038 3000038 3002020]
[4000119 4000088 4004015]


# Loading Embeddings with Numpy

In [6]:
embeddings = np.load("/scratch/gpfs/jr8867/main/db/full/embeddings.npy")
indicies = np.load("/scratch/gpfs/jr8867/main/db/full/indicies.npy")

print(embeddings.shape) # Each embedding is 1280 dim
print(indicies.shape)

(35977, 1280)
(35977,)


# Construct Dataset

X: Embeddings

y: Sequence Index

In [7]:
# Get superfamily information
print("Extracting superfamily information for stratification...")
superfamilies = np.array([scop_df.loc[scop_df['index'] == i, 'fold'].values[0] for i in indicies])

# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(superfamilies)

# Filter indices with at least 2 samples for stratification
print("Filtering indices with at least 2 samples...")
unique, counts = np.unique(labels, return_counts=True)
valid_labels = unique[counts > 1]
mask = np.isin(labels, valid_labels)
embeddings = embeddings[mask]
indicies = indicies[mask]
labels = labels[mask]  # Ensure labels are also filtered
print(f"Filtered embeddings shape: {embeddings.shape}")
print(f"Filtered indicies shape: {indicies.shape}")

# Use filtered labels for stratification
train_embeddings, test_embeddings, train_indicies, test_indicies = train_test_split(
    embeddings, indicies, test_size=0.25, stratify=labels
)

train_superfamilies = get_superfamilies(train_indicies)
test_superfamilies = get_superfamilies(test_indicies)
train_families = get_families(train_indicies)
test_families = get_families(test_indicies)
train_folds = get_folds(train_indicies)
test_folds = get_folds(test_indicies)


Extracting superfamily information for stratification...
Filtering indices with at least 2 samples...
Filtered embeddings shape: (35571, 1280)
Filtered indicies shape: (35571,)


In [9]:
print("Train embeddings shape:", train_embeddings.shape)
print("Train indices shape:", train_indicies.shape)
print("Train superfamilies shape:", train_superfamilies.shape)
print("Train families shape:", train_families.shape)
print("Test embeddings shape:", test_embeddings.shape)
print("Test indices shape:", test_indicies.shape)
print("Test superfamilies shape:", test_superfamilies.shape)
print("Test families shape:", test_families.shape)
print("Train folds shape:", train_folds.shape)
print("Test folds shape:", test_folds.shape)

Train embeddings shape: (26678, 1280)
Train indices shape: (26678,)
Train superfamilies shape: (26678,)
Train families shape: (26678,)
Test embeddings shape: (8893, 1280)
Test indices shape: (8893,)
Test superfamilies shape: (8893,)
Test families shape: (8893,)
Train folds shape: (26678,)
Test folds shape: (8893,)


In [10]:
np.save("/scratch/gpfs/jr8867/main/db/train-test-fold/train_embeddings.npy", train_embeddings)
np.save("/scratch/gpfs/jr8867/main/db/train-test-fold/test_embeddings.npy", test_embeddings)
np.save("/scratch/gpfs/jr8867/main/db/train-test-fold/train_indicies.npy", train_indicies)
np.save("/scratch/gpfs/jr8867/main/db/train-test-fold/test_indicies.npy", test_indicies)
np.save("/scratch/gpfs/jr8867/main/db/train-test-fold/train_superfamilies.npy", train_superfamilies)
np.save("/scratch/gpfs/jr8867/main/db/train-test-fold/test_superfamilies.npy", test_superfamilies)
np.save("/scratch/gpfs/jr8867/main/db/train-test-fold/train_families.npy", train_families)
np.save("/scratch/gpfs/jr8867/main/db/train-test-fold/test_families.npy", test_families)
np.save("/scratch/gpfs/jr8867/main/db/train-test-fold/train_folds.npy", train_folds)
np.save("/scratch/gpfs/jr8867/main/db/train-test-fold/test_folds.npy", test_folds)


: 