In [1]:
import pandas as pd
import numpy as np

In [2]:
scop_data_fold_path = '/scratch/gpfs/jr8867/strat-main/db/scop_data_fold.csv'
scop_data_fold = pd.read_csv(scop_data_fold_path)
scop_data_fold.head()

Unnamed: 0,index,uid,fa,sf,fold,seq
0,0,Q03131,4000119,3000038,2000148,MSGPRSRTTSRRTPVRIGAVVVASSTSELLDGLAAVADGRPHASVV...
1,1,P09147,4000088,3000038,2000148,MRVLVTGGSGYIGSHTCVQLLQNGHDVIILDNLCNSKRSVLPVIER...
2,2,P61889,4000045,3000039,2000005,MKVAVLGAAGGIGQALALLLKTQLPSGSELSLYDIAPVTPGVAVDL...
3,3,P00334,4000029,3000038,2000148,MSFTLTNKNVIFVAGLGGIGLDTSKELLKRDLKNLVILDRIENPAA...
4,4,O33830,4000089,3000039,2000005,MPSVKIGIIGAGSAVFSLRLVSDLCKTPGLSGSTVTLMDIDEERLD...


In [3]:
embeddings_path = '/scratch/gpfs/jr8867/strat-main/db/full/embeddings.npy'
indicies_path = '/scratch/gpfs/jr8867/strat-main/db/full/indicies.npy'
embeddings = np.load(embeddings_path)
original_indices_npy = np.load(indicies_path)

print(embeddings.shape)
print(original_indices_npy.shape)

(35977, 1280)
(35977,)


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
# --- Stratified Train/Test Split ---

# 1. Create a mapping from family to fold
family_to_fold = scop_data_fold[['fa', 'fold']].drop_duplicates().set_index('fa')['fold']
print(f"\nTotal unique families: {len(family_to_fold)}")

# 2. Identify folds with only one family (these cannot be stratified)
fold_family_counts = family_to_fold.groupby(family_to_fold).count()
singleton_folds = fold_family_counts[fold_family_counts == 1].index
print(f"Found {len(singleton_folds)} folds with only one family. These will be excluded from the split.")
# print("Singleton folds:", singleton_folds.tolist()) # Optional: print the specific folds

# 3. Filter out families belonging to singleton folds
families_to_split = family_to_fold[~family_to_fold.isin(singleton_folds)]
print(f"Families available for splitting: {len(families_to_split)}")

# 4. Get unique families (eligible for splitting) and their folds for stratification
unique_families_to_split = families_to_split.index.unique()
family_strata = families_to_split[unique_families_to_split] # Folds corresponding to the families being split

# 5. Perform stratified split on the eligible families
train_families, test_families = train_test_split(
    unique_families_to_split,
    test_size=0.2,
    stratify=family_strata,
    random_state=42
)

print(f"\nFamilies in train set: {len(train_families)}")
print(f"Families in test set: {len(test_families)}")

# 6. Filter the *original* DataFrame based on the family split
# Include data from singleton folds in the training set by default, or handle them separately if needed
# Here, we'll just use the families selected for train/test
train_df = scop_data_fold[scop_data_fold['fa'].isin(train_families)].reset_index(drop=True)
test_df = scop_data_fold[scop_data_fold['fa'].isin(test_families)].reset_index(drop=True)

# Optional: Add families from singleton folds back to the training set
# singleton_families = family_to_fold[family_to_fold.isin(singleton_folds)].index
# singleton_df = scop_data_fold[scop_data_fold['fa'].isin(singleton_families)]
# train_df = pd.concat([train_df, singleton_df]).reset_index(drop=True)
# print(f"Added {len(singleton_families)} families from singleton folds to the training set.")


print(f"\nInitial Train DataFrame shape (before alignment): {train_df.shape}")
print(f"Initial Test DataFrame shape (before alignment): {test_df.shape}")

# Check if the split was successful (no overlapping families among split sets)
assert len(set(train_df['fa']).intersection(set(test_df['fa']))) == 0, "Overlap detected in families between train and test sets!"

# --- Align and Split Embeddings ---

# 7. Create a map from original index value to its position in the embeddings array
index_map = {original_idx: position for position, original_idx in enumerate(original_indices_npy)}

# 8. Get the original index values present in the train/test DataFrames
train_indices_df = train_df['index'].values
test_indices_df = test_df['index'].values

# 9. Find the positions in the embeddings array corresponding to train/test data
train_positions = [index_map[idx] for idx in train_indices_df if idx in index_map]
test_positions = [index_map[idx] for idx in test_indices_df if idx in index_map]

# 10. Filter DataFrames to only include samples whose embeddings were found
original_train_size = len(train_df)
original_test_size = len(test_df)

# Important: Filter based on the indices *actually found* in the map
train_indices_found = {original_indices_npy[pos] for pos in train_positions}
test_indices_found = {original_indices_npy[pos] for pos in test_positions}

train_df = train_df[train_df['index'].isin(train_indices_found)].reset_index(drop=True)
test_df = test_df[test_df['index'].isin(test_indices_found)].reset_index(drop=True)


if len(train_df) != original_train_size:
    print(f"\nWarning: {original_train_size - len(train_df)} train samples dropped due to index mismatch or belonging to excluded singleton families.")
if len(test_df) != original_test_size:
     print(f"Warning: {original_test_size - len(test_df)} test samples dropped due to index mismatch or belonging to excluded singleton families.")

print(f"\nFinal Aligned Train DataFrame shape: {train_df.shape}")
print(f"Final Aligned Test DataFrame shape: {test_df.shape}")


# 11. Slice the embeddings and original index arrays using the found positions
train_embeddings = embeddings[train_positions]
test_embeddings = embeddings[test_positions]

# Optional: Keep the corresponding original indices as well
train_original_indices = original_indices_npy[train_positions]
test_original_indices = original_indices_npy[test_positions]

# --- Final Checks ---
assert train_embeddings.shape[0] == len(train_df), "Train embeddings rows do not match Train DataFrame rows"
assert test_embeddings.shape[0] == len(test_df), "Test embeddings rows do not match Test DataFrame rows"
assert train_original_indices.shape[0] == len(train_df), "Train original indices count does not match Train DataFrame rows"
assert test_original_indices.shape[0] == len(test_df), "Test original indices count does not match Test DataFrame rows"


print(f"\nFinal Train embeddings shape: {train_embeddings.shape}")
print(f"Final Test embeddings shape: {test_embeddings.shape}")
print(f"Final Train original indices shape: {train_original_indices.shape}")
print(f"Final Test original indices shape: {test_original_indices.shape}")


# --- Verification (Optional) ---

# Verify stratification (only on folds that were actually split)
split_folds = family_strata.unique()
original_fold_dist = scop_data_fold[scop_data_fold['fold'].isin(split_folds)]['fold'].value_counts(normalize=True)
train_fold_dist = train_df[train_df['fold'].isin(split_folds)]['fold'].value_counts(normalize=True)
test_fold_dist = test_df[test_df['fold'].isin(split_folds)]['fold'].value_counts(normalize=True)

fold_dist_comparison = pd.DataFrame({
    'Original % (Split Folds)': original_fold_dist * 100,
    'Train %': train_fold_dist * 100,
    'Test %': test_fold_dist * 100
}).fillna(0)

print("\nFold Distribution Comparison for Split Folds (%):")
print(fold_dist_comparison.head())

# Verify family separation: Check a few families are entirely in one set
if len(train_families) > 0:
    example_train_family = train_families[0]
    print(f"\nChecking train family {example_train_family}:")
    print(f"  In final train_df: {train_df[train_df['fa'] == example_train_family].shape[0] > 0}")
    print(f"  In final test_df: {test_df[test_df['fa'] == example_train_family].shape[0] > 0}")

if len(test_families) > 0:
    example_test_family = test_families[0]
    print(f"\nChecking test family {example_test_family}:")
    print(f"  In final train_df: {train_df[train_df['fa'] == example_test_family].shape[0] > 0}")
    print(f"  In final test_df: {test_df[test_df['fa'] == example_test_family].shape[0] > 0}")



Total unique families: 5887
Found 927 folds with only one family. These will be excluded from the split.
Families available for splitting: 4960

Families in train set: 3968
Families in test set: 992

Initial Train DataFrame shape (before alignment): (24195, 6)
Initial Test DataFrame shape (before alignment): (7251, 6)

Final Aligned Train DataFrame shape: (24195, 6)
Final Aligned Test DataFrame shape: (7251, 6)

Final Train embeddings shape: (24195, 1280)
Final Test embeddings shape: (7251, 1280)
Final Train original indices shape: (24195,)
Final Test original indices shape: (7251,)

Fold Distribution Comparison for Split Folds (%):
         Original % (Split Folds)   Train %    Test %
fold                                                 
2000000                  1.621828  1.066336  3.475383
2000001                  0.031801  0.041331  0.000000
2000002                  1.017618  0.103327  4.068404
2000003                  0.206704  0.247985  0.068956
2000005                  1.418304 

In [6]:
import os

save_split_path = '/scratch/gpfs/jr8867/main/db/family-split-train-test' # directory to save the split data
# Create the save directory if it doesn't exist
os.makedirs(save_split_path, exist_ok=True)

train_dir = os.path.join(save_split_path, 'train')
test_dir = os.path.join(save_split_path, 'test')

os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Save train and test DataFrames to CSV
train_df.to_csv(os.path.join(train_dir, 'train_metadata.csv'), index=False)
test_df.to_csv(os.path.join(test_dir, 'test_metadata.csv'), index=False)

# Save embeddings to NumPy .npy files
np.save(os.path.join(train_dir, 'train_embeddings.npy'), train_embeddings)
np.save(os.path.join(test_dir, 'test_embeddings.npy'), test_embeddings)

# Save original indices to NumPy .npy files
np.save(os.path.join(train_dir, 'train_original_indices.npy'), train_original_indices)
np.save(os.path.join(test_dir, 'test_original_indices.npy'), test_original_indices)

print(f"\nSaved train and test data to {save_split_path}")


Saved train and test data to /scratch/gpfs/jr8867/main/db/family-split-train-test


In [9]:
# --- Create a Training Subset Matching Test Set Size ---

# Get the size of the test set
n_test_samples = len(test_df)
n_train_samples = len(train_df)

print(f"\nTest set size: {n_test_samples}")
print(f"Original train set size: {n_train_samples}")

if n_test_samples > n_train_samples:
    print("Warning: Test set is larger than the training set. Cannot create a subset of this size.")
    # Handle this case as needed, maybe skip subset creation
    train_subset_df = None
    train_subset_embeddings = None
    train_subset_original_indices = None
else:
    # Generate random indices for the subset (without replacement)
    subset_indices = np.random.choice(n_train_samples, size=n_test_samples, replace=False)
    subset_indices.sort() # Optional: sort indices for potential minor efficiency gains

    # Create the subset DataFrame and embeddings
    train_subset_df = train_df.iloc[subset_indices].reset_index(drop=True)
    train_subset_embeddings = train_embeddings[subset_indices]
    train_subset_original_indices = train_original_indices[subset_indices]

    print(f"\nCreated training subset with size: {len(train_subset_df)}")
    print(f"Train subset DataFrame shape: {train_subset_df.shape}")
    print(f"Train subset embeddings shape: {train_subset_embeddings.shape}")
    print(f"Train subset original indices shape: {train_subset_original_indices.shape}")

    # --- Verification ---
    assert len(train_subset_df) == n_test_samples, "Subset size does not match test set size"
    assert train_subset_embeddings.shape[0] == n_test_samples, "Subset embeddings rows do not match test set size"
    assert train_subset_original_indices.shape[0] == n_test_samples, "Subset original indices count does not match test set size"

    # Assert there is no overlap between train subset and test set
    assert len(set(train_subset_df['fa']).intersection(set(test_df['fa']))) == 0, "Overlap detected in families between train subset and test set!"

    print("\nTraining subset created and verified successfully.")



Test set size: 7251
Original train set size: 24195

Created training subset with size: 7251
Train subset DataFrame shape: (7251, 6)
Train subset embeddings shape: (7251, 1280)
Train subset original indices shape: (7251,)

Training subset created and verified successfully.


In [10]:
# --- Save Training Subset (Optional) ---

if train_subset_df is not None:
    subset_save_dir = os.path.join(save_split_path, 'train_subset')
    os.makedirs(subset_save_dir, exist_ok=True)

    # Save subset DataFrame to CSV
    train_subset_df.to_csv(os.path.join(subset_save_dir, 'train_subset_metadata.csv'), index=False)

    # Save subset embeddings to NumPy .npy file
    np.save(os.path.join(subset_save_dir, 'train_subset_embeddings.npy'), train_subset_embeddings)

    # Save subset original indices to NumPy .npy file
    np.save(os.path.join(subset_save_dir, 'train_subset_original_indices.npy'), train_subset_original_indices)

    print(f"\nSaved training subset data to {subset_save_dir}")
else:
    print("\nSkipping saving of training subset as it was not created.")



Saved training subset data to /scratch/gpfs/jr8867/main/db/family-split-train-test/train_subset


# Information

Embeddings npy objects are lined up row by row with the test/train df.