In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import faiss
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Fetching Data from Storages

In [2]:
def fetch_batched_arr(arr_type, n=8):
    arrays = [np.load(f'/scratch/gpfs/jr8867/embeddings/scop/batch-{i}-{arr_type}.npy') for i in range(n)]
    return np.concatenate(arrays)

In [3]:
embeddings = fetch_batched_arr('embeddings')
indices = fetch_batched_arr('indices')
print(embeddings.shape)
print(indices.shape)

(35977, 1280)
(35977,)


In [4]:
# Create directory if it doesn't exist
save_dir = '/scratch/gpfs/jr8867/embeddings/scop'

# Save the arrays
np.save(os.path.join(save_dir, 'embeddings.npy'), embeddings)
np.save(os.path.join(save_dir, 'indices.npy'), indices)

print(f"Saved embeddings and indices to {save_dir}")

Saved embeddings and indices to /scratch/gpfs/jr8867/embeddings/scop


In [5]:
embeddings_file_path = os.path.join(save_dir, 'embeddings.npy')
embeddings_file_size = os.path.getsize(embeddings_file_path)
embeddings_file_size_mb = embeddings_file_size / (1024 * 1024)
print(f"File size of embeddings.npy: {embeddings_file_size_mb:.2f} MB")


File size of embeddings.npy: 175.67 MB


In [6]:
metadata_df = pd.read_csv('/scratch/gpfs/jr8867/datasets/scop/scop_data.csv')
metadata_df

Unnamed: 0,index,uid,fa,sf,seq
0,0,Q03131,4000119,3000038,MSGPRSRTTSRRTPVRIGAVVVASSTSELLDGLAAVADGRPHASVV...
1,1,P09147,4000088,3000038,MRVLVTGGSGYIGSHTCVQLLQNGHDVIILDNLCNSKRSVLPVIER...
2,2,P61889,4000045,3000039,MKVAVLGAAGGIGQALALLLKTQLPSGSELSLYDIAPVTPGVAVDL...
3,3,P00334,4000029,3000038,MSFTLTNKNVIFVAGLGGIGLDTSKELLKRDLKNLVILDRIENPAA...
4,4,O33830,4000089,3000039,MPSVKIGIIGAGSAVFSLRLVSDLCKTPGLSGSTVTLMDIDEERLD...
...,...,...,...,...,...
35972,35972,P20585,4004015,3000587,MSRRKPASGGLAASSSAPARQAVLSRFFQSTGSLKSTSSSTGAADQ...
35973,35973,P20585,4004015,3002020,MSRRKPASGGLAASSSAPARQAVLSRFFQSTGSLKSTSSSTGAADQ...
35974,35974,P52701,4004015,3001688,MSRQSTLYSFFPKSPALSDANKASARASREGGRAAAAPGASPSPGG...
35975,35975,P52701,4004015,3000587,MSRQSTLYSFFPKSPALSDANKASARASREGGRAAAAPGASPSPGG...


In [7]:
import sys

# Check memory usage of a variable
variable_memory = sys.getsizeof(embeddings)  # Replace 'embeddings' with the variable you want to check
print(f'Memory usage of the variable: {variable_memory} bytes')
# Convert bytes to megabytes
variable_memory_mb = variable_memory / (1024 * 1024)
print(f'Memory usage of the variable in MB: {variable_memory_mb:.2f} MB')

Memory usage of the variable: 184202368 bytes
Memory usage of the variable in MB: 175.67 MB


In [8]:
if np.all(np.isclose(np.linalg.norm(embeddings, axis=1), 1)):
    print("All arrays in embeddings have a magnitude of 1.")
else:
    print("Some arrays in embeddings do not have a magnitude of 1.")

All arrays in embeddings have a magnitude of 1.


# Grabbing Super-family information from scop dataframe

In [9]:
superfamilies = np.array([metadata_df.loc[metadata_df['index'] == i, 'sf'].values[0] for i in indices])

In [10]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(superfamilies)
print(labels)

[  25   25   26 ... 1645  556 1968]


In [11]:
# Count how many samples each label has
unique, counts = np.unique(labels, return_counts=True)

# Keep only labels that have at least 2 samples
valid_labels = unique[counts > 1]

# Create a boolean mask for those labels
mask = np.isin(labels, valid_labels)

# Filter embeddings and labels
embeddings = embeddings[mask]
labels = labels[mask]

# Now do the stratified split
train_embeddings, test_embeddings, train_labels, test_labels = train_test_split(
    embeddings, labels, test_size=0.2, random_state=42, stratify=labels
)

In [12]:
class ProteinDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = torch.tensor(embeddings, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

# Triplet Loss Sampling Function for Contrasted Learning

In [13]:
# Triplet Sampling Function
def get_triplets(embeddings, labels, num_triplets=10000):
    triplets = []
    label_dict = {}
    
    for i, label in enumerate(labels):
        if label not in label_dict:
            label_dict[label] = []
        label_dict[label].append(i)
    
    for _ in range(num_triplets):
        anchor_idx = np.random.randint(0, len(labels))
        anchor_label = labels[anchor_idx]
        
        positive_idx = np.random.choice(label_dict[anchor_label])
        
        negative_label = np.random.choice([l for l in label_dict.keys() if l != anchor_label])
        negative_idx = np.random.choice(label_dict[negative_label])
        
        triplets.append((anchor_idx, positive_idx, negative_idx))
    
    return triplets

# Projection Head Model

This is a simple feedforward net. The goal with this projection head is just to provide some non-linear transformation to project the initial embeddings into a new vector space with superfamilies spread apart. We can also lower the dimensionality while we're at it, knocking 2 birds with 1 stone and reducing compute costs this way as well.

In [14]:
# Define MLP Projection Head
class ProjectionHead(nn.Module):
    def __init__(self, input_dim, output_dim=128, normalize_output=True):
        super(ProjectionHead, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim),
            nn.ReLU()
        )
        self.normalize_output = normalize_output
    
    def forward(self, x):
        x = self.model(x)
        # Optionally normalize final embeddings
        if self.normalize_output:
            x = torch.nn.functional.normalize(x, p=2, dim=-1)
        return x


In [20]:
# Training Loop

def train_projection_head(train_embeddings, train_labels, test_embeddings, test_labels, epochs=10, batch_size=256, lr=0.001):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = ProjectionHead(input_dim=train_embeddings.shape[1], output_dim=128, normalize_output=True).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.TripletMarginLoss(margin=0.2)

    train_dataset = ProteinDataset(train_embeddings, train_labels)
    test_dataset = ProteinDataset(test_embeddings, test_labels)

    train_triplets = get_triplets(train_embeddings, train_labels, num_triplets=200000)
    test_triplets = get_triplets(test_embeddings, test_labels, num_triplets=10000)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for anchor_idx, pos_idx, neg_idx in tqdm(train_triplets, desc=f"Epoch {epoch+1} [Train]"):
            anchor = train_dataset[anchor_idx][0].to(device)
            positive = train_dataset[pos_idx][0].to(device)
            negative = train_dataset[neg_idx][0].to(device)

            anchor_out = model(anchor)
            positive_out = model(positive)
            negative_out = model(negative)

            loss = criterion(anchor_out, positive_out, negative_out)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_triplets)

        # Evaluation on test set
        model.eval()
        total_test_loss = 0.0
        with torch.no_grad():
            for anchor_idx, pos_idx, neg_idx in tqdm(test_triplets, desc=f"Epoch {epoch+1} [Test]"):
                anchor = test_dataset[anchor_idx][0].to(device)
                positive = test_dataset[pos_idx][0].to(device)
                negative = test_dataset[neg_idx][0].to(device)

                anchor_out = model(anchor)
                positive_out = model(positive)
                negative_out = model(negative)

                t_loss = criterion(anchor_out, positive_out, negative_out)
                total_test_loss += t_loss.item()
        avg_test_loss = total_test_loss / len(test_triplets)

        print(f"Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f} | Test Loss: {avg_test_loss:.4f}")

    return model


In [19]:
# Train the model
projection_model = train_projection_head(train_embeddings, train_labels,
                                         test_embeddings, test_labels,
                                         epochs=5, batch_size=256, lr=0.001)


Epoch 1 [Train]:  12%|███████████▉                                                                                       | 3390/28105 [00:06<00:48, 505.44it/s]


KeyboardInterrupt: 

In [19]:
torch.save(projection_model.state_dict(), "/scratch/gpfs/jr8867/models/projection_model.pth")

In [22]:
# Save new embeddings
with torch.no_grad():
    refined_embeddings = projection_model(torch.tensor(embeddings, dtype=torch.float32).to("cuda")).cpu().numpy()

np.save("refined_embeddings.npy", refined_embeddings)

In [None]:
# FAISS Indexing
index = faiss.IndexFlatL2(refined_embeddings.shape[1])
index.add(refined_embeddings)
faiss.write_index(index, "faiss_index_refined.idx")

print("Saved refined embeddings and FAISS index!")