In [2]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv, GATConv
from torch_geometric.utils import to_dense_adj
import random


In [3]:
import RNA

# Load the model

In [13]:
import pandas as pd
import torch
from torch_geometric.data import Data, DataLoader
from sklearn.model_selection import train_test_split

df = pd.read_csv("~/internship/modeling/GNN_autoencoder/data/p04_clinvar_summary_processed.csv")

df_filtered = df[~df.dot_bracket.str.contains('x')]

RNA_sequences = df_filtered['sec_struc'].tolist()
RNA_structures = df_filtered['dot_bracket'].tolist()
eff_scores = df_filtered['Eff'].tolist()  

RNA_seq_train, RNA_seq_temp, RNA_struct_train, RNA_struct_temp, eff_train, eff_temp = train_test_split(
    RNA_sequences, RNA_structures, eff_scores, test_size=0.3, random_state=42
)
RNA_seq_val, RNA_seq_test, RNA_struct_val, RNA_struct_test, eff_val, eff_test = train_test_split(
    RNA_seq_temp, RNA_struct_temp, eff_temp, test_size=0.5, random_state=42
)

# print(len(RNA_seq_train))
# print(len(RNA_seq_test))
# print(len(RNA_struct_train))
# print(len(RNA_struct_test))

def build_graph(sequence, structure):
    base_to_idx = {'A': 0, 'T': 1, 'G': 2, 'C': 3}
    struct_to_idx = {'.': 0, '(': 1, ')': 2}
    
    node_features = []
    for base, struct in zip(sequence, structure):
        base_feature = [0, 0, 0, 0]
        base_feature[base_to_idx[base]] = 1
        struct_feature = [0, 0, 0]
        struct_feature[struct_to_idx[struct]] = 1
        node_features.append(base_feature + struct_feature)
    
    node_features = torch.tensor(node_features, dtype=torch.float)
    
    edges = []
    stack = []
    for i, (base, struct) in enumerate(zip(sequence, structure)):
        if i > 0:
            edges.append((i-1, i))
            edges.append((i, i-1))
        
        if struct == '(':
            stack.append(i)
        elif struct == ')' and stack:
            j = stack.pop()
            edges.append((i, j))
            edges.append((j, i))
    
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    
    return Data(x=node_features, edge_index=edge_index)

print("Creating graphs")
graphs_train = [build_graph(seq, struct) for seq, struct in zip(RNA_seq_train, RNA_struct_train)]
graphs_val = [build_graph(seq, struct) for seq, struct in zip(RNA_seq_val, RNA_struct_val)]
graphs_test = [build_graph(seq, struct) for seq, struct in zip(RNA_seq_test, RNA_struct_test)]

batch_size = 64
train_loader = DataLoader(graphs_train, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(graphs_val, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(graphs_test, batch_size=batch_size, shuffle=False)

# print(len(graphs_train))
# print(len(graphs_val))
# print(len(graphs_test))

class RNAGraphAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(RNAGraphAutoencoder, self).__init__()
        self.encoder = nn.ModuleList([
            GCNConv(input_dim, hidden_dim),
            GCNConv(hidden_dim, hidden_dim),
            GCNConv(hidden_dim, latent_dim)
        ])
        
        self.decoder = nn.ModuleList([
            GCNConv(latent_dim, hidden_dim),
            GCNConv(hidden_dim, hidden_dim),
            GCNConv(hidden_dim, input_dim)
        ])
        
        self.relu = nn.ReLU()
    
    def encode(self, x, edge_index):
        for layer in self.encoder:
            x = self.relu(layer(x, edge_index))
        return x
    
    def decode(self, x, edge_index):
        for layer in self.decoder:
            x = self.relu(layer(x, edge_index))
        return x
    
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
     
        latent = self.encode(x, edge_index)
        
        reconstructed = self.decode(latent, edge_index)
        
        return reconstructed, latent


# # Load the pretrained model
# print('loading pretrained model')
# pretrained_dict = torch.load('./p04cv_rna_graph_autoencoder_model.pth')
# model = RNAGraphAutoencoder(input_dim=7, hidden_dim=64, latent_dim=32)  
# model.load_state_dict(pretrained_dict)



Creating graphs




In [None]:
def extract_features(model, loader, eff_scores):
    model.eval()
    features = []
    global_batch_indices = []
    global_eff_scores = []
    batch_count = 0 
    global_graph_count = 0 
    
    with torch.no_grad():
        for batch in loader:
            _, latent = model(batch)
            features.append(latent)
            
            global_batch = batch.batch + global_graph_count
            global_batch_indices.append(global_batch)
            
            num_graphs_in_batch = batch.num_graphs
            global_eff_scores.extend(eff_scores[global_graph_count:global_graph_count + num_graphs_in_batch])
            
            batch_count += 1  
            global_graph_count += num_graphs_in_batch  
    
    print(f"Total number of batches processed: {batch_count}")  
    return torch.cat(features, dim=0), torch.cat(global_batch_indices, dim=0), torch.tensor(global_eff_scores)

print("extracting features")
extracted_features_train, batch_mapping_train, global_eff_scores_train = extract_features(model, train_loader, eff_train)
extracted_features_val, batch_mapping_val, global_eff_scores_val = extract_features(model, val_loader, eff_val)
extracted_features_test, batch_mapping_test, global_eff_scores_test = extract_features(model, test_loader, eff_test)

# print(f"Train - Extracted features shape: {extracted_features_train.shape}")
# print(f"Train - Batch mapping shape: {batch_mapping_train.shape}")
# print(f"Train - Global eff scores shape: {global_eff_scores_train.shape}")

# print(f"Validation - Extracted features shape: {extracted_features_val.shape}")
# print(f"Validation - Batch mapping shape: {batch_mapping_val.shape}")
# print(f"Validation - Global eff scores shape: {global_eff_scores_val.shape}")

# print(f"Test - Extracted features shape: {extracted_features_test.shape}")
# print(f"Test - Batch mapping shape: {batch_mapping_test.shape}")
# print(f"Test - Global eff scores shape: {global_eff_scores_test.shape}")



In [None]:
# def aggregate_features(extracted_features, batch_mapping):
#     unique_graphs = torch.unique(batch_mapping)
#     print(f"Number of unique graphs: {len(unique_graphs)}")  
#     print(f"Unique graphs tensor: {unique_graphs}")  
#     graph_features = []
    
#     for graph in unique_graphs:
        
#         graph_nodes = extracted_features[batch_mapping == graph]
#         graph_feature = torch.mean(graph_nodes, dim=0) 
#         graph_features.append(graph_feature)
    
#     return torch.stack(graph_features, dim=0)

# aggregated_features_train = aggregate_features(extracted_features_train, batch_mapping_train)
# aggregated_features_val = aggregate_features(extracted_features_val, batch_mapping_val)
# aggregated_features_test = aggregate_features(extracted_features_test, batch_mapping_test)

# print(f"Train - Aggregated features shape: {aggregated_features_train.shape}")
# print(f"Validation - Aggregated features shape: {aggregated_features_val.shape}")
# print(f"Test - Aggregated features shape: {aggregated_features_test.shape}")



In [None]:
# torch.save(extracted_features_train, 'extracted_features_train.pt')
# torch.save(batch_mapping_train, 'batch_mapping_train.pt')
# torch.save(global_eff_scores_train, 'global_eff_scores_train.pt')

# torch.save(extracted_features_val, 'extracted_features_val.pt')
# torch.save(batch_mapping_val, 'batch_mapping_val.pt')
# torch.save(global_eff_scores_val, 'global_eff_scores_val.pt')

# torch.save(extracted_features_test, 'extracted_features_test.pt')
# torch.save(batch_mapping_test, 'batch_mapping_test.pt')
# torch.save(global_eff_scores_test, 'global_eff_scores_test.pt')

# # Save the aggregated features to files
# torch.save(aggregated_features_train, 'aggregated_features_train.pt')
# torch.save(aggregated_features_val, 'aggregated_features_val.pt')
# torch.save(aggregated_features_test, 'aggregated_features_test.pt')

In [3]:
extracted_features_train = torch.load('extracted_features_train.pt')
batch_mapping_train = torch.load('batch_mapping_train.pt')
global_eff_scores_train = torch.load('global_eff_scores_train.pt')

extracted_features_val = torch.load('extracted_features_val.pt')
batch_mapping_val = torch.load('batch_mapping_val.pt')
global_eff_scores_val = torch.load('global_eff_scores_val.pt')

extracted_features_test = torch.load('extracted_features_test.pt')
batch_mapping_test = torch.load('batch_mapping_test.pt')
global_eff_scores_test = torch.load('global_eff_scores_test.pt')

aggregated_features_train = torch.load('aggregated_features_train.pt')
aggregated_features_val = torch.load('aggregated_features_val.pt')
aggregated_features_test = torch.load('aggregated_features_test.pt')


In [14]:
# def extract_features(model, loader):
#     model.eval()
#     features = []
    
#     with torch.no_grad():
#         for batch in loader:
#             _, latent = model(batch)
#             features.append(latent)
    
#     return torch.cat(features, dim=0)

# feature_loader = DataLoader(graphs_test, batch_size=batch_size, shuffle=False)
# extracted_features = extract_features(model, feature_loader)

# print(f"Extracted features shape: {extracted_features.shape}")

In [4]:
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        

    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

In [35]:
aggregated_features_train.shape

torch.Size([549902, 32])

# PADDING SEQUENCES

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

def create_sequences(features, mappings):
    unique_graphs = torch.unique(mappings, sorted=True)
    graph_sequences = [features[mappings == graph_id] for graph_id in unique_graphs]
    return graph_sequences

train_sequences = create_sequences(extracted_features_train, batch_mapping_train)
train_labels = global_eff_scores_train
val_sequences = create_sequences(extracted_features_val, batch_mapping_val)
val_labels = global_eff_scores_val
test_sequences = create_sequences(extracted_features_test, batch_mapping_test)
test_labels = global_eff_scores_test

torch.save(train_sequences, 'train_sequences.pth')
torch.save(train_labels, 'train_labels.pth')
torch.save(val_sequences, 'val_sequences.pth')
torch.save(val_labels, 'val_labels.pth')
torch.save(test_sequences, 'test_sequences.pth')
torch.save(test_labels, 'test_labels.pth')
