In [None]:
!pip install torch_geometric

In [None]:
import os
import joblib
print(joblib.__version__)
import pandas as pd
import numpy as np
import itertools
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt
import torch
from torch_geometric.data import Data
from torch_geometric.data import DataLoader
from torch_geometric.nn import GATConv, global_mean_pool
import torch.nn.functional as F

1.3.2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def bin_ages(age):
    if pd.isnull(age) or not isinstance(age, (str, float, int)):
        return 'Unknown'
    age_str = str(age)

    if 'pcw' in age_str:
        weeks = int(age_str.split(' ')[0])
        if weeks >= 4 and weeks <= 7: # 4-7 pcw
            return 'Embryonic'
        elif weeks >= 8 and weeks <= 38: # 8-38 pcw
            return 'Prenatal'
    elif 'mos' in age_str or 'M' in age_str:
        months = int(age_str.split(' ')[0])
        if months >= 0 and months <= 19: # 0-19 months
            return 'Infancy'
    elif 'yrs' in age_str or 'Y' in age_str:
        years = int(age_str.split(' ')[0])
        if years >= 1 and years <= 11: # 1-11 years
            return 'Childhood'
        elif years >= 12 and years <= 19: # 12-19 years
            return 'Adolescence'
        elif years >= 20: # 20 years and above
            return 'Adulthood'

    # return 'Unknown' for any other cases
    return 'Unknown'

In [None]:
def scale(X):
    """
    Scales (standardizes) the input data.

    Args:
    - X (pd.DataFrame): Input data to be scaled.

    Returns:
    - np.ndarray: Scaled (standardized) data.
    """
    scaler = StandardScaler()
    return scaler.fit_transform(X)

In [None]:
def create_graphs_from_correlations(X, threshold=0.8):
    """
    Creates graphs based on correlation threshold.

    Args:
    - X (pd.DataFrame): Transposed DataFrame where each row is a gene and each column is a subject.
    - threshold (float): Correlation threshold for edge creation.

    Returns:
    - list: A list of graph objects.
    """

    correlation_matrix = X.corr().abs()  # use absolute value of correlations

    print('Correlation matrix created')

    edge_list = []
    for i in range(correlation_matrix.shape[0]):
        for j in range(i + 1, correlation_matrix.shape[1]):
            if correlation_matrix.iloc[i, j] >= threshold:
                edge_list.append((i, j))
    print('Edge list created')

    edge_index_tensor = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
    print('Converted to tensor')

    graphs = []
    for col in X.columns:
        node_features = torch.tensor(X[col].values, dtype=torch.float).view(-1, 1)
        graph = Data(x=node_features, edge_index=edge_index_tensor)
        graphs.append(graph)
    print('Graphs created')
    return graphs

In [None]:
def preprocess_data(data_path, file_type='csv'):
    """
    Preprocess the data and return the train-test split.

    Args:
    - data_path (str): Path to the data file.
    - file_type (str): File format ('csv', 'excel', 'txt').

    Returns:
    - X_train, X_test, y_train, y_test: Train-test split of the preprocessed data.
    """
    subset_rows = 20


    if 'methylation' in data_path.lower() and file_type == 'csv':
        chunk_size = 5
        chunks = pd.read_csv(data_path, chunksize=chunk_size)

        X_list = []
        y_list = []
        for chunk in chunks:
            chunk = chunk.head(subset_rows)
            X_chunk, y_chunk = preprocess_chunk(chunk, 'Methylation')

            X_list.append(X_chunk)
            y_list.append(y_chunk)

        X_data = pd.concat(X_list, axis=0)
        y_data = pd.concat(y_list, axis=0)

        data = pd.concat([X_data, y_data], axis=1)
        print("Columns after processing all chunks:", data.columns)

    else:
        if file_type == 'csv':
            data = pd.read_csv(data_path, index_col=0)
        elif file_type == 'excel':
            data = pd.read_excel(data_path, index_col=0)
        elif file_type == 'txt':
            data = pd.read_csv(data_path, sep='\t', index_col=0)
        else:
            raise ValueError("Unsupported file type")
        print("Columns after determining data type:", data.columns)

    if 'rnaseq' in data_path.lower():
        data_type = 'RNA-Seq'
    elif 'methylation' in data_path.lower():
        data_type = 'Methylation'
    elif 'microrna' in data_path.lower():
        data_type = 'MicroRNA'
    else:
        raise ValueError("Unknown data type")



    data.reset_index(drop=True, inplace=True)
    data['age_group'] = data['age'].apply(bin_ages)

    print(data.columns)

    if data_type == 'RNA-Seq':
        label_map = {'Prenatal': 0, 'Infancy': 0, 'Childhood': 1, 'Adolescence': 2, 'Adulthood': 3}
    elif data_type == 'MicroRNA':
        label_map = {'Infancy': 0, 'Childhood': 1, 'Adolescence': 2, 'Adulthood': 3}
    else:
        label_map = {'Embryonic': 0, 'Prenatal': 1, 'Infancy': 2, 'Childhood': 3, 'Adolescence': 4, 'Adulthood': 5}

    data['age_group'] = data['age_group'].map(label_map)

    if data_type == 'RNA-Seq':
        data_numeric = data.drop(['age', 'age_group'], axis=1)
        one_percent_of_samples = data_numeric.shape[1] * 0.01
        mask = data_numeric.gt(1).sum(axis=1) >= one_percent_of_samples
        filtered_data = data[mask]

        y = torch.tensor(filtered_data['age_group'].values, dtype=torch.float)
        X = filtered_data.drop(['age', 'age_group'], axis=1).transpose()

    elif data_type == 'Methylation':
        relevant_columns = [col for col in data.columns if col.startswith(('cg', 'rs', 'ch'))] + ['age_group']
        filtered_data = data[relevant_columns]
        X = filtered_data.drop(['age_group'], axis=1).transpose()
        print(X.head(5))
        y = filtered_data['age_group']

    elif data_type == 'MicroRNA':
        relevant_columns = [col for col in data.columns if col.startswith('hsa')] + ['age', 'age_group']
        filtered_data = data[relevant_columns]

        y = torch.tensor(filtered_data['age_group'].values, dtype=torch.float)
        X = filtered_data.drop(['age', 'age_group'], axis=1).transpose()


    print(np.unique(y))
    # print("About to shuffle")
    print("X shape:", X.shape)
    print("y shape:", y.shape)

    # X, y = shuffle(X, y, random_state=0)
    X_scaled = scale(X, with_mean=False)

    X_scaled_df = pd.DataFrame(X_scaled, index=X.index, columns=X.columns)

    graphs = create_graphs_from_correlations(pd.DataFrame(X_scaled, index=X.index), threshold=0.8)

    graphs_train, graphs_test, y_train, y_test = train_test_split(graphs, y, test_size=0.2, random_state=42)
    print("The data is split")

    return graphs_train, graphs_test, y_train, y_test

In [None]:
def save_data_splits(X_train, X_test, y_train, y_test, output_dir):
    """
    Saves the train-test data splits to the specified directory using joblib.

    Args:
    - X_train (pd.DataFrame or np.ndarray): Training data features.
    - X_test (pd.DataFrame or np.ndarray): Testing data features.
    - y_train (pd.Series or np.ndarray): Training data labels.
    - y_test (pd.Series or np.ndarray): Testing data labels.
    - output_dir (str): Directory path where the data splits will be saved.

    Note:
    - If the output directory does not exist, it will be created.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    joblib.dump(X_train, os.path.join(output_dir, 'X_train_classifiers.pkl'))
    joblib.dump(X_test, os.path.join(output_dir, 'X_test_classifiers.pkl'))
    joblib.dump(y_train, os.path.join(output_dir, 'y_train_classifiers.pkl'))
    joblib.dump(y_test, os.path.join(output_dir, 'y_test_classifiers.pkl'))

In [None]:
def create_data_loader(graphs, ages, batch_size=32):
    graph_data_list = []
    for i, graph in enumerate(graphs):
        age_label = torch.tensor([ages[i]], dtype=torch.float)
        graph_data_list.append(Data(x=graph.x, edge_index=graph.edge_index, y=age_label))

    return DataLoader(graph_data_list, batch_size=batch_size, shuffle=True)

In [None]:
def train_evaluate_gat_model(train_loader, test_loader, num_classes, epochs=100):
    """
    Trains and evaluates a GAT model.

    Args:
    - train_loader (DataLoader): DataLoader for training data.
    - test_loader (DataLoader): DataLoader for test data.
    - num_classes (int): Number of classes for classification.
    - epochs (int): Number of training epochs.

    Returns:
    - model: Trained GAT model.
    - avg_test_loss: Average loss on the test set.
    - test_accuracy: Accuracy on the test set.
    """
    class GATNetClassifier(torch.nn.Module):
      def __init__(self, num_classes):
          super(GATNetClassifier, self).__init__()
          self.conv1 = GATConv(1, 8, heads=8, dropout=0.6)  # accepts 1 feature per node
          self.conv2 = GATConv(8 * 8, 16, heads=1, dropout=0.6)
          self.fc = torch.nn.Linear(16, num_classes)  # output layer for classification

      def forward(self, data):
          x, edge_index, batch = data.x, data.edge_index, data.batch

          x = F.dropout(x, p=0.6, training=self.training)
          x = F.elu(self.conv1(x, edge_index))
          x = F.dropout(x, p=0.6, training=self.training)
          x = F.elu(self.conv2(x, edge_index))

          x = global_mean_pool(x, batch)  # pooling to get graph-level representation
          x = self.fc(x)
          return F.log_softmax(x, dim=1)  # use log_softmax for classification

    model = GATNetClassifier(num_classes)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            output = model(batch)
            loss = criterion(output, batch.y.long())
            loss.backward()
            optimizer.step()

    model.eval()
    total_test_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in test_loader:
            output = model(batch)
            loss = criterion(output, batch.y.long())
            total_test_loss += loss.item()
            _, predicted = torch.max(output.data, 1)
            total += batch.y.size(0)
            correct += (predicted == batch.y.long()).sum().item()

    avg_test_loss = total_test_loss / len(test_loader)
    test_accuracy = 100 * correct / total

    return model, avg_test_loss, test_accuracy

In [None]:
def main(data_paths):

    for data_path in data_paths:

        data_type = os.path.basename(data_path).split('_')[0]

        graphs_train, graphs_test, y_train, y_test = preprocess_data(data_path)
        save_data_splits(graphs_train, graphs_test, y_train, y_test, os.path.join('baseline_cnn_outputs', data_type))

        train_loader = create_data_loader(graphs_train, y_train)
        print('Train loader created')
        test_loader = create_data_loader(graphs_test, y_test)
        print('Test loader created')

        num_classes = len(set(y_train.numpy()))
        model, avg_test_loss, test_accuracy = train_evaluate_gat_model(train_loader, test_loader, num_classes, epochs=10)

        print(f"GAT Model - Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")

In [None]:
data_paths = [#'methylation_1.csv',
              '/content/drive/MyDrive/rnaseq/rnaseq_1.csv',
              '/content/drive/MyDrive/microRNA/microRNA_1.csv'
              ]
main(data_paths)

Columns after determining data type: Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       ...
       '22319', '22320', '22321', '22322', '22323', '22324', '22325', '22326',
       '22327', 'age'],
      dtype='object', length=22328)
Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       ...
       '22320', '22321', '22322', '22323', '22324', '22325', '22326', '22327',
       'age', 'age_group'],
      dtype='object', length=22329)
[0. 1. 2. 3.]
About to shuffle
X shape: (22327, 578)
y shape: torch.Size([578])
Correlation matrix created
Edge list created
Converted to tensor
Graphs created
The data is split




Train loader created
Test loader created
GAT Model - Test Loss: 1.0557, Test Accuracy: 56.03%
Columns after determining data type: Index(['hsa-miR-26a-5p', 'hsa-miR-181a-5p', 'hsa-miR-143-3p', 'hsa-let-7a-5p',
       'hsa-miR-9-5p', 'hsa-miR-3182', 'hsa-miR-99b-5p', 'hsa-miR-30a-5p',
       'hsa-miR-27b-3p', 'hsa-miR-191-5p',
       ...
       'hsa-miR-4653-5p', 'hsa-miR-4264', 'hsa-miR-3119', 'hsa-miR-4330',
       'hsa-miR-4318', 'hsa-miR-4279', 'hsa-miR-3689f', 'hsa-miR-4291',
       'donor_name', 'age'],
      dtype='object', length=1863)
Index(['hsa-miR-26a-5p', 'hsa-miR-181a-5p', 'hsa-miR-143-3p', 'hsa-let-7a-5p',
       'hsa-miR-9-5p', 'hsa-miR-3182', 'hsa-miR-99b-5p', 'hsa-miR-30a-5p',
       'hsa-miR-27b-3p', 'hsa-miR-191-5p',
       ...
       'hsa-miR-4264', 'hsa-miR-3119', 'hsa-miR-4330', 'hsa-miR-4318',
       'hsa-miR-4279', 'hsa-miR-3689f', 'hsa-miR-4291', 'donor_name', 'age',
       'age_group'],
      dtype='object', length=1864)
[0. 1. 2. 3.]
About to shuffle
X shape:



GAT Model - Test Loss: 1.2476, Test Accuracy: 50.00%
