In [3]:
import os
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt

## Defining Functions

In [5]:
def process_rnaseq_data(rnaseq_df, meta_df, output_folder="data/rnaseq"):
    """
    Process RNA-Seq data and save the result as a CSV.
    
    Args:
    - rnaseq_df (pd.DataFrame): The original RNA-Seq dataset.
    - meta_df (pd.DataFrame): The metadata for the RNA-Seq dataset.
    - output_folder (str): Folder to save the resulting CSV.
    
    Returns:
    - pd.DataFrame: Processed RNA-Seq dataframe.
    """
    
    # Ensure output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Rename columns based on the meta-data
    column_to_donor = dict(zip(meta_df.index, meta_df['donor_name']))
    rnaseq_df.columns = [column_to_donor[col] for col in rnaseq_df.columns]
    
    # Transpose
    transposed_df = rnaseq_df.transpose()
    
    # Map age using the donor name as the index
    donor_to_age = dict(zip(meta_df['donor_name'], meta_df['age']))
    transposed_df['Age'] = transposed_df.index.map(donor_to_age)
    
    # Save to CSV
    existing_files = [f for f in os.listdir(output_folder) if f.startswith("rnaseq_") and f.endswith(".csv")]
    existing_indices = [int(f.split("_")[1].split(".")[0]) for f in existing_files]
    next_index = max(existing_indices, default=0) + 1
    data_file = os.path.join(output_folder, f"rnaseq_{next_index}.csv")
    transposed_df.to_csv(data_file)
    
    print(f"Saved RNA-Seq data to {data_file}")
    return transposed_df

In [6]:
def process_methylation_data(methylation_df, specimen_df, output_folder="data/methylation"):
    """
    Process Methylation data and save the result as a CSV.
    
    Args:
    - methylation_df (pd.DataFrame): The original Methylation dataset.
    - specimen_df (pd.DataFrame): The specimen metadata for the Methylation dataset.
    - output_folder (str): Folder to save the resulting CSV.
    
    Returns:
    - pd.DataFrame: Processed Methylation dataframe.
    """
    
    # Ensure output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Transpose the DataFrame
    transposed_df = methylation_df.transpose()
    header = transposed_df.iloc[0]
    transposed_df = transposed_df[1:]
    transposed_df.columns = header
    
    # Extract specimen code
    transposed_df['Extracted Specimen Code'] = transposed_df['Sample ID'].str.extract('(\d+)').astype(int)
    
    # Mapping
    specimen_code_to_specimen_id = dict(zip(specimen_df['Specimen Code'], specimen_df['Specimen ID']))
    specimen_code_to_age = dict(zip(specimen_df['Specimen Code'], specimen_df['Age']))
    transposed_df['Specimen ID'] = transposed_df['Extracted Specimen Code'].map(specimen_code_to_specimen_id)
    transposed_df['Age'] = transposed_df['Extracted Specimen Code'].map(specimen_code_to_age)
    
    # Drop unnecessary column
    transposed_df.drop('Extracted Specimen Code', axis=1, inplace=True)
    
    # Save to CSV
    existing_files = [f for f in os.listdir(output_folder) if f.startswith("methylation_") and f.endswith(".csv")]
    existing_indices = [int(f.split("_")[1].split(".")[0]) for f in existing_files]
    next_index = max(existing_indices, default=0) + 1
    data_file = os.path.join(output_folder, f"methylation_{next_index}.csv")
    transposed_df.to_csv(data_file)
    
    print(f"Saved Methylation data to {data_file}")
    return transposed_df

In [7]:
def process_microRNA_data(microRNA_df, meta_df, output_folder="data/microRNA"):
    """
    Process MicroRNA data and save the result as a CSV.
    
    Args:
    - microRNA_df (pd.DataFrame): The MicroRNA data to be processed.
    - meta_df (pd.DataFrame): The metadata DataFrame containing the 'donor_name' and 'age'.
    - output_folder (str): Folder to save the resulting CSV.
    
    Returns:
    - pd.DataFrame: Processed MicroRNA dataframe.
    """
    
    # Ensure output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Transpose the DataFrame
    transposed_df = microRNA_df.transpose()

    # Transform the string
    def transform_string(s):
        match = re.search(r'H376_(IX|X|VI|VII|VIII|XI)_(\d{2})', s)
        if match:
            return "H376." + match.group(1) + "." + match.group(2)
        else:
            return s

    transposed_df['donor_name'] = transposed_df.index.to_series().apply(transform_string)
    
    # Mapping
    donor_name_to_age = meta_df.set_index('donor_name')['Age'].to_dict()
    transposed_df['Age'] = transposed_df['donor_name'].map(donor_name_to_age)
    
    # Save to CSV
    existing_files = [f for f in os.listdir(output_folder) if f.startswith("microRNA_") and f.endswith(".csv")]
    existing_indices = [int(f.split("_")[1].split(".")[0]) for f in existing_files]
    next_index = max(existing_indices, default=0) + 1
    data_file = os.path.join(output_folder, f"microRNA_{next_index}.csv")
    transposed_df.to_csv(data_file)
    
    print(f"Saved MicroRNA data to {data_file}")
    return transposed_df

## Reading in Data Files

In [8]:
develop_transcriptome_rnaseq = pd.read_csv("24Developmental Transcriptome/superseded_genes_matrix_csv/expression_matrix.csv", header=None, index_col=0)
develop_transcriptome_rnaseq_meta_columns = pd.read_csv("24Developmental Transcriptome/superseded_genes_matrix_csv/columns_metadata.csv", index_col=0)
output_rnaseq = process_rnaseq_data(develop_transcriptome_rnaseq, develop_transcriptome_rnaseq_meta_columns)

Saved RNA-Seq data to data/rnaseq/rnaseq_1.csv


In [10]:
data = {
    'Specimen Code': [132, 139, 131, 171, 122, 124, 119, 105, 127, 143, 172, 173, 123, 118, 141, 174],
    'Specimen ID': ['H376.VI.50', 'H376.VI.52', 'H376.VII.50', 'H376.VII.51', 'H376.VIII.51', 'H376.X.51', 'H376.X.50', 'H376.X.53', 'H376.X.52', 'H376.VIII.53', 'H376.VIII.54', 'H376.VIII.52', 'H376.XI.54', 'H376.VIII.50', 'H376.IX.51', 'H376.IX.52'],
    'Age': ['4 M', '4 M', '6 M', '10 M', '1 Y', '13 Y', '15 Y', '18 Y', '19 Y', '2 Y', '3 Y', '3 Y', '37 Y', '4 Y', '8 Y', '8 Y']
}

specimen_df = pd.DataFrame(data)

In [11]:
methylation_1 = pd.read_csv("Methylation/1109_methylation_beta_values.txt", sep='\t')
methylation_2 = pd.read_csv("Methylation/1110_methylation_beta_values.txt", sep='\t')
output_methylation_1 = process_methylation_data(methylation_1, specimen_df)
output_methylation_2 = process_methylation_data(methylation_2, specimen_df)

  methylation_1 = pd.read_csv("Methylation/1109_methylation_beta_values.txt", sep='\t')
  methylation_2 = pd.read_csv("Methylation/1110_methylation_beta_values.txt", sep='\t')


Saved Methylation data to data/methylation/methylation_1.csv
Saved Methylation data to data/methylation/methylation_2.csv


In [12]:
microrna = pd.read_excel("microRNA/MicroRNA.xls", index_col=0)
output_microRNA = process_microRNA_data(microrna, develop_transcriptome_rnaseq_meta_columns)

Saved MicroRNA data to data/microRNA/microRNA_1.csv
