# Packages

In [None]:
# Dependencies

import numpy as np
from pandas import DataFrame
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
import sklearn 
import sspa
import sspa.utils
import gseapy.plot as gp
import networkx
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import scipy.stats as stats
import statsmodels.api as sm
import plotly.graph_objects as go
import plotly.express as px
import urllib.request
import statsmodels
import networkx as nx
import math
import itertools 
from scipy.stats import hypergeom as hg
import textwrap
from itertools import chain
import missforest

# First, the standardd (filtered and not augmented) multi-omics database has to created

### Function to download the kegg default pathways - selecting multiomics omics type which concatenates the two files

In [None]:
import requests
import re
import pandas as pd
import warnings
import json
from tqdm import tqdm

def download_KEGG(organism, filepath=None, omics_type='metabolomics'):
    '''
    Function for KEGG pathway download
    Args:
        organism (str): KEGG 3 letter organism code
        filepath (str): filepath to save pathway file to, default is None - save to variable
    Returns: 
        GMT-like pd.DataFrame containing KEGG pathways
    '''
    print("Beginning KEGG download...")
    # get all pathways
    url = 'http://rest.kegg.jp/list/pathway/'+organism
    # change organism name
    data = requests.get(url)
    pathways = data.text
    pathways = pathways.split("\n")
    pathways = filter(None, pathways)
    pathway_dict = dict()

    for path in pathways:
        path = path.split("\t")
        name = path[1]
        pathid = re.search(r"(.*)", path[0]).group(1)
        pathway_dict[pathid] = name

    # get compounds for each pathway
    base_url = 'http://rest.kegg.jp/get/'

    pathway_ids = [*pathway_dict]
    pathway_names = list(pathway_dict.values())

    # get release details
    release_data = requests.get('http://rest.kegg.jp/info/kegg')
    version_no = release_data.text.split()[9][0:3]

    if omics_type == 'metabolomics':
        pathway_compound_mapping = dict()

        for index,i in enumerate(tqdm(pathway_ids)):
            complist = []
            current_url = base_url + "pathway:" +i
            # parse the pathway description page
            page = requests.get(current_url)
            lines = page.text.split("\n")

            try:
                cpds_start = [lines.index(i) for i in lines if i.startswith("COMPOUND")][0]
                reference_start = [lines.index(i) for i in lines if i.startswith("REFERENCE") or i.startswith("REL_PATHWAY")][0]
                cpds_lines = lines[cpds_start:reference_start]
                first_cpd = cpds_lines.pop(0).split()[1]
                complist.append(first_cpd)
                complist = complist + [i.split()[0] for i in cpds_lines]
                pathway_compound_mapping[i] = list(set(complist))
            except IndexError:
                pathway_compound_mapping[i] = []

        # remove empty pathway entries
        pathway_compound_mapping = {k: v for k, v in pathway_compound_mapping.items() if v}

        # create GMT style file
        df = pd.DataFrame.from_dict(pathway_compound_mapping, orient='index')
        # map pathway names onto first column
        df.insert(0, 'Pathway_name', df.index.map(pathway_dict.get))

        if filepath:
            fpath = filepath + "/KEGG_" + organism + "_pathways_compounds_R" + str(version_no) + ".gmt"
            df.to_csv(fpath, sep="\t", header=False)
            print("KEGG DB file saved to " + fpath)
        print("Complete!")

        return df
        

    if omics_type == 'multiomics':
        pathway_mapping = dict()

        for index,i in enumerate(tqdm(pathway_ids)):
            complist = []
            genelist = []
            current_url = base_url + "pathway:" +i
            # parse the pathway description page
            page = requests.get(current_url)
            lines = page.text.split("\n")

            try:
                genes_start = [lines.index(i) for i in lines if i.startswith("GENE")][0]
                cpds_start = [lines.index(i) for i in lines if i.startswith("COMPOUND")][0]
                reference_start = [lines.index(i) for i in lines if i.startswith("REFERENCE") or i.startswith("REL_PATHWAY")][0]
                genes_lines = lines[genes_start:cpds_start]
                cpds_lines = lines[cpds_start:reference_start]

                first_cpd = cpds_lines.pop(0).split()[1]
                complist.append(first_cpd)
                complist = complist + [i.split()[0] for i in cpds_lines]
                first_gene = genes_lines.pop(0).split()[1]
                genelist.append(first_gene)
                genelist = genelist + [i.split()[0] for i in genes_lines]
                pathway_mapping[i] = list(set(complist)) + list(set(genelist))
            except IndexError:
                pathway_mapping[i] = []

        # remove empty pathway entries
        pathway_mapping = {k: v for k, v in pathway_mapping.items() if v}

        # create GMT style file
        df = pd.DataFrame.from_dict(pathway_mapping, orient='index')
        # map pathway names onto first column
        df.insert(0, 'Pathway_name', df.index.map(pathway_dict.get))

        if filepath:
            fpath = filepath + "/KEGG_" + organism + "_pathways_multiomics_R" + str(version_no) + ".gmt"
            df.to_csv(fpath, sep="\t", header=False)
            print("KEGG DB file saved to " + fpath)
        print("Complete!")

        return df
        
kegg_pathways_default= download_KEGG(organism = 'hsa', omics_type = 'multiomics')

### Filtering and saving the default multi-omics database file

In [None]:
# removing hsa from row names - our pathways in augmented dataset dont have same codes but no hsa string at the start and we want them to be comparable
kegg_pathways_default = kegg_pathways_default.astype(str)
kegg_pathways_default.index = kegg_pathways_default.index.str.replace('hsa', '')

# droping columns that are  all na
kegg_pathways_default.dropna(axis=1, how='all', inplace=True)

# resetting the index and renamoing index column to 'Pathway'
kegg_pathways_default.reset_index(inplace=True)
kegg_pathways_default.rename(columns={'index': 'Pathway'}, inplace=True)

In [None]:
kegg_pathways_default.to_csv('/Users/judepops/Documents/PathIntegrate/Code/Final_Scripts/Results/Results_G/KEGG_database_multiomics_filtered.csv')

# Creating in augmented pathway databases by inverting and manipulating the JSON file

JSON file format is all compound values annotated to (predicted) for each pathway key

In [None]:
# data

# Reading in the metabolomics data
metabolomics_data_processed = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Processing/Processing_Cleaned/cleaned_metabolomics_data_covid.csv')
metabolomics_data_processed = metabolomics_data_processed.set_index('sample_id')


# removing Columns
metabolomics_data_processed_final = metabolomics_data_processed.iloc[:, :-7]
metabolomics_data_processed_final.columns = [col.strip().lower() for col in metabolomics_data_processed_final.columns]
metabolomics_data_processed_final

# kegg id Mappings
manual = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Final_Scripts/Results/Results_G/final_kegg_id.csv', index_col=0)
manual

# mapped data
processed_data_mapped_manual = sspa.map_identifiers(manual, output_id_type="KEGG", matrix=metabolomics_data_processed_final)
processed_data_mapped_manual

### Reading in the default kegg pathway multiomics database

In [None]:
# Pathways

filtered_kegg_pathways = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Final_Scripts/Results/Results_G/KEGG_database_multiomics_filtered.csv', dtype={'Pathway': str}, index_col=False )

### level 2 kegg pathways (broad) were just given their name so they were manually mapped to the ID. ALL granular pathways had their IDs

### 1) extracting the predictions

In [None]:
# loading JSON data from the file provided by Huckvale et al
file_path = '/Users/judepops/Documents/PathIntegrate/Code/Pathway_Prediction/Huckvale/kegg_all-pathway-mappings.json'
with open(file_path, 'r') as file:
    kegg_data = json.load(file)

# key for mapping level 2 kegg pathway names to their ids
pathway_key = {
    "Metabolism": "09100",
    "Carbohydrate metabolism": "09101",
    "Energy metabolism": "09102",
    "Lipid metabolism": "09103",
    "Nucleotide metabolism": "09104",
    "Amino acid metabolism": "09105",
    "Metabolism of other amino acids": "09106",
    "Glycan biosynthesis and metabolism": "09107",
    "Metabolism of cofactors and vitamins": "09108",
    "Metabolism of terpenoids and polyketides": "09109",
    "Biosynthesis of other secondary metabolites": "09110",
    "Xenobiotics biodegradation and metabolism": "09111",
    "Not included in regular maps": "09112",
    "Chemical structure transformation maps": "09120",
}

# intialising a dictionary to store pathway data extracted
pathway_dict = {}

# amending JSON keys with the missing codes from teh dictionary
for cpd, pathways in kegg_data.items():
    cpd_id = cpd.replace('cpd:', '')  # removing the the 'cpd:' prefix from the compounds
    for pathway in pathways:
        if '  ' in pathway:  # ensuring each pathway entry has the expected format
            pathway_id, pathway_name = pathway.split('  ', 1)
            if pathway_id not in pathway_dict:
                pathway_dict[pathway_id] = {'Pathway_name': pathway_name, 'Compounds': []}
            pathway_dict[pathway_id]['Compounds'].append(cpd_id)
        else:
            # attemptign to clean + handle improperly formatted entries
            pathway = pathway.strip()
            pathway_id = pathway_key.get(pathway, None)
            if pathway_id:
                if pathway_id not in pathway_dict:
                    pathway_dict[pathway_id] = {'Pathway_name': pathway, 'Compounds': []}
                pathway_dict[pathway_id]['Compounds'].append(cpd_id)

# verifying the number of unique pathways identified
print(len(pathway_dict))

# converging the dictionary output to df
pathway_list = []
for pathway_id, data in pathway_dict.items():
    row = [pathway_id, data['Pathway_name']] + data['Compounds']
    pathway_list.append(row)

# creatign columns with the max number of compounds in a pathway as the max
max_compounds = max(len(data['Compounds']) for data in pathway_dict.values())
columns = ['Pathway', 'Pathway_name'] + [f'Compound {i+1}' for i in range(max_compounds)]
# creating the final pathway database dataframe
df = pd.DataFrame(pathway_list, columns=columns)
# filling in  missing entries w/ NaN to ensure consistent dataframe shape
df = df.reindex(columns=columns, fill_value=pd.NA)
# clearning the dataframe - storing pathways as strings so they dont change 
df['Pathway'] = df['Pathway'].astype(str)
df

### extracting the information (MCC etc from a different file)

In [None]:
import json
import pandas as pd

pathway_key = {
    "Carbohydrate metabolism": "09101",
    "Energy metabolism": "09102",
    "Lipid metabolism": "09103",
    "Nucleotide metabolism": "09104",
    "Amino acid metabolism": "09105",
    "Metabolism of other amino acids": "09106",
    "Glycan biosynthesis and metabolism": "09107",
    "Metabolism of cofactors and vitamins": "09108",
    "Metabolism of terpenoids and polyketides": "09109",
    "Biosynthesis of other secondary metabolites": "09110",
    "Xenobiotics biodegradation and metabolism": "09111",
    "Chemical structure transformation maps": "09120",
}

with open('/Users/judepops/Documents/PathIntegrate/Code/Pathway_Prediction/Huckvale/pathway-info.json', 'r') as file:
    json_dict = json.load(file)

updated_json_dict = {}
for key, value in json_dict.items():
    parts = key.split()
    if parts[0].isdigit():
        updated_json_dict[key] = value
    else:
        pathway_name = key.strip()
        if pathway_name in pathway_key:
            code = pathway_key[pathway_name]
            new_key = f"{code}  {pathway_name}"
            updated_json_dict[new_key] = value
        else:
            updated_json_dict[key] = value

pathway_data = []
for key, value in updated_json_dict.items():
    pathway_number = key.split()[0]
    mean_mcc = value['mean_mcc']
    size = value['size']
    pathway_data.append({'Pathway': pathway_number, 'mean_mcc': mean_mcc, 'size': size})

json_df = pd.DataFrame(pathway_data)

json_df

### Merging the augmented pathway dataframe with the MCC informaiton on the pathway column, creating a augmented dataframe

In [None]:
merged_df = df.merge(json_df, on='Pathway', how='left')

merged_df

### visualsing the distribtuion of MCC in pathways in the merged df

In [None]:
plt.rcdefaults()
plt.style.use('default')


bins = np.linspace(0, 1, 21)
hist, bin_edges = np.histogram(merged_df['mean_mcc'], bins=bins)

bar_colors = []
for value in bin_edges[:-1]:
    if 0 <= value < 0.7:
        bar_colors.append('#fb2e2a')  
    else:
        bar_colors.append('#2d8d33')  
plt.figure(figsize=(10, 6))
bars = plt.bar(bin_edges[:-1], hist, width=np.diff(bin_edges), edgecolor='k', alpha=1, color=bar_colors, align='edge')
red_patch = plt.Line2D([0], [0], color='#fb2e2a', lw=10, label='Predicted KEGG Pathways')
green_patch = plt.Line2D([0], [0], color='#2d8d33', lw=10, label='Predicted KEGG Pathways')

plt.xlabel('Mean MCC', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.grid(False)  # removing the grid lines

# increasing size of axis 
plt.tick_params(axis='both', which='major', labelsize=20)  # Major ticks
plt.tick_params(axis='both', which='minor', labelsize=14)  # Minor ticks (if any)
plt.legend(handles=[red_patch, green_patch], loc='upper left', fontsize=20)

plt.show()


# now incorporating the MCC information and creating pathway cutoffs using MCC

In [None]:
# we only use predictions of at least 0.4 after 0 as tehre is not much change before then
thresholds = [0.0, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

# creating a dictionary to store filtered datasets
filtered_datasets = {}

# filtering the merged_df augmented pathway dataset pathways based on MCC thresholds and store in the dictionary
for threshold in thresholds:
    filtered_datasets[f'above_{threshold}'] = merged_df[merged_df['mean_mcc'] > threshold]


In [None]:
filtered_datasets

# Creating an info dataframe with different statistics for anlaysis

In [None]:
info_list = []

for threshold, dataset in filtered_datasets.items():
    num_pathways = len(dataset)
    num_rows, num_columns = dataset.shape
    mcc_distribution = dataset['mean_mcc'].mean()
    non_na_entries = dataset.notna().sum().sum()
    
    non_na_entries_unique = dataset.drop_duplicates().notna().sum().sum()
    
    info_list.append({
        'threhsold': threshold,
        'pathways count': num_pathways,
        'rows count': num_rows,
        'column count': num_columns,
        'mean MCC': mcc_distribution,
        'non-na count': non_na_entries_unique
    })

info_df = pd.DataFrame(info_list)

# MAIN step: combinign the default dataframe to the MCC-filtered augmented datasets

In [None]:
# function tto combine the datasets - also tracks the row counts
def combine_datasets(df, filtered_kegg_pathways, dataset):
    dataset_pathways = set(dataset['Pathway'])
    kegg_pathways = set(filtered_kegg_pathways['Pathway'])

    unique_dataset_pathways = dataset_pathways - kegg_pathways
    unique_kegg_pathways = kegg_pathways - dataset_pathways

    matching_pathways = dataset_pathways.intersection(kegg_pathways)

    combined_data = []
    df_unique_count = 0
    kegg_unique_count = 0

    # combining the entries for matching pathways
    for pathway in matching_pathways:
        pathway_name = dataset[dataset['Pathway'] == pathway]['Pathway_name'].iloc[0]
        df_entries = dataset[dataset['Pathway'] == pathway].drop(columns=['Pathway', 'Pathway_name']).values.flatten()
        kegg_entries = filtered_kegg_pathways[filtered_kegg_pathways['Pathway'] == pathway].drop(columns=['Pathway', 'Pathway_name']).values.flatten()
        combined_entries = list(set(df_entries) | set(kegg_entries))  
        combined_data.append([pathway, pathway_name, 'Combined'] + combined_entries)

    # processing the pathways that are unique to the original dataset (filtered kegg pathways)
    for pathway in unique_dataset_pathways:
        pathway_name = dataset[dataset['Pathway'] == pathway]['Pathway_name'].iloc[0]
        df_entries = dataset[dataset['Pathway'] == pathway].drop(columns=['Pathway', 'Pathway_name']).values.flatten()
        combined_data.append([pathway, pathway_name, 'df_unique'] + list(df_entries))
        df_unique_count += 1

    # processing pathways that are unique to the KEGG dataset (augmented - df)
    for pathway in unique_kegg_pathways:
        pathway_name = filtered_kegg_pathways[filtered_kegg_pathways['Pathway'] == pathway]['Pathway_name'].iloc[0]
        kegg_entries = filtered_kegg_pathways[filtered_kegg_pathways['Pathway'] == pathway].drop(columns=['Pathway', 'Pathway_name']).values.flatten()
        combined_data.append([pathway, pathway_name, 'kegg_unique'] + list(kegg_entries))
        kegg_unique_count += 1

    # calculating the maximum number of entries in any row (this allows creation of consistent columns)
    max_entries = max(len(row) for row in combined_data)
    columns = ['Pathway', 'Pathway_name', 'Source'] + [f'Entry {i+1}' for i in range(max_entries - 3)]
    combined_df = pd.DataFrame(combined_data, columns=columns)
    combined_df = combined_df.reindex(columns=columns, fill_value=pd.NA)

    return combined_df, len(matching_pathways), df_unique_count, kegg_unique_count

# dictionary to store the merged dataframes and their row counts 
merged_datasets = {}
summary_data = []

# combining each filtered dataset w/ filtered_kegg_pathways (the default dataset)
# also calculating informaiton to store
for threshold, dataset in filtered_datasets.items():
    merged_df, matching_count, df_unique_count, kegg_unique_count = combine_datasets(df, filtered_kegg_pathways, dataset)
    merged_datasets[threshold] = merged_df
    non_na_entries = merged_df.notna().sum().sum()
    non_na_entries_unique = merged_df.drop_duplicates().notna().sum().sum()

    summary_data.append({
        'threshold': threshold,
        'row count': merged_df.shape[0],
        'matching pathways': matching_count,
        'Unique to df': df_unique_count,
        'Unique to KEGG': kegg_unique_count,
        'Non-NA Entries': non_na_entries,
        'Non-NA Unique Entries': non_na_entries_unique
    })

# creating a summary dataframe for final results
summary_df = pd.DataFrame(summary_data)
summary_df


In [None]:
# for pathway analysis need to make sure that the Pathway column is set as the index for each dataset 
for key in merged_datasets:
    merged_datasets[key] = merged_datasets[key].set_index('Pathway')

# Pathway analysis with normal kegg and augmented

In [None]:
# loeading the default suathway data for the KEGG dataset
mo_paths = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Final_Scripts/Results/Results_G/KEGG_database_multiomics_filtered.csv', dtype={'Pathway': str}, index_col='Pathway')
# saving this default non-augmented dataset as original
mo_paths = mo_paths.drop(columns='Unnamed: 0')
mo_paths = mo_paths.astype('object')
merged_datasets['original'] = mo_paths

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pathintegrate


# function to train and cross validate the model - this is the same as before for consistiency
def train_and_evaluate_model(random_seed, prot, metab, mo_paths, shuffle_labels=False):
    X_train_prot, X_test_prot, y_train, y_test = train_test_split(
        prot.drop(columns=['Condition_Group']), prot['Condition_Group'],
        test_size=0.33, random_state=random_seed, stratify=prot['Condition_Group']
    )

    if shuffle_labels:
        np.random.shuffle(y_train.values)
        np.random.shuffle(y_test.values)

    X_train_met, X_test_met = metab.loc[X_train_prot.index, :], metab.loc[X_test_prot.index, :]

    pi_model = pathintegrate.PathIntegrate(
        omics_data={'Metabolomics_train': X_train_met, 'Proteomics_train': X_train_prot},
        metadata=y_train,
        pathway_source=mo_paths,
        sspa_scoring=sspa.sspa_SVD,
        min_coverage=4
    )

    cv_single_view = pi_model.SingleViewCV(
        LogisticRegression,
        model_params={'random_state': 0, 'max_iter': 500},
        cv_params={'cv': 5, 'scoring': 'f1', 'verbose': 2}
    )

    print('Mean cross-validated F1 score: ', np.mean(cv_single_view))

    sv_tuned = pi_model.SingleView(
        model=LogisticRegression,
        model_params={'C': 21.54434690031882, 'random_state': 0, 'max_iter': 500}
    )

    concat_data = pd.concat({'Metabolomics_test': X_test_met, 'Proteomics_test': X_test_prot.iloc[:, :-1]}.values(), axis=1)

    pipe_sv = Pipeline([
        ('Scaler', StandardScaler().set_output(transform="pandas")),
        ('sspa', pi_model.sspa_method(pi_model.pathway_source, pi_model.min_coverage)),
    ])

    test_set_scores = pipe_sv.fit_transform(concat_data)

    sv_pred = sv_tuned.predict(test_set_scores)
    sv_pred_prob = sv_tuned.predict_proba(test_set_scores)[:, 1]

    test_set_f1 = f1_score(y_test, sv_pred)
    test_set_precision = precision_score(y_test, sv_pred)
    test_set_recall = recall_score(y_test, sv_pred)
    test_set_auc = roc_auc_score(y_test, sv_pred_prob)

    fpr, tpr, _ = roc_curve(y_test, sv_pred_prob)

    return test_set_f1, test_set_precision, test_set_recall, test_set_auc, fpr, tpr, random_seed

# function to run the model for all the seeds and calcualte an  average ROC - this is also the same as before
def run_model_for_all_seeds(random_seeds, prot, metab, mo_paths, shuffle_labels=False):
    num_runs = len(random_seeds)
    f1_scores = []
    precision_scores = []
    recall_scores = []
    auc_scores = []
    fpr_list = []
    tpr_list = []
    all_fpr = np.linspace(0, 1, 100)

    for i in range(num_runs):
        random_seed = random_seeds[i]
        f1, precision, recall, auc, fpr, tpr, used_seed = train_and_evaluate_model(random_seed, prot, metab, mo_paths, shuffle_labels)
        f1_scores.append(f1)
        precision_scores.append(precision)
        recall_scores.append(recall)
        auc_scores.append(auc)
        fpr_list.append(fpr)
        tpr_list.append(interpolate.interp1d(fpr, tpr)(all_fpr))
        print(f"Run {i + 1}: F1 = {f1}, Precision = {precision}, Recall = {recall}, AUC = {auc}, Seed = {used_seed}")

    mean_tpr = np.mean(tpr_list, axis=0)
    std_tpr = np.std(tpr_list, axis=0)
    mean_auc = np.mean(auc_scores)
    std_auc = np.std(auc_scores)

    return all_fpr, mean_tpr, mean_auc, std_tpr, std_auc

# loading the random seeds --> 50 random seeds and selecting 20
random_seeds = np.random.randint(50, size=20)

# Load datasets for KEGG
metab = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Final_Scripts/Results/Results_G/COVID_Met_KEGG_Pred.csv')
prot = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Final_Scripts/Results/Results_G/COVID_Prot_KEGG_Final.csv')
prot.set_index('sample_id', inplace=True)
metab.set_index('sample_id', inplace=True)
prot = prot.drop(columns=['Who', 'Race', 'Age', 'Group', 'Age_Group', 'Race_Group'])
metab = metab.drop(columns=['Who', 'Race', 'Age', 'Group', 'Age_Group', 'Race_Group'])
common_indices = prot.index.intersection(metab.index)
prot = prot.loc[common_indices]
metab = metab.loc[common_indices]
metab = metab.iloc[:, :-1]

prot['Condition_Group'] = prot['Condition_Group'].map({'Severe': 1, 'Mild': 0})


# creating a diciontary a to store roc results (and standard deviation) for each threshold
roc_results = {}

# runnign the models for each threshold dataset and collecting the roc results
for threshold, dataset in merged_datasets.items():
    fpr, mean_tpr, mean_auc, std_tpr, std_auc = run_model_for_all_seeds(random_seeds, prot, metab, dataset)
    roc_results[threshold] = (fpr, mean_tpr, mean_auc, std_tpr, std_auc)
    print(F'Threshold {threshold} has finished')
    print(F'The AUC is: {mean_auc}')

# ploitting unique roc curves for each threshold in the augmented dataset
plt.figure()

# a null model is also created for  ROC comparison
fpr_null, mean_tpr_null, mean_auc_null, std_tpr_null, std_auc_null = run_model_for_all_seeds(random_seeds, prot, metab, mo_paths, shuffle_labels=True)
plt.plot(fpr_null, mean_tpr_null, color='black', linestyle='--', label=f'Null Model ROC curve (area = {mean_auc_null:.2f})')

for threshold, (fpr, mean_tpr, mean_auc, std_tpr, std_auc) in roc_results.items():
    plt.plot(fpr, mean_tpr, label=f'Threshold {threshold} ROC curve (area = {mean_auc:.2f}, std = {std_auc:.2f})')


plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('roc curves for diff thresholds')
plt.legend(loc="lower right", prop={'size': 8})
plt.show()


In [None]:

# creating a function to ensure that the roc  starts at the original
def ensure_start_at_zero(fpr, tpr):
    if fpr[0] != 0 or tpr[0] != 0:
        fpr = np.insert(fpr, 0, 0)
        tpr = np.insert(tpr, 0, 0)
    return fpr, tpr

# fucntion that is created to make sure values are within ranges
def clip_to_range(arr):
    return np.clip(arr, 0, 1)
fpr_null, mean_tpr_null = ensure_start_at_zero(fpr_null, mean_tpr_null)
fpr_null, mean_tpr_null = clip_to_range(fpr_null), clip_to_range(mean_tpr_null)
plt.style.use('default')
plt.style.use('seaborn-v0_8-white')

plt.figure()
colors = {
    'original': 'blue',
    'above_0.7': 'green',
    'above_0.0': 'red'
}

def equalize_lengths(fpr, mean_tpr, std_tpr):
    min_len = min(len(fpr), len(mean_tpr), len(std_tpr))
    return fpr[:min_len], mean_tpr[:min_len], std_tpr[:min_len]
for threshold, (fpr, mean_tpr, mean_auc, std_tpr, std_auc) in roc_results.items():
    if threshold in colors:
        color = colors[threshold]
        fpr, mean_tpr = ensure_start_at_zero(fpr, mean_tpr)
        fpr, mean_tpr, std_tpr = equalize_lengths(fpr, mean_tpr, std_tpr)
        fpr, mean_tpr, std_tpr = clip_to_range(fpr), clip_to_range(mean_tpr), clip_to_range(std_tpr)
        plt.step(fpr, mean_tpr, where='post', color=color, label=f'Threshold {threshold} (area = {mean_auc:.3f}, std = {std_auc:.2f})')
        plt.fill_between(fpr, clip_to_range(mean_tpr - std_tpr), clip_to_range(mean_tpr + std_tpr), color=color, alpha=0.2, label=f'Threshold {threshold} ±1 std dev')

plt.step(fpr_null, mean_tpr_null, where='post', color='black', linestyle='--', label=f'Null Model ROC curve (area = {mean_auc_null:.3f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')

# Set the labels and title
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('roc curves for diff thresholds')
plt.legend(loc="lower right", prop={'size': 8})
plt.tick_params(axis='both', which='major', labelsize=15)  # Major ticks
plt.tick_params(axis='both', which='minor', labelsize=15)  # Minor ticks (if any)
plt.savefig('/Users/judepops/Documents/PathIntegrate/Code/Pathway_Prediction/Huckvale/Results_2/roc_key_thresholds.png', dpi=500)
plt.show()