# Packages

In [None]:
import pathintegrate_v3
import pandas as pd
import numpy as np
from simulation_jp import SimulateData
import matplotlib.pyplot as plt
import seaborn as sns
import sspa

# Making synthetic models

### Data

In [None]:
mo_paths = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Final_Scripts/Results/Results_G/KEGG_database_multiomics_filtered.csv', dtype={'Pathway': str}, index_col='Pathway')

 
metab = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Final_Scripts/Results/Results_G/COVID_Met_KEGG_Pred.csv')
prot = pd.read_csv('/Users/judepops/Documents/PathIntegrate/Code/Final_Scripts/Results/Results_G/COVID_Prot_KEGG_Final.csv')
prot.set_index('sample_id', inplace=True)
metab.set_index('sample_id', inplace=True)
metab = metab.drop('INCOV090', axis=0)
metab = metab.drop('INCOV028', axis=0)
prot = prot.drop('INCOV090', axis=0)
prot = prot.drop(columns=['Race', 'Age', 'Group'])
metab = metab.drop(columns=['Race', 'Age', 'Group'])
common_indices = prot.index.intersection(metab.index)
prot = prot.loc[common_indices]
metab = metab.loc[common_indices]


metab_un = metab.copy()
prot_un = prot.copy()

def group_who_corrected(value):
    if value == '1 or 2':
        return '1-2'
    try:
        value = int(value)
        if value in [1, 2]:
            return '1-2'
        elif value in [3, 4]:
            return '3-4'
        elif value in [5, 6, 7]:
            return '5-7'
    except ValueError:
        return np.nan

prot_un['Who_Group'] = prot_un['Who'].apply(group_who_corrected)
metab_un['Who_Group'] = metab_un['Who'].apply(group_who_corrected)

In [None]:
# converting pathways to a dictionary
mo_paths
mo_paths_dict = sspa.utils.pathwaydf_to_dict(mo_paths)

In [None]:
# looking at many pathways contain at least four mapped compounds?
pathways_present_cts = {k: len([i for i in metab_un.columns if i in v]) for k, v in mo_paths_dict.items() if len([i for i in metab_un.columns if i in v]) > 3}
pathways_present_cvrg = {k: [i for i in metab_un.columns if i in v] for k, v in mo_paths_dict.items() if len([i for i in metab_un.columns if i in v]) > 3}

In [None]:
# sorting the ranked apthwyas and looking at which ones have the most compounds - best to augment
ranked_pathways = sorted(pathways_present_cts.items(), key=lambda item: item[1], reverse=True)

#  results
print("number mapped compounds:")
for pathway, count in ranked_pathways:
    print(f"{pathway}: {count} mapped compounds")

### Creating simultation

In [None]:

mo_sim = SimulateData(
    input_data=[metab_un.iloc[:, :-4], prot_un.iloc[:, :-4]],
    metadata=[metab_un['Condition_Group'], prot_un['Condition_Group']],
    pathways=mo_paths_dict,
    enriched_paths=['04211', '05152', '05133', '04934']).enrich_paths_base(effect_sizes=[0,10, 20, 40, 60, 80, 100, 120])

metab_sim = mo_sim[0]
prot_sim = mo_sim[1]

In [None]:
# Model
pi_model = pathintegrate_v3.PathIntegrate(
    omics_data={'Metabolomics': metab_sim.iloc[:, :-2], 'Proteomics':prot_sim.iloc[:, :-2]}, # dictionary of multi-omics DataFrames and names for each omics
    metadata=prot_sim['Group'], # metadata column
    pathway_source=mo_paths, # pathways dataframe
    sspa_scoring=sspa.sspa_SVD, # ssPA method, see ssPA package for options
    min_coverage=4) # minimum number of molecules mapping per pathway to be included

### UNSING my pathintgrate package clustering model

In [None]:
# running the PathIntegrate Model
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
covid_kmeans = pi_model.SingleViewClust(model=KMeans, model_params={'n_clusters' :2}, use_pca=True, return_comparison_plot=True, return_plot=True, return_ground_truth_plot=True)


### running the simulation many times for different cluster counts with different models

In [None]:
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering, DBSCAN, Birch, MeanShift
from sklearn.mixture import GaussianMixture

# using the only algorithms that work with metrics
clustering_algorithms = {
    'Birch': Birch,
    'KMeans': KMeans,
    'AgglomerativeClustering': AgglomerativeClustering,
}

metrics_list = []

for algo_name, algo in clustering_algorithms.items():
    for n_clusters in range(2, 9):
        effect_sizes = list(range(0, 10 * n_clusters, 10))
                
        mo_sim = SimulateData(
            input_data=[metab_un.iloc[:, :-4], prot_un.iloc[:, :-4]],
            metadata=[metab_un['Condition_Group'], prot_un['Condition_Group']],
            pathways=mo_paths_dict,
            enriched_paths=['04211', '05152', '05133', '04934']
        ).enrich_paths_base(effect_sizes=effect_sizes)

        metab_sim = mo_sim[0]
        prot_sim = mo_sim[1]

        pi_model = pathintegrate_v3.PathIntegrate(
            omics_data={'Metabolomics': metab_sim.iloc[:, :-2], 'Proteomics': prot_sim.iloc[:, :-2]},
            metadata=prot_sim['Group'],
            pathway_source=mo_paths,
            sspa_scoring=sspa.sspa_SVD,
            min_coverage=2
        )
        
        model_params = {'n_clusters': n_clusters} if algo_name not in ['DBSCAN', 'MeanShift'] else {}

        covid_clustering = pi_model.SingleViewClust(
            model=algo, 
            model_params=model_params,
            use_pca=True, 
            return_comparison_plot=True, 
            return_plot=False, 
            return_ground_truth_plot=False
        )
        
        metrics = covid_clustering.metrics
        metrics['Algorithm'] = algo_name
        metrics['n_clusters'] = n_clusters
        metrics_list.append(metrics)

metrics_df = pd.DataFrame(metrics_list)

print(metrics_df)
