In [1]:
import pandas as pd
import numpy as np

In [2]:
combo_metadata = pd.read_csv('../data/Metabolome/26112024/Metadata_all.tsv', sep='\t', index_col=0)
combo_metadata.shape

(2117, 15)

In [3]:
corr_3xtg_fecal = pd.read_csv('../data/JointRPCA/3XTG_sac_fecal_metab-v-micro_correlation_table/Correlation.tsv',
                              sep='\t', index_col=0)
corr_3xtg_brain = pd.read_csv('../data/JointRPCA/3XTG_sac_brain-v-micro_correlation_table/Correlation.tsv',
                              sep='\t', index_col=0)

In [4]:
def filter_distance_matrix(distance_matrix, row_filter, col_filter):
    """
    Filters a distance matrix based on row and column prefixes.

    Args:
        distance_matrix: A pandas DataFrame representing the distance matrix.
        row_filter: The prefix to filter rows by.
        col_filter: The prefix to filter columns by.

    Returns:
        A filtered pandas DataFrame.
    """

    row_filter_mask = distance_matrix.index.str.startswith(row_filter)
    col_filter_mask = distance_matrix.columns.str.startswith(col_filter)

    filtered_matrix = distance_matrix.loc[row_filter_mask, col_filter_mask]

    return filtered_matrix

def transform_df(df):
    """
    Transforms a DataFrame to have the index as the first column, 
      column names as the second column, and values in a new column.

    Args:
        df: The input DataFrame.

    Returns:
        A new DataFrame with the transformed structure.
    """
        
    idx = []
    cols = []
    val = []
    for index, row in df.iterrows():
        for col, value in row.items():
            idx.append(index)
            cols.append(col)
            val.append(value)

      # Create the new DataFrame
    new_df = pd.DataFrame({'OGU':idx, 'metab_row_id':cols, 'jointRPCA_co-occur_value':val})
    
    return new_df

In [5]:
corr_3xtg_fecal_filt = filter_distance_matrix(corr_3xtg_fecal, 
                                              row_filter="G", 
                                              col_filter="fecal_") 
corr_3xtg_fecal_filt.shape

(1952, 5626)

In [6]:
cyscp_3xtg_fecal = transform_df(corr_3xtg_fecal_filt)
cyscp_3xtg_fecal['metab_row_id'] = cyscp_3xtg_fecal['metab_row_id'].str.replace('fecal_spf_3xtg_', '')
cyscp_3xtg_fecal['metab_row_id'] = cyscp_3xtg_fecal['metab_row_id'].astype(int)
cyscp_3xtg_fecal['node']= cyscp_3xtg_fecal['OGU'].astype(str)+"-"+cyscp_3xtg_fecal['metab_row_id'].astype(str)
cyscp_3xtg_fecal.set_index('node', inplace=True)
cyscp_3xtg_fecal.head()

Unnamed: 0_level_0,OGU,metab_row_id,jointRPCA_co-occur_value
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
G000005825-64,G000005825,64,0.786478
G000005825-319,G000005825,319,0.298362
G000005825-2065,G000005825,2065,0.984805
G000005825-459,G000005825,459,0.678559
G000005825-83,G000005825,83,-0.241517


In [7]:
cyscp_3xtg_fecal.to_csv('../data/JointRPCA/3xtg_sac_fecal_cytoscape_format.txt', sep='\t')

In [8]:
short_fecal = cyscp_3xtg_fecal[np.abs(cyscp_3xtg_fecal['jointRPCA_co-occur_value']) > 0.5].copy()
short_fecal['jointRPCA_co-occur_value'].describe()

count    6.574987e+06
mean     2.717109e-02
std      8.046525e-01
min     -1.000000e+00
25%     -7.959281e-01
50%      5.189001e-01
75%      8.273432e-01
max      9.999999e-01
Name: jointRPCA_co-occur_value, dtype: float64

In [9]:
short_fecal.head()

Unnamed: 0_level_0,OGU,metab_row_id,jointRPCA_co-occur_value
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
G000005825-64,G000005825,64,0.786478
G000005825-2065,G000005825,2065,0.984805
G000005825-459,G000005825,459,0.678559
G000005825-961,G000005825,961,0.788988
G000005825-1693,G000005825,1693,-0.722251


In [10]:
short_fecal.to_csv('../data/JointRPCA/3xtg_sac_fecal_cytoscape_format_cutoff50.txt', sep='\t')

In [11]:
corr_3xtg_brain_filt = filter_distance_matrix(corr_3xtg_brain, 
                                              row_filter="G", 
                                              col_filter="brain_") 
corr_3xtg_brain_filt.shape

(1902, 890)

In [12]:
cyscp_3xtg_brain = transform_df(corr_3xtg_brain_filt)
cyscp_3xtg_brain['metab_row_id'] = cyscp_3xtg_brain['metab_row_id'].str.replace('brain_', '')
cyscp_3xtg_brain['metab_row_id'] = cyscp_3xtg_brain['metab_row_id'].astype(int)
cyscp_3xtg_brain['node']= cyscp_3xtg_brain['OGU'].astype(str)+"-"+cyscp_3xtg_brain['metab_row_id'].astype(str)
cyscp_3xtg_brain.set_index('node', inplace=True)
cyscp_3xtg_brain.head()

Unnamed: 0_level_0,OGU,metab_row_id,jointRPCA_co-occur_value
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
G000005825-532,G000005825,532,0.035114
G000005825-703,G000005825,703,0.36
G000005825-816,G000005825,816,-0.007822
G000005825-881,G000005825,881,0.19713
G000005825-1017,G000005825,1017,0.345076


In [13]:
cyscp_3xtg_brain.to_csv('../data/JointRPCA/3xtg_brain_cytoscape_format.txt', sep='\t')

In [14]:
cyscp_3xtg_brain['jointRPCA_co-occur_value'].describe()

count    1.692780e+06
mean     1.797493e-02
std      5.856040e-01
min     -9.999997e-01
25%     -4.839313e-01
50%      9.210277e-03
75%      5.329324e-01
max      9.999999e-01
Name: jointRPCA_co-occur_value, dtype: float64

In [15]:
cyscp_3xtg_brain.shape

(1692780, 3)

In [16]:
cyscp_3xtg_brain[np.abs(cyscp_3xtg_brain['jointRPCA_co-occur_value']) > 0.5].shape
#reduced by half

(859400, 3)

In [17]:
short_brain = cyscp_3xtg_brain[np.abs(cyscp_3xtg_brain['jointRPCA_co-occur_value']) > 0.5].copy()
short_brain['jointRPCA_co-occur_value'].describe()

count    859400.000000
mean          0.041586
std           0.772046
min          -1.000000
25%          -0.740169
50%           0.524789
75%           0.785735
max           1.000000
Name: jointRPCA_co-occur_value, dtype: float64

In [18]:
short_brain.head()

Unnamed: 0_level_0,OGU,metab_row_id,jointRPCA_co-occur_value
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
G000005825-871,G000005825,871,0.96534
G000005825-440,G000005825,440,0.855243
G000005825-928,G000005825,928,0.600555
G000005825-659,G000005825,659,-0.759782
G000005825-851,G000005825,851,0.742475


In [19]:
short_brain.to_csv('../data/JointRPCA/3xtg_brain_cytoscape_format_cutoff50.txt', sep='\t')

In [110]:
taxonomy = pd.read_csv('../../U19_3XTG/data/Microbiome/pooling_filtered/gg2_taxonomy/taxonomy.tsv',
                      sep='\t', index_col=0)
taxonomy.head()

Unnamed: 0_level_0,Taxon,Confidence
Feature ID,Unnamed: 1_level_1,Unnamed: 2_level_1
G000262225,d__Bacteria; p__Fusobacteriota; c__Fusobacteri...,1.0
G006716645,d__Bacteria; p__Bacteroidota; c__Bacteroidia; ...,1.0
G001310255,d__Bacteria; p__Proteobacteria; c__Alphaproteo...,1.0
G900604515,d__Bacteria; p__Actinobacteriota; c__Actinomyc...,1.0
G001813085,d__Bacteria; p__Actinobacteriota; c__Actinomyc...,1.0


In [111]:
txs = ['Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
taxonomy[txs] = taxonomy['Taxon'].str.split(';', expand=True)
taxonomy.head()

Unnamed: 0_level_0,Taxon,Confidence,Domain,Phylum,Class,Order,Family,Genus,Species
Feature ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
G000262225,d__Bacteria; p__Fusobacteriota; c__Fusobacteri...,1.0,d__Bacteria,p__Fusobacteriota,c__Fusobacteriia,o__Fusobacteriales_993521,f__Fusobacteriaceae_993521,g__Fusobacterium_C,s__Fusobacterium_C necrophorum
G006716645,d__Bacteria; p__Bacteroidota; c__Bacteroidia; ...,1.0,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Sphingobacteriales,f__Sphingobacteriaceae,g__Pararcticibacter,s__Pararcticibacter tournemirensis
G001310255,d__Bacteria; p__Proteobacteria; c__Alphaproteo...,1.0,d__Bacteria,p__Proteobacteria,c__Alphaproteobacteria,o__Caulobacterales,f__Caulobacteraceae,g__Brevundimonas,s__Brevundimonas aurantiaca
G900604515,d__Bacteria; p__Actinobacteriota; c__Actinomyc...,1.0,d__Bacteria,p__Actinobacteriota,c__Actinomycetia,o__Actinomycetales,f__Actinomycetaceae,g__Arcanobacterium_A_386370,s__Arcanobacterium_A_386370 ihumii
G001813085,d__Bacteria; p__Actinobacteriota; c__Actinomyc...,1.0,d__Bacteria,p__Actinobacteriota,c__Actinomycetia,o__Actinomycetales,f__Actinomycetaceae,g__Gleimia,s__Gleimia europaea_A


In [219]:
tog1_fecal = pd.merge(cyscp_3xtg_fecal, taxonomy[['Taxon', 'Family', 'Genus', 'Phylum']], 
                      left_on='OGU', right_index=True)
tog1_brain = pd.merge(cyscp_3xtg_brain, taxonomy[['Taxon', 'Family', 'Genus', 'Phylum']], 
                      left_on='OGU', right_index=True)
tog1_brain.head()

Unnamed: 0_level_0,OGU,metab_row_id,jointRPCA_co-occur_value,Taxon,Family,Genus,Phylum
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
G000005825-532,G000005825,532,0.035114,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__B...,f__Bacillaceae_D_361077,g__Bacillus_S,p__Firmicutes_D
G000005825-703,G000005825,703,0.36,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__B...,f__Bacillaceae_D_361077,g__Bacillus_S,p__Firmicutes_D
G000005825-816,G000005825,816,-0.007822,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__B...,f__Bacillaceae_D_361077,g__Bacillus_S,p__Firmicutes_D
G000005825-881,G000005825,881,0.19713,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__B...,f__Bacillaceae_D_361077,g__Bacillus_S,p__Firmicutes_D
G000005825-1017,G000005825,1017,0.345076,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__B...,f__Bacillaceae_D_361077,g__Bacillus_S,p__Firmicutes_D


In [25]:
metab_id_fecal = pd.read_csv('../data/Metabolome/26112024/annotation/3xtg_fecal_spf_features_info_mn.csv',
                            index_col=0)
metab_id_brain = pd.read_csv('../data/Metabolome/26112024/annotation/3xtg_brain_features_info_mn.csv',
                            index_col=0)
metab_id_brain.head()
#plan to use Compound_Name and NPC#class

Unnamed: 0_level_0,VIP1,SPF_genotype,VIP2,GF_genotype,VIP3,3xTG_colonization,VIP4,WT_colonization,VIP_mean,Exclusivity,...,Microbiome,mz,RT,Corr_ID,Compound_Name,Adduct_GNPS,Adduct_SIRIUS,NPC#pathway,NPC#superclass,NPC#class
Feature_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
383,4.567,Mut,4.439,Mut,,,1.976,GF,3.661,,...,No,284.0986,0.3671,,GUANOSINE - 20.0 eV,M+H,[M + H]+,Carbohydrates,Nucleosides,Purine nucleos(t)ides
600,3.082,Mut,3.375,Mut,,,,,3.229,,...,No,269.0877,0.3721,23.0,INOSINE - 20.0 eV,M+H,[M + H]+,Carbohydrates,Nucleosides,Purine nucleos(t)ides
1417,2.784,WT,,,3.523,GF,,,3.154,,...,No,266.1383,2.0846,,,,[M + H]+,Fatty acids,Fatty esters,Fatty acyl carnitines
403,2.43,WT,4.46,WT,,,1.87,GF,2.92,,...,No,276.1439,0.3963,,,,[M + H]+,Fatty acids,Fatty esters,Fatty acyl carnitines
738,1.863,WT,4.053,WT,2.456,SPF,,,2.791,,...,No,185.1284,0.7642,,,,[M + H]+,Alkaloids,Lysine alkaloids,Piperidine alkaloids


In [26]:
print('metab fecal ', metab_id_fecal.shape)
print('metab brain ', metab_id_brain.shape)

metab fecal  (2055, 17)
metab brain  (268, 21)


In [220]:
tog2_fecal = pd.merge(tog1_fecal, metab_id_fecal[['Compound_Name', 'NPC#class', 'NPC#superclass']], 
                      left_on='metab_row_id', right_index=True)
tog2_brain = pd.merge(tog1_brain, metab_id_brain[['Compound_Name', 'NPC#class', 'NPC#superclass']], 
                      left_on='metab_row_id', right_index=True)
tog2_brain.head()

Unnamed: 0_level_0,OGU,metab_row_id,jointRPCA_co-occur_value,Taxon,Family,Genus,Phylum,Compound_Name,NPC#class,NPC#superclass
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
G000005825-221,G000005825,221,-0.012077,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__B...,f__Bacillaceae_D_361077,g__Bacillus_S,p__Firmicutes_D,,Aminoacids,Small peptides
G000006785-221,G000006785,221,-0.206482,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__L...,f__Streptococcaceae,g__Streptococcus,p__Firmicutes_D,,Aminoacids,Small peptides
G000006865-221,G000006865,221,-0.006841,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__L...,f__Streptococcaceae,g__Lactococcus_A_346120,p__Firmicutes_D,,Aminoacids,Small peptides
G000006925-221,G000006925,221,-0.011491,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Enterobacteriaceae_A,g__Escherichia_710834,p__Proteobacteria,,Aminoacids,Small peptides
G000007265-221,G000007265,221,-0.073954,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__L...,f__Streptococcaceae,g__Streptococcus,p__Firmicutes_D,,Aminoacids,Small peptides


In [221]:
tog2_fecal['metab_row_id'] = 'metab'+tog2_fecal['metab_row_id'].astype(str)
tog2_brain['metab_row_id'] = 'metab'+tog2_brain['metab_row_id'].astype(str)
#no underscores or dashes for formatting purposes

In [222]:
tog2_fecal['jRPCA_values_scaled'] = (tog2_fecal['jointRPCA_co-occur_value'] + 1) * 50
tog2_brain['jRPCA_values_scaled'] = (tog2_brain['jointRPCA_co-occur_value'] + 1) * 50
#does not work with negative values - convert scale from -1:1 to 0:100

In [223]:
print('fecal ', tog2_fecal.shape)
print('brain ', tog2_brain.shape)

fecal  (3480416, 11)
brain  (477402, 11)


In [155]:
tog2_fecal.to_csv('../data/JointRPCA/3xtg_sac_fecal_cytoscape_format3.txt', sep='\t')

In [156]:
tog2_brain.to_csv('../data/JointRPCA/3xtg_brain_cytoscape_format3.txt', sep='\t')

In [224]:
tog2_brain.columns

Index(['OGU', 'metab_row_id', 'jointRPCA_co-occur_value', 'Taxon', 'Family',
       'Genus', 'Phylum', 'Compound_Name', 'NPC#class', 'NPC#superclass',
       'jRPCA_values_scaled'],
      dtype='object')

In [83]:
def select_rows_with_multiple_values(df):
    """
    Selects rows where the unique strings in 'OGU' or 'metab_row_id' have 
    multiple (more than 1) values in 'jRPCA_values_scaled' column 
    within the ranges 0-10 or 90-100.

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: The filtered DataFrame.
    """

    filtered_df = df[(df['jRPCA_values_scaled'] <= 10)|(90 <= df['jRPCA_values_scaled'])]
    
    group_counts1 = filtered_df.groupby('metab_row_id')['jRPCA_values_scaled'].count()
    metab_selected_ids = group_counts1[group_counts1 > 100].index.tolist()
    
    group_counts2 = filtered_df.groupby('OGU')['jRPCA_values_scaled'].count()
    micro_selected_ids = group_counts2[group_counts2 > 50].index.tolist()
    
    filtered_df2 = filtered_df[filtered_df['metab_row_id'].isin(metab_selected_ids)]
    filtered_df3 = filtered_df2[filtered_df2['OGU'].isin(micro_selected_ids)]
    
    return filtered_df3

In [158]:
tog2_brainr = tog2_brain[(tog2_brain['jRPCA_values_scaled'] <= 10)|(90 <= tog2_brain['jRPCA_values_scaled'])]
tog2_brainr.shape

(100970, 10)

In [159]:
group_counts1 = tog2_brainr.groupby('metab_row_id')['jRPCA_values_scaled'].count()
metab_selected_ids = group_counts1[group_counts1 > 100].index.tolist()
    
group_counts2 = tog2_brainr.groupby('OGU')['jRPCA_values_scaled'].count()
micro_selected_ids = group_counts2[group_counts2 > 50].index.tolist()

In [160]:
tog2_brainr2 = tog2_brainr[tog2_brainr['metab_row_id'].isin(metab_selected_ids)]
tog2_brainr3 = tog2_brainr2[tog2_brainr2['OGU'].isin(micro_selected_ids)]
tog2_brainr3.head()

Unnamed: 0_level_0,OGU,metab_row_id,jointRPCA_co-occur_value,Taxon,Family,Genus,Compound_Name,NPC#class,NPC#superclass,jRPCA_values_scaled
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
G000621705-357,G000621705,metab357,-0.895334,d__Bacteria; p__Bacteroidota; c__Bacteroidia; ...,f__Prolixibacteraceae,g__Prolixibacter,,Simple amide alkaloids,Peptide alkaloids,5.233308
G000974365-357,G000974365,metab357,-0.890002,d__Bacteria; p__Bacteroidota; c__Bacteroidia; ...,f__Marinilabiliaceae,g__Geofilum_869386,,Simple amide alkaloids,Peptide alkaloids,5.499898
G001483965-357,G001483965,metab357,-0.819508,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__L...,f__Carnobacteriaceae,g__Carnobacterium_A_320743,,Simple amide alkaloids,Peptide alkaloids,9.024613
G002000245-357,G002000245,metab357,-0.944936,d__Bacteria; p__Bacteroidota; c__Bacteroidia; ...,f__Sphingobacteriaceae,g__Sphingobacterium,,Simple amide alkaloids,Peptide alkaloids,2.75319
G003234935-357,G003234935,metab357,-0.800186,d__Bacteria; p__Bacteroidota; c__Bacteroidia; ...,f__Crocinitomicaceae,g__Putridiphycobacter,,Simple amide alkaloids,Peptide alkaloids,9.990718


In [161]:
tog2_brainr3.shape

(62497, 10)

In [225]:
tog3_brain = select_rows_with_multiple_values(tog2_brain)
tog3_brain.shape

(62497, 11)

In [120]:
tog3_fecal = select_rows_with_multiple_values(tog2_fecal)
tog3_fecal.shape

(1076752, 9)

In [96]:
import networkx as nx
from sklearn.cluster import KMeans 
from pygraphml import Graph, Node, Edge

In [90]:
def create_graphml_from_df(df, output_file_path, n_clusters, percentile_cutoff):
    """
    Converts a jointRPCA correlation df to a GraphML file.

    Args:
        co-occurence table: A 2D NumPy array representing the distance matrix.
        output_file: The name of the output GraphML file. Defaults to "graph.graphml".
    """

    if df.shape[1] != 5:
        raise ValueError("Table must have exactly 5 columns: microbe_id, microbe_name, metabolite_id, metabolite_name, correlation.")

    df.columns = ['microbe_id', 'metabolite_id', 'microbe_name', 'metabolite_name', 'correlation']

    # Calculate correlation thresholds - no longer needed, filtered above
    #lower_threshold = df['correlation'].quantile((100 - percentile_cutoff) / 100)
    #upper_threshold = df['correlation'].quantile(percentile_cutoff / 100)

    # Filter DataFrame based on correlation thresholds  - no longer needed, filtered above
    #filtered_df = df[(df['correlation'] <= lower_threshold) | (df['correlation'] >= upper_threshold)]
    filtered_df = df #just so I don't have to alter the rest of the code
    
    #Create a list for edge colors
    edge_color = []
    
    # Create a graph
    G = nx.Graph()

    # Create lists of unique microbe and metabolite IDs
    microbe_ids = filtered_df['microbe_id'].unique()
    metabolite_ids = filtered_df['metabolite_id'].unique()

    # Create feature matrices for clustering
    microbe_feature_matrix = []
    for cid in microbe_ids:
        row = []
        for tid in metabolite_ids:
            # Get correlation value (or other relevant feature)
            correlation = filtered_df[(filtered_df['microbe_id'] == cid) & (filtered_df['metabolite_id'] == tid)]['correlation'].values
            row.append(correlation[0] if len(correlation) > 0 else 0)  # Handle missing correlations
        microbe_feature_matrix.append(row)

    metabolite_feature_matrix = []
    for tid in metabolite_ids:
        row = []
        for cid in microbe_ids:
            # Get correlation value (or other relevant feature)
            correlation = filtered_df[(filtered_df['microbe_id'] == cid) & (filtered_df['metabolite_id'] == tid)]['correlation'].values
            row.append(correlation[0] if len(correlation) > 0 else 0)  # Handle missing correlations
        metabolite_feature_matrix.append(row)

    # Perform k-means clustering
    kmeans_microbe = KMeans(n_clusters=n_clusters, random_state=42)
    microbe_clusters = kmeans_microbe.fit_predict(microbe_feature_matrix)

    kmeans_metabolite = KMeans(n_clusters=n_clusters, random_state=42)
    metabolite_clusters = kmeans_metabolite.fit_predict(metabolite_feature_matrix)
    
    
    #Graph modifications
    # Add nodes to the graph with cluster information
    for index, row in filtered_df.iterrows():
        microbe_id = row['microbe_id']
        microbe_name = row['microbe_name']
        metabolite_id = row['metabolite_id']
        metabolite_name = row['metabolite_name']
        correlation = row['correlation']
        
        # Get cluster assignments
        microbe_cluster = microbe_clusters[list(microbe_ids).index(microbe_id)]
        metabolite_cluster = metabolite_clusters[list(metabolite_ids).index(metabolite_id)]
        
        G.add_node(microbe_id, name=microbe_name, type="microbe", cluster=microbe_cluster)
        G.add_node(metabolite_id, name=metabolite_name, type="metabolite", cluster=metabolite_cluster)
        
        # Create interaction string 
        interaction = f"{microbe_id}_{metabolite_id}"
        
        # Count the number of connections for each node
        microbe_degrees = dict(G.degree(G.nodes()))
        metabolite_degrees = dict(G.degree(G.nodes()))
    
        # Determine edge color   
        if microbe_degrees[microbe_id] > 50:
            edge_color = "red" # Change color for edges with multiple connections
        elif metabolite_degrees[metabolite_id] > 100 :
            edge_color = "blue"  # Change color for edges with multiple connections
        else:
            edge_color = "black"  # Default color
        
        G.add_edge(microbe_id, metabolite_id, weight=row['correlation'], 
                   interaction=interaction, color=edge_color)

    nx.write_graphml(G, output_file_path)
    print(f"GraphML file created successfully: {output_file_path}")

In [133]:
tog4_brain = tog3_brain[['OGU', 'metab_row_id', 'Genus', 'NPC#class', 'jRPCA_values_scaled']].copy()

In [37]:
import time

In [92]:
t0 = time.time()
create_graphml_from_df(tog4_brain, n_clusters=10, percentile_cutoff=75,
                         output_file_path='../data/JointRPCA/3xtg_brain_microbe-metabolite_network.graphml')
t1 = time.time()
total = t1-t0
print("total hours: ", total/3600)
#generally has fewer than fecal
#currently using NPC#class, but consider Compound_Name
#took a little over 4 hours orginally, cutoff 75
#with new modifications and extra features, now about 5 hours
#with new filtering from above, no percentile cutoff, now about 3 hours

GraphML file created successfully: ../data/JointRPCA/3xtg_brain_microbe-metabolite_network.graphml
total hours:  2.8404248519738515


In [254]:
tog5_brain = tog3_brain[['OGU', 'metab_row_id', 'Genus', 'Family', 'Phylum', 'NPC#class', 'NPC#superclass', 'jRPCA_values_scaled']].copy()

In [255]:
tog5_brain.Phylum.unique()
#make these all levels

array([' p__Bacteroidota', ' p__Firmicutes_D',
       ' p__Desulfobacterota_G_459546', ' p__Firmicutes_A',
       ' p__Proteobacteria', ' p__Desulfobacterota_I',
       ' p__Fusobacteriota', ' p__Planctomycetota', ' p__Firmicutes_C',
       ' p__Firmicutes_B_370529', ' p__Actinobacteriota',
       ' p__Campylobacterota', ' p__Verrucomicrobiota',
       ' p__Deferribacterota', ' p__Riflebacteria',
       ' p__Bdellovibrionota_E', ' p__Patescibacteria',
       ' p__Desulfobacterota_C', ' p__Cyanobacteria', ' p__Firmicutes_G'],
      dtype=object)

In [256]:
tog5_brain['NPC#superclass'].unique()
#make these all levels

array(['Peptide alkaloids', nan, 'Tetramate alkaloids', 'Pseudoalkaloids',
       'Small peptides', 'Nicotinic acid alkaloids', 'Fatty acyls',
       'Ornithine alkaloids', 'Fatty esters', 'Saccharides',
       'Aminosugars and aminoglycosides', 'Fatty Acids and Conjugates',
       'Glycerophospholipids', 'Lysine alkaloids', 'Tyrosine alkaloids',
       'Nucleosides', 'Eicosanoids', 'Fatty amides', 'Steroids',
       'Tryptophan alkaloids', 'Linear polyketides', 'Oligopeptides',
       'Histidine alkaloids', 'Phenolic acids (C6-C1)',
       'Pseudoalkaloids (transamidation)'], dtype=object)

In [257]:
tog5_brain['collapsed_pair'] = tog5_brain['Genus'].astype(str)+"-"+tog5_brain['NPC#class']

In [258]:
print("Number of all microbe-metabolite pairs:", len(tog4_brain.collapsed_pair.values))
print("Number of unique pairs:", len(tog4_brain.collapsed_pair.unique()))

Number of all microbe-metabolite pairs: 62497
Number of unique pairs: 11202


In [259]:
tog5_brain.collapsed_pair.describe()

count                                   44478
unique                                  11201
top        g__Limosilactobacillus-Tripeptides
freq                                      276
Name: collapsed_pair, dtype: object

In [262]:
t5b = tog5_brain.collapsed_pair.value_counts().reset_index()
t5b2 = t5b.set_index('index')
t5b3 = t5b2[t5b2['collapsed_pair']>50]
t5b3.shape

(66, 1)

In [261]:
t5b2['collapsed_pair'].describe()

count    11201.000000
mean         3.970895
std          9.400724
min          1.000000
25%          1.000000
50%          2.000000
75%          4.000000
max        276.000000
Name: collapsed_pair, dtype: float64

In [268]:
def create_graphml_from_dataframe(df, output_filename="network.graphml", count_threshold=5):
    """
    Creates a GraphML file from a Pandas DataFrame
    Dataframe should be from jointRPCA co-occurence matrix
    
    Function allows visualizing microbe-metabolite relationships in Cytoscape.

    Args:
        df: Pandas DataFrame
        output_filename: Name of the output GraphML file. defaults to network.graphml
    """
    if df.shape[1] != 5:
        raise ValueError("Table must have exactly 5 columns: microbe_id, microbe_name, metabolite_id, metabolite_name, correlation.")

    df.columns = ['microbe_name', 'metabolite_name', 'microbe_group', 'metabolite_group', 'correlation']    
    
    # Group and aggregate (optimized)
    grouped = df.groupby(['microbe_name', 'metabolite_name', 'microbe_group', 'metabolite_group'])['correlation'].agg(['count', 'mean']).reset_index()

    # Create NetworkX graph
    graph = nx.Graph()

    # Add the central node
    graph.add_node("jointRPCA", type="central")  # Add type attribute

    # Add nodes and edges for hierarchy
    for _, row in df.iterrows():
        microbe_phylum = row['microbe_group']
        microbe_genus = row['microbe_name']
        metabolite_superclass = row['metabolite_group']
        metabolite_class = row['metabolite_name']

        if not graph.has_node(microbe_phylum):
            graph.add_node(microbe_phylum, type="phylum")
        if not graph.has_node(metabolite_superclass):
            graph.add_node(metabolite_superclass, type="superclass")
        if not graph.has_node(microbe_genus):
            graph.add_node(microbe_genus, type="genus", count=0)
        if not graph.has_node(metabolite_class):
            graph.add_node(metabolite_class, type="class", count=0)

        graph.add_edge("jointRPCA", microbe_phylum, type='jointRPCA-Phylum')
        graph.add_edge("jointRPCA", metabolite_superclass, type='jointRPCA-SuperClass')
        graph.add_edge(microbe_phylum, microbe_genus, type='Phylum-Genus')
        graph.add_edge(metabolite_superclass, metabolite_class, type='SuperClass-Class')


    microbe_counts = grouped.groupby('microbe_name')['count'].sum().to_dict()
    metabolite_counts = grouped.groupby('metabolite_name')['count'].sum().to_dict()

    for _, row in grouped.iterrows():
        microbe = row['microbe_name']
        metabolite = row['metabolite_name']
        correlation = row['mean']

        count = microbe_counts.get(microbe, 0) + metabolite_counts.get(metabolite, 0)
        if count > count_threshold:  # Check the threshold here!
            total_count = microbe_counts.get(microbe, 0) + metabolite_counts.get(metabolite, 0)
            if graph.has_node(microbe):
                graph.nodes[microbe]['count'] = total_count
            if graph.has_node(metabolite):
                graph.nodes[metabolite]['count'] = total_count
#       else: #If count is below the threshold, don't include the count attribute
#         if graph.has_node(microbe):
#             graph.nodes[microbe]['count'] = count
#         if graph.has_node(metabolite):
#             graph.nodes[metabolite]['count'] = count

        graph.add_edge(microbe, metabolite, correlation=correlation, type="interaction")

        if correlation > 98:
            graph[microbe][metabolite]['color'] = 'red'
        elif correlation < 2:
            graph[microbe][metabolite]['color'] = 'blue'
        else:
            graph[microbe][metabolite]['color'] = 'gray'

    nx.write_graphml(graph, output_filename)  # Use networkx's write_graphml
    print(f"GraphML file created: {output_filename}")

In [234]:
tog6_brain = tog5_brain[['Genus', 'NPC#class', 'Phylum', 'NPC#superclass', 'jRPCA_values_scaled']].copy()

In [269]:
t0 = time.time()
create_graphml_from_dataframe(tog6_brain, count_threshold=50,
                              output_filename='../data/JointRPCA/3xtg_brain_microbe-metabolite_network2.graphml')
t1 = time.time()
total = t1-t0
print("total hours: ", total/3600)
#This took a second

GraphML file created: ../data/JointRPCA/3xtg_brain_microbe-metabolite_network2.graphml
total hours:  0.0005200694004694621


In [None]:
# Need the below file added on

In [270]:
tog6_brain.head()

Unnamed: 0_level_0,microbe_name,metabolite_name,microbe_group,metabolite_group,correlation
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
G000621705-357,g__Prolixibacter,Simple amide alkaloids,p__Bacteroidota,Peptide alkaloids,5.233308
G000974365-357,g__Geofilum_869386,Simple amide alkaloids,p__Bacteroidota,Peptide alkaloids,5.499898
G001483965-357,g__Carnobacterium_A_320743,Simple amide alkaloids,p__Firmicutes_D,Peptide alkaloids,9.024613
G002000245-357,g__Sphingobacterium,Simple amide alkaloids,p__Bacteroidota,Peptide alkaloids,2.75319
G003234935-357,g__Putridiphycobacter,Simple amide alkaloids,p__Bacteroidota,Peptide alkaloids,9.990718


In [354]:
brain_countz = tog6_brain.reset_index()
brain_countz[['microbe_id', 'metabolite_id']] = brain_countz['node'].str.split("-", expand=True)
brain_countz.head()

Unnamed: 0,node,microbe_name,metabolite_name,microbe_group,metabolite_group,correlation,microbe_id,metabolite_id
0,G000621705-357,g__Prolixibacter,Simple amide alkaloids,p__Bacteroidota,Peptide alkaloids,5.233308,G000621705,357
1,G000974365-357,g__Geofilum_869386,Simple amide alkaloids,p__Bacteroidota,Peptide alkaloids,5.499898,G000974365,357
2,G001483965-357,g__Carnobacterium_A_320743,Simple amide alkaloids,p__Firmicutes_D,Peptide alkaloids,9.024613,G001483965,357
3,G002000245-357,g__Sphingobacterium,Simple amide alkaloids,p__Bacteroidota,Peptide alkaloids,2.75319,G002000245,357
4,G003234935-357,g__Putridiphycobacter,Simple amide alkaloids,p__Bacteroidota,Peptide alkaloids,9.990718,G003234935,357


In [349]:
brain_counts = tog6_brain.groupby(['microbe_name', 'metabolite_name', 
                                   'microbe_group', 'metabolite_group'
                                  ]).size().reset_index(name='counts')
brain_counts.head()

Unnamed: 0,microbe_name,metabolite_name,microbe_group,metabolite_group,counts
0,g__,Aminoacids,p__Bacteroidota,Small peptides,8
1,g__,Aminoacids,p__Desulfobacterota_G_459546,Small peptides,3
2,g__,Aminoacids,p__Proteobacteria,Small peptides,6
3,g__,Aminoacids,p__Verrucomicrobiota,Small peptides,5
4,g__,Branched fatty acids,p__Bacteroidota,Fatty Acids and Conjugates,1


In [288]:
brain_counts.counts.describe()

count    13771.000000
mean         3.229831
std          7.962164
min          1.000000
25%          1.000000
50%          1.000000
75%          3.000000
max        274.000000
Name: counts, dtype: float64

In [289]:
brain_counts.shape

(13771, 5)

In [302]:
brain_counts20 = brain_counts[brain_counts.counts>=20].copy()
brain_counts20.shape

(279, 5)

In [309]:
brain_counts20.metabolite_name.unique()

array(['Tripeptides', 'Aminoacids', 'Dipeptides',
       'Glycerophosphocholines', 'Linear peptides', 'Thia fatty acids',
       'Lipopeptides', 'Fatty acyl carnitines', 'Purine nucleos(t)ides',
       'Cyclic peptides', 'Gallotannins', 'Glycerophosphoethanolamines',
       'Oxidized glycerophospholipids', 'Thiodiketopiperazine alkaloids',
       'Pyridine alkaloids', 'Branched fatty acids', 'Cephalosporins',
       'Imidazole alkaloids', 'N-acyl amines',
       'N-acyl ethanolamines (endocannabinoids)', 'Polyamines',
       'Pyrrolidine alkaloids', 'Simple amide alkaloids',
       'Purine alkaloids', 'Wax monoesters'], dtype=object)

In [314]:
brain_counts20.metabolite_group.unique()

array(['Small peptides', 'Glycerophospholipids', 'Oligopeptides',
       'Fatty Acids and Conjugates', 'Fatty esters', 'Nucleosides',
       'Phenolic acids (C6-C1)', 'Nicotinic acid alkaloids',
       'Tryptophan alkaloids', 'Linear polyketides', 'Lysine alkaloids',
       'Histidine alkaloids', 'Fatty amides', 'Ornithine alkaloids',
       'Aminosugars and aminoglycosides', 'Pseudoalkaloids'], dtype=object)

In [311]:
brain_counts20[brain_counts20.metabolite_name=='Fatty acyl carnitines']

Unnamed: 0,microbe_name,metabolite_name,microbe_group,metabolite_group,counts
3610,g__Clostridium_T,Fatty acyl carnitines,p__Firmicutes_A,Fatty esters,29
4023,g__Cryptobacteroides,Fatty acyl carnitines,p__Bacteroidota,Fatty esters,31
5450,g__Escherichia_710834,Fatty acyl carnitines,p__Proteobacteria,Fatty esters,31
8228,g__Ligilactobacillus,Fatty acyl carnitines,p__Firmicutes_D,Fatty esters,34
8327,g__Limosilactobacillus,Fatty acyl carnitines,p__Firmicutes_D,Fatty esters,37
8376,g__Liquorilactobacillus,Fatty acyl carnitines,p__Firmicutes_D,Fatty esters,30
10184,g__Pediococcus,Fatty acyl carnitines,p__Firmicutes_D,Fatty esters,22
12018,g__Streptococcus,Fatty acyl carnitines,p__Firmicutes_D,Fatty esters,20


In [308]:
brain_counts20[brain_counts20.counts>100]

Unnamed: 0,microbe_name,metabolite_name,microbe_group,metabolite_group,counts
3643,g__Clostridium_T,Tripeptides,p__Firmicutes_A,Small peptides,102
4055,g__Cryptobacteroides,Tripeptides,p__Bacteroidota,Small peptides,123
5449,g__Escherichia_710834,Dipeptides,p__Proteobacteria,Small peptides,107
5483,g__Escherichia_710834,Tripeptides,p__Proteobacteria,Small peptides,223
7750,g__Lacticaseibacillus,Tripeptides,p__Firmicutes_D,Small peptides,112
7794,g__Lactobacillus,Dipeptides,p__Firmicutes_D,Small peptides,117
7797,g__Lactobacillus,Glycerophosphocholines,p__Firmicutes_D,Glycerophospholipids,160
7817,g__Lactobacillus,Tripeptides,p__Firmicutes_D,Small peptides,201
8259,g__Ligilactobacillus,Tripeptides,p__Firmicutes_D,Small peptides,179
8316,g__Limosilactobacillus,Aminoacids,p__Firmicutes_D,Small peptides,113


In [303]:
df = brain_counts20
for (mic_id, met_id, mic_group, met_group), group in df.groupby(['microbe_name', 'metabolite_name', 'microbe_group', 'metabolite_group']):
        mic_group_counts = df[df['microbe_group'] == mic_group]['counts'].sum()
        met_group_counts = df[df['metabolite_group'] == met_group]['counts'].sum()
        mic_id_counts = df[(df['microbe_group'] == mic_group) & (df['microbe_name'] == mic_id)]['counts'].sum()
        met_id_counts = df[(df['metabolite_group'] == met_group) & (df['metabolite_name'] == met_id)]['counts'].sum()
        g_counts = group['counts'].sum()

In [304]:
print(mic_group_counts)
print(mic_id_counts)
print(met_group_counts)
print(met_id_counts)
print(g_counts)
#counts the number of jointRPCA co-occurences

8160
215
7101
4243
83


In [342]:
#export information to import into Cytoscape and connect to graphml

def transform_data(df, df_orig, organ):
    # Creating an empty list to store dfs
    transformed_dfs = [] 

    # Aggregating at each level and calculating node counts
    for (mic_id, met_id, mic_group, met_group), group in df.groupby(['microbe_name', 'metabolite_name', 'microbe_group', 'metabolite_group']):
        mic_group_counts = df[df['microbe_group'] == mic_group]['counts'].sum()
        met_group_counts = df[df['metabolite_group'] == met_group]['counts'].sum()
        mic_id_counts = df[(df['microbe_group'] == mic_group) & (df['microbe_name'] == mic_id)]['counts'].sum()
        met_id_counts = df[(df['metabolite_group'] == met_group) & (df['metabolite_name'] == met_id)]['counts'].sum()
        pair_counts = group['counts'].sum()

        # Define levels and their respective counts
        levels = [
            (mic_group, 'level_1', mic_group_counts),
            (met_group, 'level_1', met_group_counts),
            (f'{mic_group}_{mic_id}', 'level_2', mic_id_counts),
            (f'{met_group}_{met_id}', 'level_2', met_id_counts),
            (f'{mic_id}_{met_id}', 'level_3', pair_counts),
        ]

        # Adding entries for each level
        for node, level, counts in levels:
            node_count = counts
            new_df = pd.DataFrame([{  # Create a small DataFrame for each row
                'node': node,
                'level': level,
                'counts': counts,
                'node_counts': f'{node} ({node_count})'
            }])
            transformed_dfs.append(new_df)  # Append to the list of DataFrames
    
    # Adding the organism row
    total_rows = len(df_orig)  # Gets the number of rows in the input dataframe
    organ_row = {'node': organ,
                 'level': 'level_0',
                 'counts': total_rows,
                 'node_counts': f'{organ} ({total_rows})'}
    
    #create the df
    organ_df = pd.DataFrame([organ_row])
    
    #add to df list
    transformed_dfs.append(organ_df)
    
    #combine all dfs
    transformed_df = pd.concat(transformed_dfs, ignore_index=True)

    # Removing duplicate entries and resetting index
    transformed_df = transformed_df.drop_duplicates().reset_index(drop=True)

    # add another column label_counts
    transformed_df['label_counts'] = transformed_df['node'].str.split('_').str[-1] + ' (' + transformed_df['counts'].astype(str) + ')'

    return transformed_df

In [345]:
df_nodes = transform_data(brain_counts20, tog6_brain, organ='brain')
df_nodes.to_csv('../data/JointRPCA/3xtg_brain_additional_info_Cytoscape.tsv', sep='\t')

In [346]:
df_nodes

Unnamed: 0,node,level,counts,node_counts,label_counts
0,p__Firmicutes_A,level_1,627,p__Firmicutes_A (627),A (627)
1,Small peptides,level_1,7101,Small peptides (7101),Small peptides (7101)
2,p__Firmicutes_A_ g__Acetivibrio,level_2,27,p__Firmicutes_A_ g__Acetivibrio (27),Acetivibrio (27)
3,Small peptides_Tripeptides,level_2,4243,Small peptides_Tripeptides (4243),Tripeptides (4243)
4,g__Acetivibrio_Tripeptides,level_3,27,g__Acetivibrio_Tripeptides (27),Tripeptides (27)
...,...,...,...,...,...
415,g__Weissella_A_338544_Dipeptides,level_3,42,g__Weissella_A_338544_Dipeptides (42),Dipeptides (42)
416,g__Weissella_A_338544_Glycerophosphocholines,level_3,36,g__Weissella_A_338544_Glycerophosphocholines ...,Glycerophosphocholines (36)
417,g__Weissella_A_338544_Lipopeptides,level_3,20,g__Weissella_A_338544_Lipopeptides (20),Lipopeptides (20)
418,g__Weissella_A_338544_Tripeptides,level_3,83,g__Weissella_A_338544_Tripeptides (83),Tripeptides (83)


In [386]:
def transform_data_sheet(df, focus, organ):
    df_all = df.reset_index()
    df_all[['microbe_id', 'metabolite_id']] = df_all['node'].str.split("-", expand=True)
    
    # Creating an empty list to store dfs
    transformed_dfs = [] 

    # Aggregating at each level and calculating node counts
    if focus == "metabolites":
        metab_df = df_all[['metabolite_group', 'metabolite_name', 'metabolite_id', 'correlation']].copy()
        metab_df2 = metab_df.groupby(['metabolite_group', 'metabolite_name', 
                           'metabolite_id']).size().reset_index(name='counts')
        for (met_group, met_class, met_id), group in metab_df2.groupby(['metabolite_group', 'metabolite_name', 'metabolite_id']):
            met_group_counts = metab_df2[metab_df2['metabolite_group'] == met_group]['counts'].sum()
            met_name_counts = metab_df2[(metab_df2['metabolite_group'] == met_group) & (metab_df2['metabolite_name'] == met_class)]['counts'].sum()
            met_id_counts = group['counts'].sum()

            # Define levels and their respective counts
            levels = [
                (met_group, 'level_1', met_group_counts),
                (f'{met_group}_{met_class}', 'level_2', met_name_counts),
                (f'{met_group}_{met_class}_{met_id}', 'level_3', met_id_counts),
            ]

            # Adding entries for each level
            for node, level, counts in levels:
                node_count = counts
                new_df = pd.DataFrame([{  # Create a small DataFrame for each row
                    'node': node,
                    'level': level,
                    'counts': counts,
                    'node_counts': f'{node} ({node_count})'
                }])
                transformed_dfs.append(new_df)  # Append to the list of DataFrames
    
    elif focus == "microbes":
        micro_df = df_all[['microbe_group', 'microbe_name', 'microbe_id', 'correlation']].copy()
        micro_df2 = micro_df.groupby(['microbe_group', 'microbe_name', 
                           'microbe_id']).size().reset_index(name='counts')
        
        for (mic_group, mic_genus, mic_id), group in micro_df2.groupby(['microbe_group', 'microbe_name', 'microbe_id']):
            mic_group_counts = micro_df2[micro_df2['microbe_group'] == mic_group]['counts'].sum()
            mic_name_counts = micro_df2[(micro_df2['microbe_group'] == mic_group) & (micro_df2['microbe_name'] == mic_genus)]['counts'].sum()
            mic_id_counts = group['counts'].sum()

            # Define levels and their respective counts
            levels = [
                (mic_group, 'level_1', mic_group_counts),
                (f'{mic_group}_{mic_genus}', 'level_2', mic_name_counts),
                (f'{mic_group}_{mic_genus}_{mic_id}', 'level_3', mic_id_counts),
            ]

            # Adding entries for each level
            for node, level, counts in levels:
                node_count = counts
                new_df = pd.DataFrame([{  # Create a small DataFrame for each row
                    'node': node,
                    'level': level,
                    'counts': counts,
                    'node_counts': f'{node} ({node_count})'
                }])
                transformed_dfs.append(new_df)  # Append to the list of DataFrames
    
    
    else:
        return "focus can only be microbes or metabolites, check spelling and try again"
    

    
    # Adding the organism row
    total_rows = len(df)  # Gets the number of rows in the input dataframe
    organ_row = {'node': organ,
                 'level': 'level_0',
                 'counts': total_rows,
                 'node_counts': f'{organ} ({total_rows})'}
    
    #create the df
    organ_df = pd.DataFrame([organ_row])
    
    #add to df list
    transformed_dfs.append(organ_df)
    
    #combine all dfs
    transformed_df = pd.concat(transformed_dfs, ignore_index=True)

    # Removing duplicate entries and resetting index
    transformed_df = transformed_df.drop_duplicates().reset_index(drop=True)

    # add another column label_counts
    transformed_df['label_counts'] = transformed_df['node'].str.split('_').str[-1] + ' (' + transformed_df['counts'].astype(str) + ')'

    return transformed_df

In [374]:
brain_metab_nodes = transform_data_sheet(tog6_brain, focus='metabolites', organ='brain')
brain_metab_nodes.to_csv('../data/JointRPCA/3xtg_brain_additional_metab-info_Cytoscape.tsv', sep='\t')

In [389]:
brain_micro_nodes = transform_data_sheet(tog6_brain, focus='microbes', organ='brain')
brain_micro_nodes.to_csv('../data/JointRPCA/3xtg_brain_additional_micro-info_Cytoscape.tsv', sep='\t')

In [93]:
tog4_fecal = tog3_fecal[['OGU', 'metab_row_id', 'lowest_taxa_id', 'NPC#class', 'jointRPCA_co-occur_value']].copy()

In [94]:
t0 = time.time()
create_graphml_from_df(tog4_fecal, n_clusters=10, percentile_cutoff=75,
                         output_file_path='../data/JointRPCA/3xtg_fecal_microbe-metabolite_network.graphml')
t1 = time.time()
total = t1-t0
print("total hours: ", total/3600)
#generally has more than brain
#currently using NPC#class, but consider Compound_Name 
#takes about X hours now

KeyboardInterrupt: 

# Cytoscape


1.  File > Import > Network from File.. (select .graphml file)
2.  Then, more stuff...


In [None]:
#need to find a way to make points farther apart - change "gravity"?
#maybe ask Helena or Simone?