In [1]:
import pandas as pd
import numpy as np

In [2]:
combo_metadata = pd.read_csv('../data/Metabolome/26112024/Metadata_all.tsv', sep='\t', index_col=0)
combo_metadata.shape

(2117, 15)

In [3]:
taxonomy = pd.read_csv('../../U19_3XTG/data/Microbiome/pooling_filtered/gg2_taxonomy/taxonomy.tsv',
                      sep='\t', index_col=0)
txs = ['Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
taxonomy[txs] = taxonomy['Taxon'].str.split(';', expand=True)
taxonomy.head()

Unnamed: 0_level_0,Taxon,Confidence,Domain,Phylum,Class,Order,Family,Genus,Species
Feature ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
G000262225,d__Bacteria; p__Fusobacteriota; c__Fusobacteri...,1.0,d__Bacteria,p__Fusobacteriota,c__Fusobacteriia,o__Fusobacteriales_993521,f__Fusobacteriaceae_993521,g__Fusobacterium_C,s__Fusobacterium_C necrophorum
G006716645,d__Bacteria; p__Bacteroidota; c__Bacteroidia; ...,1.0,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Sphingobacteriales,f__Sphingobacteriaceae,g__Pararcticibacter,s__Pararcticibacter tournemirensis
G001310255,d__Bacteria; p__Proteobacteria; c__Alphaproteo...,1.0,d__Bacteria,p__Proteobacteria,c__Alphaproteobacteria,o__Caulobacterales,f__Caulobacteraceae,g__Brevundimonas,s__Brevundimonas aurantiaca
G900604515,d__Bacteria; p__Actinobacteriota; c__Actinomyc...,1.0,d__Bacteria,p__Actinobacteriota,c__Actinomycetia,o__Actinomycetales,f__Actinomycetaceae,g__Arcanobacterium_A_386370,s__Arcanobacterium_A_386370 ihumii
G001813085,d__Bacteria; p__Actinobacteriota; c__Actinomyc...,1.0,d__Bacteria,p__Actinobacteriota,c__Actinomycetia,o__Actinomycetales,f__Actinomycetaceae,g__Gleimia,s__Gleimia europaea_A


# Functions

In [5]:
def filter_distance_matrix(distance_matrix, row_filter, col_filter):
    """
    Filters a distance matrix based on row and column prefixes.

    Args:
        distance_matrix: A pandas DataFrame representing the distance matrix.
        row_filter: The prefix to filter rows by.
        col_filter: The prefix to filter columns by.

    Returns:
        A filtered pandas DataFrame.
    """

    row_filter_mask = distance_matrix.index.str.startswith(row_filter)
    col_filter_mask = distance_matrix.columns.str.startswith(col_filter)

    filtered_matrix = distance_matrix.loc[row_filter_mask, col_filter_mask]

    return filtered_matrix

def transform_df(df):
    """
    Transforms a DataFrame to have the index as the first column, 
      column names as the second column, and values in a new column.

    Args:
        df: The input DataFrame.

    Returns:
        A new DataFrame with the transformed structure.
    """
        
    idx = []
    cols = []
    val = []
    for index, row in df.iterrows():
        for col, value in row.items():
            idx.append(index)
            cols.append(col)
            val.append(value)

      # Create the new DataFrame
    new_df = pd.DataFrame({'OGU':idx, 'metab_row_id':cols, 'jointRPCA_co-occur_value':val})
    
    return new_df

# Sac Fecal Micro and Brain Metab

In [4]:
corr_3xtg_brain = pd.read_csv('../data/JointRPCA/3XTG_sac_brain-v-micro_correlation_table/Correlation.tsv',
                              sep='\t', index_col=0)

In [6]:
corr_3xtg_brain_filt = filter_distance_matrix(corr_3xtg_brain, 
                                              row_filter="G", 
                                              col_filter="brain_") 
corr_3xtg_brain_filt.shape

(1902, 890)

In [7]:
cyscp_3xtg_brain = transform_df(corr_3xtg_brain_filt)
cyscp_3xtg_brain['metab_row_id'] = cyscp_3xtg_brain['metab_row_id'].str.replace('brain_', '')
cyscp_3xtg_brain['metab_row_id'] = cyscp_3xtg_brain['metab_row_id'].astype(int)
cyscp_3xtg_brain['node']= cyscp_3xtg_brain['OGU'].astype(str)+"-"+cyscp_3xtg_brain['metab_row_id'].astype(str)
cyscp_3xtg_brain.set_index('node', inplace=True)
cyscp_3xtg_brain.head()

Unnamed: 0_level_0,OGU,metab_row_id,jointRPCA_co-occur_value
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
G000005825-532,G000005825,532,0.035114
G000005825-703,G000005825,703,0.36
G000005825-816,G000005825,816,-0.007822
G000005825-881,G000005825,881,0.19713
G000005825-1017,G000005825,1017,0.345076


In [8]:
cyscp_3xtg_brain.to_csv('../data/JointRPCA/3xtg_brain_cytoscape_format.txt', sep='\t')

In [9]:
cyscp_3xtg_brain['jointRPCA_co-occur_value'].describe()

count    1.692780e+06
mean     1.797493e-02
std      5.856040e-01
min     -9.999997e-01
25%     -4.839313e-01
50%      9.210277e-03
75%      5.329324e-01
max      9.999999e-01
Name: jointRPCA_co-occur_value, dtype: float64

In [10]:
cyscp_3xtg_brain.shape

(1692780, 3)

In [196]:
cyscp_3xtg_brain[np.abs(cyscp_3xtg_brain['jointRPCA_co-occur_value']) > 0.75].shape
#reduced to 11% of original size

(452287, 3)

In [197]:
short_brain = cyscp_3xtg_brain[np.abs(cyscp_3xtg_brain['jointRPCA_co-occur_value']) > 0.75].copy()
short_brain['jointRPCA_co-occur_value'].describe()

count    452287.000000
mean          0.078395
std           0.878808
min          -1.000000
25%          -0.864605
50%           0.772648
75%           0.895279
max           1.000000
Name: jointRPCA_co-occur_value, dtype: float64

In [198]:
short_brain.head()

Unnamed: 0_level_0,OGU,metab_row_id,jointRPCA_co-occur_value
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
G000005825-871,G000005825,871,0.96534
G000005825-440,G000005825,440,0.855243
G000005825-659,G000005825,659,-0.759782
G000005825-382,G000005825,382,0.968833
G000005825-769,G000005825,769,-0.966495


In [199]:
short_brain.to_csv('../data/JointRPCA/3xtg_brain_cytoscape_format_cutoff0.75.txt', sep='\t')

In [200]:
tog1_brain = pd.merge(short_brain, taxonomy[['Taxon', 'Family', 'Genus', 'Phylum']], 
                      left_on='OGU', right_index=True)
tog1_brain.head()

Unnamed: 0_level_0,OGU,metab_row_id,jointRPCA_co-occur_value,Taxon,Family,Genus,Phylum
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
G000005825-871,G000005825,871,0.96534,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__B...,f__Bacillaceae_D_361077,g__Bacillus_S,p__Firmicutes_D
G000005825-440,G000005825,440,0.855243,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__B...,f__Bacillaceae_D_361077,g__Bacillus_S,p__Firmicutes_D
G000005825-659,G000005825,659,-0.759782,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__B...,f__Bacillaceae_D_361077,g__Bacillus_S,p__Firmicutes_D
G000005825-382,G000005825,382,0.968833,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__B...,f__Bacillaceae_D_361077,g__Bacillus_S,p__Firmicutes_D
G000005825-769,G000005825,769,-0.966495,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__B...,f__Bacillaceae_D_361077,g__Bacillus_S,p__Firmicutes_D


In [201]:
metab_id_brain = pd.read_csv('../data/Metabolome/26112024/annotation/3xtg_brain_features_info_mn.csv',
                            index_col=0)
metab_id_brain.head()
#plan to use Compound_Name and NPC#class

Unnamed: 0_level_0,VIP1,SPF_genotype,VIP2,GF_genotype,VIP3,3xTG_colonization,VIP4,WT_colonization,VIP_mean,Exclusivity,...,Microbiome,mz,RT,Corr_ID,Compound_Name,Adduct_GNPS,Adduct_SIRIUS,NPC#pathway,NPC#superclass,NPC#class
Feature_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
383,4.567,Mut,4.439,Mut,,,1.976,GF,3.661,,...,No,284.0986,0.3671,,GUANOSINE - 20.0 eV,M+H,[M + H]+,Carbohydrates,Nucleosides,Purine nucleos(t)ides
600,3.082,Mut,3.375,Mut,,,,,3.229,,...,No,269.0877,0.3721,23.0,INOSINE - 20.0 eV,M+H,[M + H]+,Carbohydrates,Nucleosides,Purine nucleos(t)ides
1417,2.784,WT,,,3.523,GF,,,3.154,,...,No,266.1383,2.0846,,,,[M + H]+,Fatty acids,Fatty esters,Fatty acyl carnitines
403,2.43,WT,4.46,WT,,,1.87,GF,2.92,,...,No,276.1439,0.3963,,,,[M + H]+,Fatty acids,Fatty esters,Fatty acyl carnitines
738,1.863,WT,4.053,WT,2.456,SPF,,,2.791,,...,No,185.1284,0.7642,,,,[M + H]+,Alkaloids,Lysine alkaloids,Piperidine alkaloids


In [202]:
tog2_brain = pd.merge(tog1_brain, metab_id_brain[['Compound_Name', 'NPC#class', 'NPC#superclass']], 
                      left_on='metab_row_id', right_index=True)
tog2_brain.head()

Unnamed: 0_level_0,OGU,metab_row_id,jointRPCA_co-occur_value,Taxon,Family,Genus,Phylum,Compound_Name,NPC#class,NPC#superclass
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
G000005825-871,G000005825,871,0.96534,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__B...,f__Bacillaceae_D_361077,g__Bacillus_S,p__Firmicutes_D,,,
G000006865-871,G000006865,871,0.968249,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__L...,f__Streptococcaceae,g__Lactococcus_A_346120,p__Firmicutes_D,,,
G000006925-871,G000006925,871,-0.963834,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Enterobacteriaceae_A,g__Escherichia_710834,p__Proteobacteria,,,
G000007265-871,G000007265,871,0.770888,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__L...,f__Streptococcaceae,g__Streptococcus,p__Firmicutes_D,,,
G000007705-871,G000007705,871,0.806282,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Chromobacteriaceae,g__Chromobacterium,p__Proteobacteria,,,


In [203]:
tog2_brain['sentiment'] = np.sign(tog2_brain['jointRPCA_co-occur_value']).replace({-1: 'negative', 1: 'positive', 0: 'neutral'})
tog2_brain.head()

Unnamed: 0_level_0,OGU,metab_row_id,jointRPCA_co-occur_value,Taxon,Family,Genus,Phylum,Compound_Name,NPC#class,NPC#superclass,sentiment
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
G000005825-871,G000005825,871,0.96534,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__B...,f__Bacillaceae_D_361077,g__Bacillus_S,p__Firmicutes_D,,,,positive
G000006865-871,G000006865,871,0.968249,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__L...,f__Streptococcaceae,g__Lactococcus_A_346120,p__Firmicutes_D,,,,positive
G000006925-871,G000006925,871,-0.963834,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Enterobacteriaceae_A,g__Escherichia_710834,p__Proteobacteria,,,,negative
G000007265-871,G000007265,871,0.770888,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__L...,f__Streptococcaceae,g__Streptococcus,p__Firmicutes_D,,,,positive
G000007705-871,G000007705,871,0.806282,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Chromobacteriaceae,g__Chromobacterium,p__Proteobacteria,,,,positive


In [204]:
tog2_brain.shape

(123244, 11)

In [205]:
tog2_brain.columns

Index(['OGU', 'metab_row_id', 'jointRPCA_co-occur_value', 'Taxon', 'Family',
       'Genus', 'Phylum', 'Compound_Name', 'NPC#class', 'NPC#superclass',
       'sentiment'],
      dtype='object')

In [206]:
tog2_brain['NPC#class'].unique()

array([nan, 'Purine alkaloids', 'Wax monoesters', 'Aminoacids',
       'Pyrrolidine alkaloids', 'Branched fatty acids',
       'Glycerophosphocholines', 'Fatty acyl carnitines',
       'Simple amide alkaloids', 'Polyamines', 'Dipeptides',
       'Pyridine alkaloids', 'Purine nucleos(t)ides', 'Thia fatty acids',
       'Open-chain polyketides',
       'N-acyl ethanolamines (endocannabinoids)', 'Tripeptides',
       'Glycerophosphoethanolamines', 'N-acyl amines', 'Cyclic peptides',
       'Glycerophosphoserines', 'Imidazole alkaloids',
       'Thiodiketopiperazine alkaloids', 'Cephalosporins', 'Gallotannins',
       'Linear peptides', 'Lipopeptides', 'Steroidal alkaloids',
       'Oxidized glycerophospholipids', 'Marine-bacterial DPEs',
       'Isoprostanes', 'pteridine alkaloids', 'Cholane steroids',
       'Isoquinoline alkaloids', 'Glycerophosphoglycerols',
       'Piperidine alkaloids', 'Pyrazine and Piperazine alkaloids',
       'Simple indole alkaloids', 'Aminosugars',
       'Pyri

In [207]:
group_brain = tog2_brain[['NPC#class', 'Genus', 'sentiment', 'NPC#superclass', 'Phylum', 'Family']].copy()
group_brain.head()

Unnamed: 0_level_0,NPC#class,Genus,sentiment,NPC#superclass,Phylum,Family
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
G000005825-871,,g__Bacillus_S,positive,,p__Firmicutes_D,f__Bacillaceae_D_361077
G000006865-871,,g__Lactococcus_A_346120,positive,,p__Firmicutes_D,f__Streptococcaceae
G000006925-871,,g__Escherichia_710834,negative,,p__Proteobacteria,f__Enterobacteriaceae_A
G000007265-871,,g__Streptococcus,positive,,p__Firmicutes_D,f__Streptococcaceae
G000007705-871,,g__Chromobacterium,positive,,p__Proteobacteria,f__Chromobacteriaceae


In [208]:
group_brain.shape

(123244, 6)

In [209]:
group_brain.dropna(subset=['NPC#class', 'NPC#superclass'], inplace=True)
group_brain.shape

(90280, 6)

In [218]:
group_counts1 = group_brain.groupby(['NPC#class', 'NPC#superclass', 'Genus', 'Family', 'Phylum', 'sentiment']).size().reset_index(name='count')
group_counts2 = group_counts1[group_counts1['count'] > 50]
group_counts2.head()

Unnamed: 0,NPC#class,NPC#superclass,Genus,Family,Phylum,sentiment,count
51,Aminoacids,Small peptides,g__Alistipes_A_871400,f__Rikenellaceae,p__Bacteroidota,positive,64
396,Aminoacids,Small peptides,g__Escherichia_710834,f__Enterobacteriaceae_A,p__Proteobacteria,negative,86
561,Aminoacids,Small peptides,g__Lacticaseibacillus,f__Lactobacillaceae,p__Firmicutes_D,positive,57
564,Aminoacids,Small peptides,g__Lactobacillus,f__Lactobacillaceae,p__Firmicutes_D,negative,108
586,Aminoacids,Small peptides,g__Ligilactobacillus,f__Lactobacillaceae,p__Firmicutes_D,negative,73


In [219]:
group_counts2.shape

(84, 7)

In [220]:
group_counts2.to_csv('../data/JointRPCA/3xtg_brain_cytoscape_format_summary.txt', sep='\t')

In [221]:
group_counts2.groupby(['NPC#class']).size()

NPC#class
Aminoacids                 9
Cyclic peptides            1
Dipeptides                12
Fatty acyl carnitines      1
Glycerophosphocholines     9
Linear peptides            4
Lipopeptides               3
Purine nucleos(t)ides      5
Pyridine alkaloids         2
Thia fatty acids           3
Tripeptides               35
dtype: int64

In [222]:
group_counts2.groupby(['Genus']).size()

Genus
 g__Akkermansia                 1
 g__Alistipes_A_871400          1
 g__Bacillus_A                  1
 g__Bacteroides_H               3
 g__Bifidobacterium_387352      1
 g__Bifidobacterium_388775      5
 g__Burkholderia                1
 g__Butyrivibrio_A_168226       1
 g__Carnobacterium_A_320743     1
 g__Clostridium_T               1
 g__Companilactobacillus        1
 g__Cryptobacteroides           2
 g__Desulfovibrio_R_446353      1
 g__Escherichia_710834          4
 g__Lacticaseibacillus          5
 g__Lactobacillus               9
 g__Lentilactobacillus          1
 g__Ligilactobacillus           4
 g__Limosilactobacillus        10
 g__Liquorilactobacillus        2
 g__Paucilactobacillus          1
 g__Pediococcus                 1
 g__Phocaeicola_A_858004        1
 g__Prevotella                 10
 g__Ruminococcus_D              1
 g__Secundilactobacillus        1
 g__Staphylococcus              1
 g__Streptococcus              11
 g__Weissella_A_338544          2
dtype: i

![Cytoscape](../figures/3xtg_brain_cytoscape_summary_plot1.png)

# Sac Fecal Micro and Serum Metab - Carnitines Only

In [224]:
corr_3xtg_serum = pd.read_csv('../data/JointRPCA/3XTG_sac_serum-v-micro_correlation_table/Correlation.tsv',
                              sep='\t', index_col=0)

In [225]:
corr_3xtg_serum_filt = filter_distance_matrix(corr_3xtg_serum, 
                                              row_filter="G", 
                                              col_filter="serum_") 
corr_3xtg_serum_filt.shape

(1953, 3811)

In [226]:
cyscp_3xtg_serum = transform_df(corr_3xtg_serum_filt)
cyscp_3xtg_serum['metab_row_id'] = cyscp_3xtg_serum['metab_row_id'].str.replace('serum_', '')
cyscp_3xtg_serum['metab_row_id'] = cyscp_3xtg_serum['metab_row_id'].astype(int)
cyscp_3xtg_serum['node']= cyscp_3xtg_serum['OGU'].astype(str)+"-"+cyscp_3xtg_serum['metab_row_id'].astype(str)
cyscp_3xtg_serum.set_index('node', inplace=True)
cyscp_3xtg_serum.head()

Unnamed: 0_level_0,OGU,metab_row_id,jointRPCA_co-occur_value
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
G000005825-1099,G000005825,1099,0.852175
G000005825-131,G000005825,131,0.005951
G000005825-645,G000005825,645,-0.508716
G000005825-336,G000005825,336,-0.874918
G000005825-278,G000005825,278,0.767587


In [227]:
cyscp_3xtg_serum.to_csv('../data/JointRPCA/3xtg_serum_cytoscape_format.txt', sep='\t')

In [228]:
cyscp_3xtg_serum['jointRPCA_co-occur_value'].describe()

count    7.442883e+06
mean     1.106671e-01
std      8.163846e-01
min     -1.000000e+00
25%     -8.528193e-01
50%      4.413907e-01
75%      9.161077e-01
max      1.000000e+00
Name: jointRPCA_co-occur_value, dtype: float64

In [229]:
cyscp_3xtg_serum.shape

(7442883, 3)

In [230]:
cyscp_3xtg_serum[np.abs(cyscp_3xtg_serum['jointRPCA_co-occur_value']) > 0.75].shape
#reduced to 49% of original size

(5236197, 3)

In [231]:
short_serum = cyscp_3xtg_serum[np.abs(cyscp_3xtg_serum['jointRPCA_co-occur_value']) > 0.75].copy()
short_serum['jointRPCA_co-occur_value'].describe()

count    5.236197e+06
mean     1.327332e-01
std      9.171473e-01
min     -1.000000e+00
25%     -9.327906e-01
50%      8.276602e-01
75%      9.533671e-01
max      1.000000e+00
Name: jointRPCA_co-occur_value, dtype: float64

In [232]:
short_serum.head()

Unnamed: 0_level_0,OGU,metab_row_id,jointRPCA_co-occur_value
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
G000005825-1099,G000005825,1099,0.852175
G000005825-336,G000005825,336,-0.874918
G000005825-278,G000005825,278,0.767587
G000005825-655,G000005825,655,-0.833337
G000005825-3015,G000005825,3015,-0.786062


In [233]:
short_serum.to_csv('../data/JointRPCA/3xtg_serum_cytoscape_format_cutoff0.75.txt', sep='\t')

In [234]:
tog1_serum = pd.merge(cyscp_3xtg_serum, taxonomy[['Taxon', 'Family', 'Genus', 'Phylum']], 
                      left_on='OGU', right_index=True)
tog1_serum.head()

Unnamed: 0_level_0,OGU,metab_row_id,jointRPCA_co-occur_value,Taxon,Family,Genus,Phylum
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
G000005825-1099,G000005825,1099,0.852175,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__B...,f__Bacillaceae_D_361077,g__Bacillus_S,p__Firmicutes_D
G000005825-131,G000005825,131,0.005951,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__B...,f__Bacillaceae_D_361077,g__Bacillus_S,p__Firmicutes_D
G000005825-645,G000005825,645,-0.508716,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__B...,f__Bacillaceae_D_361077,g__Bacillus_S,p__Firmicutes_D
G000005825-336,G000005825,336,-0.874918,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__B...,f__Bacillaceae_D_361077,g__Bacillus_S,p__Firmicutes_D
G000005825-278,G000005825,278,0.767587,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__B...,f__Bacillaceae_D_361077,g__Bacillus_S,p__Firmicutes_D


In [235]:
metab_id_serum = pd.read_csv('../data/Metabolome/26112024/annotation/3xtg_serum_features_info_mn.csv',
                            index_col=0)
metab_id_serum.head()
#plan to use Compound_Name and NPC#class

Unnamed: 0_level_0,VIP1,SPF_genotype,VIP2,GF_genotype,VIP3,3xTG_colonization,VIP4,WT_colonization,VIP_mean,Exclusivity,...,Microbiome,mz,RT,Corr_ID,Compound_Name,Adduct,adduct,NPC#pathway,NPC#superclass,NPC#class
Feature_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3078,4.772,Mut,1.687,WT,6.124,SPF,1.069,SPF,3.413,,...,Yes,280.153,2.1957,,,,[M + H]+,Fatty acids,Fatty esters,Fatty acyl carnitines
2482,4.458,Mut,,,5.035,SPF,,,4.747,3xtg_both,...,No,180.0647,1.8297,,HIPPURATE - 30.0 eV,M+H,[M + H]+,Amino acids and Peptides,Small peptides,Aminoacids
4173,4.451,Mut,,,5.046,SPF,,,4.749,3xtg_both,...,No,294.1688,2.5495,,,,[M + H]+,Shikimates and Phenylpropanoids,Fatty esters,Fatty acyl carnitines
1041,4.354,WT,1.226,WT,,,2.688,SPF,2.756,,...,No,309.1429,0.493,,,,[M + Na]+,Alkaloids,Small peptides,Simple amide alkaloids
2900,4.102,WT,,,3.97,GF,1.768,SPF,3.28,,...,No,264.0524,2.1003,,,,[M + H]+,Alkaloids,Tyrosine alkaloids,Simple phenolic acids


In [236]:
metab_serum_carnitines = metab_id_serum[metab_id_serum['NPC#class'].str.contains("carnitines", case=False, na=False)]
metab_serum_carnitines.shape

(20, 21)

In [237]:
metab_new_carnitines = metab_id_serum[metab_id_serum.index.isin([3078, 4173, 2821])]
metab_new_carnitines.head()
#Key Ones

Unnamed: 0_level_0,VIP1,SPF_genotype,VIP2,GF_genotype,VIP3,3xTG_colonization,VIP4,WT_colonization,VIP_mean,Exclusivity,...,Microbiome,mz,RT,Corr_ID,Compound_Name,Adduct,adduct,NPC#pathway,NPC#superclass,NPC#class
Feature_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3078,4.772,Mut,1.687,WT,6.124,SPF,1.069,SPF,3.413,,...,Yes,280.153,2.1957,,,,[M + H]+,Fatty acids,Fatty esters,Fatty acyl carnitines
4173,4.451,Mut,,,5.046,SPF,,,4.749,3xtg_both,...,No,294.1688,2.5495,,,,[M + H]+,Shikimates and Phenylpropanoids,Fatty esters,Fatty acyl carnitines
2821,1.996,WT,,,2.084,GF,,,2.04,,...,No,266.1373,2.0532,,,,[M + Na]+,Fatty acids,Fatty esters,Fatty acyl carnitines


In [238]:
tog2_serum = pd.merge(tog1_serum, metab_serum_carnitines[['Compound_Name', 'NPC#class', 'NPC#superclass']], 
                      left_on='metab_row_id', right_index=True)
tog2_serum.head()

Unnamed: 0_level_0,OGU,metab_row_id,jointRPCA_co-occur_value,Taxon,Family,Genus,Phylum,Compound_Name,NPC#class,NPC#superclass
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
G000005825-361,G000005825,361,-0.847311,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__B...,f__Bacillaceae_D_361077,g__Bacillus_S,p__Firmicutes_D,CARNITINE,Fatty acyl carnitines,Fatty Acids and Conjugates
G000006785-361,G000006785,361,-0.999867,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__L...,f__Streptococcaceae,g__Streptococcus,p__Firmicutes_D,CARNITINE,Fatty acyl carnitines,Fatty Acids and Conjugates
G000006865-361,G000006865,361,-0.839459,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__L...,f__Streptococcaceae,g__Lactococcus_A_346120,p__Firmicutes_D,CARNITINE,Fatty acyl carnitines,Fatty Acids and Conjugates
G000006925-361,G000006925,361,0.755439,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Enterobacteriaceae_A,g__Escherichia_710834,p__Proteobacteria,CARNITINE,Fatty acyl carnitines,Fatty Acids and Conjugates
G000007265-361,G000007265,361,-0.972913,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__L...,f__Streptococcaceae,g__Streptococcus,p__Firmicutes_D,CARNITINE,Fatty acyl carnitines,Fatty Acids and Conjugates


In [239]:
tog2_serum['sentiment'] = np.sign(tog2_serum['jointRPCA_co-occur_value']).replace({-1: 'negative', 1: 'positive', 0: 'neutral'})
tog2_serum.head()

Unnamed: 0_level_0,OGU,metab_row_id,jointRPCA_co-occur_value,Taxon,Family,Genus,Phylum,Compound_Name,NPC#class,NPC#superclass,sentiment
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
G000005825-361,G000005825,361,-0.847311,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__B...,f__Bacillaceae_D_361077,g__Bacillus_S,p__Firmicutes_D,CARNITINE,Fatty acyl carnitines,Fatty Acids and Conjugates,negative
G000006785-361,G000006785,361,-0.999867,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__L...,f__Streptococcaceae,g__Streptococcus,p__Firmicutes_D,CARNITINE,Fatty acyl carnitines,Fatty Acids and Conjugates,negative
G000006865-361,G000006865,361,-0.839459,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__L...,f__Streptococcaceae,g__Lactococcus_A_346120,p__Firmicutes_D,CARNITINE,Fatty acyl carnitines,Fatty Acids and Conjugates,negative
G000006925-361,G000006925,361,0.755439,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Enterobacteriaceae_A,g__Escherichia_710834,p__Proteobacteria,CARNITINE,Fatty acyl carnitines,Fatty Acids and Conjugates,positive
G000007265-361,G000007265,361,-0.972913,d__Bacteria; p__Firmicutes_D; c__Bacilli; o__L...,f__Streptococcaceae,g__Streptococcus,p__Firmicutes_D,CARNITINE,Fatty acyl carnitines,Fatty Acids and Conjugates,negative


In [240]:
tog2_serum.metab_row_id.unique()

array([  361,   452,  2516,  3124,   440,  2127,  1889,  1335,  2408,
         810,  2821,  3078,  8443,  4173,  7462,  9257, 14140,  9256,
       10826, 11656])

In [241]:
tog2_serum.shape

(39060, 11)

In [242]:
tog2_serum.columns

Index(['OGU', 'metab_row_id', 'jointRPCA_co-occur_value', 'Taxon', 'Family',
       'Genus', 'Phylum', 'Compound_Name', 'NPC#class', 'NPC#superclass',
       'sentiment'],
      dtype='object')

In [243]:
tog2_serum['NPC#class'].value_counts()

Fatty acyl carnitines    39060
Name: NPC#class, dtype: int64

In [244]:
tog2_serum.Compound_Name.value_counts()

Butyrylcarnitine - 30.00 eV                         5859
Acetylcarnitine - 20.00 eV                          3906
CARNITINE                                           1953
Propionylcarnitine - 20.00 eV                       1953
Isovalerylcarnitine - 30.00 eV                      1953
LAUROYLCARNITINE - 20.0 eV                          1953
Spectral Match to Palmitoylcarnitine from NIST14    1953
Name: Compound_Name, dtype: int64

In [245]:
tog2_serum.metab_row_id.value_counts()
#Key ones in paper: 2821 = benzoyl-carnitine, 3078 = phenylacetyl-carnitine, and 4173 = phenylpropionyl-carnitine 

361      1953
452      1953
10826    1953
9256     1953
14140    1953
9257     1953
7462     1953
4173     1953
8443     1953
3078     1953
2821     1953
810      1953
2408     1953
1335     1953
1889     1953
2127     1953
440      1953
3124     1953
2516     1953
11656    1953
Name: metab_row_id, dtype: int64

In [246]:
group_serum = tog2_serum[['NPC#class', 'Genus', 'sentiment', 'NPC#superclass', 'Phylum', 'Family', 'Compound_Name', 'metab_row_id']].copy()
group_serum.head()

Unnamed: 0_level_0,NPC#class,Genus,sentiment,NPC#superclass,Phylum,Family,Compound_Name,metab_row_id
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
G000005825-361,Fatty acyl carnitines,g__Bacillus_S,negative,Fatty Acids and Conjugates,p__Firmicutes_D,f__Bacillaceae_D_361077,CARNITINE,361
G000006785-361,Fatty acyl carnitines,g__Streptococcus,negative,Fatty Acids and Conjugates,p__Firmicutes_D,f__Streptococcaceae,CARNITINE,361
G000006865-361,Fatty acyl carnitines,g__Lactococcus_A_346120,negative,Fatty Acids and Conjugates,p__Firmicutes_D,f__Streptococcaceae,CARNITINE,361
G000006925-361,Fatty acyl carnitines,g__Escherichia_710834,positive,Fatty Acids and Conjugates,p__Proteobacteria,f__Enterobacteriaceae_A,CARNITINE,361
G000007265-361,Fatty acyl carnitines,g__Streptococcus,negative,Fatty Acids and Conjugates,p__Firmicutes_D,f__Streptococcaceae,CARNITINE,361


In [247]:
group_serum.metab_row_id.unique()

array([  361,   452,  2516,  3124,   440,  2127,  1889,  1335,  2408,
         810,  2821,  3078,  8443,  4173,  7462,  9257, 14140,  9256,
       10826, 11656])

In [248]:
group_serum[group_serum.metab_row_id.isin([3078, 4173, 2821])].shape

(5859, 8)

In [249]:
group_serum['Compound_Name'] = group_serum['Compound_Name'].fillna(group_serum['metab_row_id'])
#fills na with row id, to keep each carnitine distinct, look at new carnitines

In [308]:
group_counts1 = group_serum.groupby(['Compound_Name', 'NPC#class', 'NPC#superclass', 'Genus', 'Family', 'Phylum', 'sentiment']).size().reset_index(name='count')
group_counts2 = group_counts1[group_counts1['count'] > 25]
group_counts2.head()

Unnamed: 0,Compound_Name,NPC#class,NPC#superclass,Genus,Family,Phylum,sentiment,count
572,810,Fatty acyl carnitines,Fatty esters,g__Lactobacillus,f__Lactobacillaceae,p__Firmicutes_D,positive,29
771,810,Fatty acyl carnitines,Fatty esters,g__Prevotella,f__Bacteroidaceae,p__Bacteroidota,negative,34
902,810,Fatty acyl carnitines,Fatty esters,g__Streptococcus,f__Streptococcaceae,p__Firmicutes_D,negative,49
1610,2821,Fatty acyl carnitines,Fatty esters,g__Lactobacillus,f__Lactobacillaceae,p__Firmicutes_D,negative,29
1811,2821,Fatty acyl carnitines,Fatty esters,g__Prevotella,f__Bacteroidaceae,p__Bacteroidota,positive,34


In [309]:
group_counts2.shape
#ideally, less than 100

(77, 8)

In [310]:
group_counts2.Compound_Name.unique()

array([810, 2821, 3078, 3124, 4173, 7462, 8443, 9256, 10826, 14140,
       'Acetylcarnitine - 20.00 eV', 'Butyrylcarnitine - 30.00 eV',
       'CARNITINE', 'Isovalerylcarnitine - 30.00 eV',
       'LAUROYLCARNITINE - 20.0 eV', 'Propionylcarnitine - 20.00 eV',
       'Spectral Match to Palmitoylcarnitine from NIST14'], dtype=object)

In [311]:
group_counts2.Genus.unique()

array([' g__Lactobacillus', ' g__Prevotella', ' g__Streptococcus',
       ' g__Bifidobacterium_388775', ' g__Limosilactobacillus',
       ' g__Bacteroides_H', ' g__Butyrivibrio_A_168226',
       ' g__Clostridium_T', ' g__Companilactobacillus',
       ' g__Cryptobacteroides', ' g__Escherichia_710834',
       ' g__Lacticaseibacillus', ' g__Ligilactobacillus',
       ' g__Staphylococcus', ' g__Weissella_A_338544'], dtype=object)

In [312]:
group_counts2['count'].describe()

count     77.000000
mean      40.402597
std       13.504638
min       26.000000
25%       30.000000
50%       34.000000
75%       49.000000
max      101.000000
Name: count, dtype: float64

In [313]:
group_counts2.shape

(77, 8)

In [272]:
group_counts2

Unnamed: 0,Compound_Name,NPC#class,NPC#superclass,Genus,Family,Phylum,sentiment,count
572,810,Fatty acyl carnitines,Fatty esters,g__Lactobacillus,f__Lactobacillaceae,p__Firmicutes_D,positive,29
771,810,Fatty acyl carnitines,Fatty esters,g__Prevotella,f__Bacteroidaceae,p__Bacteroidota,negative,34
902,810,Fatty acyl carnitines,Fatty esters,g__Streptococcus,f__Streptococcaceae,p__Firmicutes_D,negative,49
1610,2821,Fatty acyl carnitines,Fatty esters,g__Lactobacillus,f__Lactobacillaceae,p__Firmicutes_D,negative,29
1811,2821,Fatty acyl carnitines,Fatty esters,g__Prevotella,f__Bacteroidaceae,p__Bacteroidota,positive,34
...,...,...,...,...,...,...,...,...
18197,Propionylcarnitine - 20.00 eV,Fatty acyl carnitines,Fatty esters,g__Prevotella,f__Bacteroidaceae,p__Bacteroidota,negative,34
18327,Propionylcarnitine - 20.00 eV,Fatty acyl carnitines,Fatty esters,g__Streptococcus,f__Streptococcaceae,p__Firmicutes_D,negative,49
19033,Spectral Match to Palmitoylcarnitine from NIST14,Fatty acyl carnitines,Fatty esters,g__Lactobacillus,f__Lactobacillaceae,p__Firmicutes_D,negative,29
19234,Spectral Match to Palmitoylcarnitine from NIST14,Fatty acyl carnitines,Fatty esters,g__Prevotella,f__Bacteroidaceae,p__Bacteroidota,positive,35


In [273]:
group_counts2.to_csv('../data/JointRPCA/3xtg_serum_cytoscape_carnitine_summary.txt', sep='\t')

In [274]:
group_counts2.groupby(['Compound_Name']).size()

Compound_Name
810                                                  3
2821                                                 3
3078                                                 4
3124                                                 3
4173                                                 3
7462                                                 3
8443                                                 3
9256                                                 3
10826                                                3
14140                                                3
Acetylcarnitine - 20.00 eV                          10
Butyrylcarnitine - 30.00 eV                         21
CARNITINE                                            3
Isovalerylcarnitine - 30.00 eV                       3
LAUROYLCARNITINE - 20.0 eV                           3
Propionylcarnitine - 20.00 eV                        3
Spectral Match to Palmitoylcarnitine from NIST14     3
dtype: int64

In [275]:
group_counts2.groupby(['Compound_Name']).size().reset_index().to_csv('../data/JointRPCA/3xtg_serum_metab_node_list.txt', index=False)

In [295]:
group_counts2.groupby(['Genus']).size()

Genus
 g__14-2                   4
 g__1XD42-69               1
 g__Abiotrophia            1
 g__Acetatifactor         18
 g__Acetivibrio            1
                          ..
 g__Ventrimonas            1
 g__Vogesella              1
 g__Weimeria               1
 g__Weissella_A_338544    19
 g__Weizmannia             1
Length: 269, dtype: int64

In [277]:
group_counts2.groupby(['Genus']).size().reset_index().to_csv('../data/JointRPCA/3xtg_serum_micro_node_list.txt', index=False)

![Cytoscape](../figures/3xtg_serum_carnitines_summary_plot1.png)

### Subset for only items of interest

In [314]:
mic_interest = tog1_serum[tog1_serum['Genus'].str.contains("Akkermansia|Helicobacter|Mucispirillum|Desulfovibriofovibrio|Escherichia",
                                  case=False)]
mic_interest

Unnamed: 0_level_0,OGU,metab_row_id,jointRPCA_co-occur_value,Taxon,Family,Genus,Phylum
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
G000006925-1099,G000006925,1099,-0.762919,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Enterobacteriaceae_A,g__Escherichia_710834,p__Proteobacteria
G000006925-131,G000006925,131,-0.133630,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Enterobacteriaceae_A,g__Escherichia_710834,p__Proteobacteria
G000006925-645,G000006925,645,0.408399,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Enterobacteriaceae_A,g__Escherichia_710834,p__Proteobacteria
G000006925-336,G000006925,336,0.790147,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Enterobacteriaceae_A,g__Escherichia_710834,p__Proteobacteria
G000006925-278,G000006925,278,-0.750108,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Enterobacteriaceae_A,g__Escherichia_710834,p__Proteobacteria
...,...,...,...,...,...,...,...
G900227605-8068,G900227605,8068,-0.999613,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Enterobacteriaceae_A,g__Escherichia_710834,p__Proteobacteria
G900227605-8046,G900227605,8046,-0.987308,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Enterobacteriaceae_A,g__Escherichia_710834,p__Proteobacteria
G900227605-8043,G900227605,8043,-0.988421,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Enterobacteriaceae_A,g__Escherichia_710834,p__Proteobacteria
G900227605-8024,G900227605,8024,-0.996518,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Enterobacteriaceae_A,g__Escherichia_710834,p__Proteobacteria


In [315]:
subset_serum = pd.merge(mic_interest, metab_serum_carnitines[['Compound_Name', 'NPC#class', 'NPC#superclass']], 
                      left_on='metab_row_id', right_index=True)
subset_serum.head()

Unnamed: 0_level_0,OGU,metab_row_id,jointRPCA_co-occur_value,Taxon,Family,Genus,Phylum,Compound_Name,NPC#class,NPC#superclass
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
G000006925-361,G000006925,361,0.755439,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Enterobacteriaceae_A,g__Escherichia_710834,p__Proteobacteria,CARNITINE,Fatty acyl carnitines,Fatty Acids and Conjugates
G000007905-361,G000007905,361,0.062231,d__Bacteria; p__Campylobacterota; c__Campyloba...,f__Helicobacteraceae,g__Helicobacter_C_479931,p__Campylobacterota,CARNITINE,Fatty acyl carnitines,Fatty Acids and Conjugates
G000008865-361,G000008865,361,0.750832,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Enterobacteriaceae_A,g__Escherichia_710834,p__Proteobacteria,CARNITINE,Fatty acyl carnitines,Fatty Acids and Conjugates
G000012005-361,G000012005,361,0.740016,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Enterobacteriaceae_A,g__Escherichia_710834,p__Proteobacteria,CARNITINE,Fatty acyl carnitines,Fatty Acids and Conjugates
G000020225-361,G000020225,361,0.802364,d__Bacteria; p__Verrucomicrobiota; c__Verrucom...,f__Akkermansiaceae,g__Akkermansia,p__Verrucomicrobiota,CARNITINE,Fatty acyl carnitines,Fatty Acids and Conjugates


In [316]:
subset_serum.shape

(820, 10)

In [317]:
subset_serum['sentiment'] = np.sign(subset_serum['jointRPCA_co-occur_value']).replace({-1: 'negative', 1: 'positive', 0: 'neutral'})
subset_serum.head()

Unnamed: 0_level_0,OGU,metab_row_id,jointRPCA_co-occur_value,Taxon,Family,Genus,Phylum,Compound_Name,NPC#class,NPC#superclass,sentiment
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
G000006925-361,G000006925,361,0.755439,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Enterobacteriaceae_A,g__Escherichia_710834,p__Proteobacteria,CARNITINE,Fatty acyl carnitines,Fatty Acids and Conjugates,positive
G000007905-361,G000007905,361,0.062231,d__Bacteria; p__Campylobacterota; c__Campyloba...,f__Helicobacteraceae,g__Helicobacter_C_479931,p__Campylobacterota,CARNITINE,Fatty acyl carnitines,Fatty Acids and Conjugates,positive
G000008865-361,G000008865,361,0.750832,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Enterobacteriaceae_A,g__Escherichia_710834,p__Proteobacteria,CARNITINE,Fatty acyl carnitines,Fatty Acids and Conjugates,positive
G000012005-361,G000012005,361,0.740016,d__Bacteria; p__Proteobacteria; c__Gammaproteo...,f__Enterobacteriaceae_A,g__Escherichia_710834,p__Proteobacteria,CARNITINE,Fatty acyl carnitines,Fatty Acids and Conjugates,positive
G000020225-361,G000020225,361,0.802364,d__Bacteria; p__Verrucomicrobiota; c__Verrucom...,f__Akkermansiaceae,g__Akkermansia,p__Verrucomicrobiota,CARNITINE,Fatty acyl carnitines,Fatty Acids and Conjugates,positive


In [318]:
subset_serum.metab_row_id.unique()

array([  361,   452,  2516,  3124,   440,  2127,  1889,  1335,  2408,
         810,  2821,  3078,  8443,  4173,  7462,  9257, 14140,  9256,
       10826, 11656])

In [319]:
subset_serum.shape

(820, 11)

In [320]:
subset_serum.columns

Index(['OGU', 'metab_row_id', 'jointRPCA_co-occur_value', 'Taxon', 'Family',
       'Genus', 'Phylum', 'Compound_Name', 'NPC#class', 'NPC#superclass',
       'sentiment'],
      dtype='object')

In [321]:
subset_serum['NPC#class'].value_counts()

Fatty acyl carnitines    820
Name: NPC#class, dtype: int64

In [322]:
subset_serum.Compound_Name.value_counts()

Butyrylcarnitine - 30.00 eV                         123
Acetylcarnitine - 20.00 eV                           82
CARNITINE                                            41
Propionylcarnitine - 20.00 eV                        41
Isovalerylcarnitine - 30.00 eV                       41
LAUROYLCARNITINE - 20.0 eV                           41
Spectral Match to Palmitoylcarnitine from NIST14     41
Name: Compound_Name, dtype: int64

In [323]:
subset_serum.metab_row_id.value_counts()
#Key ones in paper: 2821 = benzoyl-carnitine, 3078 = phenylacetyl-carnitine, and 4173 = phenylpropionyl-carnitine 

361      41
452      41
10826    41
9256     41
14140    41
9257     41
7462     41
4173     41
8443     41
3078     41
2821     41
810      41
2408     41
1335     41
1889     41
2127     41
440      41
3124     41
2516     41
11656    41
Name: metab_row_id, dtype: int64

In [324]:
subset_group_serum = subset_serum[['NPC#class', 'Genus', 'sentiment', 'NPC#superclass', 'Phylum', 'Family', 'Compound_Name', 'metab_row_id']].copy()
subset_group_serum.head()

Unnamed: 0_level_0,NPC#class,Genus,sentiment,NPC#superclass,Phylum,Family,Compound_Name,metab_row_id
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
G000006925-361,Fatty acyl carnitines,g__Escherichia_710834,positive,Fatty Acids and Conjugates,p__Proteobacteria,f__Enterobacteriaceae_A,CARNITINE,361
G000007905-361,Fatty acyl carnitines,g__Helicobacter_C_479931,positive,Fatty Acids and Conjugates,p__Campylobacterota,f__Helicobacteraceae,CARNITINE,361
G000008865-361,Fatty acyl carnitines,g__Escherichia_710834,positive,Fatty Acids and Conjugates,p__Proteobacteria,f__Enterobacteriaceae_A,CARNITINE,361
G000012005-361,Fatty acyl carnitines,g__Escherichia_710834,positive,Fatty Acids and Conjugates,p__Proteobacteria,f__Enterobacteriaceae_A,CARNITINE,361
G000020225-361,Fatty acyl carnitines,g__Akkermansia,positive,Fatty Acids and Conjugates,p__Verrucomicrobiota,f__Akkermansiaceae,CARNITINE,361


In [325]:
subset_group_serum.metab_row_id.unique()

array([  361,   452,  2516,  3124,   440,  2127,  1889,  1335,  2408,
         810,  2821,  3078,  8443,  4173,  7462,  9257, 14140,  9256,
       10826, 11656])

In [326]:
subset_group_serum[subset_group_serum.metab_row_id.isin([3078, 4173, 2821])].shape

(123, 8)

In [327]:
subset_group_serum['Compound_Name'] = subset_group_serum['Compound_Name'].fillna(group_serum['metab_row_id'])
#fills na with row id, to keep each carnitine distinct, look at new carnitines

In [344]:
group_countz1 = subset_group_serum.groupby(['Compound_Name', 'NPC#class', 'NPC#superclass', 'Genus', 'Family', 'Phylum', 'sentiment']).size().reset_index(name='count')
group_countz2 = group_countz1[group_countz1['count'] > 0]
group_countz2.head()

Unnamed: 0,Compound_Name,NPC#class,NPC#superclass,Genus,Family,Phylum,sentiment,count
0,810,Fatty acyl carnitines,Fatty esters,g__Akkermansia,f__Akkermansiaceae,p__Verrucomicrobiota,positive,5
1,810,Fatty acyl carnitines,Fatty esters,g__Escherichia_710834,f__Enterobacteriaceae_A,p__Proteobacteria,positive,18
2,810,Fatty acyl carnitines,Fatty esters,g__Helicobacter_A_479734,f__Helicobacteraceae,p__Campylobacterota,negative,2
3,810,Fatty acyl carnitines,Fatty esters,g__Helicobacter_A_479734,f__Helicobacteraceae,p__Campylobacterota,positive,1
4,810,Fatty acyl carnitines,Fatty esters,g__Helicobacter_B,f__Helicobacteraceae,p__Campylobacterota,negative,2


In [345]:
group_countz2.shape
#ideally, less than 100

(203, 8)

In [346]:
group_countz2.Compound_Name.unique()

array([810, 2821, 3078, 3124, 4173, 7462, 8443, 9256, 10826, 14140,
       'Acetylcarnitine - 20.00 eV', 'Butyrylcarnitine - 30.00 eV',
       'CARNITINE', 'Isovalerylcarnitine - 30.00 eV',
       'LAUROYLCARNITINE - 20.0 eV', 'Propionylcarnitine - 20.00 eV',
       'Spectral Match to Palmitoylcarnitine from NIST14'], dtype=object)

In [347]:
group_countz2.Genus.unique()

array([' g__Akkermansia', ' g__Escherichia_710834',
       ' g__Helicobacter_A_479734', ' g__Helicobacter_B',
       ' g__Helicobacter_C_479931', ' g__Helicobacter_D',
       ' g__Helicobacter_G_479964', ' g__Mucispirillum'], dtype=object)

In [348]:
group_countz2['count'].describe()

count    203.000000
mean       4.039409
std        5.211224
min        1.000000
25%        1.000000
50%        2.000000
75%        4.000000
max       36.000000
Name: count, dtype: float64

In [349]:
group_countz2.shape

(203, 8)

In [350]:
group_countz2

Unnamed: 0,Compound_Name,NPC#class,NPC#superclass,Genus,Family,Phylum,sentiment,count
0,810,Fatty acyl carnitines,Fatty esters,g__Akkermansia,f__Akkermansiaceae,p__Verrucomicrobiota,positive,5
1,810,Fatty acyl carnitines,Fatty esters,g__Escherichia_710834,f__Enterobacteriaceae_A,p__Proteobacteria,positive,18
2,810,Fatty acyl carnitines,Fatty esters,g__Helicobacter_A_479734,f__Helicobacteraceae,p__Campylobacterota,negative,2
3,810,Fatty acyl carnitines,Fatty esters,g__Helicobacter_A_479734,f__Helicobacteraceae,p__Campylobacterota,positive,1
4,810,Fatty acyl carnitines,Fatty esters,g__Helicobacter_B,f__Helicobacteraceae,p__Campylobacterota,negative,2
...,...,...,...,...,...,...,...,...
198,Spectral Match to Palmitoylcarnitine from NIST14,Fatty acyl carnitines,Fatty esters,g__Helicobacter_C_479931,f__Helicobacteraceae,p__Campylobacterota,negative,1
199,Spectral Match to Palmitoylcarnitine from NIST14,Fatty acyl carnitines,Fatty esters,g__Helicobacter_C_479931,f__Helicobacteraceae,p__Campylobacterota,positive,5
200,Spectral Match to Palmitoylcarnitine from NIST14,Fatty acyl carnitines,Fatty esters,g__Helicobacter_D,f__Helicobacteraceae,p__Campylobacterota,positive,3
201,Spectral Match to Palmitoylcarnitine from NIST14,Fatty acyl carnitines,Fatty esters,g__Helicobacter_G_479964,f__Helicobacteraceae,p__Campylobacterota,positive,2


In [351]:
group_countz2.to_csv('../data/JointRPCA/3xtg_serum_cytoscape_carnitine-micro_summary.txt', sep='\t')

In [352]:
group_counts2.groupby(['Compound_Name']).size()

Compound_Name
810                                                  3
2821                                                 3
3078                                                 4
3124                                                 3
4173                                                 3
7462                                                 3
8443                                                 3
9256                                                 3
10826                                                3
14140                                                3
Acetylcarnitine - 20.00 eV                          10
Butyrylcarnitine - 30.00 eV                         21
CARNITINE                                            3
Isovalerylcarnitine - 30.00 eV                       3
LAUROYLCARNITINE - 20.0 eV                           3
Propionylcarnitine - 20.00 eV                        3
Spectral Match to Palmitoylcarnitine from NIST14     3
dtype: int64

In [353]:
group_countz2.groupby(['Genus']).size()

Genus
 g__Akkermansia              21
 g__Escherichia_710834       19
 g__Helicobacter_A_479734    34
 g__Helicobacter_B           34
 g__Helicobacter_C_479931    34
 g__Helicobacter_D           23
 g__Helicobacter_G_479964    19
 g__Mucispirillum            19
dtype: int64

![Cytoscape](../figures/3xtg_serum_cytoscape_carnitine-micro_circplot.png)