In [24]:
import pandas as pd
import numpy as np
import os

# Read shotgun data

## COG database

In [142]:
df_cog = None
for filename in os.listdir('../shotgun/'):
    if filename.endswith('_cog.tsv'):  
        df = pd.read_csv('../shotgun/'+filename, sep='\t')
        df.columns = ['GeneFamily','RPKs']
        df = df[~df['GeneFamily'].str.contains('\|')]
        df['RPKs'] = df['RPKs']/sum(df['RPKs'].values)
        df = df.set_index('GeneFamily')
        df.columns = [filename.replace('_cog.tsv','').replace('FMT','FMT.').replace('._','.')]
        if df_cog is None:
            df_cog = df
        else:
            df_cog = pd.merge(df_cog, df, left_index=True, right_index=True, how='outer').fillna(0)

In [144]:
df_cog.to_csv('shotgun_cog.csv')

## KO database

In [143]:
df_ko = None
for filename in os.listdir('../shotgun/'):
    if filename.endswith('_ko.tsv'):  
        df = pd.read_csv('../shotgun/'+filename, sep='\t')
        df.columns = ['GeneFamily','RPKs']
        df = df[~df['GeneFamily'].str.contains('\|')]
        df['RPKs'] = df['RPKs']/sum(df['RPKs'].values)
        df = df.set_index('GeneFamily')
        df.columns = [filename.replace('_ko.tsv','').replace('FMT','FMT.').replace('._','.')]
        if df_ko is None:
            df_ko = df
        else:
            df_ko = pd.merge(df_ko, df, left_index=True, right_index=True, how='outer').fillna(0)

In [145]:
df_ko.to_csv('shotgun_ko.csv')

# Read ASV counts and convert to relative abundance (limit to samples that we have shotgun data)

In [146]:
df_cog = pd.read_csv('shotgun_cog.csv', index_col=0)
df_cog.head()

Unnamed: 0_level_0,FMT.92T,FMT.0094E,668Z,1577A,FMT.0069I,FMT.0119M,FMT.0048G,105G,1742B,1179B,...,FMT.92D,FMT.0064G,1508B,FMT.0064AA,1252Y,1179R,FMT.0216A,894C,FMT.0187C,247G
GeneFamily,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
COG0001,5e-06,1.853168e-05,7.062464e-05,2e-06,5.6e-05,2.531635e-05,2.32218e-05,8.901756e-06,2.836361e-05,5.8e-05,...,1.559711e-05,5.541115e-05,1.652714e-05,8.8e-05,9e-06,2.051944e-06,6e-06,8.548738e-07,2.006811e-05,0.0
COG0002,0.0,3.930642e-05,4.515416e-05,6e-05,3e-06,5.02283e-05,3.314082e-05,1.105143e-05,4.918598e-05,3e-06,...,6.627429e-05,6.378156e-05,6.779914e-05,1.5e-05,1.1e-05,1.182357e-06,0.000116,3.525918e-05,5.124046e-05,2.260015e-07
COG0003,0.0,8.394877e-07,7.976337e-07,2.5e-05,0.0,1.074124e-07,1.789505e-07,1.273859e-07,4.09837e-07,0.0,...,5.842447e-08,3.930204e-07,2.150878e-08,0.0,0.0,9.098873e-08,6.8e-05,1.784855e-05,9.636133e-07,0.0
COG0004,0.0,1.532537e-05,0.0,2e-06,5.7e-05,1.878467e-05,2.259545e-05,8.55065e-06,3.032356e-05,6.9e-05,...,3.80209e-06,2.596159e-06,1.421928e-05,2e-06,0.0,0.0,3e-06,7.831186e-07,5.336957e-05,0.0
COG0005,0.0,8.605006e-06,2.486846e-06,2.2e-05,5e-05,8.514939e-06,2.318668e-05,1.814793e-05,8.467377e-06,6.8e-05,...,1.243506e-05,1.201591e-05,2.9955e-05,1.7e-05,0.0,5.029108e-07,7e-06,1.511243e-07,5.1884e-05,0.0


In [149]:
df_counts = pd.read_csv('../../../deidentified_data_tables/counts/tblASVcounts_human_filter.csv', index_col=0)
df_samples_by_species = pd.pivot_table(df_counts, values = 'Count', index=['SampleID'], columns = 'ASV',fill_value=0).reset_index()
df_samples_by_species.columns.name = ''
df_samples_by_species.set_index('SampleID',drop=True,inplace=True)
df_samples_by_species = df_samples_by_species.div(df_samples_by_species.sum(axis=1), axis=0)

In [150]:
# keep only samples in metagenomics database
shared_samples = list(set(df_samples_by_species.index).intersection(df_cog.columns))
df_samples_by_species = df_samples_by_species.loc[shared_samples]
df_cog = df_cog[shared_samples].transpose()

# set relative abundance cutoff
cutoff = 0.01
df_samples_by_species[df_samples_by_species<cutoff] = 0.0
df_samples_by_species = df_samples_by_species.loc[:, df_samples_by_species.sum() != 0]

df_samples_by_species.head()

Unnamed: 0_level_0,ASV_1,ASV_10,ASV_100,ASV_10000,ASV_10007,ASV_10013,ASV_10023,ASV_10030,ASV_10047,ASV_10087,...,ASV_96,ASV_961,ASV_962,ASV_97,ASV_9722,ASV_9723,ASV_9732,ASV_975,ASV_9765,ASV_98
SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FMT.0049H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.016764,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
956H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FMT.0160U,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1042V,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1105G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [151]:
len(df_samples_by_species.columns)

909

# Read ASV traits data

In [160]:
df_asv_traits = pd.read_csv('../traits/merge_trait_tables/MSK_ASV_traits_04232020.csv',index_col=0)
df_asv_traits = df_asv_traits[df_asv_traits.asv.isin(df_samples_by_species.columns)]
df_asv_traits.trait = [df_asv_traits.loc[index,'trait']+'__'+df_asv_traits.loc[index,'source'] for index in df_asv_traits.index]
df_asv_traits = df_asv_traits[['asv','trait','mean']]
df_asv_traits = pd.pivot_table(df_asv_traits, values='mean', index='asv', columns='trait')
df_asv_traits

  mask |= (ar1 == a)


trait,"1,2-Ethanediol_(Ethylene_glycol)__Consumption__NJS16","1,2-Ethanediol_(Ethylene_glycol)__Production__NJS16","1,2-propanediol_(Propene_diol,_Propylene_glycol,_[R]-1,2-propanediol,_[R]-propane-1,2-diol,_[S]-1,2-propanediol,_[S]-propane-1,2-diol)__Consumption__NJS16","1,2-propanediol_(Propene_diol,_Propylene_glycol,_[R]-1,2-propanediol,_[R]-propane-1,2-diol,_[S]-1,2-propanediol,_[S]-propane-1,2-diol)__Production__NJS16","1,3-Propanediol__Consumption__NJS16","1,3-Propanediol__Production__NJS16",2-Aminobutyric_acid_(2-Aminobutyrate)__Production__NJS16,"2-Oxobutyrate_(Alpha-ketobutyrate,_2-Oxobutanoate)__Consumption__NJS16","2-Oxobutyrate_(Alpha-ketobutyrate,_2-Oxobutanoate)__Production__NJS16",2-methylbutyrate_(2-methylbutanoic_acid)__Consumption__NJS16,...,triclosan__CARD,tryptophan__Bewick2019,urea__Bewick2019,urease__Bewick2019,uridine__Bewick2019,urocanate__Bewick2019,valerate__Bewick2019,volume__Bewick2019,xylitol__Bewick2019,xylose__Bewick2019
asv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ASV_1,,,,1.0,,1.0,,,,,...,0.000000,,0.000000,0.000000,,,,5.285764,0.071857,0.311372
ASV_10,,,1.0,,,,,1.0,,1.0,...,0.000000,0.000000,1.000000,0.051264,,,1.00000,6.313901,0.000000,0.703736
ASV_100,,1.0,,1.0,,,,1.0,,,...,0.000000,0.547332,0.000000,0.000000,,,0.00000,3.142676,0.000000,0.764057
ASV_10000,,,1.0,,,,,1.0,,1.0,...,0.000000,0.000000,1.000000,0.052868,,,1.00000,6.162326,0.000000,0.632017
ASV_10007,,,1.0,,,,,,,1.0,...,0.000000,0.399591,1.000000,0.141083,,,1.00000,5.250869,0.000000,0.526003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ASV_9723,1.0,,1.0,,1.0,1.0,,,,,...,0.000000,0.000000,0.000000,0.018588,,,0.40358,1.772006,0.000000,0.551489
ASV_9732,,,1.0,,,1.0,,,,,...,0.869340,0.035073,0.321112,0.221860,1.0,0.0,,1.704628,0.390986,0.771322
ASV_975,,1.0,,1.0,,,,1.0,,,...,0.000000,0.525238,0.000000,0.000000,,,0.00000,2.840400,0.000000,0.758665
ASV_9765,,,,1.0,,1.0,,,,,...,0.022226,0.000000,0.000000,0.131493,0.0,,,4.952795,0.034929,0.369348


# Calculate sample level trait value

In [133]:
df_sample_trait = pd.DataFrame(index=df_samples_by_species.index, columns=df_asv_traits.columns)
for sample_id in df_samples_by_species.index:
    df_samples_by_species.loc[sample_id]

trait,"1,2-Ethanediol_(Ethylene_glycol)__Consumption__NJS16","1,2-Ethanediol_(Ethylene_glycol)__Production__NJS16","1,2-propanediol_(Propene_diol,_Propylene_glycol,_[R]-1,2-propanediol,_[R]-propane-1,2-diol,_[S]-1,2-propanediol,_[S]-propane-1,2-diol)__Consumption__NJS16","1,2-propanediol_(Propene_diol,_Propylene_glycol,_[R]-1,2-propanediol,_[R]-propane-1,2-diol,_[S]-1,2-propanediol,_[S]-propane-1,2-diol)__Production__NJS16","1,3-Propanediol__Consumption__NJS16","1,3-Propanediol__Production__NJS16",2-Aminobutyric_acid_(2-Aminobutyrate)__Production__NJS16,"2-Oxobutyrate_(Alpha-ketobutyrate,_2-Oxobutanoate)__Consumption__NJS16","2-Oxobutyrate_(Alpha-ketobutyrate,_2-Oxobutanoate)__Production__NJS16",2-methylbutyrate_(2-methylbutanoic_acid)__Consumption__NJS16,...,triclosan__CARD,tryptophan__Bewick2019,urea__Bewick2019,urease__Bewick2019,uridine__Bewick2019,urocanate__Bewick2019,valerate__Bewick2019,volume__Bewick2019,xylitol__Bewick2019,xylose__Bewick2019
asv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ASV_1,,,,1.0,,1.0,,,,,...,0.000000,,0.000000,0.000000,,,,5.285764,0.071857,0.311372
ASV_10,,,1.0,,,,,1.0,,1.0,...,0.000000,0.000000,1.000000,0.051264,,,1.0,6.313901,0.000000,0.703736
ASV_100,,1.0,,1.0,,,,1.0,,,...,0.000000,0.547332,0.000000,0.000000,,,0.0,3.142676,0.000000,0.764057
ASV_10000,,,1.0,,,,,1.0,,1.0,...,0.000000,0.000000,1.000000,0.052868,,,1.0,6.162326,0.000000,0.632017
ASV_10007,,,1.0,,,,,,,1.0,...,0.000000,0.399591,1.000000,0.141083,,,1.0,5.250869,0.000000,0.526003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ASV_9947,,,1.0,,,,,,,1.0,...,0.000000,0.000000,1.000000,0.304918,,,1.0,12.720987,0.000000,0.499289
ASV_9956,,,,,,,,,,,...,0.022759,,0.000000,0.044070,,,,1.311073,0.227054,0.561578
ASV_9959,,,,,,,,,,,...,0.000000,0.000000,0.112496,0.615605,1.0,,,9.392968,1.000000,0.528435
ASV_997,,,,,,,,,,,...,0.000000,0.000000,0.000000,0.391404,,,,1.271468,0.592055,0.242428
