In [1]:
import pandas as pd
import numpy as np

# Read taxonomy table

In [2]:
df_tax = pd.read_csv('taxonomy.tsv', sep='\t')
taxonomy = []
for idx in df_tax.index:
    taxon = df_tax.loc[idx,'Taxon']
    taxon = taxon.split(';')
    res = [df_tax.loc[idx,'Feature ID']]
    lowest_classified_taxon = 'unclassified'
    for i in np.arange(7):
        if i < len(taxon):
            res.append(taxon[i])
            if 'unclassified' not in taxon[i] and 'uncultured' not in taxon[i] and 'unidentified' not in taxon[i]:
                lowest_classified_taxon = taxon[i]
        else:
            res.append('unclassified')
    res.append(lowest_classified_taxon)
    taxonomy.append(res)
    
df_tax = pd.merge(df_tax, pd.DataFrame(taxonomy, columns=['Feature ID','Kingdom','Phylum','Class','Order','Family','Genus','Species','LowestClassifiedTaxon']), left_on='Feature ID', right_on='Feature ID', how='left')
df_tax = df_tax[['Feature ID','Confidence','Kingdom','Phylum','Class','Order','Family','Genus','Species','LowestClassifiedTaxon']]
df_tax = df_tax.rename({'Feature ID':'ASV'}, axis=1).set_index('ASV')

# remove genus tag in species
df_tax['Species'] = [y.replace(x+'-','') for x,y in zip(df_tax['Genus'], df_tax['Species'])]
#df_tax['LowestTaxon'] = [y.replace(x+'-','') for x,y in zip(df_tax['Genus'], df_tax['LowestTaxon'])]

# # select only bacteria
# df_tax = df_tax[df_tax.Kingdom=='Bacteria']

df_tax.head()

Unnamed: 0_level_0,Confidence,Kingdom,Phylum,Class,Order,Family,Genus,Species,LowestClassifiedTaxon
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
859f0168170ec9c159ad152f7211fdb7,0.999977,Bacteria,Firmicutes,Clostridia,Lachnospirales,Lachnospiraceae,Agathobacter,unclassified,Agathobacter
668e61d416e2c60c21e75cd217d81c61,0.999976,Bacteria,Firmicutes,Clostridia,Lachnospirales,Lachnospiraceae,Agathobacter,unclassified,Agathobacter
22f2ec8255674bc762b7d9f896c584fa,0.99885,Bacteria,Firmicutes,Clostridia,Lachnospirales,Lachnospiraceae,Fusicatenibacter,unclassified,Fusicatenibacter
e6c70754ab852a46b4ba77d0e35d4935,1.0,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,unclassified,Bacteroides
4d87e6fd770bdc8cfae308448dfc77b3,1.0,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,unclassified,Bacteroides


# Read metadata

In [13]:
df_meta = pd.read_csv('metadata.tsv', sep='\t').set_index('sample-id')
df_meta = df_meta.drop('#q2:types')
df_meta.Subject = [s.strip() for s in df_meta.Subject]
df_meta.head()

Unnamed: 0_level_0,Sample_type,NCBI_Sample_Name,Diet,Treatment,Subject,Timepoint,Dose,week,acetate,propionate,butyrate,Valeric_Acid,iso_butyrate,iso_Valeric_Acid
sample-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
SRR10022259,feces,F01W0,Corn Starch,Control,F01,BL,baseline,0,28.804,4.916,5.032,1.268,0.872,1.316
SRR10022258,feces,F01W1,Corn Starch,Control,F01,W1,10g,1,47.576,9.76,8.968,1.184,0.752,1.044
SRR10022120,feces,F01W2,Corn Starch,Control,F01,W2,20g,2,27.468,6.136,5.784,1.052,1.392,2.06
SRR10022238,feces,F01W3,Corn Starch,Control,F01,W3,35g,3,22.48,3.724,3.152,0.704,1.004,1.596
SRR10022162,feces,F01W4,Corn Starch,Control,F01,W4,50g,4,34.692,6.624,6.992,1.164,1.024,1.6


# Prepare relative abundance table

In [8]:
df_count = pd.read_csv('otu.txt', sep='\t')
df_count = df_count.rename({'#OTU ID':'ASV'}, axis=1).set_index('ASV')
df_count = df_count[list(df_meta.index)]
df_count = df_count.div(df_count.sum(axis=0), axis=1)
df_count = df_count[(df_count.T != 0).any()]

# merge tax and absolute abundance
df_count = pd.merge(df_tax[['LowestClassifiedTaxon']], df_count, left_index=True, right_index=True, how='inner').set_index('LowestClassifiedTaxon', drop=True)

# merge index (row sum for the same index)
df_count = df_count.groupby(df_count.index).sum()

df_count.head()

Unnamed: 0_level_0,SRR10022259,SRR10022258,SRR10022120,SRR10022238,SRR10022162,SRR10022167,SRR10022172,SRR10022177,SRR10022236,SRR10022188,...,SRR10022110,SRR10022111,SRR10022112,SRR10022113,SRR10022114,SRR10022115,SRR10022116,SRR10022117,SRR10022118,SRR10022119
LowestClassifiedTaxon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abiotrophia-defectiva,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Absiella-argi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Acholeplasmatales-bacterium-canine-oral-taxon-316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Acidaminococcus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Acidaminococcus-sp.-Marseille-P2764,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df_count_T = df_count.T
df_count_T.index.name = 'SampleID'

In [10]:
df_count_T.to_excel('16S_relative_abundance.xlsx')