In [16]:
import pandas as pd
import numpy as np

# Read taxonomy table

In [17]:
df_tax = pd.read_csv('taxonomy.tsv', sep='\t')
taxonomy = []
for idx in df_tax.index:
    taxon = df_tax.loc[idx,'Taxon']
    taxon = taxon.split(';')
    res = [df_tax.loc[idx,'Feature ID']]
    lowest_classified_taxon = 'unclassified'
    for i in np.arange(7):
        if i < len(taxon):
            res.append(taxon[i])
            if 'unclassified' not in taxon[i] and 'uncultured' not in taxon[i] and 'unidentified' not in taxon[i]:
                lowest_classified_taxon = taxon[i]
        else:
            res.append('unclassified')
    res.append(lowest_classified_taxon)
    taxonomy.append(res)
    
df_tax = pd.merge(df_tax, pd.DataFrame(taxonomy, columns=['Feature ID','Kingdom','Phylum','Class','Order','Family','Genus','Species','LowestClassifiedTaxon']), left_on='Feature ID', right_on='Feature ID', how='left')
df_tax = df_tax[['Feature ID','Confidence','Kingdom','Phylum','Class','Order','Family','Genus','Species','LowestClassifiedTaxon']]
df_tax = df_tax.rename({'Feature ID':'ASV'}, axis=1).set_index('ASV')

# remove genus tag in species
df_tax['Species'] = [y.replace(x+'-','') for x,y in zip(df_tax['Genus'], df_tax['Species'])]
#df_tax['LowestTaxon'] = [y.replace(x+'-','') for x,y in zip(df_tax['Genus'], df_tax['LowestTaxon'])]

# select only bacteria
# df_tax = df_tax[df_tax.Kingdom=='Bacteria']

df_tax.head()

Unnamed: 0_level_0,Confidence,Kingdom,Phylum,Class,Order,Family,Genus,Species,LowestClassifiedTaxon
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
d272bf25781448dde9031a24679a9012,0.964328,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,acidifaciens,Bacteroides-acidifaciens
d33d84f90d2924040cf2eb66c7abe2fc,0.929414,Bacteria,Firmicutes,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,murinus,Lactobacillus-murinus
ec098ad12ef2923b449a01762462578b,1.0,Bacteria,Firmicutes,Bacilli,Lactobacillales,Streptococcaceae,Lactococcus,unclassified,Lactococcus
cb7b937fdccf682f018b57aa6b9d92b2,1.0,Bacteria,Actinobacteriota,Actinobacteria,Bifidobacteriales,Bifidobacteriaceae,Bifidobacterium,unclassified,Bifidobacterium
b6c1a8ba78f65c4a4e96627ccc53ac86,1.0,Bacteria,Firmicutes,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,unclassified,Lactobacillus


# Read metadata

In [18]:
df_meta = pd.read_csv('metadata.txt', sep='\t').set_index('#SampleID')
df_meta = df_meta.drop('#q2:types')
df_meta = df_meta[df_meta.If_longitudinal=='Yes']
df_meta.head()

Unnamed: 0_level_0,forward-absolute-filepath,reverse-absolute-filepath,Figure,Mice_ID,Day,If_longitudinal,Group,Time,Sample_description
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
sample31,/mnt/d/16S_sequencing_data/Chijiiwa-2020-micro...,/mnt/d/16S_sequencing_data/Chijiiwa-2020-micro...,1d,1,0,Yes,Inulin,Morning,D0_01_CageNo.1
sample32,/mnt/d/16S_sequencing_data/Chijiiwa-2020-micro...,/mnt/d/16S_sequencing_data/Chijiiwa-2020-micro...,1d,2,0,Yes,Inulin,Morning,D0_02_CageNo.2
sample33,/mnt/d/16S_sequencing_data/Chijiiwa-2020-micro...,/mnt/d/16S_sequencing_data/Chijiiwa-2020-micro...,1d,3,0,Yes,Inulin,Morning,D0_03_CageNo.3
sample34,/mnt/d/16S_sequencing_data/Chijiiwa-2020-micro...,/mnt/d/16S_sequencing_data/Chijiiwa-2020-micro...,1d,4,0,Yes,Inulin,Morning,D0_04_CageNo.3
sample35,/mnt/d/16S_sequencing_data/Chijiiwa-2020-micro...,/mnt/d/16S_sequencing_data/Chijiiwa-2020-micro...,1d,5,0,Yes,Inulin,Morning,D0_05_CageNo.3


# Prepare If_longitudinal abundance table

In [19]:
df_count = pd.read_csv('otu.txt', sep='\t')
df_count = df_count.rename({'#OTU ID':'ASV'}, axis=1).set_index('ASV')
df_count = df_count[list(df_meta.index)]
df_count = df_count.div(df_count.sum(axis=0), axis=1)
df_count = df_count[(df_count.T != 0).any()]

# merge tax and absolute abundance
df_count = pd.merge(df_tax[['LowestClassifiedTaxon']], df_count, left_index=True, right_index=True, how='inner').set_index('LowestClassifiedTaxon', drop=True)

# merge index (row sum for the same index)
df_count = df_count.groupby(df_count.index).sum()

df_count.head()

Unnamed: 0_level_0,sample31,sample32,sample33,sample34,sample35,sample36,sample37,sample38,sample39,sample40,...,sample102,sample103,sample104,sample105,sample106,sample107,sample108,sample109,sample110,sample111
LowestClassifiedTaxon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A2,0.000266,0.000104,4.9e-05,0.0,0.000876,0.000295,0.003023,0.005378,0.001936,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ASF356,0.000349,0.001241,0.00061,0.000479,0.000906,0.000403,0.000292,0.000489,0.001131,0.0,...,0.001437,8.8e-05,0.0,0.001227,0.0,0.000401,0.000432,0.0,0.001971,0.00139
Acetatifactor,0.001081,0.000451,0.009088,0.000137,0.004308,0.000678,0.007831,0.001264,0.004343,0.000848,...,0.0,0.0,0.001822,0.000614,0.0,0.0,0.00019,0.000189,0.000459,0.000206
Acinetobacter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Akkermansia-muciniphila,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.342675,9.7e-05,0.0,0.0,0.0,0.470374,0.0,0.0,0.280448


In [20]:
df_count_T = df_count.T
df_count_T.index.name = 'SampleID'

In [21]:
df_count_T.to_excel('16S_relative_abundance.xlsx')