In [2]:
import pandas as pd
import numpy as np
from Bio import SeqIO
import re
from copy import deepcopy

# Compute relative abundance and prevalence of oral ASVs

In [8]:
# read HMP
df_hmp = pd.read_csv("HMP_2012/relative_abundance.txt", sep="\t")
df_hmp = df_hmp[df_hmp.body_site=='oralcavity'].set_index('sample_id')
df_hmp = df_hmp.iloc[:, 21:].T
print("# oral samples of HMP = %d" % (len(df_hmp.columns)))

# read brito
df_brito = pd.read_csv("BritoIL_2016/relative_abundance.txt", sep="\t")
df_brito = df_brito[df_brito.body_site=='oralcavity'].set_index('sample_id')
df_brito = df_brito.iloc[:, 21:].T
print("# oral samples of Brito = %d" % (len(df_brito.columns)))
      
# combine
df_oral = pd.merge(df_hmp, df_brito, left_index=True, right_index=True, how='outer').fillna(0)
df_oral = df_oral.loc[[x for x in df_oral.index if 'k__Bacteria' in x and 'Chloroplast' not in x and 'Mitochondria' not in x]]
df_oral = df_oral.loc[~(df_oral==0).all(axis=1)]
df_oral['MeanRelabunOral'] = df_oral.mean(axis=1)
df_oral['PrevalenceOral'] = (df_oral.iloc[:,0:-1] >1e-3).astype(int).mean(axis=1)
df_oral = df_oral[['MeanRelabunOral', 'PrevalenceOral']]
df_oral.head()

# oral samples of HMP = 414
# oral samples of Brito = 140


sample_id,MeanRelabunOral,PrevalenceOral
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinobaculum|s__Actinobaculum_sp_oral_taxon_183,0.550773,0.458484
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_cardiffensis,0.007559,0.083032
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_denticolens,2.5e-05,0.00361
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_georgiae,0.009867,0.216606
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_graevenitzii,1.191794,0.646209


# Compute relative abundance and prevalence of gut ASVs

In [10]:
# read HMP
df_hmp = pd.read_csv("HMP_2012/relative_abundance.txt", sep="\t")
df_hmp = df_hmp[df_hmp.body_site=='stool'].set_index('sample_id')
df_hmp = df_hmp.iloc[:, 21:].T
print("# fecal samples of HMP = %d" % (len(df_hmp.columns)))

# read brito
df_brito = pd.read_csv("BritoIL_2016/relative_abundance.txt", sep="\t")
df_brito = df_brito[df_brito.body_site=='stool'].set_index('sample_id')
df_brito = df_brito.iloc[:, 21:].T
print("# fecal samples of Brito = %d" % (len(df_brito.columns)))

# combine
df_gut = pd.merge(df_hmp, df_brito, left_index=True, right_index=True, how='outer').fillna(0)
df_gut = df_gut.loc[[x for x in df_gut.index if 'k__Bacteria' in x and 'Chloroplast' not in x and 'Mitochondria' not in x]]
df_gut = df_gut.loc[~(df_gut==0).all(axis=1)]
df_gut['MeanRelabunGut'] = df_gut.mean(axis=1)
df_gut['PrevalenceGut'] = (df_gut.iloc[:,0:-1] >1e-3).astype(int).mean(axis=1)
df_gut = df_gut[['MeanRelabunGut', 'PrevalenceGut']]
df_gut.head()

# fecal samples of HMP = 147
# fecal samples of Brito = 172


sample_id,MeanRelabunGut,PrevalenceGut
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinobaculum|s__Actinobaculum_sp_oral_taxon_183,3.9e-05,0.009404
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_cardiffensis,1e-06,0.0
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_graevenitzii,0.00031,0.050157
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_johnsonii,2.1e-05,0.009404
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_naeslundii,7.4e-05,0.018809


# Combine and filter

In [8]:
df_combined = pd.merge(df_oral, df_gut, left_index=True, right_index=True, how='outer').fillna(0)
df_combined = df_combined[(df_combined.MeanRelabunOral > 0.1) & (df_combined.PrevalenceOral > 0.05) & (df_combined.MeanRelabunGut<=0.1) & (df_combined.PrevalenceGut <= 0.05)]
print('number of oral species after filtering = %d' % (len(df_combined)))
df_combined.head()

number of oral species after filtering = 92


sample_id,MeanRelabunOral,PrevalenceOral,MeanRelabunGut,PrevalenceGut
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinobaculum|s__Actinobaculum_sp_oral_taxon_183,0.550773,0.458484,3.9e-05,0.009404
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_johnsonii,0.143748,0.395307,2.1e-05,0.009404
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_massiliensis,0.354643,0.402527,0.0,0.0
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_naeslundii,0.505413,0.635379,7.4e-05,0.018809
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_sp_S6_Spd3,0.127484,0.5,0.00011,0.034483


# Write filtered sequences to fasta

In [9]:
df2write = deepcopy(df_combined).reset_index().rename({'index':'taxon'}, axis=1)
df2write['species'] = [t.split('s__')[1] for t in df2write.taxon]
df2write['species_binomial'] = [' '.join(x.split('_')[0:2]) for x in df2write.species]
df2write.head()

sample_id,taxon,MeanRelabunOral,PrevalenceOral,MeanRelabunGut,PrevalenceGut,species,species_binomial
0,k__Bacteria|p__Actinobacteria|c__Actinobacteri...,0.550773,0.458484,3.9e-05,0.009404,Actinobaculum_sp_oral_taxon_183,Actinobaculum sp
1,k__Bacteria|p__Actinobacteria|c__Actinobacteri...,0.143748,0.395307,2.1e-05,0.009404,Actinomyces_johnsonii,Actinomyces johnsonii
2,k__Bacteria|p__Actinobacteria|c__Actinobacteri...,0.354643,0.402527,0.0,0.0,Actinomyces_massiliensis,Actinomyces massiliensis
3,k__Bacteria|p__Actinobacteria|c__Actinobacteri...,0.505413,0.635379,7.4e-05,0.018809,Actinomyces_naeslundii,Actinomyces naeslundii
4,k__Bacteria|p__Actinobacteria|c__Actinobacteri...,0.127484,0.5,0.00011,0.034483,Actinomyces_sp_S6_Spd3,Actinomyces sp


In [11]:
df2write.to_csv("oral_typical_species.csv", index=False)

In [39]:
len(df2write)

92

In [40]:
fasta_sequences = SeqIO.parse(open("../HOMD/HOMD.fasta"),'fasta')
with open("fHOMD.fasta", "w") as out_file:
    res = []
    for fasta in fasta_sequences:
        name, description, sequence = fasta.id, fasta.description, str(fasta.seq)
        species = description.split('|')[1].strip()
        
        # remove parenthesis and inside
        species = re.sub(r"\((.*?)\)", "", species)
        species = re.sub(r"\[(.*?)\]", "", species)
                            
        # get binomial species name
        species_binomial = ' '.join(re.split('\s+', species)[0:2]).rstrip('.')
        if species_binomial in list(df2write.species_binomial):
            out_file.write(">%s\n" % description)
            out_file.write("%s\n" % sequence)
            
            res.append([name,description,sequence])
df_homd = pd.DataFrame(res, columns=['Subject_SeqID','Subject_SeqDescription','Subject_Sequence'])
df_homd.to_csv("fHOMD_text.csv", index=False)
df_homd.head()

Unnamed: 0,Subject_SeqID,Subject_SeqDescription,Subject_Sequence
0,623_4320,623_4320 | Campylobacter gracilis | HMT-623 | ...,AGTGAACGCTGGCGGCGTGCCTAATACATGCAAGTCGAACGGAATT...
1,575_6977,575_6977 | Campylobacter concisus | HMT-575 | ...,AGTGAACGCTGGCGGCGTGCCTAATACATGCAAGTCGAACGGACAA...
2,763_6974,763_6974 | Campylobacter showae | HMT-763 | St...,AGTGAACGCTGGCGGCGTGCCTAATACATGCAAGTCGAACGGAGAT...
3,325_1347,325_1347 | Capnocytophaga granulosa | HMT-325 ...,GATGAACGCTAGCGGCAGGCCTAACACATGCAAGTCGAGGGAGAAG...
4,325AH015,325AH015 | Capnocytophaga granulosa | HMT-325 ...,GATGAACGCTAGCGGCAGGCCTAACACATGCAAGTCGAGGGAGAAG...


In [41]:
len(df_homd)

210