# Microbiota analysis
## Author: Tijs van Lieshout

### Import statements:

In [30]:
import pandas as pd
import glob
from IPython.display import display
from bokeh.plotting import figure, show
from bokeh.layouts import gridplot
from bokeh.models import Range1d
from bokeh.io import output_notebook
output_notebook()

### Loading the data:

In [31]:
def load_data_microbiota(PATH):
    """Load microbiota data (Gut Feeling Knowledge Base and metaphlann output) into pandas dataframes

    Keyword arguments:
    PATH -- The path which contains the Gut Feeling Knowledge Base and metaphlann output dir
    
    Returns:
    gfkb -- A pandas dataframe containing the Gut Feeling Knowledge Base
    barcode2tax_profile --  dictionary containing barcode IDs as keys and tax_profile dataframes as values"""
    
    gfkb = pd.read_csv(f"{PATH}/GutFeelingKnowledgeBase-v4-Master_List.csv")
    gfkb = gfkb.drop(columns=["Present in GFKB v3 (Y/N)",
                              "Present in GFKB_epilepsy v3 (Y/N)"])
    gfkb = pd.concat([gfkb.drop(columns=["Genome Size (Mb)"]).apply(lambda x: x.astype(str)), 
                      gfkb["Genome Size (Mb)"]], axis=1)

    barcode2tax_profile = {}
    
    for file in glob.glob(f"{PATH}/metaphlan_output/*.txt"):
        tax_profile = pd.read_csv(file, 
                                  comment="#", 
                                  sep="\t", 
                                  names=["clade_name", 
                                         "NCBI_tax_id", 
                                         "relative_abundance", 
                                         "additional_species"])
        barcode = file.split('metaphlan_output/')[1].split("_all")[0]
        barcode2tax_profile[barcode] = tax_profile
    
    return gfkb, barcode2tax_profile

### Recreating the plot from Zimmer et al. 2012
<img src="../microbiota_tax_data/zimmer_species_abundance_plot.png" alt="Zimmer et al. 2012" width="400"/>

In [32]:
def recreate_zimmer(barcode2tax_profile):
    """recreate a comparison of taxa that have been routinely analysed by Zimmer et al. 2012
    Keyword arguments:
    barcode2tax_profile -- A dictionary containing barcode IDs as keys and tax_profile dataframes as values
    
    Returns:
    barcode2zimmer_subset -- dictionary containing barcode IDs as keys and tax_profile dataframes containing 
    only the taxa analyzed by zimmer et al. 2012 as values
    """
    
    barcode2zimmer_subset = {}
    
    for barcode in barcode2tax_profile:
        tax_profile = barcode2tax_profile[barcode]
        
        bacteroides_subset = tax_profile[tax_profile.clade_name.str.endswith("g__Bacteroides")]
        bifidobacteria_subset = tax_profile[tax_profile.clade_name.str.endswith("g__Bifidobacterium")]
        ecoli_subset = tax_profile[tax_profile.clade_name.str.endswith("s__Escherichia_coli")]
        enterobacter_subset = tax_profile[tax_profile.clade_name.str.endswith("f__Enterobacteriaceae")]
        
        zimmer_subset = pd.concat([bacteroides_subset, 
                                   bifidobacteria_subset,
                                   ecoli_subset,
                                   enterobacter_subset])
        
        barcode2zimmer_subset[barcode] = zimmer_subset
        
        for taxa in enterobacter_subset.clade_name:
            print(taxa)
        
    return barcode2zimmer_subset

In [33]:
def main():
    PATH = "../microbiota_tax_data"
    gfkb, barcode2tax_profile = load_data_microbiota(PATH)
    barcode2zimmer_subset = recreate_zimmer(barcode2tax_profile)

if __name__ == '__main__':
    main()

k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae
