# Microbiota analysis
## Author: Tijs van Lieshout

### Import statements:

In [60]:
import pandas as pd
import glob
from bokeh.plotting import figure, show
from bokeh.layouts import gridplot
from bokeh.models import Range1d
from bokeh.io import output_notebook
output_notebook()

### Loading the data:

In [86]:
def load_data_microbiota(PATH):
    """Load microbiota data (Gut Feeling Knowledge Base and metaphlann output) into pandas 
    dataframes

    Keyword arguments:
    PATH -- The path which contains the Gut Feeling Knowledge Base and metaphlann output dir
    
    Returns:
    gfkb -- A pandas dataframe containing the Gut Feeling Knowledge Base
    tax_profiles -- A list of pandas dataframes containing the taxonomic profile per barcode
    """
    gfkb = pd.read_csv(f"{PATH}/GutFeelingKnowledgeBase-v4-Master_List.csv")
    gfkb = gfkb.drop(columns=["Present in GFKB v3 (Y/N)",
                              "Present in GFKB_epilepsy v3 (Y/N)"])
    gfkb = pd.concat([gfkb.drop(columns=["Genome Size (Mb)"]).apply(lambda x: x.astype(str)), 
                      gfkb["Genome Size (Mb)"]], axis=1)
#     print(gfkb['Effect/Function'].value_counts())

    tax_profiles = []
    for file in glob.glob(f"{PATH}/metaphlan_output/*.txt"):
        tax_profile = pd.read_csv(file, 
                                  comment="#", 
                                  sep="\t", 
                                  names=["clade_name", 
                                         "NCBI_tax_id", 
                                         "relative_abundance", 
                                         "additional_species"])
        tax_profiles.append(tax_profile)
    
    return gfkb, tax_profiles

### Plotting data

In [100]:
def recreate_zimmer(tax_profiles):
    for tax_profile in tax_profiles:
        tax_of_interest = tax_profile[tax_profile.clade_name.str.contains("Bacteroides")]
        print(tax_of_interest)
    return

In [101]:
def main():
    PATH = "../microbiota_tax_data"
    gfkb, tax_profiles = load_data_microbiota(PATH)
    recreate_zimmer(tax_profiles)

if __name__ == '__main__':
    main()

                                           clade_name  \
34  k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o_...   
49  k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o_...   
54  k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o_...   
55  k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o_...   

                           NCBI_tax_id  relative_abundance  \
34         2|976|200643|171549|815|816           11.142482   
49     2|976|200643|171549|815|816|820            8.483781   
54  2|976|200643|171549|815|816|357276            1.393951   
55   2|976|200643|171549|815|816|46506            1.264750   

                                   additional_species  
34                                                NaN  
49  k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o_...  
54  k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o_...  
55                                                NaN  
Empty DataFrame
Columns: [clade_name, NCBI_tax_id, relative_abundance, additional_species]
Index: []
Empty DataFrame
Column