In [44]:
# imports
import os
import sys
import re
import glob
from math import pi
from datetime import datetime, timedelta
from dotmap import DotMap
import yaml

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

import panel as pn
from panel.template import DarkTheme
from bokeh.io import output_file, show
from bokeh.models import (ColumnDataSource, HoverTool, Label, Legend, 
                          Div, LabelSet, FuncTickFormatter, CustomJS)
from bokeh.models.widgets import Panel, Tabs
from bokeh.models.tickers import FixedTicker
from bokeh.plotting import figure
from bokeh.transform import dodge, cumsum
from bokeh.layouts import row

In [45]:
def get_config():
    '''
    Function that gets the configuration of personalized variable definitions
    '''
    with open("config.yaml", 'r') as stream:
        config = yaml.safe_load(stream)
    return config

In [46]:
def get_styling():
    css = '''
    :root {
        --mdc-theme-primary: rgba(0, 168, 65, 0.82) !important;
    }
    .bk-root .bk, .bk-root .bk:before, .bk-root .bk:after {
        font-family: Avenir Next, Helvetica !important;
    }
    
    .selectDiv select {
      border-color: rgba(0, 168, 65, 0.82) !important;
      border-radius: 3px !important;
      background-image: url() !important;
      background-size: 10px 10px !important;
    }
    
    .selectDiv label {
        font-weight: bold;
    }
    .checkboxLabel {
        left: 10px !important;
    }
    
    .inputElement input[type="checkbox"] {
        width: 15px;
        height: 15px;
        border: 1px solid rgba(0, 168, 65, 0.82);
        border-radius: 2px;
        -webkit-appearance: none;
        -webkit-transition: box-shadow 200ms;
    }
    .inputElement input[type="checkbox"]:checked {
        background-color: rgba(0, 168, 65, 0.82);
    }
    
    .plotDescription p {
        text-align: justify;
    }
    '''
    return css

In [47]:
def preprocess_intake_files(self, subject):
    '''
    Function that preprocesses the intakes files from the given subject to the correct formattings
    
    Parameters
    --------------
    subject : chr
        The given subject's initial
    
    Returns
    --------------
    list
        baseline_filename    the filename of the created baseline intake file
        vegan_filename       the filename of the created vegan intake file
    '''
    
    days = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
    baseline = ["week1", "week2"]
    header = "Day,Date,Year,Calories,Fat,Saturated fat,Carbs,Fiber,Sugar,Protein,Sodium,Cholesterol,Potassium\n"
    
    food_intake_files_path = self.config['food_intake_files_path']
    processed_files_path = food_intake_files_path + '/processed'
    if not os.path.exists(processed_files_path):
        os.makedirs(processed_files_path)
    
    baseline_filename = processed_files_path + '/nutrition_baseline_subject_'+ subject +'.csv'
    nutrition_baseline = open(baseline_filename, 'w')
    nutrition_baseline.write(header)
    
    vegan_filename = processed_files_path + '/nutrition_vegan_subject_'+ subject +'.csv'
    nutrition_vegan = open(vegan_filename, 'w')
    nutrition_vegan.write(header)
    
    with nutrition_baseline as n_baseline, nutrition_vegan as n_vegan:
        for file in os.scandir(food_intake_files_path):
            filename = file.path
            # only use the file from the given subject
            if (('subject_' + subject in filename) and filename.lower().endswith('.csv')):
                file = open(file, 'r')
                with file as food_intake:
                    # start at line 10 with reading the food log
                    for i in range(9):
                        food_intake, next(food_intake)
                    for line in food_intake:
                        # replace double quotes with single quotes
                        index = 1 if '""' in line else 0
                        line = line.strip().replace('""', '"')[index:].rstrip('"')
                        # get the nutrition details per day
                        if line.split(',')[0].strip('"').lower() in days:
                            if any(week in filename.lower() for week in baseline):
                                n_baseline.write(line + '\n')
                            else:
                                n_vegan.write(line + '\n')
    
    return [baseline_filename, vegan_filename]

In [48]:
def create_nutrition_tables(self, subject):
    '''
    Function that creates the food intake data tables for the given subject
    
    Parameters
    --------------
    subject : chr
        The given subject's initial
    
    Returns
    --------------
    list
        nutrition_data    food intake data table
    '''
    
    nutrition_figures = []
    
    subject_data = preprocess_intake_files(self, subject)
    base_data = pd.read_csv(subject_data[0], sep=',')
    vegan_data = pd.read_csv(subject_data[1], sep=',')

    convert_columns = ['Calories', 'Fat', 'Saturated fat', 'Carbs', 
                       'Fiber', 'Sugar', 'Protein', 
                       'Sodium', 'Cholesterol', 'Potassium']

    # convert commas to points as delimiter and convert to numeric values
    base_data[convert_columns] = base_data[convert_columns].replace(',', '.', regex=True)
    base_data[convert_columns] = base_data[convert_columns].apply(pd.to_numeric, errors='coerce')
    base_data['Calories'] = base_data['Calories'].astype(float)
    vegan_data[convert_columns] = vegan_data[convert_columns].replace(',', '.', regex=True)
    vegan_data[convert_columns] = vegan_data[convert_columns].apply(pd.to_numeric, errors='coerce')
    vegan_data['Calories'] = vegan_data['Calories'].astype(float)
    
    # convert sodium, cholesterol and potassium to grams (they are initially in mg)
    base_data[convert_columns[7:]] = base_data[convert_columns[7:]] / 100
    vegan_data[convert_columns[7:]] = vegan_data[convert_columns[7:]] / 100
    
    # set the average saturated fat of the week to '0' if it is higher than the fat value
    base_data.loc[(base_data['Saturated fat'] > base_data['Fat']), 'Saturated fat'] = 0
    vegan_data.loc[(vegan_data['Saturated fat'] > vegan_data['Fat']), 'Saturated fat'] = 0
    
    # add the diet definition column
    base_data['Diet'] = 'Normal'
    vegan_data['Diet'] = 'Vegan'
    
    # concat diets and set date as index column
    nutrition_data = pd.concat([base_data, vegan_data])
    nutrition_data['Date'] = nutrition_data['Date'].astype(str) + ' ' + nutrition_data['Year'].astype(str)
    nutrition_data['Date'] = pd.to_datetime(nutrition_data['Date'], format=' %B %d %Y')
    nutrition_data.drop(['Year'], inplace=True, axis=1)
    nutrition_data.set_index('Date', inplace=True)
    nutrition_data.sort_index(inplace=True)
    
    return nutrition_data

In [49]:
def create_nutrition_boxplot(subject, nutrition_data):
    '''
    Function that creates the nutrition boxplot for the given subject
    
    Parameters
    --------------
    subject : chr
        The given subject's initial
    nutrition_data : DataFrame
        The food intake data table of the given subject
    
    Returns
    --------------
    figure
        The created nutrition boxplot visualization
    '''
    tabs = []
    # graph labels and positions
    plot_definitions = ['Calories', 'Macronutrients', 'Carbohydrates', 'Minerals']
    boxplot_data = get_labels_and_positions(plot_definitions)
    
    for group, data in zip(plot_definitions, boxplot_data):
        bar_labels = data[0]; base_positions = data[1]; vegan_positions = data[2]; label_positions = data[3]
        graph_labels = data[4]; outlier_plot_positions = data[5]; outlier_plot_positions_v = data[6]
    
        vegan = nutrition_data.loc[nutrition_data['Diet'] == 'Vegan'][bar_labels]
        baseline = nutrition_data.loc[nutrition_data['Diet'] == 'Normal'][bar_labels]

        # transform both dataframes to 2 column dataframe
        baseline_df = pd.DataFrame(columns=['group', 'value'])
        vegan_df = pd.DataFrame(columns=['group', 'value'])
        for label in bar_labels:
            tmp = pd.DataFrame(columns=['group', 'value'])
            tmp_v = pd.DataFrame(columns=['group', 'value'])
            tmp['value'] = baseline[label]
            tmp_v['value'] = vegan[label]
            tmp['group'] = label
            tmp_v['group'] = label
            baseline_df = pd.concat([baseline_df, tmp])
            vegan_df = pd.concat([vegan_df, tmp_v])
        
        # find the quartiles and IQR for each category
        groups = baseline_df.groupby('group', sort=False)
        groups_v = vegan_df.groupby('group', sort=False)
        q1 = groups.quantile(q=0.25)
        q1_v = groups_v.quantile(q=0.25)
        q2 = groups.quantile(q=0.5)
        q2_v = groups_v.quantile(q=0.5)
        q3 = groups.quantile(q=0.75)
        q3_v = groups_v.quantile(q=0.75)
        iqr = q3 - q1
        iqr_v = q3_v - q1_v
        upper = q3 + 1.5*iqr
        upper_v = q3_v + 1.5*iqr_v
        lower = q1 - 1.5*iqr
        lower_v = q1_v - 1.5*iqr_v

        # get the outliers
        out = groups.apply(get_outliers, upper=upper, lower=lower).dropna()
        out_v = groups_v.apply(get_outliers, upper=upper_v, lower=lower_v).dropna()
        
        # prepare outlier data for plotting
        if not out.empty:
            outx = []
            outy = []
            for keys in out.index:
                if isinstance(out, pd.DataFrame):
                    outx.append(keys)
                    outy.append(out.loc[keys].iloc[0])
                else:
                    outx.append(keys[0])
                    outy.append(out.loc[keys[0]].loc[keys[1]])
        if not out_v.empty:
            outx_v = []
            outy_v = []
            for keys in out_v.index:
                if isinstance(out_v, pd.DataFrame):
                    outx_v.append(keys)
                    outy_v.append(out_v.loc[keys].iloc[0])
                else:
                    outx_v.append(keys[0])
                    outy_v.append(out_v.loc[keys[0]].loc[keys[1]])

        if group == 'Calories':
            y_range = (0, 6000)
            title = "Boxplot representing caloric information of subject {}".format(subject)
            y_axis_label = "Calories (kcal)"
        else:
            y_range = (0, 500) if group == 'Macronutrients' else (0, 300)
            title = "Boxplot representing {} of subject {} ".format(group.lower(), subject)
            y_axis_label = "Grams (g)"
        
        p = figure(tools="", toolbar_location=None, 
                   plot_height=500, plot_width=500, y_range=y_range, title=title,
                   x_axis_label="Nutrient", y_axis_label=y_axis_label)

        # if no outliers, shrink lengths of stems to be no longer than the minimums or maximums
        qmin = groups.quantile(q=0.00)
        qmin_v = groups_v.quantile(q=0.00)
        qmax = groups.quantile(q=1.00)
        qmax_v = groups_v.quantile(q=1.00)
        upper.value = [min([x,y]) for (x,y) in zip(list(qmax.loc[:,'value']),upper.value)]    
        upper_v.value = [min([x,y]) for (x,y) in zip(list(qmax_v.loc[:,'value']),upper_v.value)]
        lower.value = [max([x,y]) for (x,y) in zip(list(qmin.loc[:,'value']),lower.value)]
        lower_v.value = [max([x,y]) for (x,y) in zip(list(qmin_v.loc[:,'value']),lower_v.value)]

        # stems
        p.segment(base_positions, upper.value, base_positions, q3.value, line_color="black")
        p.segment(base_positions, lower.value, base_positions, q1.value, line_color="black")
        p.segment(vegan_positions, upper_v.value, vegan_positions, q3_v.value, line_color="black")
        p.segment(vegan_positions, lower_v.value, vegan_positions, q1_v.value, line_color="black")

        # boxes
        p.vbar(base_positions, 0.4, q2.value, q3.value, fill_color="#abdfff", line_color="black", legend_label="Normal diet")
        p.vbar(base_positions, 0.4, q1.value, q2.value, fill_color="#abdfff", line_color="black")
        p.vbar(vegan_positions, 0.4, q2_v.value, q3_v.value, fill_color="#ceffc4", line_color="black", legend_label="Vegan diet")
        p.vbar(vegan_positions, 0.4, q1_v.value, q2_v.value, fill_color="#ceffc4", line_color="black")

        # whiskers
        p.rect(base_positions, lower.value, 0.2, 0.01, line_color="black")
        p.rect(base_positions, upper.value, 0.2, 0.01, line_color="black")
        p.rect(vegan_positions, lower_v.value, 0.2, 0.01, line_color="black")
        p.rect(vegan_positions, upper_v.value, 0.2, 0.01, line_color="black")

        # outliers
        if not out.empty:
            p.circle([outlier_plot_positions[x] for x in outx], outy, size=3, color="#F38630", fill_alpha=0.6)
        if not out_v.empty:
            p.circle([outlier_plot_positions_v[x] for x in outx_v], outy_v, size=3, color="#F38630", fill_alpha=0.6)

        p.xgrid.grid_line_color = None
        p.xaxis.major_label_text_font_size = "10px"
        p.xaxis.major_label_orientation = 0.5
        p.xaxis.ticker = label_positions

        # set labels at the tick positions
        p.xaxis.formatter = FuncTickFormatter(code="""
                var mapping = {};
                return mapping[tick];
            """.format(graph_labels))
        
        tab = Panel(child=p, title=group)
        tabs.append(tab)
    
    div = Div(text="""
        <p><b>Food intake boxplots</b><br>
        Boxplots are a standardized way of displaying the distribution of data based on a five number summary 
        (“minimum”, first quartile (Q1), median, third quartile (Q3), and “maximum”)</p>
    """, width=500, height=90, css_classes=['plotDescription'])
    
    tabs = pn.Column(div, Tabs(tabs=tabs))
    return tabs

def get_labels_and_positions(columns):
    '''
    Function that creates the labels and position of the different nutrition boxplots
    '''
    ['Calories', 'Macronutrients', 'Carbohydrates', 'Minerals']
    boxplot_data = []
    for label in columns:
        if label == 'Calories':
            bar_labels = ['Calories']
            base_positions = [0.5]
        elif label == 'Macronutrients':
            bar_labels = ['Carbs', 'Protein', 'Fat', 'Saturated fat']
            base_positions = [0.5, 2.5, 4.5, 6.5]
        elif label == 'Carbohydrates':
            bar_labels = ['Fiber', 'Sugar']
            base_positions = [0.5, 2.5]
        elif label == 'Minerals':
            bar_labels = ['Sodium', 'Cholesterol', 'Potassium']
            base_positions = [0.5, 2.5, 4.5]
        
        vegan_positions = [pos+0.6 for pos in base_positions]
        label_positions = [pos+0.3 for pos in base_positions]
        graph_labels = str(dict(zip(label_positions, bar_labels)))
        outlier_plot_positions = dict(zip(bar_labels, base_positions))
        outlier_plot_positions_v = dict(zip(bar_labels, vegan_positions))
        boxplot_data.append([bar_labels, base_positions, vegan_positions, label_positions, 
                                    graph_labels, outlier_plot_positions, outlier_plot_positions_v])
    return boxplot_data

def get_outliers(group, upper, lower):
    '''
    Function that finds the outliers for each category
    '''
    
    cat = group.name
    return group[(group.value > upper.loc[cat]['value']) | (group.value < lower.loc[cat]['value'])]['value']

In [50]:
def create_average_nutrition_figure(subject, nutrition_data):
    '''
    Function that creates the nutrition graph for the given subject
    
    Parameters
    --------------
    subject : chr
        The given subject's initial
    nutrition_data : DataFrame
        The food intake data table of the given subject
    
    Returns
    --------------
    figure
        The created nutrition graph visualization
    '''
    
    # calcuate the average nutritional values
    averages = nutrition_data.groupby(['Diet']).mean().round(2)
    
    bar_labels = ['Carbs', 'Protein', 'Fat', 
                 'Saturated fat', 'Fiber', 'Sugar', 
                 'Sodium', 'Cholesterol', 'Potassium']
    data_obj = {
            'labels': bar_labels,
            'Normal': averages[bar_labels].loc['Normal'],
            'Vegan': averages[bar_labels].loc['Vegan'],
            'None' : [0 for i in bar_labels]
        }
    source = ColumnDataSource(data=data_obj)
    
    p = figure(x_range=bar_labels, y_range=(0, 400), plot_height=500, plot_width=600,
               title="Average nutritional information subject " + subject + " in grams",
               x_axis_label="Nutrient", y_axis_label="Grams (g)", 
               toolbar_location=None, tools="")
    
    p.vbar(x=dodge('labels', -0.12, range=p.x_range), 
           top='Normal', name='Normal', width=0.2, 
           source=source, color="#abdfff", legend_label="Normal diet", line_color="#75cbff")

    # middle label
    p.vbar(x=dodge('labels',  0,  range=p.x_range), top='None', width=0.1, source=source)
    
    p.vbar(x=dodge('labels',  0.12,  range=p.x_range), 
           top='Vegan', name='Vegan', width=0.2, 
           source=source, color="#ceffc4", legend_label="Vegan diet", line_color="#8dff75")
    
    p.x_range.range_padding = 0.05
    p.xgrid.grid_line_color = None
    p.legend.location = "top_right"
    p.legend.orientation = "vertical"
    p.xaxis.major_label_orientation = 0.5
    p.xaxis.major_label_text_font_size = "10px"
    
    p.add_tools(HoverTool(
        names = ['Normal', 'Vegan'],
        tooltips = [
            ('', '$name diet'),
            ('', '@$name{1.11} grams')
        ],
        mode = 'mouse',
        show_arrow = False,
        point_policy = 'follow_mouse'
    ))
    
    style = {
        'font-size': 'smaller',
        'margin-top': '20px'
    }
    div = Div(text="""
        <p><b>Average calories:</b><br>
        Normal diet: {} kcal <br>
        Vegan diet:  {} kcal</p>
        """.format(averages['Calories'].loc['Normal'], averages['Calories'].loc['Vegan']),
              width=200, height=100, style=style)
    return row(p, div)

In [51]:
def create_nutrition_piecharts(subject, nutrition_data):
    '''
    Function that creates the food intake pie charts per week for the given subject
    
    Parameters
    --------------
    subject : chr
        The given subject's initial
    nutrition_data : list
        The food intake data table
    
    Returns
    --------------
    figure
        The tab-seperated intake pie charts
    '''
    
    tabs = []
    weekly_averages = get_weekly_averages(nutrition_data)
    week_starts = {0: '2020-10-12', 1: '2020-10-19', 2: '2020-10-26', 
                   3: '2020-11-02', 4: '2020-11-09', 5: '2020-11-16'}
    
    macro_colors = ['#266298', '#89c609', '#fc2a35']
    sug_fib_colors = ['#f9dda9', '#ac545c']
    micro_colors = ['#a21d22', '#f6871e']
    
    for i in range(0, len(weekly_averages.index)):
        macros = weekly_averages[['Carbs', 'Protein', 'Fat']].iloc[i]
        sug_fib = weekly_averages[['Sugar', 'Fiber']].iloc[i]
        micros = weekly_averages[['Sodium', 'Potassium']].iloc[i]

        p1 = get_piechart_data(macros, macro_colors, 'Average calorie composition')
        p2 = get_piechart_data(sug_fib, sug_fib_colors, 'Average sugar and fiber devision')
        p3 = get_piechart_data(micros, micro_colors, 'Average sodium and potassium devision')

        pie_charts = row(p1, p2, p3)
        tab = Panel(child=pie_charts, title="Week " + str(i+1))
        tabs.append(tab)
    
    div = Div(text="""
        <p><b>Food intake pie charts</b><br>
        The pieces of a pie chart are proportional to the fraction of the whole in each category. 
        In other words, each slice of the pie is relative to the size of that category in the group as a whole. 
        The entire “pie” represents 100% of a whole, while the pie “slices” represent portions of the whole.</p>
    """, width=500, height=100, css_classes=['plotDescription'])
    
    tabs = pn.Column(div, Tabs(tabs=tabs))
    return tabs

In [52]:
def get_piechart_data(nutrients, nutrient_colors, chart_description):
    '''
    Function that creates a food intake pie chart for the given nutrients DataFrame
    
    Parameters
    --------------
    nutrients : DataFrame
        The DataFrame with the nutrient values
    nutrition_colors : list
        The nutrient colors list
    chart_description : string
        The title of the pie chart graph
    
    Returns
    --------------
    p
        The generated pie chart
    '''
    
    R = 0.35
    p_range = (-R * 1.1, R * 1.5)
    p_size = 350
    
    data = (pd.Series(nutrients.to_dict())
            .reset_index(name='value')
            .rename(columns={'index':'nutrient'})
            .assign(end_angle=lambda d: np.cumsum(d['value'] / d['value'].sum() * 2 * pi),
                    start_angle=lambda d: np.pad(d['end_angle'], (1, 0), mode='constant')[:-1],
                    label_x=lambda d: R * 0.9 * np.cos(d['start_angle']),
                    label_y=lambda d: R * 0.95 * np.sin(d['start_angle'])))
    data['percentage'] = (data['value']/data['value'].sum() * 100).round(2).astype(str) + '%'
    data['color'] = nutrient_colors[:len(nutrients)]

    p = figure(title=chart_description, toolbar_location=None,
               plot_height=p_size, plot_width=p_size,
               x_range=p_range, y_range=p_range,
               tools="")
    p.wedge(x=0, y=0, radius=R,
            start_angle='start_angle', end_angle='end_angle',
            line_color="white", fill_color='color', legend_field='nutrient', source=data)

    source = ColumnDataSource(data)
    labels = LabelSet(x='label_x', y='label_y', text='percentage',
                      angle='start_angle', source=source, render_mode='canvas', 
                      text_align='right', text_font_size="9pt", text_color='white')

    p.add_layout(labels)
    p.axis.axis_label=None
    p.axis.visible=False
    p.grid.grid_line_color = None
    p.legend.label_text_font_size = '9pt'
    
    return p

In [53]:
def get_weekly_averages(nutrition_data):
    '''
    Function that calculates the weekly average nutrient values
    '''
    
    nutrition_data['week_number'] = ((nutrition_data.Day.str.lower() == 'monday').cumsum())
    return nutrition_data.groupby('week_number').mean().round(2)

In [54]:
def create_nutrition_dataframe(nutrition_data):
    column_order = ['Carbs', 'Protein', 'Fat', 
                 'Saturated fat', 'Fiber', 'Sugar', 
                 'Sodium', 'Cholesterol', 'Potassium']
    
    weekly_df = nutrition_data.drop(columns=['Diet', 'week_number'])
    # add the mean and the median
    weekly_df.index = weekly_df.index.astype(str)
    weekly_df = pd.concat([weekly_df.round(2), weekly_df.mean().round(2).to_frame('Mean').T,
                        weekly_df.median().round(2).to_frame('Median').T])
    weekly_df.fillna('', inplace=True)
    weekly_df = pd.concat([weekly_df.iloc[:, 0:2], weekly_df[column_order].add_suffix(' (g)')], axis=1)
    
    return pn.widgets.DataFrame(weekly_df, autosize_mode='none', frozen_rows=-2,
                                width=1050, widths=86, height=350)

# Microbiota

In [55]:
def load_data_microbiota(PATH):
    """Load microbiota data (Gut Feeling Knowledge Base and metaphlann output) into pandas 
    dataframes

    Keyword arguments:
    PATH -- The path which contains the Gut Feeling Knowledge Base and metaphlann output dir
    
    Returns:
    gfkb -- A pandas dataframe containing the Gut Feeling Knowledge Base
    tax_profiles -- A pandas dataframe containing the taxonomic profile per barcode for all barcodes"""
    
    # Gut Feeling Knowledge Base
    gfkb = pd.read_csv(f"{PATH}/GutFeelingKnowledgeBase-v4-Master_List.csv")
    gfkb = gfkb.drop(columns=["Present in GFKB v3 (Y/N)",
                              "Present in GFKB_epilepsy v3 (Y/N)"])
    gfkb = pd.concat([gfkb.drop(columns=["Genome Size (Mb)"]).apply(lambda x: x.astype(str)), 
                      gfkb["Genome Size (Mb)"]], axis=1)

    tax_profiles = pd.DataFrame()
    
    # Dictionary containing barcode ID as key and a list with subject ID and bool if vegan as values
    barcode2subject_sample = {"barcode_01":["A", True],
                              "barcode_02":["B", True],
                              "barcode_03":["C", True],
                              "barcode_04":["D", True],
                              "barcode_05":["E", True],
                              "barcode_06":["A", False],
                              "barcode_07":["B", False],
                              "barcode_08":["C", False],
                              "barcode_09":["D", False],
                              "barcode_10":["E", False]}
    
    # Concatenate all taxonomic profiles of all barcodes to one dataframe
    tax_profiles = concat_tax_profiles(PATH, tax_profiles, barcode2subject_sample)
    
    return gfkb, tax_profiles

In [56]:
def concat_tax_profiles(PATH, tax_profiles, barcode2subject_sample):
    """Concatenate all taxonomic profiles of all given barcodes in the metaphlan output dir
    to one dataframe and reset the index on multi-index (subject, is_vegan).

    Keyword arguments:
    PATH -- The path which contains the Gut Feeling Knowledge Base and metaphlann output dir
    tax_profiles -- An empty pandas dataframe containing the column names in which all taxonomic
    profiles will be concatenated.
    barcode2subject_sample -- Dictionary containing barcode ID as key and a list with subject ID 
    and bool if sample is vegan as values
    
    Returns:
    tax_profiles -- A pandas dataframe containing the taxonomic profile per barcode for all barcodes"""
    
    for file in glob.glob(f"{PATH}/adj_align_output/*.txt"):
        tax_profile = pd.read_csv(file, 
                                  comment="#", 
                                  sep="\t", 
                                  names=["clade_name", 
                                         "NCBI_tax_id", 
                                         "relative_abundance",
                                         "additional_species"])
        # Splitting clade_name into taxonomic levels
        tax_profile = tax_profile.join(tax_profile["clade_name"].str.split('|', expand=True).rename(columns={0:'kingdom', 
                                                                                                             1:'phylum', 
                                                                                                             2:'class', 
                                                                                                             3:'order', 
                                                                                                             4:'family', 
                                                                                                             5:'genus', 
                                                                                                             6:'species'}), how='left')
        # Indexing
        barcode = file.split('adj_align_output/')[1].split("_all")[0]
        tax_profile["subject"] = barcode2subject_sample[barcode][0]
        tax_profile["is_vegan"] = barcode2subject_sample[barcode][1]
        tax_profile = tax_profile.set_index([tax_profile.subject, tax_profile.is_vegan]).sort_index()
        
        tax_profiles = pd.concat([tax_profiles,
                                  tax_profile])
    # Clean up of dataframe
    tax_profiles["kingdom"] = tax_profiles["kingdom"].str.strip("k__")
    tax_profiles["phylum"] = tax_profiles["phylum"].str.strip("p__")
    tax_profiles["class"] = tax_profiles["class"].str.strip("c__")
    tax_profiles["order"] = tax_profiles["order"].str.strip("o__")
    tax_profiles["family"] = tax_profiles["family"].str.strip("f__")
    tax_profiles["genus"] = tax_profiles["genus"].str.strip("g__")
    tax_profiles["species"] = tax_profiles["species"].str.strip("s__")
    tax_profiles = tax_profiles.drop(columns=["clade_name"])
    tax_profiles = tax_profiles[["kingdom", 
                                 "phylum", 
                                 "class", 
                                 "order", 
                                 "family", 
                                 "genus", 
                                 "species", 
                                 "relative_abundance", 
                                 "NCBI_tax_id", 
                                 "additional_species"]]
    
    return tax_profiles.sort_index()

In [57]:
def create_column_data_source(taxa_abundance, taxa, add_diff = False):
    """Create a Bokeh ColumnDataSource for the plotting of taxa_abundance data for the taxa in the taxa list.
    
    Keyword arguments:
    taxa_abundance -- A pandas dataframe containing the abundance of taxa for the vegan and regular diet samples for a subject
    taxa -- A list of taxa found in the taxa_abundance columns
    
    Returns:
    A bokeh ColumnDataSource containing the taxa, abundance of taxa in control, abundance of taxa in vegan and None for labels
    """
    diet = ['control', 'vegan']

    # Taking the mean of a single data point just for bokeh to accept the format of the ColumnDataSource
    
    grouped = taxa_abundance.groupby(['is_vegan']).mean()

    # Handling if there are either no datapoints before or after the vegan intervention
    if False in grouped.index:
        control = grouped[taxa].loc[False]
    else:
        control = [0 for taxon in taxa]
    if True in grouped.index:
        vegan = grouped[taxa].loc[True]
    else:
        vegan = [0 for taxon in taxa]
    if add_diff:
        diff = []
        for tax in grouped.to_dict().keys():
            diff.append([control.fillna(0).loc[tax], vegan.fillna(0).loc[tax]])
                
        print(diff)
        data = {'taxa': taxa,
                'control': control,
                'vegan': vegan,
                'None': [0 for i in taxa],
                'diff': diff}
    else:
        data = {'taxa': taxa,
                'control': control,
                'vegan': vegan,
                'None': [0 for i in taxa]}

    return ColumnDataSource(data=data)

In [58]:
def recreate_zimmer(tax_profiles, subject):
    """Recreate a comparison of taxa that have been routinely analysed by Zimmer et al. 2012
    
    Keyword arguments:
    tax_profiles -- A pandas dataframe containing the taxonomic profile per barcode for all barcodes
    
    Returns:
    zimmer_subset -- A pandas dataframe containing a subset of taxa of interest of the taxonomic profile per barcode for all barcodes 
    dataframes containing only the taxa analyzed by zimmer et al. 2012 as values
    """
    if subject == "s Pooled":
        tax_profiles_subject = tax_profiles
    else:
        tax_profiles_subject = tax_profiles[tax_profiles.index.get_level_values('subject') == subject]
        subject = f" {subject}"
    
    # Subset for the Zimmer et al. 2012 bar plot
    bacteroides_subset = tax_profiles_subject[(tax_profiles_subject['genus'] == "Bacteroides") & (tax_profiles_subject['species'].isnull())]
    bifidobacteria_subset = tax_profiles_subject[(tax_profiles_subject['genus'] == "Bifidobacterium") & (tax_profiles_subject['species'].isnull())]
    ecoli_subset = tax_profiles_subject[(tax_profiles_subject['species'] == "Escherichia_coli")]
    enterobacter_subset = tax_profiles_subject[(tax_profiles_subject['family'] == "Enterobacteriaceae") & (tax_profiles_subject['genus'].isnull())]

    # Other taxa Zimmer et al. 2012 deemed of interest
    clostridia_subset = tax_profiles_subject[(tax_profiles_subject['class'] == "Clostridia") & (tax_profiles_subject['order'].isnull())]
    
    zimmer_subset = pd.concat([bacteroides_subset,
                               bifidobacteria_subset,
                               ecoli_subset,
                               enterobacter_subset,
                               clostridia_subset]).sort_index()
    
    
    return plot_zimmer(zimmer_subset, subject)

In [59]:
def plot_zimmer(zimmer_subset, subject):
    """Plot the zimmer subset per subject as a grouped barplot. Based on code by Kylie Keijzer
    
    Keyword arguments:
    zimmer_subset -- A pandas dataframe containing a subset of taxa of interest of the taxonomic profile per barcode for all barcodes 
    dataframes containing only the taxa analyzed by zimmer et al. 2012 as values
    subject -- A string consisting of the subject ID
    """
    if not zimmer_subset.empty:
        tabs = []
        taxa_abundance = pd.DataFrame()
        # Subset for the Zimmer et al. 2012 bar plot
        taxa_abundance['Bacteroides'] = zimmer_subset[(zimmer_subset['genus'] == "Bacteroides") & 
                                                      (zimmer_subset['species'].isnull())]['relative_abundance']
        taxa_abundance['Bifidobacterium'] = zimmer_subset[(zimmer_subset['genus'] == "Bifidobacterium") & 
                                                          (zimmer_subset['species'].isnull())]['relative_abundance']
        taxa_abundance['Escherichia_coli'] = zimmer_subset[(zimmer_subset['species'] == "Escherichia_coli")]['relative_abundance']
        taxa_abundance['Enterobacteriaceae'] = zimmer_subset[(zimmer_subset['family'] == "Enterobacteriaceae") & 
                                                             (zimmer_subset['genus'].isnull())]['relative_abundance']
        # Other taxa Zimmer et al. 2012 deemed of interest
        taxa_abundance['Clostridia'] = zimmer_subset[(zimmer_subset['class'] == "Clostridia") & 
                                                     (zimmer_subset['order'].isnull())]['relative_abundance']

        taxa = ['Bacteroides', 'Bifidobacterium', 'Escherichia_coli', 'Enterobacteriaceae', 'Clostridia']
        
        if not taxa_abundance.empty:
            source = create_column_data_source(taxa_abundance, taxa)
            
            # Creating the figure
            p = figure(x_range=taxa, y_range=(0, 100), plot_height=500, plot_width=800,
               title=f"Subject{subject}: Abundance of specific taxa of interest",
               x_axis_label="Taxon", y_axis_label="Relative abundance (%)", 
               toolbar_location=None, tools="")

            p.vbar(x=dodge('taxa', -0.12, range=p.x_range), 
                   top='control', name='control', width=0.2, 
                   source=source, color="#abdfff", legend_label="Regular diet", line_color="#75cbff")

            # middle label
            p.vbar(x=dodge('taxa',  0,  range=p.x_range), top='None', width=0.1, source=source)

            p.vbar(x=dodge('taxa',  0.12,  range=p.x_range), 
                   top='vegan', name='vegan', width=0.2, 
                   source=source, color="#ceffc4", legend_label="Vegan diet", line_color="#8dff75")

            p.x_range.range_padding = 0.1
            p.xgrid.grid_line_color = None
            p.legend.location = "top_right"
            p.legend.orientation = "horizontal"

            tab = Panel(child=p, title="Taxa of interest")
            tabs.append(tab)

            tabs = Tabs(tabs=tabs)
            return tabs

In [60]:
def select_taxa_on_level(taxa_abundance, tax_profiles_subject, tax_level):
    """Create a subset of a pandas dataframe containing the abundance of taxa for the vegan and regular diet samples for a subject
    based on the taxonomic level selected
    
    Keyword arguments:
    taxa_abundance -- A pandas dataframe containing the abundance of taxa for the vegan and regular diet samples for a subject
    tax_profiles_subject -- A pandas dataframe containing the taxonomic profile for the vegan and regular diet samples for a subject
    tax_level -- A string containing the taxonomic level to select on
    
    Returns:
    taxa_abundance -- A pandas dataframe containing the abundance of taxa for the vegan and regular diet samples for a subject, 
    now selected for taxonomic level
    taxa -- A list containing the unique taxa found in the taxa_abundance dataframe
    """
    if tax_level == "kingdom":
        not_included = "phylum"
    elif tax_level == "phylum":
        not_included = "class"
    elif tax_level == "class":
        not_included = "order"
    elif tax_level == "order":
        not_included = "family"
    elif tax_level == "family":
        not_included = "genus"
    elif tax_level == "genus":
        not_included = "species"
    elif tax_level == "species":
        taxa = list(tax_profiles_subject[(tax_profiles_subject[tax_level].notnull())][tax_level])
        for taxon in taxa:
            taxa_abundance[taxon] = tax_profiles_subject[(tax_profiles_subject[tax_level] == taxon)]['relative_abundance']
        return taxa_abundance, list(set(taxa))
    else:
        raise Exception("incorrect taxonomic level selected")
    
    taxa = list(tax_profiles_subject[(tax_profiles_subject[tax_level].notnull()) & 
                                     (tax_profiles_subject[not_included].isnull())][tax_level])
    for taxon in taxa:
        taxa_abundance[taxon] = tax_profiles_subject[(tax_profiles_subject[tax_level] == taxon) & 
                                                     (tax_profiles_subject[not_included].isnull())]['relative_abundance']
    return taxa_abundance, list(set(taxa))

In [61]:
def plot_dumbbell(tax_profiles, subject, tax_level):
    """Plot the zimmer subset per subject as a grouped barplot. Based on code by Kylie Keijzer
    
    Keyword arguments:
    zimmer_subset -- A pandas dataframe containing a subset of taxa of interest of the taxonomic profile per barcode for all barcodes 
    dataframes containing only the taxa analyzed by zimmer et al. 2012 as values
    subject -- A string consisting of the subject ID
    """
    if not tax_profiles.empty:
        tabs = []
        taxa_abundance = pd.DataFrame()
        if subject == "s Pooled":
            tax_profiles_subject = tax_profiles
        else:
            tax_profiles_subject = tax_profiles[tax_profiles.index.get_level_values('subject') == subject]
            subject = f" {subject}"

        taxa_abundance, taxa = select_taxa_on_level(taxa_abundance, tax_profiles_subject, tax_level)
        
        if not taxa_abundance.empty:
            
            source = create_column_data_source(taxa_abundance, taxa, False)
            
            # Creating the figure
            p = figure(y_range=taxa, x_axis_type="log", plot_height=500, plot_width=800,
               title=f"Subject{subject}: Abundance of all taxa on {tax_level} level",
               y_axis_label="Taxon", x_axis_label="Relative abundance (%)", 
               toolbar_location=None, tools="")

#             p.multi_line(xs="diff", ys=dodge('taxa', 0, range=p.y_range), line_color="#8073ac", line_width=2, source=source)

            size = 200 / len(taxa)
                
            p.circle(y=dodge('taxa', 0, range=p.y_range), x='control', name='control',
                    source=source, color="darkblue", legend_label="Regular diet", size=size)

            p.circle(y=dodge('taxa', 0, range=p.y_range), x='vegan', name='vegan',
                    source=source, color="darkgreen", legend_label="Vegan diet", size=size)

            p.y_range.range_padding = 0.1
            p.ygrid.grid_line_color = None
            p.legend.location = "top_right"
            p.legend.orientation = "horizontal"

            tab = Panel(child=p, title="Difference of all taxa")
            tabs.append(tab)

            tabs = Tabs(tabs=tabs)
            return tabs

In [62]:
def create_nutrition_graphs(self):
    '''
    Function that creates and returns the nutrition data graphs
    '''
    subjects = self.subjects
    figures = {}
    intake_dfs = {}
    for subject in subjects:
        nutrition_data = create_nutrition_tables(self, subject)
        averages_figure = create_average_nutrition_figure(subject, nutrition_data)
        boxplots = create_nutrition_boxplot(subject, nutrition_data)
        pie_charts = create_nutrition_piecharts(subject, nutrition_data)
        sub_figures = [boxplots, averages_figure, pie_charts]
        figures[subject] = sub_figures
        intake_dfs[subject] = create_nutrition_dataframe(nutrition_data)
    return figures, intake_dfs

In [63]:
'''
NOTE
Please add mental state functionality in this block!
'''
def create_mental_state_graphs(self):
    filepath = self.config['mental_state_files_path']
    pass

In [64]:
'''
NOTE
Please add body composition functionality in this block!
'''
def create_body_comp_graphs(self):
    filepath = self.config['body_comp_files_path']
    pass

In [65]:
'''
NOTE
Please add microbiota functionality in this block!
'''
def create_microbiota_graphs(self):
    filepath = self.config['microbiota_files_path']
    gfkb, tax_profiles = load_data_microbiota(filepath)
    
    figures = {}
    
    dumbbell_plot = plot_dumbbell(tax_profiles, "s Pooled", "order")
    zimmer_plot = recreate_zimmer(tax_profiles, "s Pooled")
    
    subjects = self.subjects
    for subject in subjects:
        dumbbell_plot = plot_dumbbell(tax_profiles, subject, "order")
        zimmer_plot = recreate_zimmer(tax_profiles, subject)
    
        sub_figures = [dumbbell_plot, zimmer_plot]
        figures[subject] = sub_figures
    return figures

In [71]:
def create_microbiota_options(self):
    self.micr_options = ['Dumbbell plot', 'Zimmer plot']
    self.micr_plot_options = pn.widgets.Select(name='Microbiota data:', options=self.micr_options, 
                                                 value='Dumbbell plot', css_classes=['selectDiv'])
    self.micr_plot_options.param.watch(self.select_plot, 'value')

In [125]:
class Dashboard:
    def __init__(self):
        self.config = get_config()
        self.subjects = ['A', 'B', 'C', 'D', 'E']
        self.measurements = ['Food intake', 'Body composition', 'Mental state', 'Microbiota']
        self.panel_options = {}
        self.create_panel()
    
    def create_panel(self):
        # retrieve the plots
        
        '''
        NOTE
        Plots are retrieved here. Interactivity is also controlled here. Please keep the specific plot 
        functionolity as much in your own functions as possible. 
        '''
        nutrition_data = create_nutrition_graphs(self)
        self.nutrition_figures = nutrition_data[0]
        self.nutrition_tables = nutrition_data[1]
        self.mental_state_data = create_mental_state_graphs(self)
        self.body_comp_data = create_body_comp_graphs(self)
        self.microbiota_data = create_microbiota_graphs(self)

        self.subjects_menu = pn.widgets.Select(name='Subject:', options=self.subjects, value='A',
                                              css_classes=['selectDiv'])
        self.subjects_menu.param.watch(self.select_subject, 'value')
        
        measurement_label = Div(text="<b>Measurements:</b>", css_classes=['checkboxLabel'])
        self.checkbox_group = pn.widgets.CheckBoxGroup(name='Measurement', options=self.measurements,
                                                      value=['Food intake'], css_classes=['inputElement'])
        self.checkbox_group.param.watch(self.toggle_plots, 'value')
        
        self.food_intake_options = ['Boxplots', 'Average', 'Weekly piecharts']
        self.intake_plot_options = pn.widgets.Select(name='Food intake data:', options=self.food_intake_options, 
                                                     value='Boxplots', css_classes=['selectDiv'])
        self.intake_plot_options.param.watch(self.select_plot, 'value')
        self.table_checkbox = pn.widgets.Checkbox(name='View nutrition details', css_classes=['inputElement'])
        self.table_checkbox.param.watch(self.handle_intake_tables, 'value')
        create_microbiota_options(self)
        
        # create the options menu
        self.options_column = pn.Column(self.subjects_menu, measurement_label, self.checkbox_group, 
                                        self.intake_plot_options, self.table_checkbox, 
                                        margin=0, sizing_mode='stretch_both')

        # display subject A and food intake boxplot as default
        self.panel_options['subject'] = 'A'
        self.panel_options['intake_plot_index'] = 0
        self.panel_options['food_intake_displayed'] = True
        
        '''
        NOTE
        Please add your plots to this column and see how it looks
        '''
        self.figure_column = pn.Column(self.nutrition_figures['A'][0])
    
        self.template = pn.template.MaterialTemplate(title='Project Ve-gang')
        self.template.sidebar.append(self.options_column)
        self.template.main.append(self.figure_column)
        pn.serve(self.template)
    
    def select_subject(self, event):
        '''
        Function that handles the selection of a different subject
        '''
        if event.new:
            subject = event.new
            prev_subject = event.old
            self.panel_options['subject'] = subject
            self.panel_options['intake_plot_index'] = 0
            self.panel_options['food_intake_displayed'] = True
            self.subjects_menu.value = subject
            self.checkbox_group.value = ['Food intake']
            self.intake_plot_options.value = 'Boxplots'
            
            if self.nutrition_tables[prev_subject] in self.figure_column: 
                self.figure_column.remove(self.nutrition_tables[prev_subject])
                self.table_checkbox.value = False
            self.figure_column[0] = self.nutrition_figures[subject][0]
        
    def select_plot(self, event):
        '''
        Function that handles the selection of a different plot type
        '''
        if event.new:
            self.panel_options['event'] = 'select_plot'
            subject = self.panel_options['subject']
            
            if (self.panel_options['food_intake_displayed'] and 
                event.new in self.food_intake_options):
                if event.new == 'Boxplots':
                    index = 0
                elif event.new == 'Average':
                    index = 1
                elif event.new == 'Weekly piecharts':
                    index = 2
                
                self.panel_options['intake_plot_index'] = index
                # add the nutrition figure to the panel
                if len(self.figure_column) > 0:
                    self.figure_column.pop(0)
                self.figure_column.insert(0, self.nutrition_figures[subject][index])
            
            elif (self.panel_options['microbiota_displayed'] and 
                  event.new in self.micr_options):
                if event.new == 'Dumbbell plot':
                    index = 0
                elif event.new == 'Zimmer plot':
                    index = 1
                
                prev_index = self.panel_options['micr_plot_index']
                self.panel_options['micr_plot_index'] = index
                # add the microbiota figure to the panel
                if len(self.figure_column) > 1:
                    self.figure_column.pop(1)
                self.figure_column.insert(1, self.microbiota_data[subject][index])

    def handle_intake_tables(self, event):
        '''
        Toggle show the intake datatables 
        '''
        subject = self.panel_options['subject']
        # checked
        if event.new:
            self.figure_column.insert(1, self.nutrition_tables[subject])
        # unchecked
        else:
            self.figure_column.remove(self.nutrition_tables[subject])
    
    def toggle_plots(self, event):
        '''
        '''
        subject = self.panel_options['subject']
        if not 'Food intake' in event.new:
            self.panel_options['food_intake_displayed'] = False
            index = self.panel_options['intake_plot_index']
            self.figure_column.pop(0)
            self.options_column.remove(self.intake_plot_options)
            self.options_column.remove(self.table_checkbox)
        elif self.intake_plot_options not in self.options_column:
            self.panel_options['food_intake_displayed'] = True
            self.options_column.append(self.intake_plot_options)
            self.options_column.append(self.table_checkbox)
            self.select_plot(DotMap({'new': 'Boxplots'}))
        
        if not 'Body composition' in event.new:
            # todo - remove body compostion options
            pass
        if not 'Mental state' in event.new:
            # todo - remove mental state options
            pass
        if not 'Microbiota' in event.new:
            self.panel_options['microbiota_displayed'] = False
            index = self.panel_options['micr_plot_index']
            self.figure_column.pop(1)
            self.options_column.remove(self.micr_plot_options)
        elif self.micr_plot_options not in self.options_column:
            self.panel_options['micr_plot_index'] = 0
            self.panel_options['microbiota_displayed'] = True
            self.options_column.append(self.micr_plot_options)
            self.select_plot(DotMap({'new': 'Dumbbell plot'}))

In [126]:
def main():
    pn.extension(raw_css=[get_styling()])
    Dashboard()

if __name__ == '__main__':
    main()

Launching server at http://localhost:58572
