In [54]:
import pandas as pd
import networkx as nx
import plotly.express as px
import plotly.graph_objects as go
import dash_bio as dashbio
from jupyter_dash import JupyterDash
from dash import Dash, dash_table, html, dcc, Input, Output
import dash_daq as daq
import random

In [55]:
df = pd.read_csv('data/summary_tree.csv')

In [56]:
df['test'] = df['behavior']

In [57]:
behavior_list = list(set(df['behavior']))
genus = pd.read_csv('data/genus.csv')

In [58]:
def simplify_behavior_label(label):
    return label.replace('_batch_ranknorm','').replace('_',' ')

In [59]:
def format_labels(label_list):
    simplified = [simplify_behavior_label(label) for label in label_list]
    return ('<br>').join(simplified)

In [60]:
def behavior_split(behavior):
    subset = df[df['behavior']==behavior]
    #subset['depth'] = subset['depth'].astype(str)
    fig = px.bar(subset,
           x='split_value',
           y='nobs',
           #color='target',
           color='depth',
           pattern_shape="split",
           pattern_shape_map={
             "<": "/", ">": "\\", ">=": "\\", "<=":"/"
           },
           template='plotly_white',
           title=f'Splits for {simplify_behavior_label(behavior)}',
           hover_data=list(subset.columns),
           text="target"
          )
    fig.update_xaxes(title='Split Value')
    fig.update_yaxes(title='Number of Observations')
    fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
    return fig

In [61]:
def improve_text_position(x):
    positions = ['top center','top center', 'bottom center', 'bottom center']  # you can add more: left center ...
    return [positions[i % len(positions)] for i in range(len(x))]

In [62]:
def behavior_split_scatter(behavior):
    subset = df[df['behavior']==behavior].sort_values(by='split_value')
    #subset['depth'] = subset['depth'].astype(str)
    fig = px.scatter(subset,
           x='split_value',
           y='depth',
           size='nobs',
           template='plotly_white',
           color='split',
           title=f'Splits for {simplify_behavior_label(behavior)}',
           hover_data=['source','target','split_value','split','nobs'],
           #text="target"
          )
    fig.update_xaxes(title='Split Value')
    fig.update_yaxes(title='Depth', autorange="reversed")
    fig.update_traces(textfont_size=12, cliponaxis=False, textposition=improve_text_position(df['split_value']))
    return fig

In [63]:
behavior_split_scatter(behavior_list[10])

In [64]:
def get_behaviors(source, target):
    behaviors = list(set(df[(df['source']==source) & (df['target']==target)]['behavior']))
    return behaviors

In [65]:
df['behavior_label'] = [simplify_behavior_label(label) for label in df['behavior']]

In [66]:
meanDF = df.groupby(['source','target']).mean()
meanDF.columns=[c+'_computed_mean' for c in meanDF.columns]
sumDF = df.groupby(['source','target']).sum()
sumDF.columns=[c+'_computed_sum' for c in sumDF.columns]
summary = pd.merge(meanDF, sumDF, on=['source','target']).reset_index()


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [67]:
summary['behaviors'] = [get_behaviors(source, target) for source, target in zip(summary['source'],summary['target'])]
summary['behavior_label'] = [format_labels(behavior_list) for behavior_list in summary['behaviors']]
summary['num_trees_present'] = [len(b) for b in summary['behaviors']]

In [68]:
def dendrogram(df,col,ascending=False):
    # e.g. dendrogram(summary,'depth_mean', maxNA=True)
    filtered = df.filter(items=['source','target',col]).sort_values(by=col, ascending=ascending)
    matrix = filtered.pivot('source', 'target', col)
    #matrix.columns = matrix.columns.get_level_values(level=1)
    matrix.shape
    filled = matrix.fillna((max(df[col])+1) if ascending else (min(df[col])-1))
    columns = list(filled.columns.values)
    rows = list(filled.index)

    fig = dashbio.Clustergram( # https://dash.plotly.com/dash-bio/clustergram
        data=abs(filled.loc[rows].values),
        row_labels=rows,
        column_labels=columns,
        color_map='blues_r' if ascending else 'blues',
        optimal_leaf_order=True,
    )
    fig.update_layout(
        width=800,
        height=800,
        title=f"<b>Comparison of Microbe Pairs<br>by {col.replace('_',' ').title()}</b>"
    )
    return fig

In [69]:
px.strip(df, y='nobs', color='test')

In [70]:
def targets(df,col,bio,ascending=False):
    filtered = df[df['source']==bio].sort_values(by=col, ascending=True)
    fig = px.scatter(filtered,
                     x='target',
                     y=col,
                     title=f"<b>Top targets for {bio}<br>by {col.replace('_',' ').title()}</b>",
                     color=col,
                     color_continuous_scale='viridis' if ascending else 'viridis_r',
                     size='nobs_computed_mean',
                     hover_data=['behavior_label']
                    )
    fig.update_layout(
        template='plotly_white',
        title_font_size=20,
        title_font_color="#666"
    )
    fig.update_yaxes(title=col.replace('_',' ').title())
    fig.update_xaxes(title='Target Bacteria')
    if ascending==True:
        fig.update_yaxes(autorange="reversed")
        fig.update_xaxes(autorange="reversed")
    return fig

In [71]:
targets(summary,'depth_computed_mean',random.choice(genus.columns), ascending=True).show()

In [73]:
def behaviors(col,bio,ascending=False):
    if col == 'num_trees_present':
        col='nobs_computed_mean'
    stat_simple = col.replace('_computed_mean','').replace('_computed_sum','')
    filtered = df[df['source']==bio].sort_values(by=stat_simple, ascending=ascending)
    fig = px.scatter(filtered,
                     x=stat_simple,
                     y='behavior_label',
                     title=f"<b>Top behaviors for {bio}<br>by {col.replace('_',' ').title()}</b>",
                     color=stat_simple,
                     size=stat_simple if min(df[stat_simple])>0 else 'nobs',
                     hover_data=list(filtered.columns),
                     color_continuous_scale='viridis' if ascending else 'viridis_r',
                     symbol='test'
                    )
    fig.update_layout(
        template='plotly_white',
        title_font_size=20,
        title_font_color="#666"
    )
    fig.update_yaxes(title='Behavior')
    fig.update_xaxes(title=stat_simple.title())
    if ascending==True:
        fig.update_yaxes(autorange="reversed", title='Behavior')
        fig.update_xaxes(autorange="reversed", title=stat_simple.title())
    return fig

In [75]:
behaviors('depth_computed_mean',random.choice(genus.columns),ascending=True).show()

In [76]:
behaviors('depth_computed_mean',random.choice(genus.columns),ascending=True).write_html('sample.html')

In [77]:
stats = [
    'num_trees_present',
    'depth_computed_sum',
    'nobs_computed_sum',
    'pvalue_computed_mean',
    'split_value_computed_mean',
    'y_mean_computed_mean',
    'error_computed_mean',
    'nobs_computed_mean',
    'depth_computed_mean',
    'split_value_computed_sum',
    #'y_mean_computed_sum',
    'error_computed_sum',
    'pvalue_computed_sum',
    #'behaviors',
    #'behavior_label',
]

In [78]:
# https://medium.com/plotly/introducing-jupyterdash-811f1f57c02e

#stats = list(summary.columns[2:])
bios = list(set(summary['source']))

app = JupyterDash(__name__)

app.layout = html.Div([
    html.Div([
        html.H1('Compare microbes'),
        html.P('Select a statistic from the dropdown below to view information summarized \
        across all regression trees. Particularly interesting statistics are: {depth} the average \
        depth of tree splits involving this microbe (lower values mean closer to the top); {nobs} which \
        stands for "number of observations"; {num trees present} the number of regression trees in which \
        this microbe appears.'),
        dcc.Dropdown(stats, 'num_trees_present', id='stat-dropdown'),
        html.P('This dendrogram shows a comparison of one bacteria versus another\
               along whatever variable you have selected from the dropdown above, across *all* regression trees. \
               Hover over a cell to view the source and target. If you want to see \
               a sorted list of all the edges from highest to lowest, scroll to the table below.'),
        dcc.Graph(id='dendrogram'),
        #daq.BooleanSwitch(id='ascending-toggle', on=False,
              #label="Ascending?",labelPosition="top"),
        html.P('This table shows the same information as the dendrogram above (highlighted in yellow); \
                you can scroll right to see more detail about each edge. These values are summarized \
                from *all* of the regression trees.'),
        html.Div(id='table-container'),   
    ], style={'width': '45%', 'display': 'inline-block', 'float':'left'}),
    html.Div([
        html.H1('Explore a microbe'),
        html.P('Once you have found a bacteria of interest from the dendrogram and table on the left, \
                you can select it from the dropdown below to view more information about the trees \
                that include your chosen bacteria. The chart below will display summary information \
                about the other microbes identified as relevant using the regression trees. Nodes are sized \
                by the total number of observations between each bacteria pair. Hover over \
                a point in the chart to view more information about the relevant tree.'),
        dcc.Dropdown(bios, random.choice(genus.columns), id='bio-dropdown'),
        dcc.Graph(id='targets'),
        html.P('Similar to the chart above, the chart below displays relevant information about the specific \
            bacteria you have selected above; in this case, we see the particular behaviors for which this \
            bacteria existed in the regression tree. Nodes are sized by the selected statistic, if possible \
            or the number of implicated observations, if the selected statistic has a range that includes \
            negative values. Nodes are colored by the selected statistic also. You can change the selected \
            statistic using the uppermost dropdown.'),
        html.H1('Explore a behavior'),
        dcc.Graph(id='behaviors'),
        html.P('From the chart above, there may be a behavior that you are interested in viewing tree splits for. \
        From the dropdown below, you can select a behavior of interest, and see a bar chart below of all the \
        tree splits for that behavior of interest. Bar x-location corresponds to the split value, while the bar \
        y-height corresponds to the number of observations. Color corresponds to tree depth (with darker values) \
        meaning a higher (earlier) split in the tree, and lighter values being further down the tree. A pattern \
        to the left ("/") indicates a "less than" split, while a pattern to the right ("\\") indicates a \
        "greater than" split.'),
        dcc.Dropdown(behavior_list, behavior_list[0], id='behavior-dropdown'),
        dcc.Graph(id='split_chart'),
        dcc.Graph(id='split_scatter')
    ], style={'width': '45%', 'display': 'inline-block', 'float':'right'})
             
], style={'marginBottom': 50, 'marginTop': 25, 'marginLeft': 50, 'marginRight':50})

@app.callback(
    [Output(component_id='table-container', component_property='children'),
    Output(component_id='dendrogram', component_property='figure'),
    Output(component_id='targets', component_property='figure'),
     Output(component_id='behaviors', component_property='figure'),
     Output(component_id='split_chart', component_property='figure'),
     Output(component_id='split_scatter', component_property='figure')
    ],
    [Input(component_id='stat-dropdown', component_property='value'),
     #Input(component_id='ascending-toggle', component_property='on'),
     Input(component_id='bio-dropdown', component_property='value'),
     Input(component_id='behavior-dropdown', component_property='value')
    ]
)
def update_outputs(stat, bio, behavior):
    if ('depth' in stat) or ('pvalue' in stat) or ('y_mean' in stat):
        ascending=True
    else:
        ascending=False
    sortedDF = summary.sort_values(by=stat, ascending=ascending)
    newcols = list(sortedDF.columns)
    newcols.remove(stat)
    newcols.insert(0,stat)
    sortedDF = sortedDF[newcols]
    table = dash_table.DataTable(
        data=sortedDF.astype(str).to_dict('records'), # data
        sort_action='native',
        columns = [{"name": i, "id": i} for i in list(sortedDF.columns)], # columns
        id='table',
        style_data={
            'whiteSpace': 'normal',
            'height': 'auto',
        },
        style_table={'overflowX': 'auto'},
        style_data_conditional=[
        {
            'if': {
                'column_id': stat,
            },
            'backgroundColor': 'yellow',
            'color': 'black'
        }]
    )
    t_chart = targets(sortedDF,stat,bio,ascending)
    b_chart = behaviors(stat,bio,ascending)
    
    #behavior = list(summary[summary['source']==bio]['behaviors'])[0][0]
    
    split_chart = behavior_split(behavior)
    split_scatter = behavior_split_scatter(behavior)
    dend = dendrogram(sortedDF,stat,ascending)
    return table, dend, t_chart, b_chart, split_chart, split_scatter

app.run_server()#debug=False)

Dash app running on http://127.0.0.1:8050/



In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.

