In [1]:
import icicle_plot
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Write Autologs into "mlflow_workflows.csv"

In [2]:
import os

dirs = os.listdir("mlruns/0")
dictionary = []
for i in dirs:
    row = {}
    if "meta" in i or "DS_Store" in i:
        continue
        
    accuracy = float(open('mlruns/0/' + i + '/metrics/accuracy').read().split(" ")[1])
    row['accuracy'] = accuracy
    params_files = os.listdir('mlruns/0/' + i + '/params')
    model_params = {}
    for j in params_files:
        if 'kernel' in j:
            continue
        model_params[j] = open('mlruns/0/' + i + '/params/' + j).read()
    
    row['model_params'] = model_params
    if len(params_files) == 3:
        row['model'] = 'SVM'
    else:
        row['model'] = 'xgboost'
    dictionary.append(row)
    
df = pd.DataFrame(dictionary)
df.to_csv('mlflow_workflows.csv')

# Read MLFlow Logs and Generate Icicle-PC Plot

In [3]:
import ast
ut_pair = pd.read_csv('mlflow_workflows.csv')

# string to dict
ut_pair.model_params = ut_pair.model_params.apply(ast.literal_eval)
ut_pair = ut_pair.rename(columns={'Unnamed: 0': 'rid'})
ut_pair

Unnamed: 0,rid,accuracy,model_params,model
0,0,0.841410,"{'num_class': '2', 'max_depth': '10', 'num_boo...",xgboost
1,1,0.781954,"{'num_class': '2', 'max_depth': '1', 'num_boos...",xgboost
2,2,0.813771,"{'num_class': '2', 'max_depth': '1', 'num_boos...",xgboost
3,3,0.852835,"{'num_class': '2', 'max_depth': '5', 'num_boos...",xgboost
4,4,0.713900,"{'gamma': '0.0001', 'C': '100'}",SVM
...,...,...,...,...
75,75,0.855353,"{'num_class': '2', 'max_depth': '5', 'num_boos...",xgboost
76,76,0.852466,"{'num_class': '2', 'max_depth': '5', 'num_boos...",xgboost
77,77,0.849886,"{'num_class': '2', 'max_depth': '10', 'num_boo...",xgboost
78,78,0.754745,"{'gamma': '0.001', 'C': '1'}",SVM


In [4]:
hyperparams_df = 0

def sorting_criteria(s):
    return len(hyperparams_df[s].unique())

hp_key = {}
max_len_candidates = 0
for model_iter in ut_pair.model.unique():
    hyperparams_df = pd.DataFrame(ut_pair[ut_pair.model == model_iter].model_params.to_list())
    hp_candidates = sorted(list(ut_pair[ut_pair.model == model_iter].model_params.to_list()[0].keys()), key=sorting_criteria)
    hp_candidates = [i for i in hp_candidates if len(hyperparams_df[i].unique()) > 1]
    hp_key[model_iter] = hp_candidates
    if len(hp_candidates) > max_len_candidates:
        max_len_candidates = len(hp_candidates)
        
hyperparams_df = pd.DataFrame(ut_pair.model_params.to_list())
hyperparams_df['rid'] = ut_pair['rid'].values

# print(hyperparams_df.rid)

current_index = 0
def hp_viz_creator(row):
#     print(row)
    if current_index < len(hp_key[row.model]):
        hp_value = str(hyperparams_df[hyperparams_df.rid == row.rid][hp_key[row.model][current_index]].tolist()[0])
        return hp_key[row.model][current_index] + "=" + hp_value
    return None


for i in range(max_len_candidates):
#     print(ut_pair)
    current_index = i
    ut_pair[str(i) + "_order_hyp"] = ut_pair[['model', 'rid']].apply(hp_viz_creator,axis=1)
    
ut_pair

Unnamed: 0,rid,accuracy,model_params,model,0_order_hyp,1_order_hyp,2_order_hyp,3_order_hyp
0,0,0.841410,"{'num_class': '2', 'max_depth': '10', 'num_boo...",xgboost,subsample=0.05,max_depth=10,colsample_bytree=1.0,eta=0.9
1,1,0.781954,"{'num_class': '2', 'max_depth': '1', 'num_boos...",xgboost,subsample=1.0,max_depth=1,colsample_bytree=0.5,eta=0.005
2,2,0.813771,"{'num_class': '2', 'max_depth': '1', 'num_boos...",xgboost,subsample=0.05,max_depth=1,colsample_bytree=0.2,eta=0.3
3,3,0.852835,"{'num_class': '2', 'max_depth': '5', 'num_boos...",xgboost,subsample=1.0,max_depth=5,colsample_bytree=0.2,eta=0.3
4,4,0.713900,"{'gamma': '0.0001', 'C': '100'}",SVM,gamma=0.0001,C=100,,
...,...,...,...,...,...,...,...,...
75,75,0.855353,"{'num_class': '2', 'max_depth': '5', 'num_boos...",xgboost,subsample=0.05,max_depth=5,colsample_bytree=1.0,eta=0.3
76,76,0.852466,"{'num_class': '2', 'max_depth': '5', 'num_boos...",xgboost,subsample=1.0,max_depth=5,colsample_bytree=0.5,eta=0.005
77,77,0.849886,"{'num_class': '2', 'max_depth': '10', 'num_boo...",xgboost,subsample=0.05,max_depth=10,colsample_bytree=0.2,eta=0.9
78,78,0.754745,"{'gamma': '0.001', 'C': '1'}",SVM,gamma=0.001,C=1,,


In [5]:
import numpy as np
hierarchy_path = ['model'] + [str(i) + '_order_hyp' for i in range(max_len_candidates)]
ut_p = ut_pair[hierarchy_path + ['accuracy']]
def recur_dictify(frame):
    if len(frame.columns) == 1:
        if frame.values.size == 1: return frame.values[0][0]
        return frame.values.squeeze()
    
    # for rows that contain None values (have fewer hyperparameters than others)
    if frame[frame.columns[0]].iloc[0] == None:
        return frame.values[0][len(frame.values[0])-1]
    grouped = frame.groupby(frame.columns[0])
    d = {k: recur_dictify(g.iloc[:,1:]) for k,g in grouped}
    return d
ut_p = recur_dictify(ut_p)
low_color = 2.0
high_color = -1.0
def recur_hierarch(frame):
    global low_color
    global high_color
    if isinstance(frame, np.float64) or isinstance(frame, float) or isinstance(frame, np.ndarray):
        return frame, frame
    children = []
    colors = []
    for key in frame.keys():
        children_c, color = recur_hierarch(frame[key])
        colors.append(color)
        
        if children_c != [] and not isinstance(children_c, float) and not isinstance(children_c[0], np.float64):
            # node
            children.append({'name': key, 'color': color, 'children': children_c})
        else:
            # leaf
            if color < low_color:
                low_color = color
            if color > high_color:
                high_color = color
            children.append({'name': key, 'color': color, 'size': 1})
    return children, sum(colors)/len(colors)

children_ut_p, color = recur_hierarch(ut_p)
ut_p = {'name': 'main', 'color': color, 'children': children_ut_p}

{'SVM': {'gamma=0.0001': {'C=1': 0.7678275290215588, 'C=10': 0.7478656102205025, 'C=100': 0.7138996376143971, 'C=1000': 0.6942448252564338}, 'gamma=0.001': {'C=1': 0.7547447945457896, 'C=10': 0.7159879614274307, 'C=100': 0.7078189300411523, 'C=1000': 0.7067133468460168}}, 'xgboost': {'subsample=0.05': {'max_depth=1': {'colsample_bytree=0.2': {'eta=0.005': 0.7637737239727289, 'eta=0.01': 0.7637737239727289, 'eta=0.3': 0.813770652908298, 'eta=0.9': 0.8265462809409742}, 'colsample_bytree=0.5': {'eta=0.005': 0.7985381733308765, 'eta=0.01': 0.7985381733308765, 'eta=0.3': 0.8185615134205516, 'eta=0.9': 0.8479823106688779}, 'colsample_bytree=1.0': {'eta=0.005': 0.7943615257048093, 'eta=0.01': 0.7943615257048093, 'eta=0.3': 0.8111295374976967, 'eta=0.9': 0.8519132731404705}}, 'max_depth=10': {'colsample_bytree=0.2': {'eta=0.005': 0.7966955346723175, 'eta=0.01': 0.7978625391560715, 'eta=0.3': 0.8423315521159634, 'eta=0.9': 0.8498863706160555}, 'colsample_bytree=0.5': {'eta=0.005': 0.85750261040


The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.



In [6]:
import json

from jupyter_dash import JupyterDash
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

ut_pair = ut_pair.drop_duplicates(subset=hierarchy_path)

app = JupyterDash(__name__)
# app = dash.Dash()
app.css.config.serve_locally = True
app.scripts.config.serve_locally = True

def df_to_dict(ut):
    data = {}
    for col_name in hierarchy_path:
        for i, g in ut.groupby(col_name):
            data_key = g[col_name].iloc[0]
            data[data_key] = {}

data = ut_p

fig = icicle_plot.Icicle(
    id='icicle',
    value='my-value',
    label='my-label',
    low=low_color,
    high=high_color,
    data=data
)

def make_ints(row):
    for col in hierarchy_path:
        if row[col] != None:
            try:
                row[col] = float(row[col].split("=")[1])
            except:
                row[col]
    return row

ut_pair_numeric = ut_pair.apply(make_ints, axis=1)

pc = px.parallel_coordinates(ut_pair_numeric, color="accuracy", dimensions=hierarchy_path,
                             color_continuous_scale='RdBu')
pc_o = pc

app.layout = html.Div([
    fig,
    dcc.Graph(
        id='pc',
        figure=pc
    ),
    html.Div(id='output')
])

@app.callback(
    Output('pc', 'figure'),
    [Input('icicle', 'value')])
def display_click_data(clickData):
    global pc
    if clickData.split("/")[:-1] == []:
        return pc
    if clickData:
        click_path = clickData.split("/")[:-1][1:]
        subset_counter = len(click_path)
        if click_path == []:
            return pc_o
        
        selected_df = ut_pair
        j = -1
        for i in click_path:
            j+=1
            if "=" in i:
                comps_name = i.split("=")
                hyp_name = comps_name[0]
                hyp_val = comps_name[1]
                selected_df = selected_df[selected_df.apply(lambda x: x['model_params'][hyp_name] == hyp_val if hyp_name in x['model_params'] else False, axis=1)]
            else:
                selected_df = selected_df[selected_df['model'] == i]
        sample_vals = selected_df.iloc[0]
        labels_pc = {}
        for i in hierarchy_path[subset_counter:]:
            if sample_vals[i]:
                labels_pc[i] = sample_vals[i].split("=")[0]
        
        selected_df = selected_df.apply(make_ints, axis=1)
        
        pc = px.parallel_coordinates(selected_df, color="accuracy", dimensions=hierarchy_path[subset_counter:],
                                 labels=labels_pc, color_continuous_scale='RdBu')
        
        return pc
    pc = px.parallel_coordinates(ut_pair_numeric, color="accuracy", dimensions=hierarchy_path,
                             color_continuous_scale='RdBu')
    return pc

app.run_server(mode='inline', port=8100)
# app.run_server(port=8090)