In [2]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext sql

In [3]:
%sql sqlite:///results.db

# Flow diagram for pipe `a | b | c`

In [47]:
%%sql node_data <<
with pipe3 as (
    select c1.name as c1_name, c2.name as c2_name, c3.name as c3_name,
        c1.command_id as c1_command_id, c2.command_id as c2_command_id, c3.command_id as c3_command_id
    from command as c1 join command as c2 using (alias_id) join command as c3 using (alias_id)
    where c1.position = 0 and c1.operator is null 
    and c2.position = 1 and c2.operator = '|'
    and c3.position = 2 and c3.operator = '|'
)
,X as (
select 
    c1_id, c1_name, c1_num, round(c1_num*100.0/pipe_num, 2) as c1_per,
    c2_id, c2_name, c2_num, round(c2_num*100.0/pipe_num, 2) as c2_per,
    c3_id, c3_name, c3_num, round(c3_num*100.0/pipe_num, 2) as c3_per,
    flow_1_2, flow_2_3
from pipe3
join (select c1_name, count(*) as c1_num, c1_command_id as c1_id from pipe3 group by c1_name) using (c1_name)
join (select c2_name, count(*) as c2_num, c2_command_id as c2_id from pipe3 group by c2_name) using (c2_name)
join (select c3_name, count(*) as c3_num, c3_command_id as c3_id from pipe3 group by c3_name) using (c3_name)
join (select c1_name, c2_name, count(*) as flow_1_2 from pipe3 group by c1_name, c2_name) using (c1_name, c2_name)
join (select c2_name, c3_name, count(*) as flow_2_3 from pipe3 group by c2_name, c3_name) using (c2_name, c3_name)
join (select count(*) as pipe_num from pipe3)
where c1_per > 1 and c2_per > 1 and c3_per > 1
and (c1_per > 10 or c2_per > 10 or c3_per > 10)
group by c1_name, c2_name, c3_name
order by c2_num desc, c1_num desc, c3_num desc
limit 250
)
select c1_id, c1_name from X group by c1_id
union
select c2_id, c2_name from X group by c2_id
union
select c3_id, c3_name from X group by c3_id

* sqlite:///results.db
Done.
Returning data to local variable node_data


In [48]:
%%sql link_data <<
with pipe3 as (
    select c1.name as c1_name, c2.name as c2_name, c3.name as c3_name,
        c1.command_id as c1_command_id, c2.command_id as c2_command_id, c3.command_id as c3_command_id
    from command as c1 join command as c2 using (alias_id) join command as c3 using (alias_id)
    where c1.position = 0 and c1.operator is null 
    and c2.position = 1 and c2.operator = '|'
    and c3.position = 2 and c3.operator = '|'
)
,X as (
select 
    c1_id, c1_name, c1_num, round(c1_num*100.0/pipe_num, 2) as c1_per,
    c2_id, c2_name, c2_num, round(c2_num*100.0/pipe_num, 2) as c2_per,
    c3_id, c3_name, c3_num, round(c3_num*100.0/pipe_num, 2) as c3_per,
    flow_1_2, flow_2_3
from pipe3
join (select c1_name, count(*) as c1_num, c1_command_id as c1_id from pipe3 group by c1_name) using (c1_name)
join (select c2_name, count(*) as c2_num, c2_command_id as c2_id from pipe3 group by c2_name) using (c2_name)
join (select c3_name, count(*) as c3_num, c3_command_id as c3_id from pipe3 group by c3_name) using (c3_name)
join (select c1_name, c2_name, count(*) as flow_1_2 from pipe3 group by c1_name, c2_name) using (c1_name, c2_name)
join (select c2_name, c3_name, count(*) as flow_2_3 from pipe3 group by c2_name, c3_name) using (c2_name, c3_name)
join (select count(*) as pipe_num from pipe3)
where c1_per > 1 and c2_per > 1 and c3_per > 1
and (c1_per > 10 or c2_per > 10 or c3_per > 10)
group by c1_name, c2_name, c3_name
order by c2_num desc, c1_num desc, c3_num desc
limit 250
)
select c1_id, c2_id, flow_1_2 from X
union
select c2_id, c3_id, flow_2_3 from X

* sqlite:///results.db
Done.
Returning data to local variable link_data


In [49]:
# imports
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

nodes = [['ID','Label']] + list(map(list,node_data))
links = [['Source','Target','Value']] + list(map(list,link_data))

#print(nodes)
#print(links)

# Retrieve headers and build dataframes
nodes_headers = nodes.pop(0)
links_headers = links.pop(0)
df_nodes = pd.DataFrame(nodes, columns = nodes_headers)
df_links = pd.DataFrame(links, columns = links_headers)

# hack to get the IDs right
df_links['Source'] = df_links['Source'].apply(lambda x: df_nodes.loc[df_nodes['ID']==x].index[0])
df_links['Target'] = df_links['Target'].apply(lambda x: df_nodes.loc[df_nodes['ID']==x].index[0])
df_nodes['ID'] = df_nodes['ID'].apply(lambda x: df_nodes.loc[df_nodes['ID']==x].index[0])


# Sankey plot setup
data_trace = dict(
    type='sankey',
    domain = dict(
      x =  [0,1],
      y =  [0,1]
    ),
    orientation = "h",
    valueformat = ".0f",
    node = dict(
      pad = 10,
    # thickness = 30,
      line = dict(
        color = "black",
        width = 0
      ),
      label =  df_nodes['Label'].dropna(axis=0, how='any'),
#      color = df_nodes['Color']
    ),
    link = dict(
      source = df_links['Source'].dropna(axis=0, how='any'),
      target = df_links['Target'].dropna(axis=0, how='any'),
      value = df_links['Value'].dropna(axis=0, how='any'),
#      color = df_links['Link Color'].dropna(axis=0, how='any'),
  )
)

layout = dict(
        title = "",
    height = 772,
    width = 1000,
    font = dict(
      size = 10),)      

fig = dict(data=[data_trace], layout=layout)
iplot(fig, validate=False)