In [409]:
from bokeh.plotting import output_notebook
output_notebook()

import networkx as nx
import holoviews as hv
import hvplot.pandas

In [3]:
from neuclease.clio.api import *

In [4]:
from neuclease import configure_default_logging
configure_default_logging()

In [5]:
vnc_master

('emdata5.janelia.org:8400', '557b3d9119b445c49025acb383eb5090')

In [6]:
%time clio_ann_orig = fetch_json_annotations_all('VNC', 'neurons', 'pandas')

CPU times: user 236 ms, sys: 44.6 ms, total: 281 ms
Wall time: 43.2 s


In [7]:
%time dvid_ann_orig = fetch_body_annotations(*vnc_master)

CPU times: user 189 ms, sys: 15.6 ms, total: 204 ms
Wall time: 4.91 s


In [12]:
dvid_ann = dvid_ann_orig.copy()
dvid_ann = fix_df_names(dvid_ann)
dvid_ann = dvid_ann.query('not instance.isnull() and instance != ""').copy()

# Shinya named a few like this: (18135_L)
dvid_ann['instance'] = dvid_ann['instance'].map(lambda s: s.replace('(', '').replace(')', ''))
dvid_ann['group'] = dvid_ann['instance'].map(lambda s: int(s.split('_')[0]))
dvid_ann['soma_side'] = dvid_ann['instance'].map(lambda s: s.split('_')[1] if len(s.split('_')) > 1 else '')
dvid_ann['soma_side'] = dvid_ann['soma_side'].map(lambda s: {'L': 'LHS', 'R': 'RHS'}.get(s, s))
dvid_ann = dvid_ann[['group', 'soma_side', 'naming_user']].rename(columns={'naming_user': 'user'})

In [83]:
clio_ann = clio_ann_orig.copy()
clio_ann = clio_ann.set_index('bodyid').rename_axis('body')
clio_ann = clio_ann.query('(not group.isnull() and group != "") or (not soma_side.isnull() and soma_side != "")')

In [63]:
ann = dvid_ann.merge(clio_ann[['group', 'soma_side', 'user']], 'outer', left_index=True, right_index=True, suffixes=['_dvid', '_clio'])

# Unify terminology
ann['soma_side_dvid'] = ann['soma_side_dvid'].map(lambda s: {'UNP': 'M'}.get(s, s))
ann['soma_side_clio'] = ann['soma_side_clio'].map(lambda s: {'RHs': 'RHS', 'None': "", "TBD": ""}.get(s, s))

# Convert groups to strings for graph analysis
ann = ann.query('group_clio != ""').copy()

ann['group_dvid_name'] = ""
ann['group_clio_name'] = ""

ann.loc[~(ann['group_dvid'].isnull()), 'group_dvid_name'] = 'dvid_' + ann.loc[~(ann['group_dvid'].isnull()), 'group_dvid'].astype(int).astype(str)
ann.loc[~(ann['group_clio'].isnull()), 'group_clio_name'] = 'clio_' + ann.loc[~(ann['group_clio'].isnull()), 'group_clio'].astype(int).astype(str)

In [15]:
ann

Unnamed: 0_level_0,group_dvid,soma_side_dvid,user_dvid,group_clio,soma_side_clio,user_clio
body,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10000,10000.0,RHS,takemuras,10000,,jefferis@gmail.com
10002,10000.0,LHS,takemuras,10000,,lisa.marin@gmail.com
10004,,,,,LHS,lisa.marin@gmail.com
10010,10010.0,LHS,takemuras,,LHS,lisa.marin@gmail.com
10011,,,,,RHS,mbona2p3p@gmail.com
...,...,...,...,...,...,...
502745128677,,,,,RHS,lisa.marin@gmail.com
512015221257,,,,,RHS,lisa.marin@gmail.com
608986072590,,,,,LHS,lisa.marin@gmail.com
618805335261,,,,,RHS,lisa.marin@gmail.com


### Soma side disagreements

In [422]:
q = ('not soma_side_dvid.isnull() and not soma_side_clio.isnull()'
     ' and soma_side_dvid != "" and soma_side_clio != ""'
     ' and soma_side_dvid != soma_side_clio')
soma_disagreements = ann.query(q)
len(soma_disagreements)

150

In [94]:
print(soma_disagreements.query('soma_side_dvid == "RHS"').index.tolist())

[10016, 10098, 10127, 10136, 10209, 10322, 10374, 10383, 10452, 10528, 10529, 10567, 10659, 10674, 10862, 10959, 11096, 11178, 11225, 11227, 11292, 11293, 11342, 11451, 11665, 11942, 11962, 11967, 12016, 12754, 13035, 13058, 13371, 13375, 13509, 13670, 13807, 13840, 14481, 14706, 14791, 14945, 16242, 16679, 16761, 16958, 18299, 19984, 20681, 20760, 21315, 21574, 21818, 22334, 22536, 23092, 23104, 24698, 24737, 24878, 24911, 25756, 27420, 27436, 27869, 28050, 29114, 30031, 30965, 100167, 100199, 102590, 152542, 158757, 160145, 163891]


### Construct group graph

In [98]:
g = nx.Graph()
g.add_edges_from(ann.query('group_dvid_name != ""').reset_index()[['body', 'group_dvid_name']].values)
g.add_edges_from(ann.query('group_clio_name != ""').reset_index()[['body', 'group_clio_name']].values)

### Extract connected components

In [None]:
union_groups = []
for cc in nx.connected_components(g):
    bodies = []
    dvid_groups = []
    clio_groups = []
    for node in cc:
        if isinstance(node, int):
            bodies.append(node)
        elif node.startswith('dvid_'):
            dvid_groups.append(int(float(node[len('dvid_'):])))
        elif node.startswith('clio_'):
            clio_groups.append(int(float(node[len('clio_'):])))
        else:
            assert False
    union_groups.append((len(bodies), len(dvid_groups), len(clio_groups), bodies, dvid_groups, clio_groups, cc))

In [238]:
df = pd.DataFrame(union_groups, columns=['num_bodies', 'num_dvid_groups', 'num_clio_groups', 'bodies', 'dvid_groups', 'clio_groups', 'cc'])
df = df.sort_values(['num_dvid_groups', 'num_clio_groups'], ascending=False).reset_index(drop=True)
#df.head(20)

In [237]:
_df = df.copy()
_df['num_dvid_groups'] *= -1
_df[['num_dvid_groups', 'num_clio_groups']].head(67).iloc[::-1].hvplot.barh(
    stacked=True, title='unioned group counts', legend='bottom_right', height=700).opts(xlabel='union id', ylabel='number')

### Plot connected components

In [415]:
from networkx.classes.filters import show_nodes
import hvplot.networkx as hvnx

def plot_groups(row):
    nodes = list(df.loc[row, 'cc'])
    sg = nx.subgraph_view(g, show_nodes(nodes))

    colors = []
    for n in sg.nodes():
        if isinstance(n, int):
            colors.append('white')
        elif n.startswith('dvid'):
            colors.append('skyblue')
        elif n.startswith('clio'):
            colors.append('springgreen')
        else:
            assert False
    
    #pos = nx.spring_layout(sg)
    pos = nx.kamada_kawai_layout(sg)
    OFFSET = 0.02
    label_pos = {n: (x, y+OFFSET) for n, (x,y) in pos.items()}
    
    pn = hvnx.draw(sg, pos=pos, node_color=colors)
    pl = hvnx.draw_networkx_labels(sg, label_pos)
    
    p = pn * pl
    p = p.opts(height=800, width=1000)
    return p

In [417]:
from bokeh.plotting import output_file, save as bokeh_save

for i in tqdm_proxy(range(65)):
    p = plot_groups(i)
    output_file(filename=f'/tmp/body-grouping-plots/{i:02d}.html', title=f'body-group-set-{i}')
    bokeh_save(hv.render(p))

  0%|          | 0/65 [00:00<?, ?it/s]

In [419]:
df.rename_axis('set')[['num_bodies', 'num_dvid_groups', 'num_clio_groups', 'dvid_groups', 'clio_groups', 'bodies']].to_csv('/tmp/body-grouping-sets.tsv', sep='\t')

In [407]:
df.iloc[:65]['num_bodies'].sum()

429

In [405]:
df.iloc[65:]['num_bodies'].sum()

4493

In [420]:
dvid_ann.shape

(4915, 3)

In [408]:
df.num_bodies.sum()

4922

In [385]:
df.loc[499:501]

Unnamed: 0,num_bodies,num_dvid_groups,num_clio_groups,bodies,dvid_groups,clio_groups,cc
499,2,1,1,"[13021, 13030]",[13021],[13021],"{dvid_13021, clio_13021, 13021, 13030}"
500,2,1,1,"[15629, 13022]",[13022],[13022],"{clio_13022, dvid_13022, 15629, 13022}"
501,2,1,1,"[13025, 14122]",[13025],[13025],"{13025, 14122, dvid_13025, clio_13025}"


In [421]:
plot_groups(15)

In [392]:
dvid_ann_orig.loc[27228]

body ID                                                         27228
class                                                             NaN
status                                          Prelim Roughly traced
user                                                        takemuras
naming user                                                 takemuras
instance                                                      25262_R
status user                                                    lohffa
last_modified_by                                                  NaN
instance_user                                                     NaN
comment                                                           NaN
json                {'body ID': 27228, 'status': 'Prelim Roughly t...
Name: 27228, dtype: object

In [395]:
clio_ann_orig.set_index('bodyid').loc[[25262, 22986], 'group']

bodyid
25262    22986
22986    22986
Name: group, dtype: object

In [387]:
dvid_ann_orig.loc[df.loc[1067, 'bodies']]

Unnamed: 0_level_0,body ID,class,status,user,naming user,instance,status user,last_modified_by,instance_user,comment,json
body,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
43616,43616,,Prelim Roughly traced,smithc,costam,26089_L,smithc,,,,"{'body ID': 43616, 'status': 'Prelim Roughly t..."
42273,42273,,Prelim Roughly traced,ribeiroc,costam,26089_L,ribeiroc,,,,"{'body ID': 42273, 'status': 'Prelim Roughly t..."
31170,31170,,Prelim Roughly traced,cookm,costam,26089_L,cookm,,,,"{'body ID': 31170, 'status': 'Prelim Roughly t..."
26089,26089,,Prelim Roughly traced,smithc,costam,26089_L,,,,,"{'body ID': 26089, 'status': 'Prelim Roughly t..."
46922,46922,,Prelim Roughly traced,phillipse2,costam,26089_L,phillipse2,,,,"{'body ID': 46922, 'status': 'Prelim Roughly t..."
42673,42673,,Prelim Roughly traced,smithc,costam,26089_L,smithc,,,,"{'body ID': 42673, 'status': 'Prelim Roughly t..."
33074,33074,,Prelim Roughly traced,baileyd2,costam,26089_L,baileyd2,,,,"{'body ID': 33074, 'status': 'Prelim Roughly t..."
42130,42130,,Prelim Roughly traced,baileyd2,costam,26089_L,baileyd2,,,,"{'body ID': 42130, 'status': 'Prelim Roughly t..."
52357208220,52357208220,,Prelim Roughly traced,smithc,costam,26089_L,smithc,,,,"{'body ID': 52357208220, 'status': 'Prelim Rou..."
47230,47230,,Prelim Roughly traced,phillipse2,costam,26089_L,phillipse2,,,,"{'body ID': 47230, 'status': 'Prelim Roughly t..."


In [363]:
df.query('num_dvid_groups == 1 and num_clio_groups == 1 and num_bodies == 10')

Unnamed: 0,num_bodies,num_dvid_groups,num_clio_groups,bodies,dvid_groups,clio_groups,cc
1067,10,1,1,"[43616, 42273, 31170, 26089, 46922, 42673, 330...",[26089],[26089],"{43616, 42273, 31170, 26089, 46922, clio_26089..."
1145,10,1,1,"[34144, 45282, 43011, 45311, 34409, 40234, 381...",[34144],[34144],"{34144, dvid_34144, 45282, 43011, 45311, 34409..."


In [324]:
df.query('num_dvid_groups == 1 and num_clio_groups == 1')['num_bodies'].sum()

2562

In [183]:
df['num_bodies'].sum()

4922

In [179]:
dvid_ann_orig.loc[26027]

body ID                                                         26027
class                                                             NaN
status                                          Prelim Roughly traced
user                                                        takemuras
naming user                                                       NaN
instance                                                      25146_L
status user                                                       NaN
last_modified_by                                            takemuras
instance_user                                                     NaN
comment                                                           NaN
json                {'user': 'takemuras', 'instance': '25146_L', '...
Name: 26027, dtype: object