# proofread_soma
For our 2nd run of quality check, we used a different approach. We looked at `'soma_xyz'` instead of `'nuc_xyz'`, and also took a note whether the `'soma_xyz'` located inside or outside neurons. This information became helpful when we merge nuclei and cytosols later.

In [None]:
import numpy as np
import pyperclip
import pandas as pd
import pyperclip
from cloudvolume import CloudVolume, view, Bbox
from nglui import statebuilder,annotation,easyviewer,parser
from nglui.statebuilder import *
from nglui.nglite import *
import json
import sys
import os
from datetime import datetime
from caveclient import CAVEclient
from ..lib import get_cv_path
from fanc import rootID_lookup as IDlook

In [None]:
datastack_name = 'fanc_production_mar2021'
client = CAVEclient(datastack_name)

In [None]:
df_progress = pd.read_csv('../Output/proofread_soma_temp.csv', header=0)

In [None]:
df = df_progress[df_progress.is_neuron=='y']

## 1. Create Neuroglancer links
We did not use lines for this QC. Instead, we decided to use `linked_segmentation_layer` functionality. See [this example](https://github.com/seung-lab/NeuroglancerAnnotationUI/blob/master/examples/statebuilder_examples.ipynb) as well. Besically, when you jumped to a soma point on the annotation layer, you will not only find your soma point in the center of your screen, but also your soma with some random color based on`'nuc_xyz'`. This is very helpful to guess the spatial relationship between `'nuc_xyz'` and `'soma_xyz'` (e.g., if a soma is inside neuron and has some color, `'nuc_xyz'` is also likely to be inside that neuron). The previous method of using line was good, but the line itself was too thin to find and you cannot change it.

In [None]:
xyz_df = df.reindex(columns=['nuc_xyz', 'soma_xyz', 'nucID'])
xyz_df.columns =['nuc_xyz', 'soma_xyz', 'id']
nuc_xyz_df = df['nuc_xyz'].str.strip('()').str.split(',',expand=True)
soma_xyz_df = df['soma_xyz'].str.strip('()').str.split(',',expand=True)
xyz_df['nuc_xyz'] = nuc_xyz_df.astype(int).values.tolist()
xyz_df['soma_xyz'] = soma_xyz_df.astype(int).values.tolist()

In [None]:
nuc_xyz_df = nuc_xyz_df.set_axis(['Col_x', 'Col_y', 'Col_z'], axis=1)
sorted_z = nuc_xyz_df.astype(int).sort_values(by=['Col_z'], ascending=True)
sorted_xz = sorted_z.astype(int).sort_values(by=['Col_x'], ascending=True)
sorted_indices = sorted_xz.astype(int).sort_values(by=['Col_y'], ascending=True).index
xyz_df = xyz_df.reindex(sorted_indices).reset_index(drop=True)

In [None]:
datastack_name = 'fanc_production_mar2021'
client = CAVEclient(datastack_name)

In [None]:
client.materialize.version = client.materialize.get_versions()[-1]
nuc_from_cave = client.materialize.query_table('nuclei_aug2021ver2')

In [None]:
latest_nuc_segid = nuc_from_cave.reindex(columns=['id', 'pt_root_id'])
merged = xyz_df.merge(latest_nuc_segid, how='left', on='id')

In [None]:
# ..., and make them into points
points = PointMapper('soma_xyz', linked_segmentation_column='pt_root_id')

In [None]:
# make Neuroglancer link
imgTokyo = ImageLayerConfig(name = 'FANCv4-jp',
                                    source = get_cv_path('Image_Tokyo')['url'])
img = ImageLayerConfig(name = 'FANCv4',
                                    source = get_cv_path('Image')['url'])
seg = SegmentationLayerConfig(name = 'seg_Mar2021_proofreading',
                                    source = get_cv_path('FANC_production_segmentation')['url'])          

ann = AnnotationLayerConfig(name='soma_Aug2021',
                            mapping_rules=points,
                            linked_segmentation_layer='seg',
                            tags=['inside', 'outside', 'need_check'],
                            active = True)

In [None]:
view_options = {"layout": "xy"}

memory_options = {"gpuMemoryLimit": 4000000000,
                  "systemMemoryLimit": 9000000000,
                  "concurrentDownloads": 64,
                  "jsonStateServer": "https://global.daf-apis.com/nglstate/api/v1/post"}


sb = StateBuilder(layers=[imgTokyo, seg, img, ann],
                  resolution=[4.3,4.3,45],
                  view_kws=view_options)

In [None]:
LINK=[]
k=500
minidfs = [merged.loc[i:i+k-1, :] for i in range(0, len(merged), k)]
for dftmp in minidfs:
    # csb = ChainedStateBuilder([sb, vs])                             
    state = json.loads(sb.render_state(dftmp, return_as='json'))
    state.update(memory_options)
    jsn_id = client.state.upload_state_json(state)
    output = client.state.build_neuroglancer_url(jsn_id, get_cv_path('neuroglancer_base')['url'])
    LINK.append(output)

In [None]:
# save into csv
LINK2 = pd.DataFrame(LINK)
LINK2.to_csv('../Output/links_20211223soma.csv', index=False, header=False)
# do the exact same thing as 1st time

## 2. Extract tags
We extracted annotation tags from the result similar to our 1st QC.

In [None]:
# only for the 1st time

# df['is_inside']=""
# df['is_outside']=""
# df['is_false_positive']=""
# df['is_duplicated']=""
# df.to_csv('../Output/proofread_soma_temp.csv', index=False)

In [None]:
df_progress = pd.read_csv("../Output/proofread_soma_temp.csv", header=0)
prfrd = pd.read_table("../Output/soma_proofread_ver3_mistake_fixedfromver1.tsv", usecols = ['new link'])

In [None]:
rsplitted = prfrd['new link'].dropna(how='all').str.rsplit('/', 1)
new_id = list()
for i in rsplitted.index:
    new_id.append(rsplitted.loc[i][1])

print(len(new_id))

In [None]:
mylist = []

for i in range(len(new_id)):
    state_id = int(new_id[i])
    state = client.state.get_state_json(state_id)

    # extract info from json state
    nuc_tags = parser.tag_dictionary(state,parser.annotation_layers(state)[0])
    anno_lists = parser.point_annotations(state,parser.annotation_layers(state)[0], tags=True)

    temp = pd.DataFrame({'anno_points': anno_lists[0],
                     'anno_tags': anno_lists[1]})

    # convert [] to [0]
    for j in range(len(temp)):
        if (len(temp.iloc[j,1]) == 0) or (len(temp.iloc[j,1]) >= 2): # make dup to 0 for now....
            temp.iloc[j,1] = [0]
            # temp['anno_tags']

    temp['anno_tags'] = np.concatenate(temp['anno_tags']).astype(int)
    temp['anno_tags'] = temp['anno_tags'].replace(nuc_tags)

    mylist.append(temp)


In [None]:
df_new = pd.concat(mylist).reset_index()

In [None]:
print('inside neuron are {}'.format(sum(df_new['anno_tags']=='inside')))
print('outside neuron are {}'.format(sum(df_new['anno_tags']=='outside')))
print('need_check are {}'.format(sum(df_new['anno_tags']=='need_check')))
print('anything else? are {}'.format(sum(df_new['anno_tags']==0)))
print('in total {}'.format(len(df_new)))

In [None]:
df_new.drop("index", axis=1).to_csv('../Output/soma_proofread_ver3_extracted.csv', index=False)

In [None]:
df_new2 = df_new[(df_new['anno_tags']!='inside') & (df_new['anno_tags']!='outside')]
print(len(df_new2))

In [None]:
# create xyz_df from csv
xyz_df = xyz_df.loc[df_new2.index]
# go back to the cell above and create new links

## X. Save final results
After repeating the QC process and proofreading all the putative nuclei, we saved them into a single csv file.

In [None]:
df_new = pd.read_csv("../Output/soma_proofread_extracted.csv", header=0)
df_new2 = pd.read_csv("../Output/soma_proofread_ver2_extracted.csv", header=0)
df_new3 = pd.read_csv('../Output/soma_proofread_ver3_extracted.csv', header=0)
df_progress = pd.read_csv("../Output/proofread_soma_temp.csv", header=0)

In [None]:
print('total num of registered nuclei {}'.format(len(df_progress)))

In [None]:
i = df_new
print('df_new')
print('inside neuron are {}'.format(sum(i['anno_tags']=='inside')))
print('outside neuron are {}'.format(sum(i['anno_tags']=='outside')))
print('need_check are {}'.format(sum(i['anno_tags']=='need_check')))
print('anything else? are {}'.format(sum(i['anno_tags']=='0')))
print('in total {}'.format(len(i)))

In [None]:
i = df_new2
print('df_new2')
print('inside neuron are {}'.format(sum(i['anno_tags']=='inside')))
print('outside neuron are {}'.format(sum(i['anno_tags']=='outside')))
print('need_check/glia are {}'.format(sum(i['anno_tags']=='need_check')))
print('anything else? are {}'.format(sum(i['anno_tags']=='0')))
print('in total {}'.format(len(i)))

In [None]:
i = df_new3
print('df_new3')
print('inside neuron are {}'.format(sum(i['anno_tags']=='inside')))
print('outside neuron are {}'.format(sum(i['anno_tags']=='outside')))
print('need_check/glia are {}'.format(sum(i['anno_tags']=='need_check')))
print('anything else? are {}'.format(sum(i['anno_tags']=='0')))
print('in total {}'.format(len(i)))

In [None]:
i = df_new.reindex()
test1 = []

for j in range(len(i)):
    nuc_loc_temp = i['anno_points'].values[j].strip('[]')
    nuc_loc = '(' + nuc_loc_temp + ')'
    nuc_tag = i['anno_tags'].values[j]
    if nuc_loc in df_progress['soma_xyz'].values:
        idx = df_progress.index[df_progress['soma_xyz'] == nuc_loc]
        if nuc_tag == 'inside':
            df_progress.at[idx,'is_inside'] = 'y'
        elif nuc_tag == 'outside':
            df_progress.at[idx,'is_outside'] = 'y'
    else:
        test1.append(j)

In [None]:
i = df_new2.reindex()
test2 = []

for j in range(len(i)):
    nuc_loc_temp = i['anno_points'].values[j].strip('[]')
    nuc_loc = '(' + nuc_loc_temp + ')'
    nuc_tag = i['anno_tags'].values[j]
    if nuc_loc in df_progress['soma_xyz'].values:
        idx = df_progress.index[df_progress['soma_xyz'] == nuc_loc]
        if nuc_tag == 'inside':
            df_progress.at[idx,'is_inside'] = 'y'
        elif nuc_tag == 'outside':
            df_progress.at[idx,'is_outside'] = 'y'
        elif nuc_tag == 'need_check':
            df_progress.at[idx,'is_neuron'] = 'NaN'
            df_progress.at[idx,'is_glia'] = 'y'
    else:
        test2.append(j)

In [None]:
i = df_new3.reindex()
test3 = []

for j in range(len(i)):
    nuc_loc_temp = i['anno_points'].values[j].strip('[]')
    nuc_loc = '(' + nuc_loc_temp + ')'
    nuc_tag = i['anno_tags'].values[j]
    if nuc_loc in df_progress['soma_xyz'].values:
        idx = df_progress.index[df_progress['soma_xyz'] == nuc_loc]
        if nuc_tag == 'inside':
            df_progress.at[idx,'is_inside'] = 'y'
        elif nuc_tag == 'outside':
            df_progress.at[idx,'is_outside'] = 'y'
        elif nuc_tag == 'need_check':
            df_progress.at[idx,'is_neuron'] = 'NaN'
            df_progress.at[idx,'is_glia'] = 'y'
    else:
        test3.append(j)

In [None]:
print('neuron are {}'.format(sum(df_progress.is_neuron=='y')))
print('glia are {}'.format(sum(df_progress.is_glia=='y')))
print('inside are {}'.format(sum(df_progress.is_inside=='y')))
print('outside are {}'.format(sum(df_progress.is_outside=='y')))
print('in total {}'.format(len(df_progress)))

Then, we noticed that some of them were labeled wrong and we still needed to fix them.

In [None]:
glia_but_inside = df_progress[(df_progress.is_glia=='y') & (df_progress.is_inside=='y')]
glia_but_inside

In [None]:
# first of all, the last one 72622194198315045 is neuron...
this_is_neuron_idx = df_progress.index[df_progress['nucID'] == 72622194198315045]
df_progress.loc[this_is_neuron_idx]

In [None]:
df_progress.at[this_is_neuron_idx,'is_neuron'] = 'y'
df_progress.at[this_is_neuron_idx,'is_glia'] = 'NaN'

In [None]:
# now lets see glia but inside again
glia_but_inside2 = df_progress[(df_progress.is_glia=='y') & (df_progress.is_inside=='y')]
glia_but_inside2

In [None]:
# these need to be empty in is_inside
i = glia_but_inside2.reindex()

for j in range(len(i)):
    nucID_temp = i['nucID'].values[j]
    this_shoud_not_have_inside_idx = df_progress.index[df_progress['nucID'] == nucID_temp]
    df_progress.at[this_shoud_not_have_inside_idx,'is_inside'] = 'NaN'

In [None]:
# Fixed results
print('neuron are {}'.format(sum(df_progress.is_neuron=='y')))
print('glia are {}'.format(sum(df_progress.is_glia=='y')))
print('inside are {}'.format(sum(df_progress.is_inside=='y')))
print('outside are {}'.format(sum(df_progress.is_outside=='y')))
print('in total {}'.format(len(df_progress)))

In [None]:
df_progress.to_csv('../Output/proofread_soma_temp.csv', index=False) # save into csv