### Doccano Annotation

Commands to run doccano locally:

```
docker pull chakkiworks/doccano
docker run -d --name --rm doccano -p 8000:8000 chakkiworks/doccano
docker exec doccano tools/create-admin.sh "admin" "admin@example.com" "password"
```

In [1]:
import os
import os.path as osp
import pandas as pd
import numpy as np
import snorkel
from snorkel.models import Candidate
from snorkel import SnorkelSession
from tcre.env import *
from tcre import supervision

In [2]:
session = SnorkelSession()
classes = supervision.get_candidate_classes()

In [3]:
# Pull list of all non-annotated candidate ids and types
dfc = pd.DataFrame(
    session.query(Candidate.id, Candidate.type).filter(Candidate.split == supervision.SPLIT_INFER).all(),
    columns=['id', 'type']
)
dfc.groupby('type').size()

type
inducing_cytokine                56404
inducing_transcription_factor    15155
secreted_cytokine                56404
dtype: int64

In [4]:
dfc.head()

Unnamed: 0,id,type
0,1,inducing_cytokine
1,2,inducing_cytokine
2,3,inducing_cytokine
3,4,inducing_cytokine
4,5,inducing_cytokine


In [26]:
# Generate samples for each type
dfs = dfc.groupby('type', group_keys=False).apply(lambda g: g.sample(n=500, random_state=TCRE_SEED))
dfs.groupby('type').size()

type
inducing_cytokine                500
inducing_transcription_factor    500
secreted_cytokine                500
dtype: int64

In [27]:
def get_filter_ids():
    path = osp.join(REPO_DATA_DIR, 'annotation', 'brat_export.csv')
    df = pd.read_csv(path)
    ids1 = df['id'].unique()
    
    path = osp.join(REPO_DATA_DIR, 'annotation', 'ngramviewer_export.csv')
    df = pd.read_csv(path)
    ids2 = df['doc_id'].unique()
    return list(set(list(ids1) + list(ids2)))
filter_ids = get_filter_ids()
dfs['cand'] = dfs.apply(lambda r: session.query(Candidate).filter(Candidate.id == r['id']).one(), axis=1)
dfs['doc_id'] = dfs.apply(lambda r: r['cand'].get_parent().document.name, axis=1)
dfs['has_cd4'] = dfs.apply(lambda r: any([c.get_span() == 'CD4' for c in r['cand'].get_contexts()]), axis=1)
dfs = dfs[~dfs['doc_id'].isin(filter_ids)]
dfs = dfs[~dfs['has_cd4']]

dfs = dfs.groupby('type', group_keys=False).apply(lambda g: g.head(300))
dfs.groupby('type').size()

type
inducing_cytokine                300
inducing_transcription_factor    300
secreted_cytokine                300
dtype: int64

In [28]:
# Convert to JSONL files for docanno; Example:
# {"text": "EU rejects German call to boycott British lamb.", "labels": [ [0, 2, "ORG"], [11, 17, "MISC"], ... ]}
# {"text": "Peter Blackburn", "labels": [ [0, 15, "PERSON"] ]}
# {"text": "President Obama", "labels": [ [10, 15, "PERSON"] ]}

markers = {
    supervision.ENT_TYP_CK_L: '||',
    supervision.ENT_TYP_CT_L: '$$',
    supervision.ENT_TYP_TF_L: '@@',
}

def process(r):
    cand = session.query(Candidate).filter(Candidate.id == r['id']).one()
    doc = cand.get_parent().get_parent()
    words = list(cand.get_parent().words)
    ctxs = cand.get_contexts()
    
    for ctx in ctxs:
        rng = ctx.get_word_range()
        typ = cand.get_parent().entity_types[rng[0]]
        # Add visual markers in the text to make the entities in question clear
        words[rng[0]] = markers[typ] + words[rng[0]]
        words[rng[1]] = words[rng[1]] + markers[typ] 
    meta = dict(
        type=typ, doc_id=doc.name, 
        e1_text=ctxs[0].get_span(), e1_start_chr=ctxs[0].char_start, e1_end_chr=ctxs[0].char_end,
        e2_text=ctxs[1].get_span(), e2_start_chr=ctxs[1].char_start, e2_end_chr=ctxs[1].char_end
    )
    return dict(text=' '.join(words), labels=[], meta=meta)

dfexp = pd.DataFrame(list(dfs.apply(process, axis=1).values))
dfexp.head()

Unnamed: 0,labels,meta,text
0,[],"{'type': 'immune_cell_type', 'doc_id': 'PMC489...",We show that neither the ||IFN-γ|| – producing...
1,[],"{'type': 'immune_cell_type', 'doc_id': 'PMC333...",The high amount of CD4+CD25 + T cells expressi...
2,[],"{'type': 'immune_cell_type', 'doc_id': 'PMC219...",In vitro analysis of acquired functional poten...
3,[],"{'type': 'immune_cell_type', 'doc_id': 'PMC334...",Under conditions in which DCs produce ||IL-23|...
4,[],"{'type': 'immune_cell_type', 'doc_id': 'PMC405...",To validate our observation of different time ...


In [29]:
path = osp.join(REPO_DATA_DIR, 'doccano', 'annotations_import.jsonl')
dfexp.to_json(path, orient='records', lines=True)
path

'/lab/repos/t-cell-relation-extraction/data/doccano/annotations_import.jsonl'

### Reload and Export

In [14]:
# These labels need to be defined on a fresh run of the doccano container
# and they need to be defined in an order that corresponds with the dict below
# (for some reason, doccano only exports the internal id associated with the label,
# which will increment indefinitely as you add projects and label categories)
labels = {
    1: (classes.inducing_cytokine.field, 1),
    2: (classes.inducing_cytokine.field, -1),
    3: (classes.secreted_cytokine.field, 1),
    4: (classes.secreted_cytokine.field, -1),
    5: (classes.inducing_transcription_factor.field, 1),
    6: (classes.inducing_transcription_factor.field, -1)
}
labels

{1: ('inducing_cytokine', 1),
 2: ('inducing_cytokine', -1),
 3: ('secreted_cytokine', 1),
 4: ('secreted_cytokine', -1),
 5: ('inducing_transcription_factor', 1),
 6: ('inducing_transcription_factor', -1)}

In [17]:
# Read doccano export:
# {"id": 1, "text": "EU rejec...", "annotations": [{"label": 2, "user": 1}, "label": 1, "user": 1}], "meta": {"s2": "yes", "s1": 1}, "annotation_approver": null}
# {"id": 2, "text": "Peter Bl...", "annotations": [{"label": 2, "user": 1}], "meta": {"s2": "yes", "s1": 1}, "annotation_approver": null}
# {"id": 3, "text": "Presiden...", "annotations": [{"label": 1, "user": 1}], "meta": {"s2": "yes", "s1": 1}, "annotation_approver": null}

path = osp.join(REPO_DATA_DIR, 'doccano', 'annotations_export.jsonl')
df = pd.read_json(path, lines=True)
df = pd.DataFrame([
    {**r['meta'], **dict(type=labels[a['label']][0], value=labels[a['label']][1])}
    for i, r in df.iterrows()
    for a in r['annotations']
])
df.head()

Unnamed: 0,s1,s2,type,value
0,1,yes,inducing_cytokine,-1
1,1,yes,inducing_cytokine,-1
2,1,yes,inducing_cytokine,1


In [None]:
path = osp.join(REPO_DATA_DIR, 'annotation', 'doccano_export.csv')
df.to_csv(path, index=False)
path