## BRAT Label Export

Convert BRAT annotations to csv. In this case, BRAT annotations apply only to the dev partition.

In [6]:
import os
import os.path as osp
import pandas as pd
import numpy as np
from bratreader.repomodel import RepoModel
from tcre.env import *
from tcre.supervision import ENT_TYP_TF, ENT_TYP_CK, ENT_TYP_CT

In [2]:
collection_dir = osp.join(REPO_DATA_DIR, 'brat', 'collection_02')
collection_dir

'/lab/repos/t-cell-relation-extraction/data/brat/collection_02'

In [3]:
repo_model = RepoModel(collection_dir)

In [4]:
len(repo_model.documents)

89

In [7]:
def get_relations(doc):
    relations = []
    
    # Map from entity type label used in annotation to label used within snorkel
    ent_typ_map = {
        'TF': ENT_TYP_TF,
        'CELL_TYPE': ENT_TYP_CT,
        'CYTOKINE': ENT_TYP_CK
    }
    
    for a2 in doc.annotations:
        # Links only exist as INCOMING links meaning that a2 is the target of relations
        # and a1 will be the source
        if a2.links:
            for relation_type, annots in a2.links.items():
                for a1 in annots:
                    assert len(a1.labels) == len(a2.labels) == 1
                    a2_typ = ent_typ_map[list(a2.labels.keys())[0]]
                    
                    # For consistency with snorkel convention, ensure that the second
                    # value in relation entry is the cell type and all others are first
                    e1, e2 = a1, a2
                    if a2_typ != ENT_TYP_CT:
                        e1, e2 = a2, a1
                    
                    relations.append(dict(
                        id=doc.key,
                        rel_typ=relation_type, 
                        e1_typ=ent_typ_map[list(e1.labels.keys())[0]], 
                        e1_start_chr=e1.realspan[0], 
                        e1_end_chr=e1.realspan[1], 
                        e1_text=e1.repr,
                        e2_typ=ent_typ_map[list(e2.labels.keys())[0]], 
                        e2_start_chr=e2.realspan[0], 
                        e2_end_chr=e2.realspan[1], 
                        e2_text=e2.repr
                    ))
    return relations
df = pd.DataFrame([r for k, doc in repo_model.documents.items() for r in get_relations(doc)])
df = df.drop_duplicates()
df.head()

Unnamed: 0,e1_end_chr,e1_start_chr,e1_text,e1_typ,e2_end_chr,e2_start_chr,e2_text,e2_typ,id,rel_typ
0,1343,1338,IL-17,CYTOKINE,1296,1291,Vγ4 T,IMMUNE_CELL_TYPE,PMC4451961,Secretion
1,1465,1460,IL-17,CYTOKINE,1296,1291,Vγ4 T,IMMUNE_CELL_TYPE,PMC4451961,Secretion
2,1908,1903,IL-17,CYTOKINE,1855,1850,Vγ4 T,IMMUNE_CELL_TYPE,PMC4451961,Secretion
3,3737,3730,(IFN)-γ,CYTOKINE,3829,3825,γδ T,IMMUNE_CELL_TYPE,PMC4451961,Secretion
4,3761,3754,(IL)-17,CYTOKINE,3829,3825,γδ T,IMMUNE_CELL_TYPE,PMC4451961,Secretion


In [8]:
df.groupby(['e1_typ', 'e2_typ', 'rel_typ']).size()

e1_typ                e2_typ                rel_typ               
CYTOKINE              CYTOKINE              CKCKEnhancement            31
                                            CKCKEnhancementNeg         11
                      IMMUNE_CELL_TYPE      CKProliferation             5
                                            Induction                 145
                                            InductionNeg                3
                                            InductionNeutral           17
                                            Secretion                 152
                                            SecretionNeg                3
                      TRANSCRIPTION_FACTOR  TFCKEnhancement            15
                                            TFCKEnhancementNeg         13
TRANSCRIPTION_FACTOR  CYTOKINE              CKTFEnhancement            20
                                            CKTFEnhancementNeg          2
                      IMMUNE_CELL_TYPE      D

In [9]:
dff = df[df['rel_typ'].isin(['Induction', 'Secretion', 'Differentiation'])]
dff.groupby(['e1_typ', 'e2_typ', 'rel_typ']).size()

e1_typ                e2_typ            rel_typ        
CYTOKINE              IMMUNE_CELL_TYPE  Induction          145
                                        Secretion          152
TRANSCRIPTION_FACTOR  IMMUNE_CELL_TYPE  Differentiation     95
dtype: int64

In [12]:
# Note: export has exclusive char range
path = osp.join(REPO_DATA_DIR, 'annotation',  'brat_export.csv')
dff.to_csv(path, index=False)
path

'/lab/repos/t-cell-relation-extraction/data/annotation/brat_export.csv'