# Creates all necessary dictionaries for creating the (entity) annotations and mapping the gold labels to the snorkel candidates

** Files needed: **

** - Corpus (with entity annotations) should be in BRAT format **

** - Gold (relationship) labels should be in tsv format **

In [1]:
import glob
import codecs
import pandas as pd

In [2]:
import bz2
import cPickle as pickle
import time

In [3]:
from preprocessing import standoff_to_entitydf, entitydf_to_meshdict, entitydf_to_tagdict

# Standoff format to entity DF

In [4]:
from preprocessing import standoff_to_entitydf

In [8]:
ann_files = []
ann_files.extend(glob.glob("/home/antonis/data/biocreative6/goldset/*/*.ann"))
ann_files.extend(glob.glob("/home/antonis/data/biocreative6/NCBI_parsed/similar50/*.ann"))

In [9]:
ann_files[0:3]+ann_files[-5:]

['/home/antonis/data/biocreative6/goldset/test_gs/15866500.ann',
 '/home/antonis/data/biocreative6/goldset/test_gs/20407761.ann',
 '/home/antonis/data/biocreative6/goldset/test_gs/23177789.ann',
 '/home/antonis/data/biocreative6/NCBI_parsed/similar50/22513289.ann',
 '/home/antonis/data/biocreative6/NCBI_parsed/similar50/3177812.ann',
 '/home/antonis/data/biocreative6/NCBI_parsed/similar50/26784681.ann',
 '/home/antonis/data/biocreative6/NCBI_parsed/similar50/9138706.ann',
 '/home/antonis/data/biocreative6/NCBI_parsed/similar50/15121849.ann']

In [10]:
start_time = time.time()

# parse standoff annotations again !! Time consuming 
entities = standoff_to_entitydf(ann_files, )
# Write to bz2 pickle 
with bz2.BZ2File('/home/antonis/data/biocreative6/entities/entitiesdf_GS+similar50.pkl.bz2', 'wb') as f:
    pickle.dump(entities, f)

print time.time()-start_time

Found 12979 standoff files
Processing file 0 of 12979 (0.0 min)
Processing file 500 of 12979 (0.0 min)
Processing file 1000 of 12979 (0.0 min)
Processing file 1500 of 12979 (0.1 min)
Processing file 2000 of 12979 (0.1 min)
Processing file 2500 of 12979 (0.2 min)
Processing file 3000 of 12979 (0.3 min)
Processing file 3500 of 12979 (0.4 min)
Processing file 4000 of 12979 (0.5 min)
Processing file 4500 of 12979 (0.7 min)
Processing file 5000 of 12979 (0.8 min)
Processing file 5500 of 12979 (1.0 min)
Processing file 6000 of 12979 (1.2 min)
Processing file 6500 of 12979 (1.4 min)
Processing file 7000 of 12979 (1.7 min)
Processing file 7500 of 12979 (2.0 min)
Processing file 8000 of 12979 (2.3 min)
Processing file 8500 of 12979 (2.6 min)
Processing file 9000 of 12979 (2.9 min)
Processing file 9500 of 12979 (3.3 min)
Processing file 10000 of 12979 (3.7 min)
Processing file 10500 of 12979 (4.1 min)
Processing file 11000 of 12979 (4.5 min)
Processing file 11500 of 12979 (5.0 min)
Processing fi

# ~~~~

# Load entities df

In [11]:
with bz2.BZ2File('/home/antonis/data/biocreative6/entities/entitiesdf_GS+similar50.pkl.bz2', 'rb') as f:
    entities = pickle.load(f)

In [12]:
entities.head()

Unnamed: 0,T,EntityType,start,stop,EntityText,_,identifier,doc_id
0,T10,Chemical,1899,1904,FdUMP,,MESH:D005468,15866500
1,T11,Chemical,383,386,UdR,,Chemical:1,15866500
2,T12,Chemical,1928,1931,UdR,,Chemical:1,15866500
3,T13,Chemical,2008,2020,capecitabine,,MESH:C110904,15866500
4,T14,Chemical,2093,2108,2'-deoxyuridine,,Chemical:4,15866500


### Check mapped vs un-mapped entities in our dataset

In [14]:
154228+ 26964+136649+13478-41200


290119

In [13]:
# un-mapped Chemicals (excluding tmTool unknown: identifier)
def check_unmapped(ent_type, entities):
    ser = pd.Series(map(lambda x: x.startswith(ent_type+":"),entities[entities.EntityType==ent_type]['identifier'] )).value_counts().rename({False: "Mapped", True: "Un-mapped"},)
    return ser

for ent_type in entities.EntityType.unique():
    print ent_type
    print check_unmapped(ent_type, entities),'\n'
    
# Previous results (GS + chemdner_silver)
# Chemical
# Mapped       67113
# Un-mapped    30656
# dtype: int64 

# Gene
# Mapped       51594
# Un-mapped    17974
# dtype: int64 

# Disease
# Mapped       83511
# Un-mapped        3
# dtype: int64 

# Species
# Mapped    55273
# dtype: int64 

# Mutation
# Mapped    1690
# dtype: int64 

Chemical
Mapped       121003
Un-mapped     27075
dtype: int64 

Gene
Mapped       95413
Un-mapped    13588
dtype: int64 

Species
Mapped    34827
dtype: int64 

Disease
Mapped       45379
Un-mapped        1
dtype: int64 

Mutation
Mapped    2336
dtype: int64 



# Create entities import files for snorkel (candidate extraction)

### Create tag dict

In [14]:
unary_tags = entitydf_to_tagdict(entities)
# unary_tags

In [15]:
len(unary_tags)

12935

In [16]:
unary_tags[unary_tags.keys()[1]]

{(u'Chemical|Chemical:95131', 871, 887),
 (u'Chemical|Chemical:95132', 889, 896),
 (u'Chemical|MESH:C045645', 1756, 1766),
 (u'Chemical|MESH:C055162', 82, 93),
 (u'Chemical|MESH:C055162', 117, 128),
 (u'Chemical|MESH:C055162', 565, 576),
 (u'Chemical|MESH:D000244', 784, 787),
 (u'Chemical|MESH:D000244', 791, 794),
 (u'Chemical|MESH:D000244', 815, 818),
 (u'Chemical|MESH:D000244', 1274, 1277),
 (u'Chemical|MESH:D000244', 1289, 1292),
 (u'Chemical|MESH:D000244', 1343, 1346),
 (u'Chemical|MESH:D000527', 797, 813),
 (u'Chemical|MESH:D000527', 821, 825),
 (u'Chemical|MESH:D000527', 1295, 1299),
 (u'Chemical|MESH:D000527', 1349, 1353),
 (u'Chemical|MESH:D002784', 693, 704),
 (u'Chemical|MESH:D002784', 1612, 1623),
 (u'Chemical|MESH:D014280', 706, 719),
 (u'Chemical|MESH:D014280', 1628, 1641),
 (u'Disease|unknown:D001791', 752, 772),
 (u'Disease|unknown:D003324', 1149, 1172),
 (u'Disease|unknown:D006470', 374, 382),
 (u'Disease|unknown:D014652', 146, 162),
 (u'Gene|NCBIGENE:64805', 26, 31),
 

#### Write to bz2 pickle

In [17]:
# Write to bz2 pickle 
with bz2.BZ2File('/home/antonis/data/biocreative6/entities/unary_tags.pkl.bz2', 'w') as f:
    pickle.dump(unary_tags, f)

### Create mesh dict

In [18]:
mesh_dict = entitydf_to_meshdict(entities)

In [19]:
# Write to bz2 pickle 
with bz2.BZ2File('/home/antonis/data/biocreative6/entities/mesh_dict.pkl.bz2', 'w') as f:
    pickle.dump(tuple([mesh_dict['Chemical'], mesh_dict['Gene']]), f)

# Create relationship import files for snorkel (candidate extraction)

### Create goldset relationships dict
** Skipped - assumes document-level annotations **

In [18]:
# from preprocessing import gold_relations_to_dict
T_mapping = entities.groupby('doc_id')[['T','identifier']].apply(lambda g: dict(g.values.tolist())).to_dict()

** To tsv ** 

In [19]:
# from preprocessing import gold_relations_to_tsv

In [20]:
entities.head()

Unnamed: 0,T,EntityType,start,stop,EntityText,_,identifier,doc_id,tup
0,T10,Chemical,1899,1904,FdUMP,,MESH:D005468,15866500,"(Chemical|MESH:D005468, 1899, 1904)"
1,T11,Chemical,383,386,UdR,,Chemical:1,15866500,"(Chemical|Chemical:1, 383, 386)"
2,T12,Chemical,1928,1931,UdR,,Chemical:1,15866500,"(Chemical|Chemical:1, 1928, 1931)"
3,T13,Chemical,2008,2020,capecitabine,,MESH:C110904,15866500,"(Chemical|MESH:C110904, 2008, 2020)"
4,T14,Chemical,2093,2108,2'-deoxyuridine,,Chemical:4,15866500,"(Chemical|Chemical:4, 2093, 2108)"


In [21]:
entities.tail()

Unnamed: 0,T,EntityType,start,stop,EntityText,_,identifier,doc_id,tup
454549,T23,Species,1569,1577,patients,N23\tReference T23 NCBITaxon:9606\tpatients,NCBITaxon:9606,22323410,"(Species|NCBITaxon:9606, 1569, 1577)"
454550,T24,Disease,1674,1685,weight loss,N24\tReference T24 unknown:D015431\tweight loss,unknown:D015431,22323410,"(Disease|unknown:D015431, 1674, 1685)"
454551,T25,Disease,1705,1713,diabetes,N25\tReference T25 unknown:D003920\tdiabetes,unknown:D003920,22323410,"(Disease|unknown:D003920, 1705, 1713)"
454552,T26,Chemical,1804,1813,vitamin D,N26\tReference T26 MESH:D014807\tvitamin D,MESH:D014807,22323410,"(Chemical|MESH:D014807, 1804, 1813)"
454553,T27,Disease,1818,1826,diabetes,N27\tReference T27 unknown:D003920\tdiabetes,unknown:D003920,22323410,"(Disease|unknown:D003920, 1818, 1826)"


In [22]:
from collections import defaultdict

In [23]:
# prepare for extracting to tsv

entities['startstop'] = map(lambda start,stop: [int(start), int(stop)], entities.start,entities.stop)
entities = entities[map(lambda x: x in ['Chemical', 'Gene'],entities.EntityType)] #keep only Chemicals & Genes (combinations get too big)
# dict of offsets per T
T_offsets = entities.groupby('doc_id')[['T','startstop']].apply(lambda g: dict(g.values.tolist())).to_dict()
# dict of T types (to avoid pointless combinations)
T_types = defaultdict(lambda: defaultdict(list))
for row in entities.iterrows():
    T_types[row[1].doc_id][row[1]['EntityType']].append(row[1]['T'])

In [24]:
from preprocessing import gold_relations_to_tsv

In [26]:
# Analytics relationships 

input_path = glob.glob("/home/antonis/data/biocreative6/tsv/*/*gold_standard.tsv")
#, output_path, T_offsets, T_types, 
true_rels=['CPR:3', 'CPR:4']
# correct_last_offset = True

relationships = pd.DataFrame()
if isinstance(input_path,str):
    input_path = [input_path]
for path in input_path:
    relationships = relationships.append(pd.read_table(path, names = ['doc_id', 'CPR', 'arg1', 'arg2']) , ignore_index=True)

# convert doc_id to string
relationships['doc_id'] = map(lambda x: str(x) , relationships['doc_id'])
relationships['label'] = map(lambda x: 1 if x in true_rels else -1, relationships.CPR)

print 'Relationship labels in total (train,dev,test set)'
print relationships.label.value_counts()

# Total (+) relationships in gold set:
# Train: 3022
# Dev: 1644
# Test 2326
# Total: 6992


Relationship labels in total (train,dev,test set)
 1    6992
-1    3039
Name: label, dtype: int64


In [27]:
# neg_rels1 = pd.DataFrame(map(lambda (T1,T2): (doc_id,-1,T1,T2) , product(T_types[doc_id]['Chemical'],T_types[doc_id]['Gene'])) , columns = [u'doc_id', u'CPR', u'arg1', u'arg2'])

# neg_rels1['arg1'] = map( lambda arg: "Arg1:"+arg , neg_rels1['arg1'])
# neg_rels1['arg2'] = map( lambda arg: "Arg2:"+arg , neg_rels1['arg2'])

# neg_rels

In [28]:
# Extract rels and save to disk

In [29]:
path = glob.glob("/home/antonis/data/biocreative6/tsv/*/*gold_standard.tsv")
print 'Converting labels from:\n'+ ",\n".join(path)

print 'Labels statistics:'
gold_relations_to_tsv(path,'/home/antonis/data/biocreative6/entities/gold_rels_complete.tsv',T_offsets, T_types)

Converting labels from:
/home/antonis/data/biocreative6/tsv/test_gs/chemprot_test_gold_standard.tsv,
/home/antonis/data/biocreative6/tsv/training/chemprot_training_gold_standard.tsv,
/home/antonis/data/biocreative6/tsv/development/chemprot_development_gold_standard.tsv
Labels statistics:
-1    314556
 1      6987
Name: label, dtype: int64
