In [1]:
import tqdm
import os.path as osp
import pandas as pd
import numpy as np 
import spacy
%run src/logging.py
%run src/integration.py
%run src/brat.py
%run env.py
article_data_file = osp.join(DATA_DIR, 'articles', 'data.csv')

# To create new collection: 
# - Export all results and (optionally) include results from previous sets
# - Copy txt and ann files from all previous sets into current collection (including .conf)
#output_collection_dir = osp.join(REPO_DATA_DIR, 'brat', 'collection_01')
output_collection_dir = osp.join(REPO_DATA_DIR, 'brat', 'collection_02')

# Originally annotated set of 10 articles
IDS1 = [
    2193209, 2634967, 2646571, 2938478, 3046151, 
    3095633, 3173465, 3189223, 3235500, 3304099
]
IDS_ALL = IDS1

In [2]:
nlp = get_scispacy_pipeline()
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x119a13198>),
 ('parser', <spacy.pipeline.DependencyParser at 0x119a4abf8>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x1248ed570>)]

In [3]:
df = pd.read_csv(article_data_file)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3500 entries, 0 to 3499
Data columns (total 6 columns):
abstract    3481 non-null object
date        2155 non-null object
has_text    3500 non-null bool
id          3500 non-null int64
title       3500 non-null object
xml         3500 non-null object
dtypes: bool(1), int64(1), object(4)
memory usage: 140.2+ KB


In [4]:
df_exp = df[df['abstract'].notnull() | df['id'].isin(IDS_ALL)]

# Get list of N ids not in previous sets
ids = pd.Series(np.setdiff1d(df_exp['id'].unique(), IDS_ALL)).sample(n=100, random_state=SEED).values
df_exp = df_exp[df_exp['id'].isin(list(IDS_ALL) + list(ids))]

df_exp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110 entries, 22 to 3496
Data columns (total 6 columns):
abstract    110 non-null object
date        69 non-null object
has_text    110 non-null bool
id          110 non-null int64
title       110 non-null object
xml         110 non-null object
dtypes: bool(1), int64(1), object(4)
memory usage: 5.3+ KB


In [5]:
df_exp['id'].astype(str).str.cat(sep=', ')

'4426480, 5876181, 4959015, 5112176, 4056277, 4474185, 4224975, 4552951, 5052263, 4159719, 3067507, 2193209, 6282816, 3855395, 6130380, 5611819, 5611846, 4592272, 4649113, 3317433, 5429091, 4241840, 3650071, 200936, 5191835, 4007342, 3787487, 5591438, 4214202, 3092345, 2783637, 5833121, 4418961, 6141714, 3854702, 4084624, 3246047, 3304099, 5020626, 3750006, 3791721, 2989239, 3321800, 3204990, 4851424, 5118948, 2996551, 4100769, 3926063, 5293011, 3064981, 5290235, 3095633, 6373736, 4023883, 2196041, 5983667, 3639604, 5520220, 2587175, 6197911, 4720349, 3046151, 3235500, 4423225, 4451961, 2646571, 5648021, 2634967, 3850168, 2938478, 6157333, 4233385, 3711858, 5519767, 5464295, 4628936, 5749247, 5923349, 3189223, 6372559, 4856445, 3323935, 6290922, 3228524, 2772737, 4710466, 3173465, 5578684, 3249647, 5206501, 5342705, 6274670, 4418002, 4385920, 5343661, 5257256, 4291544, 4905708, 3842119, 4151505, 2805085, 2983473, 4168117, 6092975, 5417820, 5727967, 6122729, 3927957, 4337382'

In [6]:
df_exp['body'] = df_exp['xml'].apply(extract_text)
df_exp['text'] = df_exp.apply(lambda r: combine_text(r['title'], r['abstract'], r['body']), axis=1)
df_exp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110 entries, 22 to 3496
Data columns (total 8 columns):
abstract    110 non-null object
date        69 non-null object
has_text    110 non-null bool
id          110 non-null int64
title       110 non-null object
xml         110 non-null object
body        56 non-null object
text        110 non-null object
dtypes: bool(1), int64(1), object(6)
memory usage: 7.0+ KB


In [7]:
df_exp['text'].str.len().sort_values().tail(8)

1085     55029
3496     56990
898      58356
3354     59462
2679     59862
1033     84306
1083    102343
1125    162841
Name: text, dtype: int64

In [12]:
def export_annotations(df):
    if not osp.exists(output_collection_dir):
        os.makedirs(output_collection_dir)

    for i, r in tqdm.tqdm(list(df.iterrows())):
        text = r['text']
        if r['id'] in IDS1:
            max_len = 10000
        else:
            max_len = 5000
            
        # Clip texts to manageable size for annotation (10k chars is ~50-100 sentences)
        text = text if len(text) < max_len else text[:max_len]
        doc = nlp(text)
        doc_id = 'PMC{}'.format(r['id'])
        # Ignore whitespace-only or other single character named entities
        doc.ents = [ent for ent in doc.ents if len(ent.text.strip()) >= 2]
        brat_doc = spacy_doc_to_brat_doc(doc, doc_id)
        brat_doc.export(output_collection_dir)

export_annotations(df_exp)
#export_annotations(df_exp[df_exp['id'].isin([2193209, 2634967, 2646571])])

100%|██████████| 110/110 [00:11<00:00,  9.74it/s]


In [13]:
! cp data/brat/collection_01/* data/brat/collection_02/

In [None]:
# with open('/tmp/test_doc.ann', 'w') as fd:
#     fd.write(to_ann(doc))
# with open('/tmp/test_doc.txt', 'w') as fd:
#     fd.write(doc.text)

In [7]:
# doc = nlp(df[df['text'].notnull()]['text'].iloc[0])

In [31]:
# colors = {k: 'grey' for k in ner_types}
# colors.update({'PROTEIN': 'green', 'CELL_TYPE': 'red'})
# options = {'ents': ner_types, 'colors': colors}
# displacy.render(doc, style='ent', jupyter=True, options=options)

In [10]:
# T1	Organization 0 4	Sony
# T2	MERGE-ORG 14 27	joint venture
# T3	Organization 33 41	Ericsson
# E1	MERGE-ORG:T2 Org1:T1 Org2:T3
# T4	Country 75 81	Sweden
# R1	Origin Arg1:T3 Arg2:T4