In [2]:
import tqdm
import os.path as osp
import pandas as pd
import spacy
%run src/logging.py
%run src/integration.py
%run env.py
article_data_file = osp.join(DATA_DIR, 'articles', 'data.csv')
output_collection_dir = osp.join(REPO_DATA_DIR, 'brat', 'collection_01')

In [None]:
nlp = get_scispacy_pipeline()
nlp.pipeline

In [3]:
df = pd.read_csv(article_data_file)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2990 entries, 0 to 2989
Data columns (total 5 columns):
abstract    2972 non-null object
date        1833 non-null object
id          2990 non-null int64
text        1490 non-null object
title       2990 non-null object
dtypes: int64(1), object(4)
memory usage: 116.9+ KB


In [4]:
df_exp = df[df['text'].notnull() & df['abstract'].notnull()].sample(n=100, random_state=1)
df_exp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 68 to 137
Data columns (total 5 columns):
abstract    100 non-null object
date        88 non-null object
id          100 non-null int64
text        100 non-null object
title       100 non-null object
dtypes: int64(1), object(4)
memory usage: 4.7+ KB


In [5]:
df_exp['id'].astype(str).str.cat(sep=', ')

'4441007, 6080923, 6159523, 3402525, 6069866, 5392269, 2634967, 3798598, 4758703, 4890327, 3888704, 5357273, 4810024, 5536310, 6385830, 6127336, 2646571, 5318439, 4029221, 6303068, 4784746, 4422701, 6384271, 4919350, 3942504, 4451961, 5352345, 5464295, 4363672, 5771098, 3235500, 5550994, 5472765, 5597704, 4372528, 5785573, 4423225, 4409658, 3724802, 3616072, 4674549, 3304099, 5209687, 3828240, 5215294, 5173246, 3189223, 5339304, 4363301, 5264496, 4449014, 4978836, 5150244, 6249024, 4136865, 3637864, 4028272, 3804955, 3832935, 3707302, 5827529, 3531463, 4314580, 3922392, 5549607, 5406529, 6371971, 4390375, 5597262, 4893440, 4100769, 4991385, 4809658, 6070887, 4586477, 5532287, 4545657, 5148907, 3173465, 6077677, 5075284, 5704186, 4232071, 4203955, 3674606, 2193209, 3095633, 2938478, 4896250, 3672957, 6053392, 4529306, 5085117, 5179326, 4792960, 4016499, 3046151, 5716032, 6318333, 3418275'

In [6]:
df_exp['text'].str.len().sort_values().tail(8)

204       66549
381       66980
215       68161
98        70894
2072      76938
68        84116
2347      88164
90      1532124
Name: text, dtype: int64

In [9]:
MAX_TEXT_LEN = 10000

def to_ann(doc):
    ann = []
    for i, ent in enumerate(doc.ents):
        if not ent.text.strip():
            continue
        if '\n' in ent.text:
            continue
        line = 'T{}\t{} {} {}\t{}'.format(i+1, ent.label_, ent.start_char, ent.end_char, ent.text)
        ann.append(line)
    return '\n'.join(ann)

def export_annotations(df):
    if not osp.exists(output_collection_dir):
        os.makedirs(output_collection_dir)

    for i, r in tqdm.tqdm(list(df.iterrows())):
        text = combine_text(r['title'], r['abstract'], r['text'])
        
        # Clip texts to manageable size for annotation (10k chars is ~50-100 sentences)
        text = text if len(text) < MAX_TEXT_LEN else text[:MAX_TEXT_LEN]
        doc = nlp(text)
        filename = 'PMC{}'.format(r['id'])
        with open(osp.join(output_collection_dir, filename+'.ann'), 'w') as fd:
            fd.write(to_ann(doc))
        with open(osp.join(output_collection_dir, filename+'.txt'), 'w') as fd:
            fd.write(text)

export_annotations(df_exp)
#export_annotations(df_exp[df_exp['id'].isin([2193209, 2634967, 2646571])])

100%|██████████| 3/3 [00:00<00:00,  3.39it/s]


In [None]:
# with open('/tmp/test_doc.ann', 'w') as fd:
#     fd.write(to_ann(doc))
# with open('/tmp/test_doc.txt', 'w') as fd:
#     fd.write(doc.text)

In [7]:
# doc = nlp(df[df['text'].notnull()]['text'].iloc[0])

In [31]:
# colors = {k: 'grey' for k in ner_types}
# colors.update({'PROTEIN': 'green', 'CELL_TYPE': 'red'})
# options = {'ents': ner_types, 'colors': colors}
# displacy.render(doc, style='ent', jupyter=True, options=options)

In [10]:
# T1	Organization 0 4	Sony
# T2	MERGE-ORG 14 27	joint venture
# T3	Organization 33 41	Ericsson
# E1	MERGE-ORG:T2 Org1:T1 Org2:T3
# T4	Country 75 81	Sweden
# R1	Origin Arg1:T3 Arg2:T4