In [1]:
import pandas as pd
import nltk
from nltk import sent_tokenize
import os
import spacy
from tqdm import tqdm
from spacy.tokenizer import Tokenizer
import re
nlp = spacy.load("en_core_web_sm")

import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 

In [2]:
class DataPrepMedMentionDoc():
    def __init__(self, filepath):
        self.ann_df, self.text = self.load_annotations(filepath)
        self.kept = 0

    def change_offset(self, val, title_len):
        if val > title_len:
            val += 1
        return val
        
    def load_annotations(self, filepath):
        file = open(filepath, 'r')
        lines = file.readlines()
        file.close()
        title = lines[0].split('|')[-1].strip()
        abstract = lines[1].split('|')[-1].strip()
        lines = lines[2:]
        
        title_len = len(title)
        text = title + '. ' + abstract
        lines = [line.split('\t')[1:4] for line in lines]
        ann_df = pd.DataFrame(columns=['beg_idx', 'end_idx', 'entity'], data=lines)
        ann_df['beg_idx'] = pd.to_numeric(ann_df['beg_idx'])
        ann_df['end_idx'] = pd.to_numeric(ann_df['end_idx'])
        
        ann_df['beg_idx'] = ann_df.apply(lambda row: self.change_offset(row['beg_idx'], title_len), axis=1)
        ann_df['end_idx'] = ann_df.apply(lambda row: self.change_offset(row['end_idx'], title_len), axis=1)
        for index, row in ann_df.iterrows():
            if row['entity'] != text[row['beg_idx']:row['end_idx']]:
                print('problem')
                
        return ann_df, text
                
    def convert_bio(self):
        df, text = self.ann_df, self.text
        
        # BIO tagging entire text
        doc = nlp(text)
        tokens = []
        prev_term_id = ''
        tags = []

        # go through annotations and mark entities in doc with BIO tags
        tags = ['O']*len(doc)

        for index, row in df.iterrows():
            self.kept += 1
            span = doc.char_span(row['beg_idx'], row['end_idx'], alignment_mode='expand')
            for token_idx in range(span.start, span.end):
                if token_idx == span.start:
                    tags[token_idx] = 'B'
                else:
                    tags[token_idx] = 'I'

        # removing tokens that are whitespaces
        tokens = []
        old_tags = tags
        tags = []
        for token, tag in zip(doc, old_tags):
            if token.text.strip() != '':
                tokens.append(token.text)
                tags.append(tag)
        text = ' '.join(tokens)

        # splitting tags and text into sentences
        sentences = sent_tokenize(text)
        df = pd.DataFrame(columns=['sentence', 'word_labels'])
        for sent in sentences:
            sent_len = len(sent.split())
            cur_tags = ','.join(tags[0:sent_len])
            # remove current sentence's tags
            tags = tags[sent_len:]
            df = df.append({'sentence':sent, 'word_labels':cur_tags}, ignore_index=True)
        return df, self.kept

In [3]:
class DataPrepMedMentionDir():
    def __init__(self):
        pass
    
    def prep_to_bio(self, dir_path):
        kept = 0
        filenames = [f for f in os.listdir(dir_path)]
        df = pd.DataFrame(columns=['sentence', 'word_labels'])
        for filename in tqdm(filenames):
            filepath = dir_path + '/' + filename
            try:
                prepper = DataPrepMedMentionDoc(filepath)
                cur_df, cur_kept = prepper.convert_bio()
                df = pd.concat([df, cur_df])
                kept += cur_kept
            except:
                print(filename)
        df.to_csv('../corpus/medmentions/medmentions_small_bio.csv', header=True)
        print("total annotations: ", kept)
        return df

In [4]:
filepath = '../corpus/medmentions/docs/27358636.txt'
dir_path = '../corpus/medmentions/docs_small'

In [61]:
prepper = DataPrepMedMentionDoc(filepath)
df = prepper.convert_bio()

In [5]:
prepper = DataPrepMedMentionDir()
prepper.prep_to_bio(dir_path)

  8%|████████▉                                                                                                             | 332/4393 [00:14<03:02, 22.21it/s]

.txt


 33%|██████████████████████████████████████▌                                                                              | 1447/4393 [01:03<02:20, 21.03it/s]

problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
27460729.txt


 85%|███████████████████████████████████████████████████████████████████████████████████████████████████▊                 | 3749/4393 [02:44<00:26, 24.68it/s]

problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
27059693.txt


 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊            | 3934/4393 [02:52<00:20, 22.85it/s]

problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
27801889.txt


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4393/4393 [03:12<00:00, 22.86it/s]


total annotations:  203124


Unnamed: 0,sentence,word_labels
0,Haemophilus influenzae type b meningitis in a ...,"B,I,I,I,I,O,O,B,O,B,O,O"
1,Invasive Haemophilus influenzae type b ( Hib )...,"B,I,I,I,I,I,I,I,I,O,O,O,O,O,O,B,I,O,O,B,I,O"
2,We report a case of a fifteen - months - old g...,"O,B,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B,O,B,O,O,B,I,..."
3,She was irritable and the Brudzinski 's sign w...,"O,O,B,O,O,B,I,I,O,B,O"
4,The cerebrospinal fluid ( CSF ) analysis showe...,"O,B,I,O,B,O,O,O,B,O,B,I,I,O"
...,...,...
7,hPDSCs exerted inhibitory actions on inflammat...,"B,O,O,O,O,O,B,O,O,O,O,O,O,B,B,O,O,O,O,O,B,I,O,..."
8,The preservation of cells expressing lysozyme ...,"O,B,O,B,O,B,O,O,B,O,O,O,O,O,B,B,O,O"
9,Both preventive and therapeutic efficacies of ...,"O,O,O,B,O,O,B,O,O,O,B,I,I,I,O"
10,Label - free quantification was used to identi...,"O,O,O,O,O,O,O,O,B,O,O,B,I,O,B,B,O,O,O,B,I,I,I,..."


In [69]:
index = 15
for a,b in zip(df.iloc[index]['sentence'].split(' '), df.iloc[index]['word_labels'].split(',')):
    print(a,b)

DWI B
and O
quantitative B
measurement I
of O
ADC B
values B
can O
be O
used O
in O
differential B
diagnosis I
of O
benign B
and O
malignant B
liver B
lesions I
and O
also O
in O
the O
diagnosis B
and O
differentiation B
of O
hemangiomas B
. O


In [76]:
file = open('../corpus/medmentions/corpus_pubtator_small.txt','r')
text = file.read()
file.close()

docs = text.split('\n\n')
for doc in docs: 
    filename = doc.split('|')[0]
    file = open('../corpus/medmentions/docs_small/' + filename + '.txt', 'w')
    file.write(doc)
    file.close()