In [1]:
import pandas as pd
import nltk
from nltk import sent_tokenize
import os
import spacy
from tqdm import tqdm
from spacy.tokenizer import Tokenizer
import re
import random
nlp = spacy.load("en_core_web_sm")

import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 

In [1]:
class DataPrepMedMentionDoc():
    def __init__(self, filepath):
        self.ann_df, self.text = self.load_annotations(filepath)

    def change_offset(self, val, title_len):
        if val > title_len:
            val += 1
        return val
        
    def load_annotations(self, filepath):
        file = open(filepath, 'r')
        lines = file.readlines()
        file.close()
        title = lines[0].split('|')[-1].strip()
        abstract = lines[1].split('|')[-1].strip()
        lines = lines[2:]
        
        title_len = len(title)
        text = title + '. ' + abstract
        lines = [line.split('\t')[1:4] for line in lines]
        ann_df = pd.DataFrame(columns=['beg_idx', 'end_idx', 'entity'], data=lines)
        ann_df = ann_df[ann_df['beg_idx'].apply(lambda x: str(x).isdigit())]
        ann_df['beg_idx'] = pd.to_numeric(ann_df['beg_idx'])
        ann_df['end_idx'] = pd.to_numeric(ann_df['end_idx'])
        
        ann_df['beg_idx'] = ann_df.apply(lambda row: self.change_offset(row['beg_idx'], title_len), axis=1)
        ann_df['end_idx'] = ann_df.apply(lambda row: self.change_offset(row['end_idx'], title_len), axis=1)
        for index, row in ann_df.iterrows():
            if row['entity'] != text[row['beg_idx']:row['end_idx']]:
                print('problem')
                
        return ann_df, text
                
    def convert_bio(self):
        df, text = self.ann_df, self.text
        
        # BIO tagging entire text
        doc = nlp(text)
        tokens = []
        prev_term_id = ''
        tags = []

        # go through annotations and mark entities in doc with BIO tags
        tags = ['O']*len(doc)

        for index, row in df.iterrows():
            span = doc.char_span(row['beg_idx'], row['end_idx'], alignment_mode='expand')
            for token_idx in range(span.start, span.end):
                if token_idx == span.start:
                    tags[token_idx] = 'B'
                else:
                    tags[token_idx] = 'I'

        # removing tokens that are whitespaces
        tokens = []
        old_tags = tags
        tags = []
        for token, tag in zip(doc, old_tags):
            if token.text.strip() != '':
                tokens.append(token.text)
                tags.append(tag)
        text = ' '.join(tokens)

        # splitting tags and text into sentences
        sentences = sent_tokenize(text)
        df = pd.DataFrame(columns=['sentence', 'word_labels'])
        for sent in sentences:
            sent_len = len(sent.split())
            cur_tags = ','.join(tags[0:sent_len])
            # remove current sentence's tags
            tags = tags[sent_len:]
            df = df.append({'sentence':sent, 'word_labels':cur_tags}, ignore_index=True)
        return df

In [3]:
class DataPrepMedMentionDir():
    def __init__(self):
        pass
    
    def prep_to_bio(self, dir_path):
        filenames = [f for f in os.listdir(dir_path)]
        df = pd.DataFrame(columns=['sentence', 'word_labels'])
        for filename in tqdm(filenames):
            filepath = dir_path + '/' + filename
            try:
                prepper = DataPrepMedMentionDoc(filepath)
                cur_df = prepper.convert_bio()
                df = pd.concat([df, cur_df])
            except:
                print(filename)
        df.to_csv('../corpus/biored/biored_train_full.csv', header=True)
        return df

In [4]:
class DataPrepBioRedPartial():
    def __init__(self, filepath):
        self.ann_df, self.text, self.removed = self.load_annotations(filepath)
        self.kept = 0


    def change_offset(self, val, title_len):
        if val > title_len:
            val += 1
        return val
        
    def load_annotations(self, filepath):
        file = open(filepath, 'r')
        lines = file.readlines()
        file.close()
        title = lines[0].split('|')[-1].strip()
        abstract = lines[1].split('|')[-1].strip()
        lines = lines[2:]
        
        title_len = len(title)
        text = title + '. ' + abstract
        
        removed = 0
        partial_lines = []
        for line in lines: 
            line = line.split('\t')
            # remove 60% of all annotations
            if random.random() > 0.4:
                removed += 1
                continue
            partial_lines.append(line[1:4])
        lines = partial_lines
        
        ann_df = pd.DataFrame(columns=['beg_idx', 'end_idx', 'entity'], data=lines)
        ann_df = ann_df[ann_df['beg_idx'].apply(lambda x: str(x).isdigit())]
        ann_df['beg_idx'] = pd.to_numeric(ann_df['beg_idx'])
        ann_df['end_idx'] = pd.to_numeric(ann_df['end_idx'])
        
        ann_df['beg_idx'] = ann_df.apply(lambda row: self.change_offset(row['beg_idx'], title_len), axis=1)
        ann_df['end_idx'] = ann_df.apply(lambda row: self.change_offset(row['end_idx'], title_len), axis=1)
        for index, row in ann_df.iterrows():
            if row['entity'] != text[row['beg_idx']:row['end_idx']]:
                print('problem')
                
        return ann_df, text, removed
                
    def convert_bio(self):
        df, text = self.ann_df, self.text
        
        # BIO tagging entire text
        doc = nlp(text)
        tokens = []
        prev_term_id = ''
        tags = []

        # go through annotations and mark entities in doc with BIO tags
        tags = ['O']*len(doc)

        for index, row in df.iterrows():
            span = doc.char_span(row['beg_idx'], row['end_idx'], alignment_mode='expand')
            for token_idx in range(span.start, span.end):
                if token_idx == span.start:
                    tags[token_idx] = 'B'
                    self.kept += 1
                else:
                    tags[token_idx] = 'I'

        # removing tokens that are whitespaces
        tokens = []
        old_tags = tags
        tags = []
        for token, tag in zip(doc, old_tags):
            if token.text.strip() != '':
                tokens.append(token.text)
                tags.append(tag)
        text = ' '.join(tokens)

        # splitting tags and text into sentences
        sentences = sent_tokenize(text)
        df = pd.DataFrame(columns=['sentence', 'word_labels'])
        for sent in sentences:
            sent_len = len(sent.split())
            cur_tags = ','.join(tags[0:sent_len])
            # remove current sentence's tags
            tags = tags[sent_len:]
            df = df.append({'sentence':sent, 'word_labels':cur_tags}, ignore_index=True)
        return df, self.removed, self.kept

In [5]:
class DataPrepBioRedDirPartial():
    def __init__(self):
        pass
    
    def prep_to_bio(self, dir_path):
        removed = 0
        kept = 0
        filenames = [f for f in os.listdir(dir_path)]
        df = pd.DataFrame(columns=['sentence', 'word_labels'])
        for filename in tqdm(filenames):
            filepath = dir_path + '/' + filename
            try:
                prepper = DataPrepBioRedPartial(filepath)
                cur_df, cur_removed, cur_kept = prepper.convert_bio()
                df = pd.concat([df, cur_df])
                removed += cur_removed
                kept += cur_kept
            except:
                print(filename)
        #df.to_csv('../corpus/biored/biored_train_partial_all_40.csv', header=True)
        print("removed entities: ", removed)
        print("kept entities: ", kept)
        return df

In [14]:
filepath = '../corpus/biored/test_docs/19531695.txt'

In [19]:
dir_path = '../corpus/biored/train_docs'
prepper = DataPrepBioRedDirPartial()
prepper.prep_to_bio(dir_path)

 10%|████████████▍                                                                                                           | 52/501 [00:02<00:20, 21.79it/s]

.txt


 28%|█████████████████████████████████▋                                                                                     | 142/501 [00:06<00:14, 24.29it/s]

8944024.txt


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 501/501 [00:21<00:00, 23.48it/s]

removed entities:  13322
kept entities:  6783





Unnamed: 0,sentence,word_labels
0,Catechol - O - methyltransferase ( COMT ) gene...,"O,O,O,O,O,O,B,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,A common single nucleotide polymorphism ( SNP ...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,We sequenced exon IV of COMT gene in search fo...,"O,O,O,O,O,B,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,Genotype frequencies of the G472A SNP varied s...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,"Using a genotype test , we found a trend to po...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
...,...,...
2,METHODS : A case - control study was carried o...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
3,"Genomic DNA was extracted from blood samples ,...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
4,"Data were adjusted for sex , age , migraine hi...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
5,RESULTS : There was no association between ind...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"


In [6]:
dir_path = '../corpus/biored/train_docs'
prepper = DataPrepMedMentionDir()
prepper.prep_to_bio(dir_path)

 10%|████████████▍                                                                                                           | 52/501 [00:02<00:21, 21.06it/s]

.txt


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 501/501 [00:22<00:00, 21.92it/s]


Unnamed: 0,sentence,word_labels
0,Catechol - O - methyltransferase ( COMT ) gene...,"B,I,I,I,I,O,B,O,O,O,O,O,O,O,O,B,O,O,B,I,O,O,B,..."
1,A common single nucleotide polymorphism ( SNP ...,"O,O,O,O,O,O,O,O,O,B,O,O,O,O,B,O,O,O,O,O,O,O,O,..."
2,We sequenced exon IV of COMT gene in search fo...,"O,O,O,O,O,B,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,Genotype frequencies of the G472A SNP varied s...,"O,O,O,O,B,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,"Using a genotype test , we found a trend to po...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B,O,..."
...,...,...
2,METHODS : A case - control study was carried o...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B,O,O,O,O"
3,"Genomic DNA was extracted from blood samples ,...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
4,"Data were adjusted for sex , age , migraine hi...","O,O,O,O,O,O,O,O,B,O,O,O,O,O,O,O,O,O,O,O,O,O"
5,RESULTS : There was no association between ind...,"O,O,O,O,O,O,O,O,O,O,B,O,O,O,O,O,O,O,O,O,O"


In [40]:
index = 0
for a,b in zip(df.iloc[index]['sentence'].split(' '), df.iloc[index]['word_labels'].split(',')):
    print(a,b)

Delirium B
in O
a O
patient B
with O
toxic O
flecainide B
plasma O
concentrations O
: O
the O
role O
of O
a O
pharmacokinetic O
drug O
interaction O
with O
paroxetine B
.. O


In [7]:
file = open('../corpus/biored/Dev.Pubtator','r')
text = file.read()
file.close()

docs = text.split('\n\n')
for doc in docs: 
    filename = doc.split('|')[0]
    file = open('../corpus/biored/dev_docs/' + filename + '.txt', 'w')
    file.write(doc)
    file.close()