In [101]:
import pandas as pd
import nltk
from nltk import sent_tokenize
import os
import spacy
from tqdm import tqdm
from spacy.tokenizer import Tokenizer
import re
import math
import numpy as np
nlp = spacy.load("en_core_web_sm")

import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 

In [116]:
class AnnotationDataPrep():
    def __init__(self):
        pass
    
    def load_annotations_file(self, filename, dir_path):
        txt_path = dir_path + filename + '/' + filename + '.txt'
        ann_path = dir_path + filename + '/' + filename + '.ann'

        # load text
        file = open(txt_path, 'r')
        text = file.read()
        file.close()

        # load annotations
        file = open(ann_path, 'r')
        lines = file.readlines()
        file.close()
        df = pd.DataFrame(columns=['term_id', 'source', 'beg_idx', 'end_idx', 'entity', 'line'])
        for line in lines:
            if line[0] != 'T':
                break
            line = line.strip()
            row = {}
            parts = line.split('\t')
            row['term_id'] = parts[0]
            row['entity'] = parts[-1]
            parts = parts[1].split()
            row['source'] = 'OmniScienceMedicine'
            row['beg_idx'], row['end_idx'] = int(parts[1]), int(parts[2])
            row['line'] = line
            df = df.append(row, ignore_index=True)
        
        df = df.drop_duplicates(subset=['beg_idx', 'end_idx'], keep='first')
        
        return df, text
    
    def get_annotation_in_format(self, term_id, source, beg_idx, end_idx, entity):
        line = term_id + '\t' + source + ' ' + str(beg_idx) + ' ' + str(end_idx) + '\t' + entity
        return line
        
    def output_files(self, file_text, file_ann_df, filename, num, dir_path):
        filename = filename + '-' + str(num+1)
        text_path = dir_path + filename + '.txt'
        ann_path = dir_path + filename + '.ann'
        
        file = open(text_path, 'w')
        file.write(file_text)
        file.close()
        
        lines = list(file_ann_df['line'])
        lines = '\n'.join(lines)
        file = open(ann_path, 'w')
        file.write(lines)
        file.close()
    
    def break_annotations_file(self, filename, dir_path, output_path):
        ann_df, text = self.load_annotations_file(filename, dir_path)
        sents = text.split('\n\n')
        num_files = math.ceil(len(sents)/50)
        for num in range(num_files):
            # if there are more files after this, remove the first 50 lines
            if num < num_files-1:
                file_sents = sents[0:50]
                sents = sents[50:]
            else:
                file_sents = sents
            file_text = '\n\n'.join(file_sents) + '\n\n'
            offset = len(file_text)
#            print(offset)
            file_ann_df = ann_df[ann_df['beg_idx']<offset]
            ann_df = ann_df[ann_df['beg_idx'] > offset]
            ann_df['end_idx'] = ann_df['end_idx'] - offset
            ann_df['beg_idx'] = ann_df['beg_idx'] - offset
            file_ann_df['line'] = file_ann_df.apply(lambda row: self.get_annotation_in_format(
                                                            row['term_id'],
                                                            row['source'],
                                                            row['beg_idx'],
                                                            row['end_idx'],
                                                            row['entity']
                                                            ), axis=1)
            self.output_files(file_text, file_ann_df, filename, num, output_path)
    
    def break_annotations_dir(self, dir_path, output_path):
        filenames = [f for f in os.listdir(dir_path) if f[0]=='B']
        for filename in tqdm(filenames): 
            self.break_annotations_file(filename, dir_path, output_path)

In [117]:
dir_path = '../../s3_data/2023-04-03_1339/medicine-test-small/'
output_path = '../../s3_data/2023-04-03_1339/medicine-for-goldset-small/'

prepper = AnnotationDataPrep()

prepper.break_annotations_dir(dir_path, output_path)

100%|██████████| 10/10 [00:11<00:00,  1.14s/it]


In [119]:
dir_path = '../../s3_data/2023-04-03_1339/medicine-test-big/'
output_path = '../../s3_data/2023-04-03_1339/medicine-for-goldset-big/'

prepper = AnnotationDataPrep()

prepper.break_annotations_dir(dir_path, output_path)

100%|██████████| 40/40 [00:45<00:00,  1.14s/it]


In [89]:
file_df

Unnamed: 0,term_id,source,beg_idx,end_idx,entity,line
513,T514,OmniScienceMedicine,37,65,skeletal and cardiac muscles,T514\tOmniScienceMedicine 37 65\tskeletal and ...
514,T515,OmniScienceMedicine,50,65,cardiac muscles,T515\tOmniScienceMedicine 50 65\tcardiac muscles
515,T516,OmniScienceMedicine,91,99,myopathy,T516\tOmniScienceMedicine 91 99\tmyopathy
516,T517,OmniScienceMedicine,147,162,muscle necrosis,T517\tOmniScienceMedicine 147 162\tmuscle necr...
517,T518,OmniScienceMedicine,164,175,hypokalemia,T518\tOmniScienceMedicine 164 175\thypokalemia
...,...,...,...,...,...,...
621,T622,OmniScienceMedicine,4730,4744,Drug Addiction,T622\tOmniScienceMedicine 4730 4744\tDrug Addi...
622,T623,OmniScienceMedicine,4746,4760,Drug Addiction,T623\tOmniScienceMedicine 4746 4760\tDrug Addi...
623,T624,OmniScienceMedicine,4784,4804,Korsakoff's Syndrome,T624\tOmniScienceMedicine 4784 4804\tKorsakoff...
636,T637,OmniScienceMedicine,4373,4376,REM,T637\tOmniScienceMedicine 4373 4376\tREM


In [90]:
df

Unnamed: 0,term_id,source,beg_idx,end_idx,entity,line
0,T1,OmniScienceMedicine,0,25,Alcohol-related Disorders,T1\tOmniScienceMedicine 0 25\tAlcohol-related ...
1,T2,OmniScienceMedicine,129,142,cell function,T2\tOmniScienceMedicine 129 142\tcell function
2,T3,OmniScienceMedicine,203,223,alcohol intoxication,T3\tOmniScienceMedicine 203 223\talcohol intox...
3,T4,OmniScienceMedicine,307,317,dependence,T4\tOmniScienceMedicine 307 317\tdependence
4,T5,OmniScienceMedicine,349,363,brain function,T5\tOmniScienceMedicine 349 363\tbrain function
...,...,...,...,...,...,...
633,T634,OmniScienceMedicine,11796,11821,high-density lipoproteins,T634\tLong 11796 11821\thigh-density lipoproteins
634,T635,OmniScienceMedicine,12461,12464,LDL,T635\tShort 12461 12464\tLDL
635,T636,OmniScienceMedicine,12436,12459,low density lipoprotein,T636\tLong 12436 12459\tlow density lipoprotein
636,T637,OmniScienceMedicine,30526,30529,REM,T637\tShort 30526 30529\tREM


In [73]:
file_text[4378:4396]

'rapid eye movement'

In [39]:
for num in range(num_files):
    file_sents = sents[0:50]
    # if there are more files after this, remove the first 50 lines
    if num < num_files-1:
        sents = sents[50:]
    file_text = '\n\n'.join(file_sents) + '\n\n'
    offset = len(file_text)
    print(offset)
    print(file_text[0:25])
    

12633
Alcohol-related Disorders
13520
Digestive system: increas
4820
Alcohol has myotoxic effe
