In [5]:
import pandas as pd
import nltk
from nltk import sent_tokenize
import os
import spacy
from tqdm import tqdm
from spacy.tokenizer import Tokenizer
import re
nlp = spacy.load("en_core_web_sm")

import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 

In [6]:
infix_re = re.compile('[\(.,\)?]')

def custom_tokenizer(nlp):
    return Tokenizer(nlp.vocab, infix_finditer=infix_re.finditer)
nlp.tokenizer = custom_tokenizer(nlp)

In [8]:
class DataPrepBIO():
    """Load annotations and convert them to BIO Tags"""
    
    def __init__(self):
        pass

    # load annotations file and convert to BIO tags
    # return DataFrame with the columns: [file, sentence, BIO tags]
    # filename: filename without any extensions
    # dir_path: path of directory containing all files 
    def load_annotations_file_bio(self, filename, dir_path):
        txt_path = dir_path + filename + '.txt'
        ann_path = dir_path + filename + '.ann'

        # load text
        file = open(txt_path, 'r')
        text = file.read()
        file.close()

        df = pd.read_csv(ann_path, sep='\t', names=['term_id', 'source', 'beg_idx', 'end_idx', 'entity'], header=None)
        # drop annotations that are not terms
        df = df[df['term_id'].str.contains('T')]
        # drop beg_idx, end_idx columns
        df['entity'] = df['beg_idx']
        df = df.drop(columns=['beg_idx', 'end_idx'])
        # separating source and spans
        df[['source', 'beg_idx', 'end_idx']] = df['source'].str.split(' ', 2, expand=True)
        df = df[df['source'] != 'Number']
        df = df[df['source'] != 'Condition-Unit']
        df['beg_idx'] = pd.to_numeric(df['beg_idx'])
        df['end_idx'] = pd.to_numeric(df['end_idx'])

        # BIO tagging entire text
        doc = nlp(text)
        tokens = []
        prev_term_id = ''
        tags = []

        # go through annotations and mark entities in doc with BIO tags
        tags = ['O']*len(doc)

        kept = 0
        for index, row in df.iterrows():
            kept += 1
            span = doc.char_span(row['beg_idx'], row['end_idx'], alignment_mode='expand')
            for token_idx in range(span.start, span.end):
                if token_idx == span.start:
                    tags[token_idx] = 'B'
                else:
                    tags[token_idx] = 'I'

        # removing tokens that are whitespaces
        tokens = []
        old_tags = tags
        tags = []
        for token, tag in zip(doc, old_tags):
            if token.text.strip() != '':
                tokens.append(token.text)
                tags.append(tag)
        text = ' '.join(tokens)

        # splitting tags and text into sentences
        sentences = sent_tokenize(text)
        df = pd.DataFrame(columns=['file', 'sentence', 'word_labels'])
        for sent in sentences:
            sent_len = len(sent.split())
            cur_tags = ','.join(tags[0:sent_len])
            # remove current sentence's tags
            tags = tags[sent_len:]
            df = df.append({'file': filename, 'sentence':sent, 'word_labels':cur_tags}, ignore_index=True)
        return df, kept
    
    # load annotations of all files in directory and convert to BIO tags
    # return DataFrame with the columns: [file, sentence, BIO tags]
    # dir_path: path of directory containing all files 
    def load_annotations_dir_bio(self, dir_path):
        filenames = [f.split('.')[0] for f in os.listdir(dir_path) if f.split('.')[1] == 'txt']
        df = pd.DataFrame(columns=['file', 'sentence', 'word_labels'])
        kept = 0
        for filename in tqdm(filenames):
            #print(filename)
            try:
                cur_df, cur_kept = self.load_annotations_file_bio(filename, dir_path)
                kept += cur_kept
            except:
                print(filename)
            df = pd.concat([df, cur_df])
        print("total annotations: ", kept)
        df.to_csv('../corpus/matpro/matpro_bio_tags.csv', header=True)
        return df

In [9]:
dir_path = '../corpus/matpro/data/'
prepper = DataPrepBIO()
filename = '101002adma200903953'
df = prepper.load_annotations_file_bio(filename, dir_path)

In [9]:
dir_path = '../corpus/matpro/data/'
prepper = DataPrepBIO()
df = prepper.load_annotations_dir_bio(dir_path)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 230/230 [00:10<00:00, 21.93it/s]

total annotations:  15625





In [17]:
df

Unnamed: 0,file,sentence,word_labels
0,101016jjallcom201511182,10 .,"O,O"
1,101016jjallcom201511182,1016/j .,"O,O"
2,101016jjallcom201511182,jallcom .,"O,O"
3,101016jjallcom201511182,2015 .,"O,O"
4,101016jjallcom201511182,11 .,"O,O"
...,...,...,...
8,101016jjpowsour201601014,"After the adsorption , the resin was filtered ...","O,O,O,O,O,B,O,B,O,B,O,O,O,B,B,O,O,B,O,B,O,O,O,..."
9,101016jjpowsour201601014,The resulting product was carbonized at 750 de...,"O,O,B,O,B,O,O,O,O,O,O,O,O,B,O,O,O,B,I,O,O,O,O,O"
10,101016jjpowsour201601014,"Finally , the black-colored product was furthe...","O,O,O,B,B,O,O,B,O,O,B,O,B,I,O,B,O,O,O,O"
11,101016jjpowsour201601014,"For comparison , pure carbon was prepared usin...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
