In [1]:
import pandas as pd
import nltk
from nltk import sent_tokenize
import os
import spacy
from tqdm import tqdm
from spacy.tokenizer import Tokenizer
import re
import random
import shutil
nlp = spacy.load("en_core_web_sm")

import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 

In [2]:
class DataPrepSciERC():
    def __init__(self, dir_path, filename):
        self.ann_df, self.text = self.load_annotations(dir_path, filename)

        
    def load_annotations(self, dir_path, filename):
        txt_path = dir_path + '/' + filename + '.txt'
        ann_path = dir_path + '/' + filename + '.ann'

        # load text
        file = open(txt_path, 'r')
        text = file.read()
        file.close()

        # load annotations
        file = open(ann_path, 'r')
        lines = file.readlines()
        file.close()
        df = pd.DataFrame(columns=['term_id', 'type', 'beg_idx', 'end_idx', 'entity', 'line'])
        for line in lines:
            if line[0] != 'T':
                continue
            line = line.strip()
            row = {}
            parts = line.split('\t')
            row['term_id'] = parts[0]
            row['entity'] = parts[-1]
            parts = parts[1].split()
            row['beg_idx'], row['end_idx'] = int(parts[1]), int(parts[2])
            row['line'] = line
            df = df.append(row, ignore_index=True)
        
        df = df.drop_duplicates(subset=['beg_idx', 'end_idx'], keep='first')
        
        return df, text
                
    def convert_bio(self):
        df, text = self.ann_df, self.text
        
        # BIO tagging entire text
        doc = nlp(text)
        tokens = []
        prev_term_id = ''
        tags = []

        # go through annotations and mark entities in doc with BIO tags
        tags = ['O']*len(doc)

        for index, row in df.iterrows():
            span = doc.char_span(row['beg_idx'], row['end_idx'], alignment_mode='expand')
            for token_idx in range(span.start, span.end):
                if token_idx == span.start:
                    tags[token_idx] = 'B'
                else:
                    tags[token_idx] = 'I'

        # removing tokens that are whitespaces
        tokens = []
        old_tags = tags
        tags = []
        for token, tag in zip(doc, old_tags):
            if token.text.strip() != '':
                tokens.append(token.text)
                tags.append(tag)
        text = ' '.join(tokens)

        # splitting tags and text into sentences
        sentences = sent_tokenize(text)
        df = pd.DataFrame(columns=['sentence', 'word_labels'])
        for sent in sentences:
            sent_len = len(sent.split())
            cur_tags = ','.join(tags[0:sent_len])
            # remove current sentence's tags
            tags = tags[sent_len:]
            df = df.append({'sentence':sent, 'word_labels':cur_tags}, ignore_index=True)
        return df

In [6]:
class DataPrepSciERCDir():
    def __init__(self):
        pass
    
    def prep_to_bio(self, dir_path):
        filenames = [f.split('.')[0] for f in os.listdir(dir_path) if f.split('.')[-1]=='ann']
        df = pd.DataFrame(columns=['sentence', 'word_labels'])
        for filename in tqdm(filenames):
            try:
                prepper = DataPrepSciERC(dir_path, filename)
                cur_df = prepper.convert_bio()
                df = pd.concat([df, cur_df])
            except Exception as e: 
                print(e)
        df.to_csv('../corpus/scierc/scierc_train_full.csv', header=True)
        return df

In [4]:
class DataPrepSciERCPartial():
    def __init__(self, dir_path, filename, to_keep):
        self.removed = 0
        self.kept = 0
        self.ann_df, self.text = self.load_annotations(dir_path, filename, to_keep)
        

        
    def load_annotations(self, dir_path, filename, to_keep):
        txt_path = dir_path + '/' + filename + '.txt'
        ann_path = dir_path + '/' + filename + '.ann'

        # load text
        file = open(txt_path, 'r')
        text = file.read()
        file.close()

        # load annotations
        file = open(ann_path, 'r')
        lines = file.readlines()
        file.close()
        df = pd.DataFrame(columns=['term_id', 'type', 'beg_idx', 'end_idx', 'entity', 'line'])
        for line in lines:
            if line[0] != 'T':
                continue
            # remove annotations higher than a certain amount
            if random.random() > to_keep:
                self.removed += 1
                continue
            self.kept += 1
            line = line.strip()
            row = {}
            parts = line.split('\t')
            row['term_id'] = parts[0]
            row['entity'] = parts[-1]
            parts = parts[1].split()
            row['beg_idx'], row['end_idx'] = int(parts[1]), int(parts[2])
            row['line'] = line
            df = df.append(row, ignore_index=True)
        
        df = df.drop_duplicates(subset=['beg_idx', 'end_idx'], keep='first')
        
        return df, text
                
    def convert_bio(self):
        df, text = self.ann_df, self.text
        
        # BIO tagging entire text
        doc = nlp(text)
        tokens = []
        prev_term_id = ''
        tags = []

        # go through annotations and mark entities in doc with BIO tags
        tags = ['O']*len(doc)

        for index, row in df.iterrows():
            span = doc.char_span(row['beg_idx'], row['end_idx'], alignment_mode='expand')
            for token_idx in range(span.start, span.end):
                if token_idx == span.start:
                    tags[token_idx] = 'B'
                else:
                    tags[token_idx] = 'I'

        # removing tokens that are whitespaces
        tokens = []
        old_tags = tags
        tags = []
        for token, tag in zip(doc, old_tags):
            if token.text.strip() != '':
                tokens.append(token.text)
                tags.append(tag)
        text = ' '.join(tokens)

        # splitting tags and text into sentences
        sentences = sent_tokenize(text)
        df = pd.DataFrame(columns=['sentence', 'word_labels'])
        for sent in sentences:
            sent_len = len(sent.split())
            cur_tags = ','.join(tags[0:sent_len])
            # remove current sentence's tags
            tags = tags[sent_len:]
            df = df.append({'sentence':sent, 'word_labels':cur_tags}, ignore_index=True)
        return df, self.removed, self.kept

In [5]:
class DataPrepSciERCDirPartial():
    def __init__(self):
        pass
    
    def prep_to_bio(self, dir_path, to_keep):
        filenames = [f.split('.')[0] for f in os.listdir(dir_path) if f.split('.')[-1]=='ann']
        df = pd.DataFrame(columns=['sentence', 'word_labels'])
        removed, kept = 0, 0
        for filename in tqdm(filenames):
            try:
                prepper = DataPrepSciERCPartial(dir_path, filename, to_keep)
                cur_df, cur_removed, cur_kept = prepper.convert_bio()
                df = pd.concat([df, cur_df])
                removed += cur_removed
                kept += cur_kept
            except Exception as e: 
                print(e)
        df.to_csv('../corpus/scierc/scierc_train_partial.csv', header=True)
        print("kept: ", kept)
        print("removed: ", removed)
        return df

In [7]:
dir_path = '../corpus/scierc/train_docs'
prepper = DataPrepSciERCDir()
df = prepper.prep_to_bio(dir_path)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [00:12<00:00, 30.81it/s]


In [48]:
dir_path = '../corpus/scierc/train_docs'
prepper = DataPrepSciERCDirPartial()
df = prepper.prep_to_bio(dir_path, 0.4)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [00:11<00:00, 35.25it/s]

kept:  2525
removed:  3960





In [17]:
#files = [f.split('.')[0] for f in os.listdir('../corpus/scierc/raw_data') if f.split('.')[-1] == 'ann']

In [18]:
#len(files)

400

In [19]:
# for i, file in enumerate(files):
#     if i == 400:
#         break
#     for ex in ['.ann', '.txt', '.txt.xml']:
#         shutil.move('../corpus/scierc/raw_data/' + file + ex, '../corpus/scierc/train_docs/' + file + ex)