## (1) Reading in the dataset

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk import tokenize
STOPWORDS = set(stopwords.words('english'))
import string
PUNCTUATION = set(char for char in string.punctuation)
import csv
import spacy
import re
import numpy as np
import pandas as pd
from classify_abs import get_abstract
import string
PUNCTUATION = set(char for char in string.punctuation)
INCLUSIVE_WORDS = {'between','around','approximately','about','<','>','roughly','relatively','over','under','than'}#less than, greater than
EPI_MODIFIERS = {'annual','overall','estimated','weighted','nationwide','pooled','average','cumulative'}
DATES = {'january','february','march','april','may','june','july','august','september','october','november','december'}

### Grab the Abstract

In [2]:
abstract = get_abstract(20490927)
abstract

'The aim of this retrospective study was to determine the prevalence of lysosomal storage disorders (LSDs) in the Czech Republic. The data on cases diagnosed between 1975 and 2008 were collected and analyzed. The overall prevalence of LSDs in the Czech population (12.25 per 100,000) is comparable to that reported for the countries with well-established and advanced diagnostics of LSDs such as the Netherlands (14 per 100,000), Australia (12.9 per 100,000) and Italy (12.1 per 100,000). Relatively higher prevalence of LSDs was reported in the north of Portugal (25 per 100,000). Thirty-four different LSDs were diagnosed in a total of 478 individuals. Gaucher disease was the most frequent LSD with a birth prevalence of 1.13 per 100,000 births. The most frequent LSD groups were lipidoses, mucopolysaccharidoses, and neuronal ceroid lipofuscinoses, with combined prevalences of 5.0, 3.72, and 2.29 per 100,000 live births, respectively. Glycoproteinoses (0.57 per 100,000 live births), glycogenos

### HTML Removal

In [3]:
def remove_html(string):
    string = re.sub('<.{1,4}>', ' ', string)
    string = re.sub("  *", " " , string)
    string = re.sub("^ ", "" , string)
    string = re.sub(" $", "" , string)
    string = re.sub("  ", " " , string)
    string=string.strip()
    return string

## (3) Map label onto each word (done rule-based at the sentence level)

### Tagging Functions

In [4]:
#SpaCy named entities are here: https://spacy.io/models/en 
nlp = spacy.load('en_core_web_lg')

In [5]:
def combine_stats1(tokens,labels):
    i=1
    while i<len(labels)-1:
        if 'STAT' in labels[i]:
            #Includes <, > number in the statistic
            if tokens[i-1]=='<' or tokens[i-1]=='>':
                labels[i-1]='B-STAT'
                labels[i]='I-STAT'
            #Includes greater than, less than, more than, etc. 
            if tokens[i-1]=='than':
                labels[i-2]='B-STAT'
                labels[i-1]='I-STAT'
                labels[i]='I-STAT'
                
        #Combines "This disease affects 1 in 7500 to 1 in 10,000 people" into a single statistic phrase instead of 2
        if 'STAT' in labels[i-1] and 'STAT' in labels[i+1] and 'STAT' not in labels[i]:
            if tokens[i] =='to':
                labels[i]='I-STAT'
                labels[i+1]='I-STAT'
            if tokens[i] =='-':
                labels[i]='I-STAT'
                labels[i+1]='I-STAT'
        
        #This gets of the type "prevalence of 2 to 18 per 100,000"
        if labels[i+1]=='B-STAT':
            if tokens[i]=='to' or tokens[i]=='-' or tokens[i-1].isdigit():
                labels[i-1]='B-STAT'
                labels[i]='I-STAT'
                labels[i+1]='I-STAT'
        i+=1
    return tokens,labels

In [6]:
def combine_stats2(tokens,labels): #this is with lists of tokens, not lists of sentences which are lists of tokens
    for i in range(1,len(labels)-1):
        if 'STAT' in labels[i]:
            #Includes <, > number in the statistic
            if tokens[i-1]=='<' or tokens[i-1]=='>':
                labels[i-1]='B-STAT'
                labels[i]='I-STAT'
            #Includes greater than, less than, more than, etc. 
            if tokens[i-1]=='than':
                labels[i-2]='B-STAT'
                labels[i-1]='I-STAT'
                labels[i]='I-STAT'
                
        #Combines "This disease affects 1 in 7500 to 1 in 10,000 people" into a single statistic phrase instead of 2
        if 'STAT' in labels[i-1] and 'STAT' in labels[i+1] and 'STAT' not in labels[i]:
            if tokens[i] =='to':
                labels[i]='I-STAT'
                labels[i+1]='I-STAT'
            if tokens[i] =='-':
                labels[i]='I-STAT'
                labels[i+1]='I-STAT'
        
        #This gets of the type "prevalence of 2 to 18 per 100,000"
        if labels[i+1]=='B-STAT':
            if tokens[i]=='to' or tokens[i]=='-' or tokens[i-1].isdigit():
                labels[i-1]='B-STAT'
                labels[i]='I-STAT'
                labels[i+1]='I-STAT'
    
    return tokens,labels

In [7]:
def modify_labels(tokens,labels):
    if len(tokens)!=len(labels):
        raise IndexError("Num Sentences {} and Num Sentence Labels {} Mismatch".format(len(tokens),len(labels)))
    for i in range(len(tokens)):
        if len(tokens[i])!=len(labels[i]):
            raise IndexError("Sentence Length {} and Label Length {} Mismatch".format(len(tokens[i]),len(labels[i])))
        '''
        #Comparison loop
        for j in range(2,len(tokens[i])-2):
            if labels[i][j]=='B-STAT':
                print('BEFORE')
                print(tokens[i][j-2:j+3])
                print(labels[i][j-2:j+3])
                print('')
        '''
        for j in range(len(tokens[i])):
            if tokens[i][j]=='prevalent' or tokens[i][j]=='occurs':
                labels[i][j] = 'B-EPI'
        
        for j in range(2,len(tokens[i])-2):
            if tokens[i][j].lower() in DATES:
                labels[i][j]='O'
                labels[i][j+1]='O'
                labels[i][j+2]='O'
                
        for j in range(2,len(tokens[i])-2):
            if tokens[i][j-1].lower() in DATES:
                labels[i][j-1]='O'
                labels[i][j]='O'
                labels[i][j+1]='O'
                labels[i][j+2]='O'
            
            #Ensures that there is not already a label
            if labels[i][j] in {'O','B-STAT','I-STAT'}:
                #relabel all of the percentages
                '''
                if '%' in tokens[i][j]:
                    print('BEFORE')
                    print(tokens[i][j-2:j+3])
                    print(labels[i][j-2:j+3])
                    print('')
                '''
                #only include small percentages of the form '0.*'
                if (('%' in tokens[i][j] or ('per' in tokens[i][j].lower() and 'cent' in tokens[i][j+1].lower()) or 'percent' in tokens[i][j].lower()
                    ) and not (re.match(r"^0\.", tokens[i][j]
                    ) or re.match(r"^0\.", tokens[i][j-1]
                    ) or re.match(r"^0\.", tokens[i][j-2]))
                    ) and not ('B-EPI' in set(labels[i][j-2:j+3]) or 'I-EPI' in set(labels[i][j-2:j+3])):
                    labels[i][j-2] = 'O'
                    labels[i][j-1] = 'O'
                    labels[i][j] = 'O'
                    labels[i][j+1] = 'O'
                    labels[i][j+2] = 'O'
                '''
                if '%' in tokens[i][j]:
                    print('AFTER')
                    print(tokens[i][j-2:j+3])
                    print(labels[i][j-2:j+3])
                    print('\n')
                '''
                #gets of the form 1:100,000
                #tag word before except if it is 'of' or label before is there
                if tokens[i][j][0].isdigit() and ':' in tokens[i][j]:
                    #print(tokens[i][j-2:j+3])
                    #print(labels[i][j-2:j+3])
                    #print(tokens[i][j].split(':'))
                    #Exclude
                    if ((len(tokens[i][j])==5 and not tokens[i][j+1][0].isdigit()) or len(tokens[i][j].split(':')[0])>3):
                        pass
                        #continue
                    #Exclude
                    elif 'ratio' in tokens[i][j-2:j-1]:
                        pass
                        #continue
                    else:
                        if tokens[i][j-1].lower() in INCLUSIVE_WORDS:
                            labels[i][j-1]='B-STAT'
                            labels[i][j]='I-STAT'
                        elif tokens[i][j-2].lower() in INCLUSIVE_WORDS:
                            labels[i][j-2]='B-STAT'
                            labels[i][j-1]='I-STAT'
                            labels[i][j]='I-STAT'
                        else:
                            labels[i][j]='B-STAT'

                        if tokens[i][j+1][0].isdigit():
                            labels[i][j+1]='I-STAT'
                            if (tokens[i][j+2].lower() not in STOPWORDS and tokens[i][j+2] not in PUNCTUATION) and labels[i][j+2] in {'O','B-STAT','I-STAT'}:
                                labels[i][j+2]='I-STAT'
                                #Could potentially cause an indexing issue?
                                labels[i][j+3]='I-STAT'
                        if (tokens[i][j+1].lower() not in STOPWORDS and tokens[i][j+1] not in PUNCTUATION) and labels[i][j+1] in {'O','B-STAT','I-STAT'}:
                            labels[i][j+1]='I-STAT'
                            #Checks to make sure it is not already tagged
                            if labels[i][j+2] in {'O','B-STAT'}:
                                labels[i][j+2]='I-STAT'
                                
                            
        for j in range(1,len(tokens[i])-1):
            if tokens[i][j].lower() == 'unknown' and (labels[i][j+1]=='B-EPI' or labels[i][j+1]=='I-EPI'):
                labels[i][j]='B-STAT'
            if tokens[i][j].lower() == 'global' and (labels[i][j+1]=='B-EPI' or labels[i][j+1]=='I-EPI'):
                labels[i][j]='B-LOC'
            #Gets the ones who have cut off numbers
            if labels[i][j]=='I-STAT' and labels[i][j+1]=='O':
                if tokens[i][j+1][0].isdigit():
                    labels[i][j+1]='I-STAT'
            
            if labels[i][j]=='B-STAT':
                #This is supposed to match years, but not sure how well, did not test, copied from stackoverflow
                if re.match(r"^[12][0-9]{3}$",tokens[i][j].split('/')[0]):
                    labels[i][j]='O'
                #gets rid of incessant 'was' being tagged
                if tokens[i][j].lower() in STOPWORDS:
                    labels[i][j]='O'
                    labels[i][j+1]='B-STAT'
            #Lengthens tags to include descriptors, there could bee more to include, but did not pop out during testing
            if (labels[i][j-1] =='I-STAT' or labels[i][j-1] =='B-STAT') and labels[i][j] =='O':
                if tokens[i][j+1] in {'births','LBs','LB','birth'}:
                    labels[i][j]='I-STAT'
                    labels[i][j+1]='I-STAT'
                elif tokens[i][j] in {'births','LBs','LB','birth'}:
                    labels[i][j]='I-STAT'
            #This should also lengthen epi tags a little bit to include descriptors
            if labels[i][j]=='B-EPI' and tokens[i][j-1].lower() in EPI_MODIFIERS:
                labels[i][j-1]='B-EPI'
                labels[i][j]='I-EPI'
            
            ## This was not in V3.1, this is the final change that created the V3.2 dataset. Everything else is the same
            #This should remove isolated stats that do not have epi in the rest of the sentence
            if labels[i][j-1]=='B-STAT' and labels[i][j]!='I-STAT' and 'B-EPI' not in labels[i]:
                if '/' in tokens[i][j-1]:
                    if len(tokens[i][j-1])<9 or '+' in tokens[i][j-1] or '-' in tokens[i][j-1] or '±' in tokens[i][j-1]:
                        labels[i][j-1]=='O'
                    else:
                        #leave out the ones like ['1.21/10,000', 'individuals', ')'] ['B-STAT', 'O', 'O']
                        pass
                else:
                    labels[i][j-1]=='O'

        tokens, labels = combine_stats2(tokens,labels)
        '''
        #Comparison loop
        for j in range(2,len(tokens[i])-2):
                
            if labels[i][j]=='B-STAT':
                print('AFTER')
                print(tokens[i][j-2:j+3])
                print(labels[i][j-2:j+3])
                print('\n')
        '''
        if i %250==0:
            print(i)
            
    return tokens, labels

In [8]:
# This function should take in a sentence and output each word in it with a tentative label
def tag_NERs(sentence):
    
    doc = nlp(sentence)
    tokens = [token.text for token in doc]
    labels = ['O' for token in doc]
    
    i = 0
    for token in doc:
        if len(str(token.text).strip())==0:
            tokens.pop(i)
            labels.pop(i)
            
        else:
            ## Epidemiologic identifier
            if token.text.lower() in {'incidence','prevalence','prevalences','prevalence ','incidences','occurrence','occurrences'}:
                labels[i] = 'B-EPI'
        
            ## Location
            if token.ent_type_ in {'GPE','LOC'}:
                labels[i] = str(token.ent_iob_+'-LOC')
            if token.text in {"worldwide"}:
                labels[i] = 'B-LOC'
        
            ## Epidemiologic Rates
            #This gets stuff of the form 3.5/100
            if token.text[0].isdigit() and '/' in token.text:
                labels[i] = 'B-STAT'
        
            #label all percents except those preceding "confidence interval (CI)"
            if token.ent_type_ in {'PERCENT'}:# and token.text not in {'95', 'CI'}:
                if i<len(doc)-2:
                    if doc[i+2].text in {'CI','confidence','interval','confidence interval','(CI)','(CI','CI)'}:
                        labels[i] = 'O'
                        labels[i+1] = 'O'
                        labels[i+2] = 'O'
                    elif doc[i+1].text in {'CI','confidence','interval','confidence interval','(CI)','(CI','CI)'}:
                        labels[i] = 'O'
                        labels[i+1] = 'O'
                    else:
                        labels[i] = str(token.ent_iob_+'-STAT')
                elif i<len(doc)-1:
                    if doc[i+1].text in {'CI','confidence','interval','confidence interval','(CI)','(CI','CI)'}:
                        labels[i] = 'O'
                        labels[i+1] = 'O'
                    else:
                        labels[i] = str(token.ent_iob_+'-STAT')        
                else:
                    labels[i] = str(token.ent_iob_+'-STAT')
        
            #These 3 get stuff of the form "one in 35000" or "one in every 23043"
            if (token.text.lower() in {'one','1'} and i<(len(doc)-3)): 
                if doc[i+3].is_digit:
                    labels[i] = 'B-STAT'
                    for j in range(i+1,i+4):
                        labels[j] = 'I-STAT'
            if (token.text.lower() in {'one','1'} and i<(len(doc)-2)): 
                if doc[i+2].is_digit:
                    labels[i] = 'B-STAT'
                    labels[i+1] = 'I-STAT'
                    labels[i+2] = 'I-STAT'
            if (token.text.lower() in {'one','1'} and i<(len(doc)-1)):
                if doc[i+1].is_digit:
                    labels[i] = 'B-STAT'
                    labels[i+1] = 'I-STAT'
        
            #These should get the ones of the form: 14.1 deaths per 1,000 LBs
            #This is a big decision tree, not sure how to write it in fewer lines of code
            #Need to get all permutations of "a b per c d e" where (a or b) and (c or d) is number and e is anything, but if e does not exist still need to tag a-d as STAT
            if token.text.lower() =='per':
                #print(i,len(doc))
                if i>1:
                    if i<len(doc)-3:
                        #Resulted in better testing when not validating that words after 'per' are numbers
                        if (doc[i-2].is_digit or doc[i-2].ent_type_ in {'CARDINAL','ORDINAL','QUANTITY','MONEY'}) or (
                            doc[i-1].is_digit or doc[i-1].ent_type_ in {'CARDINAL','ORDINAL','QUANTITY','MONEY'}) or (
                            doc[i+2].is_digit or doc[i+2].ent_type_ in {'CARDINAL','ORDINAL','QUANTITY','MONEY'}) or (
                            doc[i+1].is_digit or doc[i+1].ent_type_ in {'CARDINAL','ORDINAL','QUANTITY','MONEY'}):
                            if tokens[i-2] not in STOPWORDS and tokens[i-2] not in PUNCTUATION:
                                labels[i-2] = 'B-STAT'
                                #labeling also the token after the number
                                for j in range(i-1,i+3):
                                    labels[j]='I-STAT'
                            else:
                                labels[i-1] = 'B-STAT'
                                #labeling also the token after the number
                                for j in range(i,i+3):
                                    labels[j]='I-STAT'
                            
                    if i<len(doc)-2:
                        if (doc[i-2].is_digit or doc[i-2].ent_type_ in {'CARDINAL','ORDINAL','QUANTITY','MONEY'}) or (
                            doc[i-1].is_digit or doc[i-1].ent_type_ in {'CARDINAL','ORDINAL','QUANTITY','MONEY'}) or (
                            doc[i+2].is_digit or doc[i+2].ent_type_ in {'CARDINAL','ORDINAL','QUANTITY','MONEY'}) or (
                            doc[i+1].is_digit or doc[i+1].ent_type_ in {'CARDINAL','ORDINAL','QUANTITY','MONEY'}):
                            if tokens[i-2] not in STOPWORDS and tokens[i-2] not in PUNCTUATION:
                                labels[i-2] = 'B-STAT'
                                #labeling also the token after the number
                                for j in range(i-1,i+2):
                                    labels[j]='I-STAT'
                            else: 
                                labels[i-1] = 'B-STAT'
                                #labeling also the token after the number
                                for j in range(i,i+2):
                                    labels[j]='I-STAT'
                    #The difference between the above and below is in labeling the token immediately after the number
                    if i<len(doc)-1:
                        if (doc[i-2].is_digit or doc[i-2].ent_type_ in {'CARDINAL','ORDINAL','QUANTITY','MONEY'}) or (
                            doc[i-1].is_digit or doc[i-1].ent_type_ in {'CARDINAL','ORDINAL','QUANTITY','MONEY'}) or (
                            doc[i+2].is_digit or doc[i+2].ent_type_ in {'CARDINAL','ORDINAL','QUANTITY','MONEY'}) or (
                            doc[i+1].is_digit or doc[i+1].ent_type_ in {'CARDINAL','ORDINAL','QUANTITY','MONEY'}):
                            if tokens[i-2] not in STOPWORDS and tokens[i-2] not in PUNCTUATION:
                                labels[i-2] = 'B-STAT'
                                #labeling also the token after if it is number
                                for j in range(i-1,i+1):
                                    labels[j]='I-STAT'
                            else: 
                                labels[i-1] = 'B-STAT'
                                #labeling also the token after the number
                                for j in range(i,i+1):
                                    labels[j]='I-STAT'
                elif i>0:
                    if i<len(doc)-3:
                        if (doc[i-1].is_digit or doc[i-1].ent_type_ in {'CARDINAL','ORDINAL','QUANTITY','MONEY'}) or (
                            doc[i+2].is_digit or doc[i+2].ent_type_ in {'CARDINAL','ORDINAL','QUANTITY','MONEY'}) or (
                            doc[i+1].is_digit or doc[i+1].ent_type_ in {'CARDINAL','ORDINAL','QUANTITY','MONEY'}):
                
                            labels[i-1] = 'B-STAT'
                            #labeling also the token after the number
                            for j in range(i,i+3):
                                labels[j]='I-STAT'
                            
                    if i<len(doc)-2:
                        if (doc[i-1].is_digit or doc[i-1].ent_type_ in {'CARDINAL','ORDINAL','QUANTITY','MONEY'}) or (
                            doc[i+2].is_digit or doc[i+2].ent_type_ in {'CARDINAL','ORDINAL','QUANTITY','MONEY'}) or (
                            doc[i+1].is_digit or doc[i+1].ent_type_ in {'CARDINAL','ORDINAL','QUANTITY','MONEY'}):
                        
                            labels[i-1] = 'B-STAT'
                            #labeling also the token after the number
                            for j in range(i,i+2):
                                labels[j]='I-STAT'
                            
                    if i<len(doc)-1:
                        if (doc[i-1].is_digit or doc[i-1].ent_type_ in {'CARDINAL','ORDINAL','QUANTITY','MONEY'}) or (
                            doc[i+1].is_digit or doc[i+1].ent_type_ in {'CARDINAL','ORDINAL','QUANTITY','MONEY'}):
                    
                            labels[i-1] = 'B-STAT'
                            #labeling just the number if there is nothing after. 
                            for j in range(i,i+1):
                                labels[j]='I-STAT'
            i+=1

    if len(tokens) != len(labels):
        raise ValueError('Token/Label Length Mismatch')
        
    if len(tokens)>2 and len(labels)>2:
        tokens, labels = combine_stats1(tokens,labels)
        tokens = [str(token) for token in tokens]
        labels = [str(label) for label in labels]
        tokens, labels = modify_labels([tokens],[labels])
        
    return tokens, labels #This returns as type Spacy.tokens, need to convert to strings at writing

### tag_NERs Function Testing Cells

In [9]:
sentences = tokenize.sent_tokenize(remove_html(abstract))

for sentence in sentences:
    t,l = tag_NERs(sentence)
    for i in range(len(t)):
        for j in range(len(t[i])):
            print(t[i][j], l[i][j])
        print('')

0
The O
aim O
of O
this O
retrospective O
study O
was O
to O
determine O
the O
prevalence B-EPI
of O
lysosomal O
storage O
disorders O
( O
LSDs O
) O
in O
the B-LOC
Czech I-LOC
Republic I-LOC
. O

0
The O
data O
on O
cases O
diagnosed O
between O
1975 O
and O
2008 O
were O
collected O
and O
analyzed O
. O

0
The O
overall B-EPI
prevalence I-EPI
of O
LSDs O
in O
the O
Czech O
population O
( O
12.25 B-STAT
per I-STAT
100,000 I-STAT
) I-STAT
is O
comparable O
to O
that O
reported O
for O
the O
countries O
with O
well O
- O
established O
and O
advanced O
diagnostics O
of O
LSDs O
such O
as O
the O
Netherlands B-LOC
( O
14 B-STAT
per I-STAT
100,000 I-STAT
) I-STAT
, O
Australia B-LOC
( O
12.9 B-STAT
per I-STAT
100,000 I-STAT
) I-STAT
and O
Italy B-LOC
( O
12.1 B-STAT
per I-STAT
100,000 I-STAT
) I-STAT
. O

0
Relatively O
higher O
prevalence B-EPI
of O
LSDs O
was O
reported O
in O
the O
north O
of O
Portugal B-LOC
( O
25 B-STAT
per I-STAT
100,000 I-STAT
) I-STAT
. O

0
Thirty O
- O
four O
di