## Goals
Write a program that I can modify any time that will modify the labels for the training and validation dataset
 - Remove Percentages (unless there is EPI in the rest of the sentence?)
 - Tag ones of the form '1:100,000'

In [1]:
import csv
import nltk
#nltk.download('punkt')
from nltk import tokenize
import re

In [2]:
#Important sets of words
from nltk.corpus import stopwords
#nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))
import string
PUNCTUATION = set(char for char in string.punctuation)
INCLUSIVE_WORDS = {'between','around','approximately','about','<','>','roughly','relatively','over','under','than'}#less than, greater than
EPI_MODIFIERS = {'annual','overall','estimated','weighted','nationwide','pooled','average','cumulative'}
DATES = {'january','february','march','april','may','june','july','august','september','october','november','december'}

In [3]:
epi_train_tokens, epi_train_labels= [],[]
with open('epi_train_setV2.tsv','r', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    sentence_tokens,sentences_tags=[],[]
    for row in reader:
        if len(row)%2==0:
            if len(row)==0:
                if len(sentence_tokens) != len(sentences_tags):
                    print('uh oh', sentence_tokens, sentences_tags, sep='\n')
                epi_train_tokens.append(sentence_tokens.copy())
                epi_train_labels.append(sentences_tags.copy())
                sentence_tokens.clear()
                sentences_tags.clear()
            else:
                sentence_tokens.append(row[0])
                sentences_tags.append(row[1])
f.close()
print(len(epi_train_tokens),len(epi_train_labels))

4580 4580


In [4]:
epi_val_tokens, epi_val_labels= [],[]
with open('epi_val_setV2.tsv','r', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    sentence_tokens,sentences_tags=[],[]
    for row in reader:
        if len(row)%2==0:
            if len(row)==0:
                epi_val_tokens.append(sentence_tokens.copy())
                epi_val_labels.append(sentences_tags.copy())
                sentence_tokens.clear()
                sentences_tags.clear()
            else:
                sentence_tokens.append(row[0])
                sentences_tags.append(row[1])
f.close()
print(len(epi_val_tokens),len(epi_val_labels))

1227 1227


In [5]:
#Need to modify t to calculate
epi_test_tokens, epi_test_labels= [],[]
with open('epi_test_setV2.tsv','r', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    sentence_tokens,sentences_tags=[],[]
    for row in reader:
        if len(row)%2==0:
            if len(row)==0:
                epi_test_tokens.append(sentence_tokens.copy())
                epi_test_labels.append(sentences_tags.copy())
                sentence_tokens.clear()
                sentences_tags.clear()
            else:
                sentence_tokens.append(row[0])
                sentences_tags.append(row[1])
f.close()
print(len(epi_test_tokens),len(epi_test_labels))

534 534


In [6]:
def combine_stats(tokens,labels): #this is with lists of tokens, not lists of sentences which are lists of tokens
    for i in range(1,len(labels)-1):
        if 'STAT' in labels[i]:
            #Includes <, > number in the statistic
            if tokens[i-1]=='<' or tokens[i-1]=='>':
                labels[i-1]='B-STAT'
                labels[i]='I-STAT'
            #Includes greater than, less than, more than, etc. 
            if tokens[i-1]=='than':
                labels[i-2]='B-STAT'
                labels[i-1]='I-STAT'
                labels[i]='I-STAT'
                
        #Combines "This disease affects 1 in 7500 to 1 in 10,000 people" into a single statistic phrase instead of 2
        if 'STAT' in labels[i-1] and 'STAT' in labels[i+1] and 'STAT' not in labels[i]:
            if tokens[i] =='to':
                labels[i]='I-STAT'
                labels[i+1]='I-STAT'
            if tokens[i] =='-':
                labels[i]='I-STAT'
                labels[i+1]='I-STAT'
        
        #This gets of the type "prevalence of 2 to 18 per 100,000"
        if labels[i+1]=='B-STAT':
            if tokens[i]=='to' or tokens[i]=='-' or tokens[i-1].isdigit():
                labels[i-1]='B-STAT'
                labels[i]='I-STAT'
                labels[i+1]='I-STAT'
    return tokens,labels

In [7]:
def modify_labels(tokens,labels):
    if len(tokens)!=len(labels):
        raise IndexError("Num Sentences {} and Num Sentence Labels {} Mismatch".format(len(tokens),len(labels)))
    for i in range(len(tokens)):
        if len(tokens[i])!=len(labels[i]):
            raise IndexError("Sentence Length {} and Label Length {} Mismatch".format(len(tokens[i]),len(labels[i])))
        '''
        #Comparison loop
        for j in range(2,len(tokens[i])-2):
            if labels[i][j]=='B-STAT':
                print('BEFORE')
                print(tokens[i][j-2:j+3])
                print(labels[i][j-2:j+3])
                print('')
        '''
        for j in range(len(tokens[i])):
            if tokens[i][j]=='prevalent' or tokens[i][j]=='occurs':
                labels[i][j] = 'B-EPI'
        
        for j in range(2,len(tokens[i])-2):
            if tokens[i][j].lower() in DATES:
                labels[i][j]='O'
                labels[i][j+1]='O'
                labels[i][j+2]='O'
                
        for j in range(2,len(tokens[i])-2):
            if tokens[i][j-1].lower() in DATES:
                labels[i][j-1]='O'
                labels[i][j]='O'
                labels[i][j+1]='O'
                labels[i][j+2]='O'
            
            #Ensures that there is not already a label
            if labels[i][j] in {'O','B-STAT','I-STAT'}:
                #relabel all of the percentages
                '''
                if '%' in tokens[i][j]:
                    print('BEFORE')
                    print(tokens[i][j-2:j+3])
                    print(labels[i][j-2:j+3])
                    print('')
                '''
                #only include small percentages of the form '0.*'
                if (('%' in tokens[i][j] or ('per' in tokens[i][j].lower() and 'cent' in tokens[i][j+1].lower()) or 'percent' in tokens[i][j].lower()
                    ) and not (re.match(r"^0\.", tokens[i][j]
                    ) or re.match(r"^0\.", tokens[i][j-1]
                    ) or re.match(r"^0\.", tokens[i][j-2]))
                    ) and not ('B-EPI' in set(labels[i][j-2:j+3]) or 'I-EPI' in set(labels[i][j-2:j+3])):
                    labels[i][j-2] = 'O'
                    labels[i][j-1] = 'O'
                    labels[i][j] = 'O'
                    labels[i][j+1] = 'O'
                    labels[i][j+2] = 'O'
                '''
                if '%' in tokens[i][j]:
                    print('AFTER')
                    print(tokens[i][j-2:j+3])
                    print(labels[i][j-2:j+3])
                    print('\n')
                '''
                #gets of the form 1:100,000
                #tag word before except if it is 'of' or label before is there
                if tokens[i][j][0].isdigit() and ':' in tokens[i][j]:
                    #print(tokens[i][j-2:j+3])
                    #print(labels[i][j-2:j+3])
                    #print(tokens[i][j].split(':'))
                    #Exclude
                    if ((len(tokens[i][j])==5 and not tokens[i][j+1][0].isdigit()) or len(tokens[i][j].split(':')[0])>3):
                        pass
                        #continue
                    #Exclude
                    elif 'ratio' in tokens[i][j-2:j-1]:
                        pass
                        #continue
                    else:
                        if tokens[i][j-1].lower() in INCLUSIVE_WORDS:
                            labels[i][j-1]='B-STAT'
                            labels[i][j]='I-STAT'
                        elif tokens[i][j-2].lower() in INCLUSIVE_WORDS:
                            labels[i][j-2]='B-STAT'
                            labels[i][j-1]='I-STAT'
                            labels[i][j]='I-STAT'
                        else:
                            labels[i][j]='B-STAT'

                        if tokens[i][j+1][0].isdigit():
                            labels[i][j+1]='I-STAT'
                            if (tokens[i][j+2].lower() not in STOPWORDS and tokens[i][j+2] not in PUNCTUATION) and labels[i][j+2] in {'O','B-STAT','I-STAT'}:
                                labels[i][j+2]='I-STAT'
                                #Could potentially cause an indexing issue?
                                labels[i][j+3]='I-STAT'
                        if (tokens[i][j+1].lower() not in STOPWORDS and tokens[i][j+1] not in PUNCTUATION) and labels[i][j+1] in {'O','B-STAT','I-STAT'}:
                            labels[i][j+1]='I-STAT'
                            #Checks to make sure it is not already tagged
                            if labels[i][j+2] in {'O','B-STAT'}:
                                labels[i][j+2]='I-STAT'
                                
                            
        for j in range(1,len(tokens[i])-1):
            if tokens[i][j].lower() == 'unknown' and (labels[i][j+1]=='B-EPI' or labels[i][j+1]=='I-EPI'):
                labels[i][j]='B-STAT'
            if tokens[i][j].lower() == 'global' and (labels[i][j+1]=='B-EPI' or labels[i][j+1]=='I-EPI'):
                labels[i][j]='B-LOC'
            #Gets the ones who have cut off numbers
            if labels[i][j]=='I-STAT' and labels[i][j+1]=='O':
                if tokens[i][j+1][0].isdigit():
                    labels[i][j+1]='I-STAT'
            
            if labels[i][j]=='B-STAT':
                #This is supposed to match years, but not sure how well, did not test, copied from stackoverflow
                if re.match(r"^[12][0-9]{3}$",tokens[i][j].split('/')[0]):
                    labels[i][j]='O'
                #gets rid of incessant 'was' being tagged
                if tokens[i][j].lower() in STOPWORDS:
                    labels[i][j]='O'
                    labels[i][j+1]='B-STAT'
            #Lengthens tags to include descriptors, there could bee more to include, but did not pop out during testing
            if (labels[i][j-1] =='I-STAT' or labels[i][j-1] =='B-STAT') and labels[i][j] =='O':
                if tokens[i][j+1] in {'births','LBs','LB','birth'}:
                    labels[i][j]='I-STAT'
                    labels[i][j+1]='I-STAT'
                elif tokens[i][j] in {'births','LBs','LB','birth'}:
                    labels[i][j]='I-STAT'
            #This should also lengthen epi tags a little bit to include descriptors
            if labels[i][j]=='B-EPI' and tokens[i][j-1].lower() in EPI_MODIFIERS:
                labels[i][j-1]='B-EPI'
                labels[i][j]='I-EPI'
            
            ## This was not in V3.1, this is the final change that created the V3.2 dataset. Everything else is the same
            #This should remove isolated stats that do not have epi in the rest of the sentence
            if labels[i][j-1]=='B-STAT' and labels[i][j]!='I-STAT' and 'B-EPI' not in labels[i]:
                if '/' in tokens[i][j-1]:
                    if len(tokens[i][j-1])<9 or '+' in tokens[i][j-1] or '-' in tokens[i][j-1] or '±' in tokens[i][j-1]:
                        labels[i][j-1]=='O'
                    else:
                        #leave out the ones like ['1.21/10,000', 'individuals', ')'] ['B-STAT', 'O', 'O']
                        pass
                else:
                    labels[i][j-1]=='O'

        tokens, labels = combine_stats(tokens,labels)
        '''
        #Comparison loop
        for j in range(2,len(tokens[i])-2):
                
            if labels[i][j]=='B-STAT':
                print('AFTER')
                print(tokens[i][j-2:j+3])
                print(labels[i][j-2:j+3])
                print('\n')
        '''
        if i %250==0:
            print(i)
            
    return tokens, labels

In [8]:
mod_train_tokens, mod_train_labels = modify_labels(epi_train_tokens,epi_train_labels)

0
250
500
750
1000
1250
1500
1750
2000
2250
2500
2750
3000
3250
3500
3750
4000
4250
4500


In [9]:
mod_val_tokens, mod_val_labels = modify_labels(epi_val_tokens,epi_val_labels)

0
250
500
750
1000


In [10]:
mod_test_tokens, mod_test_labels = modify_labels(epi_test_tokens,epi_test_labels)

0
250
500


In [11]:
with open('epi_train_setV3.tsv', "w") as f:
    for i in range(len(mod_train_tokens)): #For sentence in list of sentences
        for j in range(len(mod_train_tokens[i])): #for token in sentence
            output = str(mod_train_tokens[i][j]) +'\t' +str(mod_train_labels[i][j])+'\n'
            f.write(output)
        f.write('\n')
f.close()

In [12]:
with open('epi_val_setV3.tsv', "w") as f:
    for i in range(len(mod_val_tokens)): #For sentence in list of sentences
        for j in range(len(mod_val_tokens[i])): #for token in sentence
            output = str(mod_val_tokens[i][j]) +'\t' +str(mod_val_labels[i][j])+'\n'
            f.write(output)
        f.write('\n')
f.close()

In [13]:
with open('epi_test_setV3.tsv', "w") as f:
    for i in range(len(mod_test_tokens)): #For sentence in list of sentences
        for j in range(len(mod_test_tokens[i])): #for token in sentence
            output = str(mod_test_tokens[i][j]) +'\t' +str(mod_test_labels[i][j])+'\n'
            f.write(output)
        f.write('\n')
f.close()