# COVID-19 Lab Results
Written by: Branson Chen <br>
Last modified: 20200710

## Table of Contents

<a href='#Overview'>Overview</a><br>
<a href='#Input-variables'>Input variables</a><br>
<a href='#Importing-data'>Importing data</a><br>
<a href='#Text-analysis'>Text analysis</a><br>

- <a href='#Algorithm-description'>Algorithm description</a><br>
- <a href='#Initial-processing'>Initial processing</a><br>
- <a href='#Assign-results'>Assign results</a><br>

<a href='#Final-output'>Final output</a><br>
<a href='#Testing-and-validation'>Testing and validation</a><br>

## Overview

- This script first imports a SAS dataset based on the input variables provided, and then fields are decoded/renamed.
- Next, the text is cleaned (clean function) and then tokenized (tokenize function).
- Relevant labels are then assigned to the tokens (assign_labels function).
- The labelled tokens are then interpreted using an in-house algorithm (interpret function).
- All of the information from the previous step is then collapsed to give one result per virus per test (process_result function), and unidentified virus/test types are filled in based on observation codes and testrequest codes.
- Lastly, the results are converted to a single character per virus type (char_output function) and then output in a csv.

## Input variables

In [None]:
#input path and filename (should be .sas7bdat file)
input_path = ''
input_filename = ''

#name of patientid variable in input dataset, will be renamed as 'patientid'
input_patientid_var = 'ikn'

#output additional columns
#0 = just virus flags, 1 = with key columns, 2 = with ALL columns
output_flag = 1

#output filename (should be .csv file)
output_filename = 'output.csv'

## Importing data

In [None]:
import pandas as pd
import numpy as np

In [None]:
%%time
#import sas file (COMPRESS=BINARY MAY NOT WORK WITH READ_SAS; COMPRESS=YES|CHAR DOES NOT WORK WITH READ_SAS)
df_raw=pd.read_sas(input_path+input_filename)

#decode strings (np objects)
df_raw.loc[:, df_raw.dtypes == np.object] = df_raw.loc[:, df_raw.dtypes == np.object].apply(lambda x: x.str.decode('UTF-8'))
df_raw.fillna('', inplace=True)
print('# of records:',len(df_raw))

In [None]:
df = df_raw.copy(deep = True)

#rename variables
df = df.rename(columns={input_patientid_var:'patientid','fillerordernumber':'fillerordernumberid',
                       'observationvalue':'value','observationsubid':'subid'})
#keep key cols
key_cols = ['patientid', 'ordersid', 'interpretationvalue', 'fillerordernumberid', 
            'reportinglaborgname', 'performinglaborgname', 'observationdatetime', 
            'testrequestcode', 'observationcode', 'observationreleasets', 
            'observationresultstatus', 'subid', 'value']
df = df[key_cols]

#set exclude_flag based on observationresultstatus = W
df_W = df.loc[df['observationresultstatus'] == 'W', ['ordersid', 'observationcode', 'value']]
df_excl = df[['ordersid', 'observationcode', 'value']].reset_index().merge(df_W, how='inner').set_index('index')
df['exclude_flag'] = 'N'
df.loc[df.index.isin(df_excl.index),['exclude_flag']] = 'Y'
print(df['exclude_flag'].value_counts())

#set exclude_flag based on DO NOT TRANSMIT code
# DNT_text = '<p1:MicroOrganism xmlns:p1="http://www.ssha.ca"><p1:Code>99999999999</p1:Code><p1:Text>Do Not Transmit</p1:Text><p1:CodingSystem>HL79905</p1:CodingSystem></p1:MicroOrganism>'
# df_DNT = df.loc[df['value'] == DNT_text, ['ordersid', 'observationcode','observationreleasets']]
# df_excl2 = df[['ordersid', 'observationcode', 'observationreleasets']].reset_index().merge(df_DNT, how='inner').set_index('index')
# df.loc[df.index.isin(df_excl2.index),['exclude_flag']] = 'Y'
# print(df['exclude_flag'].value_counts())

In [None]:
#determine which observations need to be concatenated
group_cols = ['ordersid', 'fillerordernumberid', 'reportinglaborgname', 
              'testrequestcode', 'observationcode', 'observationreleasets', 'observationresultstatus']
df_gp_subid = df.reset_index().groupby(group_cols).agg({'index':tuple, 'subid':tuple}).reset_index()
df_gp_subid = df_gp_subid.rename(columns={'index':'original_indexes'})

#only concatenate ones where there are more than two subids, all the subids are numbers and contains 1
df_to_concat = df_gp_subid[df_gp_subid['subid'].apply(lambda x: all([subid.isdigit() for subid in x]) and len(x) > 2 and '1' in x)]
concat_indexes = [i for tup in df_to_concat['original_indexes'] for i in tup]

#concatenate based on subid
df_gp_concat = df[df.index.isin(concat_indexes)].reset_index()
df_gp_concat['subid'] = df_gp_concat['subid'].apply(int)
df_gp_concat = df_gp_concat.sort_values(by = group_cols+['subid']).groupby(group_cols)
df_gp_concat = df_gp_concat.agg({'index': tuple,
                   'value': lambda x: ' '.join(map(str, x))}).reset_index()

#add on records that were not concatenated
df_gp = df.loc[~df.index.isin(concat_indexes), group_cols+['value']].reset_index()
df_gp['index'] = df_gp['index'].apply(lambda x: (x,))
df_gp = pd.concat([df_gp_concat, df_gp], sort=False).rename(columns={'index':'original_indexes'})

print('# of TEST RESULTS:', len(df_gp))

#cleanup
del df_W
del df_excl
del df_gp_subid
del df_to_concat
del df_gp_concat

## Text analysis

In [None]:
import nltk
import re

In [None]:
#clean punctuation, xml field, numbers, other text
puncs = [';', ':', ',', '.', '-', '_', '/', '(', ')', '[', ']', '{', '}', '<', '>', '*', '#', '?', '.', '+', 
        'br\\', '\\br', '\\e\\', '\\f\\', '\\t\\', '\\', "'", '"', '=']
terms_to_space = ['detected', 'by', 'positive', 'parainfluenza', 'accession']
nums_following = ['date', 'telephone', 'tel', 'phone', 'received', 'collected',  
                 'result', 'on', 'at', '@', 'approved', 'final', 'time', 'number']
strings_to_replace = {'non detected':'not detected', 'npot detected':'not detected', 'nor detected':'not detected',
                      'mot detected':'not detected', 'n0t detected':'not detected',
                      'covid 19 virus not interpretation detected':'covid 19 virus interpretation not detected',
                      'presumptive interpretation':'interpretation presumptive',
                      'preliminary interpretation':'interpretation preliminary',
                      'covid 19 not detected and covid 19 detected':'covid 19 detected and covid 19 not detected',
                      'virusnot':'virus not', 'prevuous':'previous'}
date_id_patterns = [r'\d{2,4} \d{2} \d{2,4} ', r'\d{4} \d{2} ', r'\d{4}h ', 
                   r' \d{0,2}[a-z]{0,2}\d{5,}[a-z]{0,1}', r' [a-z]{0,2}\d{1,3}[a-z]{1,3}\d{4,}[a-z]{0,1}']

def clean(value):
    cleaned = value.lower()

    #clean xml field, only keep text field surrounded with 'p1 text'
    pattern = r'(<p1:microorganism xmlns)(.+)(<p1:text>.+</p1:text>)(.+)(</p1:microorganism>)'
    while re.search(pattern, cleaned):
        cleaned = re.sub(pattern, r'\g<3>', cleaned)
    
    #surround terms with spaces (some terms found stuck together)
    for t in terms_to_space:
        cleaned = cleaned.replace(t, ' ' + t + ' ')
    
    #replace punctuation with space
    for punc in puncs:
        cleaned = cleaned.replace(punc, ' ')

    #remove consecutive spaces
    while '  ' in cleaned:
        cleaned = cleaned.replace('  ', ' ')
    
    cleaned = cleaned.strip()     
    
    #remove numbers after certain terms
    for term in nums_following:
        pattern = term + r' \d{1,4}'
        
        while re.search(pattern, cleaned):
            cleaned = re.sub(pattern, term, cleaned)
            
    #remove more dates and ids
    for pattern in date_id_patterns:
        while re.search(pattern, cleaned):
            cleaned = re.sub(pattern, '', cleaned)
    
    #remove numbers at the end
    while len(cleaned) > 0 and (cleaned[-1].isdigit() or cleaned[-1] == ' '):
        cleaned = cleaned[:-1]
    
    #remove "no" at the end
    while cleaned.endswith(' no') or cleaned == 'no':
        cleaned = cleaned[:-3]
    
    #fix certain strings
    for k, v in strings_to_replace.items():
        cleaned = cleaned.replace(k, v)

    return cleaned

In [None]:
#tokenize values using nltk
def tokenize(value):
    tokenized = nltk.word_tokenize(value)
   
    return tokenized

In [None]:
#assign labels for useful tokens based on some dictionaries and exclusions
easy_virus_dict = {'v_adenovirus':['aden'], 'v_bocavirus':['boca', 'bocca'], 'v_coronavirus':['coro', 'cora'],
                   'v_entero_rhino':['enterol', 'enterov', 'entervir', 'rhino', 'rhini'], 'v_hmv':['metap']}
hard_virus_dict = {'v_rsv':['rsv'], 'v_flu':['nflu', 'flue'], 'v_para':['parai', 'pata', 'parta'],
                   'v_covid':['cov', 'sars', 'orf1', 'orfl', 'or1lab']}
indirect_matches_dict = {'r_pos': ['posi'], 
                         'r_neg': ['neg', 'naeg', 'neag'],  
                         'r_ind': ['indeter', 'eterminate', 'inconclu',
                                   'equivocal', 'uninterpret', 'unresolved'],
                         'r_can': ['cancel', 'forward', 'incorrect', 'duplicate', 'mislabel', 'recollect'],
                         'r_rej': ['reject', 'inval', 'leak', 'unable', 'insuffic', 
                                   'spill', 'inapprop', 'nsq', 'poor'],
                         'presumptive': ['presump', 'prelim', 'possi'], 
                         'retest': ['retest']}
direct_matches_dict = {'r_pos': ['detected', 'pos', 'deteced', 'postive', 'organism'],
                       'r_neg': ['no', 'not'],
                       'r_ind': ['ind'],
                       'r_pen': ['pending', 'progress', 'follow', 'ordered', 'reordered'],
                       'r_can': ['sent', 'send', 'redirected'],
                       'presumptive': ['single', 'possible', 'probable'],
                       'xml': ['p1'], 
                       'reset': ['deleted','anesthesiologist'],
                       'stop': ['specific', 'required', 'error', 'copy', 'see', 
                                'note', 'stability', 'changed', 'recollect', 'moh', 'if'],
                       'final': ['interpretation', 'interpetation', 'interp', 'pretation',
                                 'final', 'overall', 'updated', 'corrected', 'proved'],
                       'skip': ['reason', 'identify'],
                       'connecting': ['targets', 'tagets', 'target', 'screen', 'presence', 'as', 'real',
                                      'is', 'of', 'in', '1', '2', '3', '4', 'a', 'b', 'c', 
                                      '229e', 'nl63', 'hku1', 'oc43', '19', '2019', 'low',
                                      'biosafety', 'hazard', 'has', 'been', 'for', 'changed', 'identified', 
                                      'result', 'other', 'testing', 'using', 'to', 'from', 'tested',
                                      'phl', 'phol', 'phlo', 'new', 'request', 'lab', 'will']}
test_type_dict = {'t_oth': ['eia', 'rapid', 'immunoassay', 'ict', 'immunochromatographic', 'antigen'], 
                  't_pcr': ['multiplex', 'naat', 'nat', 'pcr', 'rrt', 'gene', 'rna', 'gen', 
                            'reverse', 'polymerase', 'chain', 'simplexa']}

def assign_labels(tokenized):
    tokenized_length = len(tokenized)
    useful = [None]*tokenized_length #store same list length of tokens and update each accordingly
    
    for counter, token in enumerate(tokenized):
        
        #skip if already assigned
        if useful[counter]:
            continue
        
        ###easy viruses dictionary (non-exact matching)
        for virus, patterns in easy_virus_dict.items():
            if any([pattern in token for pattern in patterns]):
                useful[counter] = virus
                break

        #extra rhino/entero rule (exact matching)
        if token in ('rhino', 'entero'):
            useful[counter] = 'v_entero_rhino'

        ###hard viruses dictionary (non-exact matching)
        
        #COVID19 
        elif any([pattern in token for pattern in hard_virus_dict['v_covid']])\
        and not any([pattern in token for pattern in ('ecov', 'cove')])\
        and 'mers' not in tokenized[counter-3:counter]:
            useful[counter] = 'v_covid'
            
        #e/envelope/n/nucleocapsid gene
        elif token in ('e', 'envelope', 'n', 'nucleocapsid') and (tokenized_length > counter+1)\
        and tokenized[counter+1] == 'gene':
            useful[counter:counter+2] = ['v_covid', 'connecting']
        
        #rdrp gene
        elif token == 'rdrp' and (tokenized_length > counter+1)\
        and tokenized[counter+1] == 'gene' and 'v_coronavirus' not in useful[:counter]:
            useful[counter:counter+2] = ['v_covid', 'connecting']
        
        #extra rule for seasonal coronavirus, if preceded by novel or followed by 19/disease/cov/sars/2
        elif any([pattern in token for pattern in easy_virus_dict['v_coronavirus']]):
            if 'nove' in tokenized[counter-1] or tokenized[counter-1] == 'nivel':
                useful[counter-1:counter+1] = ['v_covid', 'connecting']
                
            covid_extra = [] #extra terms
            look_forward = 3 #how many terms to look forward for
            max_forward = min(counter+look_forward, tokenized_length-1) #limit if record is too short
            covid_extra = [(tokenized[covid_pos], covid_pos) for covid_pos in range(counter+1, max_forward+1)\
                       if any([pattern in tokenized[covid_pos] for pattern in ('19', 'disea', 'cov', 'sars')]\
                              +[tokenized[covid_pos] == '2'])]
            
            #assign range of relevant tokens as virus
            if len(covid_extra) > 0:
                last_pos = max([x[1] for x in covid_extra])
                useful[counter:last_pos+1] = ['v_covid']+['connecting']*(last_pos-counter)
            else:
                pass
        
        #PARA
        elif any([pattern in token for pattern in hard_virus_dict['v_para']]+[token == 'para'])\
        and tokenized[counter-1] != 'haemophilus':
            para_extra = []
            look_forward = 5
            max_forward = min(counter+look_forward, tokenized_length-1)
            para_extra = [(tokenized[para_pos], para_pos) for para_pos in range(counter+1, max_forward+1)\
                              if tokenized[para_pos] in ('1','2','3','4')]
            
            if len(para_extra) > 0:
                last_pos = max([x[1] for x in para_extra])
                para_nums = [x[0] for x in para_extra]
                useful[counter:last_pos+1] = ['v_para_' + '_'.join(para_nums)]+['connecting']*(last_pos-counter)
            else:
                useful[counter] = 'v_para'

        #FLU
        elif any([pattern in token for pattern in hard_virus_dict['v_flu']]+[token in ('flu', 'inf')])\
        and tokenized[counter-1] != 'haemophilus':
            flu_extra = []
            look_forward = 4
            max_forward = min(counter+look_forward, tokenized_length-1)
            
            for flu_pos in range(counter+1, max_forward+1):
                if tokenized[flu_pos] in ('a','b') or 'h1' in tokenized[flu_pos] or 'h3' in tokenized[flu_pos]:
                    flu_extra.append((tokenized[flu_pos], flu_pos))
                elif 'flu' in tokenized[flu_pos]: #to deal with influenza a influenza b
                    break
                
            if len(flu_extra) > 0:
                last_pos = max([x[1] for x in flu_extra])
                flu_types = [x[0] for x in flu_extra]
                if 'a' in flu_types and 'b' in flu_types:
                    useful[counter:last_pos+1] = ['v_flu_a_b']+['connecting']*(last_pos-counter)
                elif 'b' in flu_types:
                    useful[counter:last_pos+1] = ['v_flu_b']+['connecting']*(last_pos-counter)
                elif any(['h1' in f for f in flu_types]) and any(['h3' in f for f in flu_types]):
                    useful[counter:last_pos+1] = ['v_flu_a_h1_h3']+['connecting']*(last_pos-counter)
                elif any(['h1' in f for f in flu_types]):
                    useful[counter:last_pos+1] = ['v_flu_a_h1']+['connecting']*(last_pos-counter)
                elif any(['h3' in f for f in flu_types]):
                    useful[counter:last_pos+1] = ['v_flu_a_h3']+['connecting']*(last_pos-counter)
                elif 'a' in flu_types:
                    useful[counter:last_pos+1] = ['v_flu_a']+['connecting']*(last_pos-counter)                                                                  
            elif token.endswith('aa'):
                useful[counter] = 'v_flu_a'
            elif token.endswith('ab'):
                useful[counter] = 'v_flu_b'
            else:
                useful[counter] = 'v_flu'

        #RSV
        elif any([pattern in token for pattern in hard_virus_dict['v_rsv']]):
            rsv_extra = []
            look_forward = 2
            max_forward = min(counter+look_forward, tokenized_length-1) 
            rsv_extra = [(tokenized[rsv_pos], rsv_pos) for rsv_pos in range(counter+1, max_forward+1)\
                       if tokenized[rsv_pos] in ('a','b')]
                
            if len(rsv_extra) > 0:
                last_pos = max([x[1] for x in rsv_extra])
                rsv_types = [x[0] for x in rsv_extra]
                if 'a' in rsv_types and 'b' in rsv_types:
                    useful[counter:last_pos+1] = ['v_rsv_a_b']+['connecting']*(last_pos-counter)
                elif 'a' in rsv_types:
                    useful[counter:last_pos+1] = ['v_rsv_a']+['connecting']*(last_pos-counter)
                elif 'b' in rsv_types:
                    useful[counter:last_pos+1] = ['v_rsv_b']+['connecting']*(last_pos-counter)
            else:
                useful[counter] = 'v_rsv'

        elif (tokenized_length > counter+2) and ((token.startswith('resp')\
        and tokenized[counter+1].startswith('syn') and tokenized[counter+2].startswith('vi'))\
        or (token == 'r' and tokenized[counter+1] == 's' and tokenized[counter+2] == 'v')):
            rsv_extra = []
            look_forward = 4
            max_forward = min(counter+look_forward, tokenized_length-1) 
            rsv_extra = [(tokenized[rsv_pos], rsv_pos) for rsv_pos in range(counter+3, max_forward+1)\
                       if tokenized[rsv_pos] in ('a','b')]

            if len(rsv_extra) > 0:
                last_pos = max([x[1] for x in rsv_extra])
                rsv_types = [x[0] for x in rsv_extra]
                if 'a' in rsv_types and 'b' in rsv_types:
                    useful[counter:last_pos+1] = ['v_rsv_a_b']+['connecting']*(last_pos-counter)
                elif 'a' in rsv_types:
                    useful[counter:last_pos+1] = ['v_rsv_a']+['connecting']*(last_pos-counter)
                elif 'b' in rsv_types:
                    useful[counter:last_pos+1] = ['v_rsv_b']+['connecting']*(last_pos-counter)
            else:
                useful[counter:counter+3] = ['v_rsv', 'connecting', 'connecting']
        
        #UNKNOWN VIRUS
        elif (token.startswith('vir') or token.startswith('viu')):
            #extra rule for virus culture
            if (tokenized_length > counter+2) and tokenized[counter+1].startswith('cult')\
            and 'request' in tokenized[counter+2]:
                useful[counter:counter+3] = ['connecting']*3
            elif (tokenized_length > counter+1) and tokenized[counter+1].startswith('cult'):
                useful[counter:counter+2] = ['t_oth']*2
            else:
                useful[counter] = 'v_unk'
        
        #extra terms to treat as an "unknown virus" for purpose of algorithm
        elif token in ('by','further','specimen','specimens','test','sample','considered'):
            useful[counter] = 'v_unk'
        
    # loop over the record again
    for counter, token in enumerate(tokenized):
        
        #skip if already assigned
        if useful[counter]:
            continue

        #culture tests  
        if token.startswith('cult') and not ((tokenized_length > counter+1) and 'request' in tokenized[counter+1]):
            useful[counter] = 't_oth'

        #additional "direct" tests
        elif token == 'direct' and (tokenized_length > counter+1):
            if tokenized[counter+1] in ('kit', 'enzyme', 'test', 'testing', 'eia', 'antigen', 'ict'):
                useful[counter:counter+2] = ['t_oth']*2
            elif tokenized[counter+1] in ('influenza',):
                useful[counter] = 't_oth'
        
        #condition for mention of pos/neg
        elif token in ('negative','neg','positive','pos','detected','organism') and (tokenized_length > counter+1)\
        and ((tokenized[counter-1] in ('a','original','or','level','of','the')
              and tokenized[counter+1] in ('test','result','covid'))
             or tokenized[counter+1] in ('or','swab','results','to','p1','contact','workers','retest')):
            useful[counter-1:counter+2] = [None]*3
        elif token in ('negative','neg','positive','pos','detected','organism') and (tokenized_length > 1)\
        and (tokenized[counter-2] in ('previous','previously','contact','worker','depot','targets')
             or tokenized[counter-1] in ('previous','previously','known','unit','first','second',
                                         'needs','need','requires','considered','swab','if',
                                         'depot','employee','gram')):
            useful[counter-1:counter+1] = [None]*2
            
        #condition for word before no
        elif token == 'no' and (tokenized[counter-1] in ('by','lab','specimen','accession','sample')
                                or any([pattern in tokenized[counter-1] for pattern in ('out','break')]))\
        and tokenized[counter+1:counter+2] != ['virus']:
            useful[counter-1:counter+1] = [None]*2
            
        #condition for word after no
        elif token == 'no' and (tokenized_length > counter+1)\
        and tokenized[counter+1] in ('specimen','reportable','done','gene','fever','answer','result','longer',
                                     'media','liquid','sample','swab','nasopharyngeal','record','fluid',
                                     'patient','second','results','testing','grh'):
            useful[counter] = 'r_can'

        #condition for due to
        elif tokenized[counter:counter+2] == ['due','to'] and 'new' not in tokenized[counter+2:counter+4]:
            useful[counter:counter+2] = ['stop']*2
        
        #condition for not test/been
        elif token == 'not' and (tokenized_length > counter+1)\
        and tokenized[counter+1] in ('test','been','suspicious','validated','the'):
            useful[counter:counter+2] = ['skip']*2
        
        #condition for word following not
        elif token == 'not' and (tokenized_length > counter+1) and \
        tokenized[counter+1] in ('tested','tessted','perform','performed','process','processed', 
                                 'transmit','suitable','done','doen','be','reported','received', 
                                 'match','needed','labelled','available','symptomatic','forwared',
                                 'met','specified','indicated','returned','sufficient','given',
                                 'valid','required','able','needed','contain','ordered','recieved',
                                 'labeled','a','provided','appropriate'):
            useful[counter:counter+2] = ['r_can']*2
        
        #condition for does not
        elif token == 'not' and tokenized[counter-1] in ('does','did','please','done'):
            useful[counter-1:counter+1] = ['skip']*2
        
        #condition for ordered in error
        elif 'ordered' in token and tokenized[counter+1:counter+3] == ['in', 'error']:
            useful[counter:counter+3] = ['r_can']*3
      
        #condition for target rna
        elif tokenized[counter:counter+2] in (['target','rna'],['patient','disregard']):
            useful[counter:counter+2] = ['end']*2
        
        #condition for previous
        elif 'previous' in token and ('reported' in tokenized[counter+1:counter+3] or
                                      tokenized[counter+1:counter+3] in (['report','of'],
                                                                         ['reports','of'],
                                                                         ['result','of'],
                                                                         ['covid','19'])):
            useful[counter:counter+3] = ['end']*3
        
        #unable and indeterminate
        elif tokenized[counter:counter+5] == ['unable','to','be','completed','indeterminate']:
            useful[counter:counter+4] = ['connecting']*4
        
        else:
            #indirect_matches dictionary
            for term, patterns in indirect_matches_dict.items():
                if any([pattern in token for pattern in patterns]):
                    useful[counter] = term
                    break
                    
            #direct_matches dictionary
            for term, patterns in direct_matches_dict.items():
                if any([pattern == token for pattern in patterns]):
                    useful[counter] = term
                    break
                    
            #test_type dictionary
            for test, patterns in test_type_dict.items():
                if any([pattern == token for pattern in patterns]):
                    useful[counter] = test
                    break
        
    return useful

### Algorithm description

Using the useful_tokens field, this interpret function sequentially "reads" the terms. It picks up virus/result/test terms and they are held in a "bundle" (virus, result, test). There are also multiple modifiers that affect the way that the algorithm processes the terms. These modifiers are: final (flag to take highest priority later on), presumptive (change pos to pre), end (end reading early or skip the next save), and skip (skip the 'save when virus switches' rule once). Any time a bundle is saved, the bundle (except for test type) and the final/presumptive modifiers are cleared. If a save occurs with incomplete information, the virus defaults to an unknown virus, result defaults to negative, and test defaults to unknown test. Whenever a save happens, all of the previous tokens+labels that were read are considered to be a "segment".
<br>
- First, the xml field is processed if there is one. If a relevant virus is found, it is treated as a positive and the bundle is saved.
- Next, the algorithm will go through the labelled tokens one by one. There are different conditions for storing terms and saving the bundle when encountering a virus, a result, a special term, or an irrelevant (unlabelled) term.
    - Viruses: A relevant virus is always kept. If the virus switches, save the bundle (note: can be affected by skip modifier). If the same virus is read, save the bundle only if there is a result as well. An unknown virus is only kept if there is no current virus.
    - Results: A clear result (ind, neg, pos) is kept with hierarchy ind > neg > pos such that a neg/ind can overwrite a positive if it's close together (e.g., "not detected" becomes a neg). An unclear result (rej, can, pen) is only kept if there is no current result with hierarchy rej > can > pen. If there is already a previous result and a neg/ind is encounter, save the bundle.
    - Special terms:
        - Final: Modifier to add flag when saving to specify whether it is a final result, which takes higher priority over all others in the process_result function. Save if there is a current virus and current result. Clear the current result.
        - Presumptive: Modifier to change positive (r_pos) into presumptive-positive (r_pre).
        - End: Modifier to skip the next save. Save if there is a current virus and current result. Stop the reading if there are any results.
        - Skip: Modifier to skip the 'save when virus switches' once. This is only reset when a virus switches and the current virus is skipped. Save if there is a current virus and current result. Clear the bundle.
        - Reset: Clear bundle without saving.
        - Stop: Save if there is a current virus and current result. Clear the bundle.        
    - Irrelevant terms: If two irrelevant terms (Nones) are read in a row, save the bundle if there is both a current result and virus. Also save the bundle if there is a virus and the past segment had another virus (virus_counter > 1; normally viruses tested are listed in a mpx or pcr assay). Otherwise, clear the bundle without saving and reset all the counter variables (i.e., start a new segment).
    - If the sentence ends before hitting two Nones, save any result and save the bundle if there is a virus and the past segment had another virus.

In [None]:
#interpret text to get initial results
def interpret(useful):
    
    def save(b):
        #presumptive modifier
        if b[1] == 'r_pos' and modifier[1]:
            b[1] = 'r_pre'

        #end modifier (skips a save)
        if not modifier[2]:
            output.append([b[0] if b[0] else 'v_unk', 
                           b[1] if b[1] else 'r_neg', 
                           b[2] if b[2] else 't_unk', 
                           modifier[0]]) #final modifier
        
        b[0] = None
        b[1] = None
        modifier[0:3] = [False, False, False]
        return
    
    sentence = useful[:]
    output = []
    
    #bundle for current virus/result/test
    #0 = virus, 1 = result, 2 = test
    bundle = [None, None, None]
    
    #modifiers
    #0 = final, 1 = presumptive, 2 = end, 3 = skip
    modifier = [False, False, False, False]
    
    none_counter = 0 #counter for hitting consecutive irrelevant words
    virus_counter = 0 #counter for different viruses in same segment
    
    #xml field processing
    xml_pos = [i for i, x in enumerate(sentence) if x == 'xml']
    num = len(xml_pos)//2
    for i in range(num):
        xml_start_pos = xml_pos[i*2]
        xml_end_pos = xml_pos[i*2+1]
        for j in range(xml_start_pos, xml_end_pos + 1):
            if sentence[j] and sentence[j].startswith('v_') and sentence[j] != 'v_unk':
                bundle[0] = sentence[j]
                bundle[1] = 'r_pos'
                save(bundle)
    
    #add result to output if result in first 3 words
    if len(sentence) > 3 and not any(['v_' in s for s in sentence[0:3] if s]):
        for s in sentence[0:3]:
            if s and 'r_' in s:
                output.append(['v_unk', s, 't_unk', False])
                break
            elif s and s == 'connecting':
                pass
            else:
                break
    
    #if there is mention of retest but no final result, take earliest one as final
    if 'retest' in sentence and 'final' not in sentence:
        modifier[0] = True
    
    #loop on words in sentence
    for word in sentence:
        
        if word: #relevant term
            none_counter = 0 #restart counter
            
            #set current virus 
            if word.startswith('v_'):
                #different virus
                if word != 'v_unk' and word != bundle[0]:
                    #save current result if hitting a different virus
                    if bundle[0] and bundle[0] != 'v_unk':
                        #skip modifier
                        if modifier[3]:
                            modifier[3] = None #reset skip modifier
                            bundle[1] = None
                        else:
                            save(bundle)
                            virus_counter += 1 #increase counter if different virus in segment     
                    bundle[0] = word
                #same virus
                elif word != 'v_unk' and word == bundle[0]:
                    #save current result if there is one
                    if bundle[1]:
                        save(bundle)
                    bundle[0] = word
                #only set to general virus if there's no current virus
                elif word == 'v_unk' and not bundle[0]:
                    bundle[0] = word
                
            #set current result
            elif word.startswith('r_'):
                if word == 'r_ind':
                    if bundle[1]: 
                        save(bundle)
                    bundle[1] = word
                elif word == 'r_neg' and bundle[1] not in ('r_ind',):
                    if bundle[1]: 
                        save(bundle)
                    bundle[1] = word
                elif word == 'r_pos' and bundle[1] not in ('r_ind', 'r_neg'):
                    bundle[1] = word

                elif word in ('r_rej', 'r_can', 'r_pen') and bundle[1] not in ('r_ind', 'r_neg', 'r_pos'):
                    if word == 'r_rej':
                        bundle[1] = word
                    elif word == 'r_can' and bundle[1] not in ('r_rej',):
                        bundle[1] = word
                    elif word == 'r_pen' and bundle[1] not in ('r_rej', 'r_can'):
                        bundle[1] = word
                
            #set current test
            elif word.startswith('t_'):
                bundle[2] = word
            
            #final modifier
            elif word == 'final':
                if bundle[0] and bundle[1]:
                    save(bundle)
                modifier[0] = True
                bundle[1] = None #reset result
            
            #presumptive modifier
            elif word == 'presumptive':
                modifier[1] = True
            
            #end modifier/word
            elif word == 'end':
                if bundle[0] and bundle[1]:
                    save(bundle)
                modifier[0:3] = [False, False, True] #end modifier skips next save
                #end early only if there is already result
                if len(output) > 0:
                    return output
            
            #skip modifier
            elif word == 'skip':
                if bundle[0] and bundle[1]:
                    save(bundle)
                modifier[3] = True
                bundle[0] = None
                bundle[1] = None
            
            #stop word
            elif word == 'stop':
                if bundle[0] and bundle[1]:
                    save(bundle)
                modifier[0:3] = [False, False, False]
                bundle[0] = None
                bundle[1] = None           
                
            #reset word
            elif word == 'reset':
                modifier[0:3] = [False, False, False]
                bundle[0] = None
                bundle[1] = None
            
        else: #word is None
            none_counter += 1
            
            if none_counter == 2: #can change threshold
                #save if there is current virus and result
                if bundle[0] and bundle[1]:
                    save(bundle)
                #save the last virus if multiple were listed
                elif bundle[0] and bundle[0] != 'v_unk' and virus_counter > 1:
                    if modifier[3]:
                        modifier[3] = None #reset skip modifier
                    else:
                        save(bundle)
                #reset
                none_counter = 0 
                virus_counter = 0
                bundle[0] = None
                bundle[1] = None
                modifier[0:3] = [False, False, False]
                
    #if there is still a remaining result
    if bundle[1]: 
        save(bundle)
    
    #if there is an extra virus listed at the end
    elif bundle[0] and bundle[0] != 'v_unk' and virus_counter > 1 and not modifier[3]:
        save(bundle)
            
    return output

In [None]:
#using reference excel to assign 89 LOINCs + 11 LOINCs to virus and test type
#added COVID19 LOINCs
xlsx_filename = 'COVID19_Resp_codes_20200413.xlsx'
mappings = {'--':'unk', 'culture':'cult', 'other':'oth', 'entero_rhino_D68':'entero_rhino'}

df_loincs = pd.read_excel(xlsx_filename, sheet_name='Resp_LOINCs')
df_loincs_covid = pd.read_excel(xlsx_filename, sheet_name='COVID19_LOINCs')
df_loincs = df_loincs.append(df_loincs_covid)

#cleaning the categories to match previously defined ones
df_loincs = df_loincs.replace(mappings)
df_loincs['Virus_to_assign'] = df_loincs['Virus_to_assign'].apply(lambda x: 'coronavirus' if 'corona' in x else x)
df_loincs['Virus_to_assign'] = df_loincs['Virus_to_assign'].apply(lambda x: 'v_' + x)
df_loincs['Test_to_assign'] = df_loincs['Test_to_assign'].apply(lambda x: 't_' + x)

#assign LOINCs to virus and test type
loincs_by_v = {}
loincs_by_t = {}
for index, row in df_loincs.iterrows():
    loincs_by_v.setdefault(row['Virus_to_assign'], [])
    loincs_by_v[row['Virus_to_assign']].append(row['LOINCs'])
    loincs_by_t.setdefault(row['Test_to_assign'], [])
    loincs_by_t[row['Test_to_assign']].append(row['LOINCs'])

#remove the unk ones
del loincs_by_v['v_unk']
del loincs_by_t['t_unk']
    
#use reference excel to assign 19 TR codes to virus and test type
#added COVID19 TR codes
df_tr_codes = pd.read_excel(xlsx_filename, sheet_name='Resp_TRs')
df_tr_covid = pd.read_excel(xlsx_filename, sheet_name='COVID19_TRs')
df_tr_codes = df_tr_codes.append(df_tr_covid)

#cleaning the categories to match previously defined ones
df_tr_codes = df_tr_codes.replace(mappings)
df_tr_codes['Virus_to_assign'] = df_tr_codes['Virus_to_assign'].apply(lambda x: 'coronavirus' if 'corona' in x else x)
df_tr_codes['Virus_to_assign'] = df_tr_codes['Virus_to_assign'].apply(lambda x: 'v_' + x)
df_tr_codes['Test_to_assign'] = df_tr_codes['Test_to_assign'].apply(lambda x: 't_' + x)

#assign LOINCs to virus and test type
tr_codes_by_v = {}
tr_codes_by_t = {}
for index, row in df_tr_codes.iterrows():
    tr_codes_by_v.setdefault(row['Virus_to_assign'], [])
    tr_codes_by_v[row['Virus_to_assign']].append(row['TRs'])
    tr_codes_by_t.setdefault(row['Test_to_assign'], [])
    tr_codes_by_t[row['Test_to_assign']].append(row['TRs'])
    
#remove the unk ones
del tr_codes_by_v['v_unk']
del tr_codes_by_t['t_unk']

In [None]:
# assign more details to v_unk or t_unk based on LOINC and TR code
# group by test type and then type of virus, remove duplicates
loinc_exclusions = ['21026-0','22634-0','22635-7','22636-5','22637-3','22638-1','22639-9',
                    '31208-2','33882-2','35265-8','47526-9','49049-0','55752-0','59465-5',
                    '664-3','XON10007-3','XON10011-5','XON10337-4','XON11913-1','XON12721-7',
                    'XON13543-4']

def process_result(tokens, testrequestcode, observationcode, results):
    dd = {}
    
    ###extra conditions
    
    #LOINC exclusions
    if observationcode in loinc_exclusions:
        return dd 
    
    for i in range(len(tokens)):
        #delete all results for irrelevant phrases
        if tokens[i:i+5] in (['swab', 'is', 'required', 'for', 'both'],
                             ['is', 'unable', 'to', 'go', 'until']):
            return dd
        #make presumptive-positive if test is investigational
        if tokens[i:i+3] in (['not', 'been', 'established'], 
                             ['is', 'considered', 'investigational'], 
                             ['a', 'retrospective', 'review']):
            for r in results:
                if (r[0] == 'v_covid' or r[0] == 'v_unk') and r[1] == 'r_pos':
                    r[1] = 'r_pre' 
        #change negative to pending if there are results to follow
        if tokens[i:i+3] == ['to', 'follow', 'tested']:
            for r in results:
                if r[1] in ('r_neg','r_can','r_rej') and not r[3]:
                    r[1] = 'r_pen'      
    
    #change negative to indeterminate for indeterminate multiplex
    if 'indeterminate' in tokens[0:3] and len(set([v for (v,r,t,f) in results])) > 5 and 'all' not in tokens[3:8]:
        for r in results:
            if r[1] == 'r_neg' and not r[3]:
                r[1] = 'r_ind'
      
    ###determine virus or test based on LOINC or TR
    v_from_loinc = [loinc_vir for loinc_vir, loincs in loincs_by_v.items() if observationcode in loincs]
    v_from_tr = [tr_codes_vir for tr_codes_vir, tr_codes in tr_codes_by_v.items() if testrequestcode in tr_codes]
    t_from_loinc = [loinc_test for loinc_test, loincs in loincs_by_t.items() if observationcode in loincs]
    t_from_tr = [tr_codes_test for tr_codes_test, tr_codes in tr_codes_by_t.items() if testrequestcode in tr_codes]
    
    #determine if there are any final/interpretation results
    viruses_with_final = [v for (v,r,t,f) in results if r in ('r_pos', 'r_pre', 'r_ind', 'r_neg') and f]
    results_final = results
    #remove the non-final/interpretation results for viruses with final/interpretation
    for vf in viruses_with_final:
        results_final = [(v,r,t,f) for (v,r,t,f) in results if not (v in (vf, 'v_unk') and not f)]
        
    for v, r, t, f in results_final:
        #fill in unknown virus
        if v == 'v_unk':
            if len(v_from_loinc) > 0:
                v = v_from_loinc[0]
            elif len(v_from_tr) > 0:
                v = v_from_tr[0]
        
        #fill in unknown test
        if t == 't_unk':
            if len(t_from_loinc) > 0:
                t = t_from_loinc[0]
            elif len(t_from_tr) > 0:
                t = t_from_tr[0]
            
        #fill in pcr if there is a pcr term in text
        if t == 't_unk' and 'pcr' in tokens: 
            t = 't_pcr'
        
        #remove unknown virus results
        if v != 'v_unk':
            v, r, t = v[2:], r[2:], t[2:]
            #all tests that aren't pcr are oth
            #t = t if t == 'pcr' else 'oth'
            
            #ASSUME EVERYTHING PCR FOR COVID DATASET
            t = 'pcr'
        
            dd.setdefault(t, [])
            
            #compiling results with hierarchy: S (presumptive positive) > P (positive) > I (indeterminate) 
            #                                  > N (negative) > D (pending) > R (invalid) > C (cancelled) 
            same_vir = False
            for i in range(len(dd[t])):
                if v == dd[t][i][0]:
                    same_vir = True
                    if r == 'pre':
                        dd[t][i] = (v,r)
                    elif r == 'pos' and dd[t][i][1] not in ('pre',):
                        dd[t][i] = (v,r)
                    elif r == 'ind' and dd[t][i][1] not in ('pre', 'pos'):
                        dd[t][i] = (v,r)
                    elif r == 'neg' and dd[t][i][1] not in ('pre', 'pos', 'ind'):
                        dd[t][i] = (v,r)
                    elif r == 'pen' and dd[t][i][1] not in ('pre', 'pos', 'ind', 'neg'):
                        dd[t][i] = (v,r)
                    elif r == 'rej' and dd[t][i][1] not in ('pre', 'pos', 'ind', 'neg', 'pen'):
                        dd[t][i] = (v,r)
                    elif r == 'can':
                        pass
            if not same_vir:
                dd[t].append((v,r))
        
    return dd

In [None]:
#create output as character value for each virus and test type
result_char = {'pre':'S', 'pos': 'P', 'ind':'I', 'neg':'N', 'pen':'D', 'can':'C', 'rej':'R'}

def char_output(results, ind):
    
    #loop through each test type and virus
    for t, pairs in results.items(): #need to update if there are multiple test types
        for v, r in pairs:
            if v in ('adenovirus', 'bocavirus', 'coronavirus', 'entero_rhino', 'hmv', 'covid'):
                df_results[v][ind] = result_char[r]
            
            elif v.startswith('para'):
                df_results['para'][ind] = result_char[r]
                    
            elif v.startswith('flu'):
                df_results['flu'][ind] = result_char[r]   
                if '_a' in v:
                    df_results['flu_a'][ind] = result_char[r]
                if '_h1' in v:
                    df_results['flu_a_h1'][ind] = result_char[r]
                if '_h3' in v:
                    df_results['flu_a_h3'][ind] = result_char[r]
                if '_b' in v:
                    df_results['flu_b'][ind] = result_char[r]
            
            elif v.startswith('rsv'):
                df_results['rsv'][ind] = result_char[r]
                if '_a' in v:
                    df_results['rsv_a'][ind] = result_char[r]
                if '_b' in v:
                    df_results['rsv_b'][ind] = result_char[r]
                   
    return

### Initial processing

In [None]:
#make copy of df
df_unique = df_gp.copy(deep = True)

#clean text
df_unique["cleaned_value"] = df_unique["value"].apply(clean)

#group by unique records (org, TR code, Obs code, cleaned text) and store original indexes as tuple
df_unique = df_unique.reset_index()
groupby_vars = ['reportinglaborgname', 'testrequestcode', 'observationcode', 'cleaned_value']
df_unique = df_unique.groupby(groupby_vars).agg({'value': 'count', 
                                                 'original_indexes': lambda x: tuple([i for tup in x for i in tup])}).reset_index()
df_unique = df_unique.rename(columns={'value':'count'})

df_unique = df_unique.sort_values(by=['count'], ascending=False).reset_index(drop=True)
print('unique records after cleaning:', len(df_unique))

#tokenize
df_unique["cleaned_tokenized_value"] = df_unique["cleaned_value"].apply(tokenize)

### Assign results

In [None]:
#assign labels using dictionary
df_unique["useful_tokens"] = df_unique["cleaned_tokenized_value"].apply(assign_labels)

#interpret the labelled tokens
df_unique["initial_results"] = df_unique["useful_tokens"].apply(interpret)

#fill in unknown viruses based on LOINC or TR code, roll up results to one test type
final_results = []
for i in range(len(df_unique)):
    final_results.append(process_result(df_unique["cleaned_tokenized_value"][i],
                                        df_unique["testrequestcode"][i], df_unique["observationcode"][i], 
                                        df_unique["initial_results"][i]))

In [None]:
#translate results to 1-character format
col_virus = ['covid', 'adenovirus', 'bocavirus', 'coronavirus', 'flu', 'flu_a', 'flu_a_h1', 'flu_a_h3', 'flu_b',
         'entero_rhino', 'hmv', 'para', 'rsv', 'rsv_a', 'rsv_b']
cols = [v for v in col_virus]

#create empty df to fill in results
df_results = pd.DataFrame(index=np.arange(len(df_unique)), columns=cols)

#fill in results
for i in range(len(df_unique)):
    char_output(final_results[i], i)

In [None]:
#tracker for unique records (some records may be marked as new if clean function changes)

#initialize tracker
try:
    f = open('record_tracker.pkl')
    f.close()
except FileNotFoundError:
    df_tracker = pd.DataFrame(columns=['filename', 'reportinglaborgname', 'testrequestcode', 'observationcode', 'cleaned_value'])
    df_tracker.to_pickle("./record_tracker.pkl")
    print('CREATING RECORD TRACKER FILE')
    
#read tracker
df_tracker = pd.read_pickle('./record_tracker.pkl')

#RESET TRACKER
#df_tracker = df_tracker.iloc[0:0]

df_tracker_orig = df_tracker[['reportinglaborgname', 'testrequestcode', 'observationcode', 'cleaned_value']].copy(deep = True)
df_tracker_delta = df_unique[['reportinglaborgname', 'testrequestcode', 'observationcode', 'cleaned_value']].copy(deep = True)

#set difference
df_tracker_delta = pd.concat([df_tracker_delta, df_tracker_orig, df_tracker_orig], ignore_index=True).drop_duplicates(keep=False)
print('Original tracker length:', len(df_tracker_orig))
print('Delta tracker length:', len(df_tracker_delta))

In [None]:
#intermediate output for checking results
int_output_cols = ['count', 'reportinglaborgname', 'testrequestcode', 'observationcode', 'cleaned_value']
df_unique[int_output_cols].join(df_results).to_csv('intermediate_output.csv')
df_unique[int_output_cols][df_unique.index.isin(df_tracker_delta.index)].join(df_results).to_csv('intermediate_output_delta.csv')

## Final output

In [None]:
df_create_output = df_unique[['original_indexes']].join(df_results)
output = [None]*len(df)

#order results based on original_indexes
for row in df_create_output.itertuples():
    for i in row[1]: #original_indexes
        output[i] = row[2:]

df_output = pd.DataFrame(output, columns=cols)

In [None]:
#FINAL DATASET TO OUTPUT
if output_flag == 0:
    df[['exclude_flag']].join(df_output).reset_index().to_csv(output_filename, index=False)
elif output_flag == 1:
    df.join(df_output).reset_index().to_csv(output_filename, index=False)
elif output_flag == 2:
    df_raw.join(df[['exclude_flag']].join(df_output)).reset_index().to_csv(output_filename, index=False)
else:
    print('PLEASE ENTER ONE OF THE FOLLOWING OPTIONS FOR OUTPUT_FLAG IN THE FIRST CELL: 0, 1, 2')

In [None]:
df_output['covid'].value_counts()
#df_output.describe()

In [None]:
#STOP RUNNING HERE IF MANUAL REVIEW IS NOT DONE

In [None]:
#FINALIZE THE RECORD TRACKER (only run when you are satisfied with the review process)
#add filename
df_tracker_delta['filename'] = input_filename

#add the delta
df_tracker = pd.concat([df_tracker, df_tracker_delta], sort=False, ignore_index=True)

#save file
df_tracker.to_pickle("./record_tracker.pkl")
print('Records in tracker:', len(df_tracker))

#cleanup
del df_raw
del df
del df_gp
del df_unique
del df_output
del df_tracker
del df_tracker_orig
del df_tracker_delta

## Testing and validation

In [None]:
#test a string
test_string = r'''

'''

test_clean = clean(test_string)
print('--', test_clean)
test_useful = assign_labels(tokenize(test_clean))
print('--', test_useful)
test_interpret = interpret(test_useful)
print('--', test_interpret)

In [None]:
#check specific record based on intermediate_output index

x = 
print('ordersid:', df_raw["ordersid"][df_unique["original_indexes"][x][0]])
print('cleaned tokenized value:', list(df_unique["cleaned_tokenized_value"][x]))
print('useful tokens:', list(df_unique["useful_tokens"][x]))
print('initial results:', list(df_unique["initial_results"][x]))
print('final results:', final_results[x])

In [None]:
#test word in all words
all_words = list(set([word for sentence in df_unique["cleaned_tokenized_value"] for word in sentence]))

test_word = ''
print('All unique words that contain "' + test_word + '":',
      [word for word in all_words if test_word in word])
print('All instances of exactly "' + test_word + '":', 
      len([True for c in df_unique["cleaned_tokenized_value"] if test_word in c]))

In [None]:
#check preceding words and frequencies
d_preceding = {}
word = ''
for i in range(len(df_unique)):
    for j in range(len(df_unique["cleaned_tokenized_value"][i])):
        if df_unique["cleaned_tokenized_value"][i][j] == word and len(df_unique["cleaned_tokenized_value"][i]) > 1:
            next_term = df_unique["cleaned_tokenized_value"][i][j-1]
            d_preceding.setdefault(next_term, 0)
            d_preceding[next_term] += 1
print(d_preceding)

In [None]:
#check following words and frequencies
d_following = {}
word = ''
for i in range(len(df_unique)):
    for j in range(len(df_unique["cleaned_tokenized_value"][i])):
        if df_unique["cleaned_tokenized_value"][i][j] == word and len(df_unique["cleaned_tokenized_value"][i]) > j+1:
            next_term = df_unique["cleaned_tokenized_value"][i][j+1]
            d_following.setdefault(next_term, 0)
            d_following[next_term] += 1
print(d_following)