# Python script for COVID19
Created by: Branson Chen <br>
Last updated: 20200413

## Table of Contents

<a href='#Importing-data'>Importing data</a><br>
<a href='#Text-analysis'>Text analysis</a><br>

- <a href='#Algorithm-description'>Algorithm description</a><br>
- <a href='#Initial-processing'>Initial processing</a><br>
- <a href='#Assign-results'>Assign results</a><br>

<a href='#Final-output'>Final output</a><br>
<a href='#Testing-and-validation'>Testing and validation</a><br>

## Importing data

In [None]:
import pandas as pd

In [None]:
#import sas file; if compressed, has to be binary not char compression for read_sas
input_path = ''
input_filename = '.sas7bdat'
df_raw=pd.read_sas(input_path+input_filename)
df = df_raw.copy(deep = True)
print('# of records:',len(df))

In [None]:
#rename and decode variables
df['TestRequestCode']= df['testrequestcode'].str.decode('UTF-8')
df['ReportingLabOrgName']= df['reportinglaborgname'].str.decode('UTF-8')
df['PerformingLabOrgName']= df['performinglaborgname'].str.decode('UTF-8')
df['ObservationCode']= df['observationcode'].str.decode('UTF-8')
df['ObservationResultStatus']= df['observationresultstatus'].str.decode('UTF-8')
df['value']= df['observationvalue'].str.decode('UTF-8')
df['value'] = df['value'].fillna('')

#currently not filtering on ObservationResultStatus, further processing/aggregation will happen after text analysis
# df = df[df['ObservationResultStatus'].isin(['F', 'C'])]

df = df[['ReportingLabOrgName', 'TestRequestCode', 'ObservationCode', 'value']]
df.head()

## Text analysis

In [None]:
import numpy as np
import nltk
import re

In [None]:
#clean punctuation and xml field
puncs = [';', ':', ',', '.', '-', '_', '/', '(', ')', '[', ']', '{', '}', '<', '>', '*', '#', '?', '.', '+', 
        'br\\', '\\br', '\\e', 'e\\', '\\f\\', '\\t\\', '\\', "'", '"', '=']

def clean(value):    
    cleaned = value.lower()
    
    #clean xml field, only keep text
    while cleaned.find('<p1:microorganism xmlns') >= 0:
        xml_start = cleaned.find('<p1:microorganism xmlns')
        xml_text1 = cleaned.find('<p1:text>')
        xml_text2 = cleaned.find('</p1:text>') + 10
        xml_end = cleaned.find('</p1:microorganism>') + 19
        cleaned = cleaned[0:xml_start] + cleaned[xml_text1:xml_text2] + cleaned[xml_end:]
    
    #surround some terms with spaces (some terms I found stuck together)
    terms_to_space = ['detected', 'by', 'positive', 'parainfluenza']
    for t in terms_to_space:
        cleaned = cleaned.replace(t, ' ' + t + ' ')
   
    #replace punctuation with space
    for punc in puncs:
        cleaned = cleaned.replace(punc, ' ')

    #remove consecutive spaces
    while '  ' in cleaned:
        cleaned = cleaned.replace('  ', ' ')
    
    cleaned = cleaned.strip()        
    
    
    #remove dates after certain words
    terms = ['date', 'telephone', 'tel', 'phone', 'received', 'collected',  
             'result', 'on', 'at', '@', 'approved', 'final']
    for term in terms:
        pattern = term + ' \d{1,4}'
        test_d.setdefault(term, 0)
        
        while re.search(pattern, cleaned):
            cleaned = re.sub(pattern, term, cleaned)
            test_d[term] += 1
            
    #remove ids
    pattern = ' \d{0,2}[a-z]{0,2}\d{5,}'
    while re.search(pattern, cleaned):
        cleaned = re.sub(pattern, '', cleaned)
    
    #remove "no" at the end
    while cleaned.endswith(' no'):
        cleaned = cleaned[:-3]

    return cleaned

In [None]:
#tokenize values using nltk
def tokenize(value):
    tokenized = nltk.word_tokenize(value)
    all_words.extend(tokenized)
        
    return tokenized

In [None]:
#assign labels for useful tokens based on some dictionaries and exclusions
easy_virus_dict = {'v_adenovirus':['aden'], 'v_bocavirus':['boca', 'bocca'], 'v_coronavirus':['coro', 'cora'],
                   'v_entero_rhino':['enterol', 'enterov', 'rhino', 'rhini'], 'v_hmv':['meta']}
hard_virus_dict = {'v_rsv':['rsv'], 'v_flu':['nflu', 'flue'], 'v_para':['parai', 'pata', 'parta'],
                   'v_covid':['cov', 'sars']}
test_type_dict = {'t_oth': ['eia', 'rapid', 'immunoassay', 'ict', 'immunochromatographic', 'antigen'], 
                  't_pcr': ['multiplex', 'naat', 'nat', 'pcr', 'rrt', 'gene', 'rna', 'gen', 
                            'reverse', 'polymerase', 'chain', 'simplexa']}
direct_matches_dict = {'r_neg': ['no', 'not', 'un'],
                        'r_pos': ['evidence', 'detected', 'isolated', 'pos'], 
                        'xml':['p1'], 
                        'stop': ['cancelled', 'canceled', 'specific', 'required'],
                        'interpretation': ['interpretation'],
                        'connecting': ['targets', 'tagets', 'by', 'screen', 'presence', 'or',
                                        'is', 'for', 'of', 'in', 'test', 'real', 'note', 'completed',
                                        '1', '2', '3', '4', 'a', 'b', 'c', '229e', 'nl63', 'hku1', 'oc43', 
                                        '19', '2019']} 
indirect_matches_dict = {'r_neg': ['neg', 'naeg', 'neag'], 'r_pos': ['posi'], 'r_ind': ['indeter', 'inconclu'], 'r_inv': ['inval']}

def assign_labels(tokenized):
    counter = 0
    tokenized_length = len(tokenized)
    useful = [None]*tokenized_length #store same list length of tokens and update each accordingly
    
    # loop over each token for record
    for token in tokenized:
        
        #skip if already assigned
        if useful[counter] is not None:
            counter +=1
            continue
        
        ###easy viruses dictionary (non-exact matching)
        for virus, patterns in easy_virus_dict.items():
            if any([pattern in token for pattern in patterns]):
                useful[counter] = virus
                break

        #extra rhino/entero rule (exact matching)
        if token in ('rhino', 'entero'):
            useful[counter] = 'v_entero_rhino'

        ###hard viruses dictionary (non-exact matching)
        
        #COVID19
        elif any([pattern in token for pattern in hard_virus_dict['v_covid']])\
        and 'ecov' not in token and 'cove' not in token: #may need more exclusions in future
            useful[counter] = 'v_covid'
        
        #extra rule for seasonal coronavirus, if preceded by novel or followed by 19/disease/cov/sars/2
        elif any([pattern in token for pattern in easy_virus_dict['v_coronavirus']]):
            if 'nove' in tokenized[counter-1] or tokenized[counter-1] == 'nivel':
                useful[counter-1:counter+1] = ['v_covid']*2
                
            covid_extra = [] #extra terms
            look_forward = 3 #how many terms to look forward for
            max_forward = min(counter+look_forward, tokenized_length-1) #limit if record is too short
            covid_extra = [(tokenized[covid_pos], covid_pos) for covid_pos in range(counter+1, max_forward+1)\
                       if any([pattern in tokenized[covid_pos] for pattern in ['19', 'disea', 'cov', 'sars']]\
                              +[tokenized[covid_pos] == '2'])]

            if len(covid_extra) > 0:
                last_pos = max([x[1] for x in covid_extra])
                useful[counter:last_pos+1] = ['v_covid']*(last_pos+1-counter) #assign the range of relevant tokens as virus
            else:
                pass
        
        #PARA
        elif any([pattern in token for pattern in hard_virus_dict['v_para']]+[token == 'para']) \
        and tokenized[counter-1] != 'haemophilus':
            para_extra = []
            look_forward = 5
            max_forward = min(counter+look_forward, tokenized_length-1)
            para_extra = [(tokenized[para_pos], para_pos) for para_pos in range(counter+1, max_forward+1) \
                              if tokenized[para_pos] in ('1','2','3','4')]
            
            if len(para_extra) > 0:
                last_pos = max([x[1] for x in para_extra])
                para_nums = [x[0] for x in para_extra]
                useful[counter:last_pos+1] = ['v_para_' + '_'.join(para_nums)]*(last_pos+1-counter)
            else:
                useful[counter] = 'v_para'

        #FLU
        elif any([pattern in token for pattern in hard_virus_dict['v_flu']]+[token in ('flu', 'inf')]) \
        and tokenized[counter-1] != 'haemophilus':
            flu_extra = []
            look_forward = 4
            max_forward = min(counter+look_forward, tokenized_length-1)
            
            for flu_pos in range(counter+1, max_forward+1):
                if tokenized[flu_pos] in ('a','b') or 'h1' in tokenized[flu_pos] or 'h3' in tokenized[flu_pos]:
                    flu_extra.append((tokenized[flu_pos], flu_pos))
                elif 'flu' in tokenized[flu_pos]: #to deal with influenza a influenza b
                    break
                
            if len(flu_extra) > 0:
                last_pos = max([x[1] for x in flu_extra])
                flu_types = [x[0] for x in flu_extra]
                if 'a' in flu_types and 'b' in flu_types:
                    useful[counter:last_pos+1] = ['v_flu_a_b']*(last_pos+1-counter)
                elif 'b' in flu_types:
                    useful[counter:last_pos+1] = ['v_flu_b']*(last_pos+1-counter)
                elif any(['h1' in f for f in flu_types]) and any(['h3' in f for f in flu_types]):
                    useful[counter:last_pos+1] = ['v_flu_a_h1_h3']*(last_pos+1-counter)
                elif any(['h1' in f for f in flu_types]):
                    useful[counter:last_pos+1] = ['v_flu_a_h1']*(last_pos+1-counter)
                elif any(['h3' in f for f in flu_types]):
                    useful[counter:last_pos+1] = ['v_flu_a_h3']*(last_pos+1-counter)
                elif 'a' in flu_types:
                    useful[counter:last_pos+1] = ['v_flu_a']*(last_pos+1-counter)                                                                  
            elif token.endswith('aa'):
                useful[counter] = 'v_flu_a'
            elif token.endswith('ab'):
                useful[counter] = 'v_flu_b'
            else:
                useful[counter] = 'v_flu'

        #RSV
        elif any([pattern in token for pattern in hard_virus_dict['v_rsv']]):
            rsv_extra = []
            look_forward = 2
            max_forward = min(counter+look_forward, tokenized_length-1) 
            rsv_extra = [(tokenized[rsv_pos], rsv_pos) for rsv_pos in range(counter+1, max_forward+1)\
                       if tokenized[rsv_pos] in ('a','b')]
                
            if len(rsv_extra) > 0:
                last_pos = max([x[1] for x in rsv_extra])
                rsv_types = [x[0] for x in rsv_extra]
                if 'a' in rsv_types and 'b' in rsv_types:
                    useful[counter:last_pos+1] = ['v_rsv_a_b']*(last_pos+1-counter)
                elif 'a' in rsv_types:
                    useful[counter:last_pos+1] = ['v_rsv_a']*(last_pos+1-counter)
                elif 'b' in rsv_types:
                    useful[counter:last_pos+1] = ['v_rsv_b']*(last_pos+1-counter)
            else:
                useful[counter] = 'v_rsv'

        elif (tokenized_length > counter+2) \
        and ((token.startswith('resp') and tokenized[counter+1].startswith('syn') and tokenized[counter+2].startswith('vi'))\
        or (token == 'r' and tokenized[counter+1] == 's' and tokenized[counter+2] == 'v')):
            rsv_extra = []
            look_forward = 4
            max_forward = min(counter+look_forward, tokenized_length-1) 
            rsv_extra = [(tokenized[rsv_pos], rsv_pos) for rsv_pos in range(counter+3, max_forward+1)\
                       if tokenized[rsv_pos] in ('a','b')]

            if len(rsv_extra) > 0:
                last_pos = max([x[1] for x in rsv_extra])
                rsv_types = [x[0] for x in rsv_extra]
                if 'a' in rsv_types and 'b' in rsv_types:
                    useful[counter:last_pos+1] = ['v_rsv_a_b']*(last_pos+1-counter)
                elif 'a' in rsv_types:
                    useful[counter:last_pos+1] = ['v_rsv_a']*(last_pos+1-counter)
                elif 'b' in rsv_types:
                    useful[counter:last_pos+1] = ['v_rsv_b']*(last_pos+1-counter)
            else:
                useful[counter:counter+3] = ['v_rsv']*3
        
        #UNKNOWN VIRUS
        elif (token.startswith('vir') or token.startswith('viu')):
            #extra rule for virus culture
            if (tokenized_length > counter+2) and tokenized[counter+1].startswith('cult')\
            and 'request' in tokenized[counter+2]:
                useful[counter:counter+3] = ['request']*3
            elif (tokenized_length > counter+1) and tokenized[counter+1].startswith('cult'):
                useful[counter:counter+2] = ['t_oth']*2
            else:
                useful[counter] = 'v_unk'
        
        counter += 1
        
    # loop over the record again
    counter = 0
    for token in tokenized:
        
        #make sure token doesn't have term already
        if useful[counter] is not None:
            counter += 1
            continue

        #culture tests  
        if token.startswith('cult') and not ((tokenized_length > counter+1) and 'request' in tokenized[counter+1]):
            useful[counter] = 't_oth'

        #additional "direct" tests
        elif token == 'direct' and (tokenized_length > counter+1):
            if tokenized[counter+1] in ('kit', 'enzyme', 'test', 'testing'):
                useful[counter:counter+2] = ['t_oth']*2
            elif tokenized[counter+1] in ('influenza', 'eia', 'antigen', 'ict'):
                useful[counter] = 't_oth'
        
        else:
            #indirect_matches dictionary
            for term, patterns in indirect_matches_dict.items():
                if any([pattern in token for pattern in patterns]):
                    useful[counter] = term
                    break
                    
            #direct_matches dictionary
            for term, patterns in direct_matches_dict.items():
                #extra condition for no/not test/perform
                if term == 'r_neg' and (tokenized_length > counter+1
                                        and ('test' in tokenized[counter+1] 
                                            or 'perform' in tokenized[counter+1]
                                            or'transmit' in tokenized[counter+1])):
                    continue
                
                if any([pattern == token for pattern in patterns]):
                    useful[counter] = term
                    break
                    
            #test_type dictionary
            for test, patterns in test_type_dict.items():
                if any([pattern == token for pattern in patterns]):
                    useful[counter] = test
                    break
            
            #additional case for 'previously reported'
            if 'previous' in token and (tokenized_length > counter+1) and 'report' in tokenized[counter+1]:
                useful[counter:counter+2] = ['previous']*2
   
        counter += 1
        
    return useful

### Algorithm description

Using the useful_tokens field, this interpret function sequentially "reads" the terms. It picks up virus/result/test terms and they are held in a "bundle" (virus, result, test). Any time a bundle is saved, the bundle (except for test type) is cleared. If a save occurs with incomplete information, the virus defaults to an unknown virus, result defaults to negative, and test defaults to unknown test.
<br>
- First, the xml field is processed if there is one. If a relevant virus is found, it is treated as a positive and the bundle is saved.
- Next, the algorithm will go through the labelled tokens one by one. There are different conditions for storing terms and saving the bundle when encountering a virus, a result, or an irrelevant term.
    - Viruses: A relevant virus is always kept. If the virus switches, save the bundle. If the same virus is read, save the bundle only if there is a result as well. An unknown virus is only kept if there is no current virus.
    - Results: If the term "interpretation" is in the current segment and a result is encountered and there is a current virus, delete all previous bundles pertaining to the current virus. A result is kept if there is no current result or if the result term is higher than the current result (with hierarchy inv > ind > neg > pos; a neg/ind/inv can overwrite a positive if it's close together, such that "not detected" becomes a neg for example).
    - Irrelevant terms: If two irrelevant terms (Nones) are read in a row, save the bundle if there is both a current result and virus. Also save the bundle if there is a virus and the past segment had another virus (virus_counter > 1; normally viruses tested are listed in a mpx or pcr assay). Otherwise, clear the bundle without saving and reset all the counter variables (i.e., start a new segment).
    - The same bundle save conditions for two consecutive irrelevant terms are applied again at the end of reading the sentence (in case sentence ends before hitting two Nones).
- Additional logic: If a stop term is read, clear the bundle without saving. And if 'previous report' appears, stop processing.

In [None]:
#interpret text to get initial results
def interpret(useful):
    
    def save(o, b):
        o.append([b[0] if b[0] else 'v_unk', b[1] if b[1] else 'r_neg', b[2] if b[2] else 't_unk'])
        bundle[0] = None
        bundle[1] = None
        
    sentence = useful[:]
    output = []
    
    #bundle for current virus/result/test
    #0 = virus
    #1 = result
    #2 = test
    bundle = [None, None, None]
    
    #xml field processing
    xml_pos = [i for i, x in enumerate(sentence) if x == 'xml']
    num = len(xml_pos)//2
    for i in range(num):
        xml_start_pos = xml_pos[i*2]
        xml_end_pos = xml_pos[i*2+1]
        for j in range(xml_start_pos, xml_end_pos + 1):
            if sentence[j] and sentence[j].startswith('v_') and sentence[j] != 'v_unk':
                bundle[0] = sentence[j]
                bundle[1] = 'r_pos'
                save(output, bundle)
    
    none_counter = 0 #counter for hitting consecutive irrelevant words
    virus_counter = 0 #counter for different viruses in same segment
    interpretation = False #prioritize segement with interpretation
    
    #loop on words in sentence
    for word in sentence:
        if word: #relevant term
            none_counter = 0 #restart counter
            
            #set current virus 
            if word.startswith('v_'):
                #different virus
                if word != 'v_unk' and word != bundle[0]:
                    #save current result if hitting a different virus
                    if bundle[0] and bundle[0] != 'v_unk':
                        save(output, bundle)
                    bundle[0] = word
                    virus_counter += 1 #increase counter if different virus in segment
                #same virus
                elif word != 'v_unk' and word == bundle[0]:
                    #save current result if there is one
                    if bundle[1]:
                        save(output, bundle)
                    bundle[0] = word
                #only set to general virus if there's no current virus
                elif word == 'v_unk' and not bundle[0]:
                    bundle[0] = word
                
            #set current result
            elif word.startswith('r_'):
                #remove all other saved results from same virus if reading interpretation
                if interpretation and bundle[0]:
                    output = [o for o in output if o[0] != bundle[0]]
                    
                if word == 'r_inv':
                    bundle[1] = word
                elif word == 'r_ind' and bundle[1] not in ('r_inv',):
                    bundle[1] = word
                elif word == 'r_neg' and bundle[1] not in ('r_ind','r_inv'):
                    bundle[1] = word
                elif word == 'r_pos' and not bundle[1]:
                    bundle[1] = word
                
            #set current test
            elif word.startswith('t_'):
                bundle[2] = word
            
            #stop word
            elif word == 'stop': 
                bundle[0] = None
                bundle[1] = None
            
            #interpretation flag
            elif word == 'interpretation': 
                interpretation = True
                
            #previous report condition
            elif word == 'previous':
                if bundle[0] and bundle[1]:
                    save(output, bundle)
                return output #end early
            
        else: #word is None
            none_counter += 1
            
            if none_counter == 2: #can change threshold
                #save if there is current virus and result
                if bundle[0] and bundle[1]:
                    save(output, bundle)
                #save the last virus if multiple were listed
                elif bundle[0] and bundle[0] != 'v_unk' and virus_counter > 1: #can change threshold
                    save(output, bundle)
                #reset
                none_counter = 0 
                virus_counter = 0
                bundle[0] = None
                bundle[1] = None
                interpretation = False
                
    #if there is still a remaining result
    if bundle[1]:
        save(output, bundle)
    
    #if there is an extra virus listed at the end
    elif bundle[0] and bundle[0] != 'v_unk' and virus_counter > 1:
        save(output, bundle)
        
    return output

In [None]:
#using reference excel to assign 89 LOINCs + 11 LOINCs to virus and test type
#added COVID19 LOINCs
df_loincs = pd.read_excel('COVID19_Resp_codes_20200413.xlsx', sheet_name='Resp_LOINCs')
df_loincs_covid = pd.read_excel('COVID19_Resp_codes_20200413.xlsx', sheet_name='COVID19_LOINCs')
df_loincs = df_loincs.append(df_loincs_covid)

#cleaning the categories to match previously defined ones
df_loincs['Virus_to_assign'] = df_loincs['Virus_to_assign'].apply(lambda x: 'unk' if '--' in x else x)
df_loincs['Virus_to_assign'] = df_loincs['Virus_to_assign'].apply(lambda x: 'entero_rhino' if 'entero' in x else x)
df_loincs['Virus_to_assign'] = df_loincs['Virus_to_assign'].apply(lambda x: 'coronavirus' if 'corona' in x else x)
df_loincs['Virus_to_assign'] = df_loincs['Virus_to_assign'].apply(lambda x: 'v_' + x)

df_loincs['Test_to_assign'] = df_loincs['Test_to_assign'].apply(lambda x: 'unk' if '--' in x else x)
df_loincs['Test_to_assign'] = df_loincs['Test_to_assign'].apply(lambda x: 'cult' if 'culture' in x else x)
df_loincs['Test_to_assign'] = df_loincs['Test_to_assign'].apply(lambda x: 'oth' if 'other' in x else x)
df_loincs['Test_to_assign'] = df_loincs['Test_to_assign'].apply(lambda x: 't_' + x)

#assign LOINCs to virus and test type
loincs_by_v = {}
loincs_by_t = {}
for index, row in df_loincs.iterrows():
    loincs_by_v.setdefault(row['Virus_to_assign'], [])
    loincs_by_v[row['Virus_to_assign']].append(row['LOINCs'])
    loincs_by_t.setdefault(row['Test_to_assign'], [])
    loincs_by_t[row['Test_to_assign']].append(row['LOINCs'])

#remove the unk ones
del loincs_by_v['v_unk']
del loincs_by_t['t_unk']
    
#use reference excel to assign 19 TR codes to virus and test type
#added COVID19 TR codes
df_tr_codes = pd.read_excel('COVID19_Resp_codes_20200413.xlsx', sheet_name='Resp_TRs')
df_tr_covid = pd.read_excel('COVID19_Resp_codes_20200413.xlsx', sheet_name='COVID19_TRs')
df_tr_codes = df_tr_codes.append(df_tr_covid)

#cleaning the categories to match previously defined ones
df_tr_codes['Virus_to_assign'] = df_tr_codes['Virus_to_assign'].apply(lambda x: 'unk' if '--' in x else x)
df_tr_codes['Virus_to_assign'] = df_tr_codes['Virus_to_assign'].apply(lambda x: 'entero_rhino' if 'entero' in x else x)
df_tr_codes['Virus_to_assign'] = df_tr_codes['Virus_to_assign'].apply(lambda x: 'coronavirus' if 'corona' in x else x)
df_tr_codes['Virus_to_assign'] = df_tr_codes['Virus_to_assign'].apply(lambda x: 'v_' + x)

df_tr_codes['Test_to_assign'] = df_tr_codes['Test_to_assign'].apply(lambda x: 'unk' if '--' in x else x)
df_tr_codes['Test_to_assign'] = df_tr_codes['Test_to_assign'].apply(lambda x: 'cult' if 'culture' in x else x)
df_tr_codes['Test_to_assign'] = df_tr_codes['Test_to_assign'].apply(lambda x: 'oth' if 'other' in x else x)
df_tr_codes['Test_to_assign'] = df_tr_codes['Test_to_assign'].apply(lambda x: 't_' + x)

#assign LOINCs to  virus and test type
tr_codes_by_v = {}
tr_codes_by_t = {}
for index, row in df_tr_codes.iterrows():
    tr_codes_by_v.setdefault(row['Virus_to_assign'], [])
    tr_codes_by_v[row['Virus_to_assign']].append(row['TRs'])
    tr_codes_by_t.setdefault(row['Test_to_assign'], [])
    tr_codes_by_t[row['Test_to_assign']].append(row['TRs'])
    
#remove the unk ones
del tr_codes_by_v['v_unk']
del tr_codes_by_t['t_unk']

In [None]:
# assign more details to v_unk or t_unk based on LOINC and TR code
# group by test type and then type of virus, remove duplicates
def process_result(tokens, testrequestcode, observationcode, results):
    dd = {}

    #delete all results if there is 'not performed'
    for i in range(len(tokens)-1):
        if tokens[i:i+2] == ['not', 'performed']:
            return dd
        
    #delete all results if there is 'swab is required for both'
    for i in range(len(tokens)-4):
        if tokens[i:i+5] == ['swab', 'is', 'required', 'for', 'both']:
            return dd
    
    #determine virus or test based on LOINC or TR
    v_from_loinc = [loinc_vir for loinc_vir, loincs in loincs_by_v.items() if observationcode in loincs]
    v_from_tr = [tr_codes_vir for tr_codes_vir, tr_codes in tr_codes_by_v.items() if testrequestcode in tr_codes]
    t_from_loinc = [loinc_test for loinc_test, loincs in loincs_by_t.items() if observationcode in loincs]
    t_from_tr = [tr_codes_test for tr_codes_test, tr_codes in tr_codes_by_t.items() if testrequestcode in tr_codes]
    
    for v, r, t in results:
        #fill in unknown virus
        if v == 'v_unk':
            if len(v_from_loinc) > 0:
                v = v_from_loinc[0]
            elif len(v_from_tr) > 0:
                v = v_from_tr[0]
        
        #fill in unknown test
        if t == 't_unk':
            if len(t_from_loinc) > 0:
                t = t_from_loinc[0]
            elif len(t_from_tr) > 0:
                t = t_from_tr[0]
            
        #fill in pcr if there is a pcr term in text
        if t == 't_unk' and 'pcr' in tokens: 
            t = 't_pcr'
        
        #remove unknown virus results
        if v != 'v_unk':
            v, r, t = v[2:], r[2:], t[2:]
            #all tests that aren't pcr are oth
            #t = t if t == 'pcr' else 'oth'
            
            #ASSUME EVERYTHING PCR FOR COVID DATASET
            t = 'pcr'
        
            dd.setdefault(t, [])
            
            #compiling results
            same_vir = False
            for i in range(len(dd[t])):
                if v == dd[t][i][0]:
                    same_vir = True
                    if r == 'pos':
                        dd[t][i] = (v,r)
                    elif r == 'neg' and dd[t][i][1] not in ('pos',):
                        dd[t][i] = (v,r)
                    elif r == 'ind' and dd[t][i][1] not in ('pos', 'neg'):
                        dd[t][i] = (v,r)
                    elif r == 'inv':
                        pass
            if same_vir is False:
                dd[t].append((v,r))
        
    return dd

In [None]:
#create output as character value for each virus,
#assigning results with hierarchy: P (positive) > N (negative) > I (indeterminate) > R (invalid)
def char_output(results, ind):
    #loop through each test type and virus
    for t, pairs in results.items():
        for v, r in pairs:
            if v in ('adenovirus', 'bocavirus', 'coronavirus', 'entero_rhino', 'hmv', 'covid'):
                if r == 'pos':
                    df_results[v][ind] = 'P'
                elif r == 'neg' and df_results[v][ind] not in ('P',):
                    df_results[v][ind] = 'N'
                elif r == 'ind' and df_results[v][ind] not in ('P','N'):
                    df_results[v][ind] = 'I'
                elif r == 'inv' and df_results[v][ind] not in ('P','N','I'):
                    df_results[v][ind] = 'R'
            
            elif v.startswith('para'):
                if r == 'pos':
                    df_results['para'][ind] = 'P'
                elif r == 'neg' and df_results['para'][ind] not in ('P',):
                    df_results['para'][ind] = 'N'
                elif r == 'ind' and df_results['para'][ind] not in ('P','N'):
                    df_results['para'][ind] = 'I'
                elif r == 'inv' and df_results['para'][ind] not in ('P','N','I'):
                    df_results['para'][ind] = 'R'
                    
            elif v.startswith('flu'):
                if r == 'pos':
                    df_results['flu'][ind] = 'P'
                elif r == 'neg' and df_results['flu'][ind] not in ('P',):
                    df_results['flu'][ind] = 'N'
                elif r == 'ind' and df_results['flu'][ind] not in ('P','N'):
                    df_results['flu'][ind] = 'I'
                elif r == 'inv' and df_results['flu'][ind] not in ('P','N','I'):
                    df_results['flu'][ind] = 'R'
                    
                if '_a' in v:
                    if r == 'pos':
                        df_results['flu_a'][ind] = 'P'
                    elif r == 'neg' and df_results['flu_a'][ind] not in ('P',):
                        df_results['flu_a'][ind] = 'N'
                    elif r == 'ind' and df_results['flu_a'][ind] not in ('P','N'):
                        df_results['flu_a'][ind] = 'I'
                    elif r == 'inv' and df_results['flu_a'][ind] not in ('P','N','I'):
                        df_results['flu_a'][ind] = 'R'
            
                if '_h1' in v:
                    if r == 'pos':
                        df_results['flu_a_h1'][ind] = 'P'
                    elif r == 'neg' and df_results['flu_a_h1'][ind] not in ('P',):
                        df_results['flu_a_h1'][ind] = 'N'
                    elif r == 'ind' and df_results['flu_a_h1'][ind] not in ('P','N'):
                        df_results['flu_a_h1'][ind] = 'I'
                    elif r == 'inv' and df_results['flu_a_h1'][ind] not in ('P','N','I'):
                        df_results['flu_a_h1'][ind] = 'R'
                    
                if '_h3' in v:
                    if r == 'pos':
                        df_results['flu_a_h3'][ind] = 'P'
                    elif r == 'neg' and df_results['flu_a_h3'][ind] not in ('P',):
                        df_results['flu_a_h3'][ind] = 'N'
                    elif r == 'ind' and df_results['flu_a_h3'][ind] not in ('P','N'):
                        df_results['flu_a_h3'][ind] = 'I'
                    elif r == 'inv' and df_results['flu_a_h3'][ind] not in ('P','N','I'):
                        df_results['flu_a_h3'][ind] = 'R'
            
                if '_b' in v:
                    if r == 'pos':
                        df_results['flu_b'][ind] = 'P'
                    elif r == 'neg' and df_results['flu_b'][ind] not in ('P',):
                        df_results['flu_b'][ind] = 'N'
                    elif r == 'ind' and df_results['flu_b'][ind] not in ('P','N'):
                        df_results['flu_b'][ind] = 'I'
                    elif r == 'inv' and df_results['flu_b'][ind] not in ('P','N','I'):
                        df_results['flu_b'][ind] = 'R'
            
            elif v.startswith('rsv'):
                if r == 'pos':
                    df_results['rsv'][ind] = 'P'
                elif r == 'neg' and df_results['rsv'][ind] not in ('P',):
                    df_results['rsv'][ind] = 'N'
                elif r == 'ind' and df_results['rsv'][ind] not in ('P','N'):
                    df_results['rsv'][ind] = 'I'
                elif r == 'inv' and df_results['rsv'][ind] not in ('P','N','I'):
                    df_results['rsv'][ind] = 'R'
                    
                if '_a' in v:
                    if r == 'pos':
                        df_results['rsv_a'][ind] = 'P'
                    elif r == 'neg' and df_results['rsv_a'][ind] not in ('P',):
                        df_results['rsv_a'][ind] = 'N'
                    elif r == 'ind' and df_results['rsv_a'][ind] not in ('P','N'):
                        df_results['rsv_a'][ind] = 'I'
                    elif r == 'inv' and df_results['rsv_a'][ind] not in ('P','N','I'):
                        df_results['rsv_a'][ind] = 'R'
                        
                if '_b' in v:
                    if r == 'pos':
                        df_results['rsv_b'][ind] = 'P'
                    elif r == 'neg' and df_results['rsv_b'][ind] not in ('P',):
                        df_results['rsv_b'][ind] = 'N'
                    elif r == 'ind' and df_results['rsv_b'][ind] not in ('P','N'):
                        df_results['rsv_b'][ind] = 'I'
                    elif r == 'inv' and df_results['rsv_b'][ind] not in ('P','N','I'):
                        df_results['rsv_b'][ind] = 'R'
                   
    return

### Initial processing

In [None]:
#make copy of df
df_unique = df.copy(deep = True)

#clean text
df_unique["cleaned_value"] = df_unique["value"].apply(clean)

#group by unique strings and store original indexes as tuple
df_unique = df_unique.reset_index()
groupby_vars = ['ReportingLabOrgName', 'TestRequestCode', 'ObservationCode', 'cleaned_value']
df_unique = df_unique.groupby(groupby_vars).agg({'value': 'count', 'index': lambda x: tuple(x)}).reset_index()
df_unique = df_unique.rename(columns={'value':'count', 'index':'original_indexes'})
df_unique = df_unique.sort_values(by=['count'], ascending=False).reset_index(drop=True)
print('unique strings after cleaning:', len(df_unique))

#tokenize
all_words = [] #keep list of unique tokens for testing purposes
df_unique["cleaned_tokenized_value"] = df_unique["cleaned_value"].apply(tokenize) 
all_words = list(set(all_words))

#assign labels using dictionary
df_unique["useful_tokens"] = df_unique["cleaned_tokenized_value"].apply(assign_labels)

#interpret the labelled tokens
df_unique["initial_results"] = df_unique["useful_tokens"].apply(interpret)

#fill in unknown viruses based on LOINC or TR code, roll up results to one test type
final_results = []
for i in range(len(df_unique)):
    final_results.append(process_result(df_unique["cleaned_tokenized_value"][i], df_unique["TestRequestCode"][i], 
                                        df_unique["ObservationCode"][i], df_unique["initial_results"][i]))

### Assign results

In [None]:
#translate results to 1-character format
col_virus = ['covid', 'adenovirus', 'bocavirus', 'coronavirus', 'flu', 'flu_a', 'flu_a_h1', 'flu_a_h3', 'flu_b',
         'entero_rhino', 'hmv', 'para', 'rsv', 'rsv_a', 'rsv_b']
cols = [v for v in col_virus]

#create empty df to fill in results
df_results = pd.DataFrame(index=np.arange(len(df_unique)), columns=cols)

#fill in results
for i in range(len(df_unique)):
    char_output(final_results[i], i)

df_results['covid'].value_counts()

In [None]:
#tracker for unique records
#try not to change the clean function

#initialize tracker for first time
# df_tracker = pd.DataFrame(columns=['filename', 'processed_date', ReportingLabOrgName', 'TestRequestCode', 'ObservationCode', 'cleaned_value'])
# df_tracker.to_pickle("./string_tracker.pkl")

df_tracker = pd.read_pickle('./string_tracker.pkl')

#reset tracker ONLY USE TO FULLY RESET
# df_tracker = df_tracker.iloc[0:0]

df_tracker_orig = df_tracker[['ReportingLabOrgName', 'TestRequestCode', 'ObservationCode', 'cleaned_value']].copy(deep = True)
df_tracker_delta = df_unique[['ReportingLabOrgName', 'TestRequestCode', 'ObservationCode', 'cleaned_value']].copy(deep = True)

#set difference
df_tracker_delta = pd.concat([df_tracker_delta, df_tracker_orig, df_tracker_orig], ignore_index=True).drop_duplicates(keep=False)
print('Original tracker length:', len(df_tracker_orig))
print('Delta tracker length:', len(df_tracker_delta))

In [None]:
#intermediate output for checking results
int_output_cols = ['count', 'ReportingLabOrgName', 'TestRequestCode', 'ObservationCode', 'cleaned_value']
df_unique[int_output_cols].join(df_results).to_csv('intermediate_output.csv')
df_unique[int_output_cols][df_unique.index.isin(df_tracker_delta.index)].join(df_results).to_csv('intermediate_output_delta.csv')

## Final output

In [None]:
df_create_output = df_unique[['original_indexes']].join(df_results)

output = [None]*len(df)

#order results based on original_indexes
for row in df_create_output.itertuples():
    for i in row[1]: #original_indexes
        output[i] = row[2:]

df_output = pd.DataFrame(output, columns=cols)

In [None]:
df_output['covid'].value_counts()

In [None]:
df_output.describe()

In [None]:
#FINAL DATASET TO OUTPUT
df_output.to_csv('output.csv')

In [None]:
#FINALIZE THE STRING TRACKER
today = '20200413' #set today's date (for processing_date)

#add filename and processed_date
df_tracker_delta['filename'] = input_filename
df_tracker_delta['processed_date'] = today

#add the delta
df_tracker = pd.concat([df_tracker, df_tracker_delta], sort=False, ignore_index=True)

#save file
df_tracker.to_pickle("./string_tracker.pkl")
print('Records in tracker:', len(df_tracker))

In [None]:
df_tracker

## Testing and validation

In [None]:
#check specific record

x = 
print('cleaned tokenized value:', list(df_unique["cleaned_tokenized_value"][x]))
print('useful tokens:', list(df_unique["useful_tokens"][x]))
print(':', list(df_unique["initial_results"][x]))
print(final_results[x])

In [None]:
#test word in all words
test_word = ''
print('All unique words that contain "' + test_word + '":',
      [word for word in all_words if test_word in word])
print('All instances of exactly "' + test_word + '":', 
      len([True for c in df_unique["cleaned_tokenized_value"] if test_word in c]))

In [None]:
#check preceding words and frequencies
d_preceding = {}
word = ''
for i in range(len(df_unique)):
    for j in range(len(df_unique["cleaned_tokenized_value"][i])):
        if df_unique["cleaned_tokenized_value"][i][j] == word and len(df_unique["cleaned_tokenized_value"][i]) > 1:
            next_term = df_unique["cleaned_tokenized_value"][i][j-1]
            d_preceding.setdefault(next_term, 0)
            d_preceding[next_term] += 1
print(d_preceding)

In [None]:
#check following words and frequencies
d_following = {}
word = ''
for i in range(len(df_unique)):
    for j in range(len(df_unique["cleaned_tokenized_value"][i])):
        if df_unique["cleaned_tokenized_value"][i][j] == word and len(df_unique["cleaned_tokenized_value"][i]) > j+1:
            next_term = df_unique["cleaned_tokenized_value"][i][j+1]
            d_following.setdefault(next_term, 0)
            d_following[next_term] += 1
print(d_following)