# Step 3: Quality Check

## Setup

In [1]:
import re
import pandas as pd
import numpy as np
from spellchecker import SpellChecker
from master_functions import *

In [2]:
# Suppress false positive warnings
import warnings
warnings.filterwarnings("ignore")

#### Load Datasets

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/code/Step2Output_Clinical_Abbreviation_Acronym_Crosswalk.csv',
                 sep='|',
                 header=0,
                 index_col=False,
                 na_filter=False,
                 dtype=object)

In [4]:
df.sample(3, random_state=0)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI
368728,,E368729,T-ALL,S068102,t_all,N067915,,T cell acute lymphoblastic leukemias,L032523,T cell acute lymphoblastic leukemia,,ADAM,,,,0.7357,193,,
320521,,E320522,Emax,S023535,emax,N022914,,maximal responses,L107849,,,ADAM,,,,0.2113,24,,
311671,,E311672,CTG,S015579,ctg,N017580,,connective tissue graft,L060880,connective tissue graft,,ADAM,,,,0.7103,10,,


In [5]:
df.shape

(409668, 19)

## Identify Errors

#### Heuristic 1: Duplicates

Identify which entries exactly duplicate another entry from the same source. 

In [6]:
Extract1 = df[df.duplicated(['SF', 'LF', 'Source']) == True]
Extract1.shape

(3933, 19)

#### Heuristic 2: Punctuation

Identify excess punctuation in the long form (e.g. "nitric oxide;").

In [7]:
# Punctuation after LF (excludes .+%()[])
Extract2_1 = df[df['LF'].str.contains('.*[,\/#!\$\^&@\?<>\*:;{}=\-_\'~\"]$') == True]
Extract2_1.shape

(55, 19)

In [8]:
# Punctuation before LF (excludes .+%()[])
Extract2_2 = df[df['LF'].str.contains('^[,\/#!\$\^&@\?<>\*:;{}=\-_\'~\"].*') == True]
Extract2_2.shape

(7, 19)

Identify excess punctuation in the short form (e.g. "..IVF").

In [9]:
# Excess periods before SF
Extract2_3 = df[df['SF'].str.contains('^[\.]+.*') == True]
Extract2_3.shape

(76, 19)

#### Heuristic 3: Spelling

The long form contains spelling errors (e.g. "cncer").

In [10]:
# Set spell checker parameters
spell = SpellChecker(distance=1)

# Add medical word corpus (UMLS Metathesaurus)
spell.word_frequency.load_text_file('Data/ClinSpell.txt')

# Exclude UMLS and ADAM
subset = df[(df['Source'] != 'UMLS') & 
            (df['Source'] != 'ADAM')]

# Instantiate output
misspelled_rows = []
misspelled_data = []

In [11]:
# Iterate over subset
for index, row in subset.iterrows():
    
    # Format LF for spellchecker
    pre_token = re.sub('[^A-Za-z\s\-]+', '', row['LF']).lower()
    token = list(filter(None, re.split(r'[\s\-]+', pre_token)))
    
    # Identify misspelled LFs
    misspelled = spell.unknown(token)
    if len(misspelled) > 0:
        misspelled_rows.append(row['EntryID'])
        misspelled_data.append(misspelled)

In [12]:
# Extract misspelled LFs
Extract3 = df[df['EntryID'].isin(misspelled_rows)]
Extract3.shape

(350, 19)

#### Heuristic 4: Content

The alphabetic characters in the short form don't occur anywhere in the long form.

In [13]:
# Exclude UMLS and ADAM
subset = df[(df['Source'] == 'Vanderbilt Clinic Notes') | 
            (df['Source'] == 'Vanderbilt Discharge Sums')]

# Instantiate output
missing_character = []
missing_char_data = []

In [14]:
# Iterate over dataframe
for index, row in subset.iterrows():
    
    # Extract alphabetic characters
    alph_SF = set(re.sub('[^A-Za-z]+', '', row['SF']).lower())
    alph_LF = set(re.sub('[^A-Za-z]+', '', row['LF']).lower())
    
    if alph_SF.issubset(alph_LF) == False:
        if (alph_SF - alph_LF) != {'x'}:
            missing_character.append(row['EntryID'])
            missing_char_data.append(alph_SF - alph_LF)

In [15]:
# Extract LFs missing characters
Extract4 = df[df['EntryID'].isin(missing_character)]
Extract4.shape

(217, 19)

#### Heuristic 5: User-Identified

In [16]:
Extract5 = df[(df['LF'].str.contains("#000066") |
              df['LF'].str.contains("typo") |
              df['LF'].str.contains("not an abbreviation") | 
              df['LF'].str.contains("not an acronym"))]
Extract5.shape

(49, 19)

## Format

#### Add Columns

In [17]:
# Error type, decision, modification
Extract1['error'], Extract1['action'] = ["duplicate", "retire"]
Extract2_1['error'], Extract2_1['action'] = ["punctuation after LF", "modify"]
Extract2_2['error'], Extract2_2['action'] = ["punctuation before LF", "modify"]
Extract2_3['error'], Extract2_3['action'] = ["punctuation before SF", "modify"]
Extract3['error'], Extract3['action'] = [misspelled_data, "modify"]
Extract4['error'], Extract4['action'] = [missing_char_data, "modify"]
Extract5['error'], Extract5['action'] = ["user identified", "retire"]

#### Merge

In [18]:
errors = pd.concat([Extract1, Extract2_1, Extract2_2, Extract2_3, Extract3, Extract4, Extract5])
errors.shape

(4687, 21)

In [19]:
errors = errors.drop_duplicates(subset="EntryID")
errors.shape

(4672, 21)

#### Export

In [20]:
errors.to_csv('Data/Errors_Automated.csv',
              index=False,
              header=True,
              sep='|')

## Import Errors

#### Import Annotated

In [21]:
errors = pd.read_csv('Data/Errors_Annotated.csv',
                     sep='|',
                     header=0,
                     index_col=False,
                     na_filter=False,
                     dtype=object)

In [22]:
errors.sample(3, random_state=0)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,...,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI,error,action
1364,,E157554,RTE,S061215,rte,N061480,,trans-epithelial resistance,L159051,,...,UMLS,E0692002,E0691991,acronym,,,,,duplicate,retire
2263,,E187545,NOK,S049019,nok,N048752,,next-of-kin,L117292,,...,UMLS,E0700030,E0700026,acronym,,,,,duplicate,retire
2962,,E219691,Vp1,S075865,vp1,N074314,,Viviparous-1,L035785,,...,UMLS,E0720795,E0720793,acronym,,,,,duplicate,retire


In [23]:
errors.shape

(4474, 21)

In [24]:
errors['action'].value_counts()

retire    4018
modify     292
none       164
Name: action, dtype: int64

#### Remove None

In [25]:
errors = errors[(errors['action'] != 'none')]
errors.shape

(4310, 21)

#### Subset Crosswalk

In [26]:
df_all = df # Keep unsubsetted version
df = df[~df['EntryID'].isin(errors['EntryID'])]
df.shape

(405358, 19)

#### Subset Errors

In [27]:
retire = df_all[df_all['EntryID'].isin(errors[(errors['action'] == 'retire')]['EntryID'])]
retire.shape

(4018, 19)

In [28]:
modify = errors[(errors['action'] == 'modify')].iloc[:, 0:19]
modify.shape

(292, 19)

## Modify

#### Retire Duplicates

In [29]:
# Identify duplicates
dups = pd.concat([df, modify])
dups = dups[dups.duplicated(['SF', 'LF', 'Source']) == True]
dups.shape

(107, 19)

In [30]:
# Remove from modify
modify = modify[~modify['EntryID'].isin(dups['EntryID'])]
modify = modify.reset_index(drop=True)
modify.shape

(185, 19)

In [31]:
# Add to retire
retire = pd.concat([retire, df_all[df_all['EntryID'].isin(dups['EntryID'])]])
retire = retire.reset_index(drop=True)
retire.shape

(4125, 19)

#### Strip Source Data

This is done as the source data is potentially no longer valid.

In [32]:
modify['SFUI'], modify['NormSF'], modify['NSFUI'], modify['PrefSF'] = ['', '', '', '']
modify['LFUI'], modify['NormLF'], modify['PrefLF'], modify['SFEUI'] = ['', '', '', '']
modify['LFEUI'], modify['Type'], modify['Score'], modify['Count'] = ['', '', '', '']
modify['Frequency'], modify['UMLS.CUI'] = ['', '']

In [33]:
modify.sample(3, random_state=0)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI
33,,E390742,ectr,,,,,endoscopic carpal tunnel release,,,,Berman,,,,,,,
125,,E402784,po,,,,,per os,,,,Vanderbilt Clinic Notes,,,,,,,
173,,E409135,e,,,,,exempli gratia,,,,Stetson,,,,,,,


#### Reassign Norm_SF

In [34]:
modify['NormSF'] = modify['SF'].apply(normalized_short_form)

In [35]:
modify.sample(3, random_state=0)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI
33,,E390742,ectr,,ectr,,,endoscopic carpal tunnel release,,,,Berman,,,,,,,
125,,E402784,po,,po,,,per os,,,,Vanderbilt Clinic Notes,,,,,,,
173,,E409135,e,,e,,,exempli gratia,,,,Stetson,,,,,,,


#### Reassign SFUI

In [36]:
# Search existing SFUIs
for index, row in modify.iterrows():
    temp = df_all[(df_all['SF'] == modify['SF'].iat[index])]
    if temp.empty:
        modify['SFUI'].iat[index] = ''
    else:
        modify['SFUI'].iat[index] = temp.iloc[0]['SFUI']

In [37]:
# If none, add SFUI
modify = add_new_SFUI(modify)
modify.sample(3, random_state=0)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI
33,,E390765,ees,S085737,ees,,,erythromycin ethylsuccinate,,,,Berman,,,,,,,
125,,E402785,P.O,S052990,po,,,per os,,,,Vanderbilt Clinic Notes,,,,,,,
173,,E409172,lee,S090443,lee,,,lower extremity edema,,,,Stetson,,,,,,,


#### Reassign LFUI

In [38]:
# Search existing LFUIs
for index, row in modify.iterrows():
    temp = df_all[(df_all['LF'] == modify['LF'].iat[index])]
    if temp.empty:
        modify['LFUI'].iat[index] = ''
    else:
        modify['LFUI'].iat[index] = temp.iloc[0]['LFUI']

In [39]:
# If none, add LFUI
modify = add_new_LFUI(modify)
modify.sample(3, random_state=0)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI
33,,E395988,mect,S091844,mect,,,multiple-monitored electroconvulsive therapy,L114103,,,Berman,,,,,,,
125,,E408683,ODYSSEY LONG TERM,S051007,odyssey long term,,,Long-term Safety and Tolerability of Alirocuma...,L170578,,,Wikipedia,,,,,,,
173,,E401143,pc,S095391,pc,,,pectus carinatum,L170623,,,Berman,,,,,,,


#### Reassign NSFUI

In [40]:
# Search existing NSFUIs
for index, row in modify.iterrows():
    temp = df_all[(df_all['NormSF'] == modify['NormSF'].iat[index])]
    if temp.empty:
        modify['NSFUI'].iat[index] = ''
    else:
        modify['NSFUI'].iat[index] = temp.iloc[0]['NSFUI']

In [41]:
# If none, add NSFUI
modify = add_new_NSFUI(modify)
modify.sample(3, random_state=0)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI
33,,E395988,mect,S091844,mect,N043368,,multiple-monitored electroconvulsive therapy,L114103,,,Berman,,,,,,,
125,,E408683,ODYSSEY LONG TERM,S051007,odyssey long term,N050128,,Long-term Safety and Tolerability of Alirocuma...,L170578,,,Wikipedia,,,,,,,
173,,E401143,pc,S095391,pc,N053077,,pectus carinatum,L170623,,,Berman,,,,,,,


#### Add "Modified" Column

In [42]:
modify["Modified"] = "Modified"
df["Modified"] = ""

#### Append to Crosswalk

In [43]:
df = pd.concat([df, modify])
df = df.sort_values(by=['EntryID'])
df = df.reset_index(drop=True)
df.shape

(405543, 20)

## Export

#### Export Modify

In [44]:
# Get original rows
modify = df_all[df_all['EntryID'].isin(modify['EntryID'])]
modify.shape

(185, 19)

In [45]:
modify.to_csv('Final/Modified.csv',
              index=False,
              header=True,
              sep='|')

#### Export Retire

In [46]:
retire.to_csv('Final/Retired.csv',
              index=False,
              header=True,
              sep='|')

#### Export Crosswalk

In [47]:
df.to_csv('Step3Output_Clinical_Abbreviation_Acronym_Crosswalk.csv',
          index=False,
          header=True,
          sep='|')