# Step 4: Quality Check

## Setup

In [20]:
import pandas as pd
import numpy as np
import re
from spellchecker import SpellChecker

In [22]:
# Suppress false positive warnings
import warnings
warnings.filterwarnings("ignore")

#### Load Datasets

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/modules/Step2Output_Clinical_Abbreviation_Acronym_Crosswalk.csv',
                 sep='|',
                 header=0,
                 index_col=False,
                 na_filter=False,
                 dtype=object)

In [3]:
df.sample(3, random_state=0)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI
33582,,E033583,α-GPDH,S103760,α_gpdh,N076113,,alpha-glycerophosphate dehydrogenase,L041419,alpha glycerophosphate dehydrogenase,,UMLS,E0412935,E0769306,acronym,,,,
70982,,E070983,PRPPs,S056625,prpps,N057119,,phosphoribosyl pyrophosphate synthetase,L128319,phosphoribosylpyrophosphate synthetase,,UMLS,E0571504,E0047486,acronym,,,,
192802,,E192803,IPH,S036667,iph,N036838,,intra-peritoneal haemorrhage,L097757,,,UMLS,E0703883,E0703882,acronym,,,,


In [4]:
df.shape

(413964, 19)

#### Instantiate Output

In [5]:
modify = []
retire = []

## Identify Errors

#### Heuristic 1: Duplicates

Identify which entries exactly duplicate another entry from the same source. 

In [6]:
Extract1 = df[df.duplicated(['SF', 'LF', 'Source' ]) == True]
df = df[df.duplicated(['SF', 'LF', 'Source' ]) == False]
Extract1['error'] = "duplicate"
Extract1.shape

(3942, 19)

In [7]:
retire.append(Extract1)

#### Heuristic 2: Punctuation

Identify excess punctuation in the long form (e.g. "nitric oxide;").

In [25]:
# Punctuation after LF (excludes .+%)
Extract2_1 = df[df['LF'].str.contains('.*[,\/#!\$\^&@\?<>\*:;{}=\-_\'~\(\)\[\]\"]$') == True]
df = df[df['LF'].str.contains('.*[,\/#!\$\^&@\?<>\*:;{}=\-_\'~\(\)\[\]\"]$') == False]
Extract2_1['error'] = "punctuation after LF"
Extract2_1.shape

(0, 20)

In [9]:
# Punctuation before LF (excludes .+%()[])
Extract2_2 = df[df['LF'].str.contains('^[,\/#!\$\^&@\?<>\*:;{}=\-_\'~\"].*') == True]
df = df[df['LF'].str.contains('^[,\/#!\$\^&@\?<>\*:;{}=\-_\'~\"].*') == False]
Extract2_2['error'] = "punctuation before LF"
Extract2_2.shape

(5, 19)

Identify excess punctuation in the short form (e.g. "..IVF").

In [10]:
# Excess periods before SF
Extract2_3 = df[df['SF'].str.contains('^[\.]+.*') == True] 
df = df[df['SF'].str.contains('^[\.]+.*') == False]
Extract2_3['error'] = "punctuation before SF"
Extract2_3.shape

(131, 19)

Append output

In [12]:
Extract2 = pd.concat([Extract2_1, Extract2_2, Extract2_3])
Extract2.shape

(2078, 19)

In [None]:
modify.append(Extract2)

#### Heuristic 3: Spelling

The long form contains spelling errors (e.g. "cncer").

In [13]:
# Set spell checker parameters
spell = SpellChecker(distance=1)

# Add medical word corpus (UMLS Metathesaurus)
spell.word_frequency.load_text_file('ClinSpell.txt')

In [23]:
# Exclude UMLS and ADAM
subset = df[df['Source'] != 'UMLS']
subset = subset[subset['Source'] != 'ADAM']

# Instantiate output
misspelled_rows = []
misspelled_data = []

# Iterate over subset
for index, row in subset.iterrows():
    
    # Format LF for spellchecker
    pre_token = re.sub('[^A-Za-z\s\-]+', '', row['LF']).lower()
    token = list(filter(None, re.split(r'[\s\-]+', pre_token)))
    
    # Identify misspelled LFs
    misspelled = spell.unknown(token)
    if len(misspelled) > 0:
        misspelled_rows.append(row['EntryID'])
        misspelled_data.append(misspelled)
        
# Extract misspelled LFs
Extract3 = df[df['EntryID'].isin(misspelled_rows)]
Extract3['error'] = misspelled_data
Extract3.shape

(409, 20)

In [37]:
modify.append(Extract3)

#### Heuristic 4: Content

The alphabetic characters in the short form don't occur anywhere in the long form.

In [47]:
# Exclude UMLS and ADAM
subset = df[df['Source'] != 'UMLS']
subset = subset[subset['Source'] != 'ADAM']

# Instantiate output
missing_characters = []

# Iterate over dataframe
for index, row in subset.iterrows():
    
    # Extract alphabetic characters
    alph_SF = set(re.sub('[^A-Za-z]+', '', row['SF']).lower())
    alph_LF = set(re.sub('[^A-Za-z]+', '', row['LF']).lower())
    
    if alph_SF.issubset(alph_LF) == False:
        missing_characters.append(row['EntryID'])
        
# Extract LFs missing characters
Extract3 = df[df['EntryID'].isin(missing_characters)]
Extract3.shape

(3993, 19)

In [48]:
Extract3.head(20)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI
401230,,E401231,A0,S003024,a0,N002603,,dispense,L068584,dispense,,Vanderbilt Clinic Notes,,,,,5,0.612,c1880359
401231,,E401232,disp,S085013,disp,N019983,,0 abortions,L000101,,,Vanderbilt Discharge Sums,,,,,9,1.0,c0233107
401232,,E401233,Disp,S020440,disp,N019983,,0 abortions,L000101,,,Vanderbilt Discharge Sums,,,,,5,1.0,c0233107
401236,,E401237,disp,S085013,disp,N019983,,1 abortion,L000103,,,Vanderbilt Discharge Sums,,,,,2,0.8675,c0233108
401240,,E401241,A1,S003025,a1,N002604,,disposition,L068603,disposition,,Vanderbilt Clinic Notes,,,,,4,0.369,c0743223
401241,,E401242,Disp,S020440,disp,N019983,,a1 segment of anterior cerebral artery,L037324,,,Vanderbilt Discharge Sums,,,,,7,0.1325,c0923398
401242,,E401243,DISP,S019133,disp,N019983,,a1 segment of anterior cerebral artery,L037324,,,Vanderbilt Discharge Sums,,,,,1,0.1325,c0923398
401244,,E401245,QOW,S058924,qow,N058304,,every other week,L074786,,,Vanderbilt Clinic Notes,,,,,4,1.0,c0585332
401245,,E401246,qow,S097392,qow,N058304,,every other week,L074786,,,Vanderbilt Clinic Notes,,,,,3,1.0,c0585332
401246,,E401247,A/P,S003018,a_p,N002776,,every other week,L074786,,,Vanderbilt Clinic Notes,,,,,2,1.0,c0585332


#### Export for Annotation

In [None]:
# Extract misspelled LFs
Extract3 = df[df['EntryID'].isin(misspelled_rows)]
Extract3['error'] = 
Extract3.shape

## Modify or Retire

#### Import Annotated

In [39]:
df.head(5)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI
0,,E000001,AA,S003081,aa,N002817,,achievement age,L037913,achievement age,,UMLS,E0000048,E0006859,acronym,,,,
1,,E000002,AA,S003081,aa,N002817,,Alcoholics Anonymous,L004250,,,UMLS,E0000048,E0000204,acronym,,,,
2,,E000003,AA,S003081,aa,N002817,,alcohol abuse,L040702,alcohol abuse,,UMLS,E0000048,E0356324,acronym,,,,
3,,E000004,AA,S003081,aa,N002817,,alcohol-abuse,L040752,alcohol abuse,,UMLS,E0000048,E0356324,acronym,,,,
4,,E000005,AA,S003081,aa,N002817,,aortic aneurysm,L045559,aortic aneurysm,,UMLS,E0000048,E0009858,acronym,,,,
