# Step 1: Preprocessing

## Setup

In [0]:
import pandas as pd
from google.colab import files

#### Import Master Functions

In [0]:
!wget https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/modules/master_functions.py
from master_functions import *

#### Instantiate Output Frame

This is the common data model for formatting all sources.

In [0]:
out_db = pd.DataFrame(columns=['GroupID', 'SF', 'SFUI', 'NormSF', 'PrefSF',
                               'LF', 'LFUI', 'NormLF', 'PrefLF', 'Source', 
                               'SFEUI', 'LFEUI', 'Type', 'Score', 'Count', 
                               'Frequency', 'UMLS CUI', 'MetaMap CUI'])

## Source #1: UMLS

#### Load UMLS

In [0]:
umls_db = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/sources/1-umls/LRABR',
                      sep='|',
                      header=None,
                      names=['SFEUI', 'SF', 'Type', 'LFEUI', 'LF'],
                      na_filter=False,
                      index_col=False)

In [5]:
clean(umls_db)
umls_db.sample(3, random_state=0)

Unnamed: 0,SFEUI,SF,Type,LFEUI,LF
233514,E0740167,αSMA-expressing,acronym,E0740166,α-smooth muscle actin-expressing
96448,E0637109,β site APP cleaving enzyme-2,acronym,E0637108,beta site amyloid precursor protein cleaving e...
43480,E0456071,A. sinensis,abbreviation,E0352266,Anopheles sinensis


#### Populate Output Frame

In [0]:
umls_out = out_db.copy()
umls_out['SF'] = umls_db['SF']
umls_out['LF'] = umls_db['LF']
umls_out['Source'] = 'UMLS'
umls_out['SFEUI'] = umls_db['SFEUI']
umls_out['LFEUI'] = umls_db['LFEUI']
umls_out['Type'] = umls_db['Type']

In [7]:
umls_out.sample(3, random_state=0)

Unnamed: 0,GroupID,SF,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS CUI,MetaMap CUI
233514,,αSMA-expressing,,,,α-smooth muscle actin-expressing,,,,UMLS,E0740167,E0740166,acronym,,,,,
96448,,β site APP cleaving enzyme-2,,,,beta site amyloid precursor protein cleaving e...,,,,UMLS,E0637109,E0637108,acronym,,,,,
43480,,A. sinensis,,,,Anopheles sinensis,,,,UMLS,E0456071,E0352266,abbreviation,,,,,


#### Append Output

In [0]:
out_list = []
out_list.append(umls_out)

## Source #2: ADAM

#### Load ADAM

In [0]:
adam_db = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/sources/2-adam/adam_database',
                      sep='\t',
                      skiprows=38,  # skips readme portion
                      header=None,
                      names=['Pref_SF', 'Alt_SF', 'All_LF', 'Score', 'Count'],
                      na_filter=False,
                      index_col=False)

In [10]:
clean(adam_db)
adam_db.sample(3, random_state=0)

Unnamed: 0,Pref_SF,Alt_SF,All_LF,Score,Count
13054,DMN,DMN:15,dysplastic melanocytic nevi:15:0.8045,0.8045,15
5739,BM,BM:8|Bm:1,bicuculline methiodide:9:0.6794,0.6794,9
19192,GISSI-2,GISSI-2:10,Gruppo Italiano per lo Studio della Sopravvive...,0.547,10


#### Populate Output Frame

In [0]:
adam_out = out_db.copy()
adam_out['SF'] = adam_db['Alt_SF']
adam_out['LF'] = adam_db['All_LF']
adam_out['Source'] = 'ADAM'
adam_out['PSF'] = adam_db['Pref_SF']

In [12]:
adam_out.sample(3, random_state=0)

Unnamed: 0,GroupID,SF,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS CUI,MetaMap CUI,PSF
13054,,DMN:15,,,,dysplastic melanocytic nevi:15:0.8045,,,,ADAM,,,,,,,,,DMN
5739,,BM:8|Bm:1,,,,bicuculline methiodide:9:0.6794,,,,ADAM,,,,,,,,,BM
19192,,GISSI-2:10,,,,Gruppo Italiano per lo Studio della Sopravvive...,,,,ADAM,,,,,,,,,GISSI-2


#### Unnest SF & LF Columns

In [0]:
adam_out = expand_col(adam_out, 'SF')
adam_out = expand_col(adam_out, 'LF')
adam_out.drop_duplicates(inplace=True)

In [14]:
adam_out.sample(3, random_state=0)

Unnamed: 0,GroupID,SFUI,NormSF,PrefSF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS CUI,MetaMap CUI,PSF,SF,LF
2789,,,,,,,,ADAM,,,,,,,,,ANF,ANF:19,atrial natriuretic peptide:19:0.7658
12925,,,,,,,,ADAM,,,,,,,,,DLS,DLS:159,dynamic light scattering:150:0.9220
23483,,,,,,,,ADAM,,,,,,,,,Ids,ids:1,idiotypes:14:0.0378


#### Assign Count Information

In [0]:
temp = adam_out["SF"].str.split(":", expand=True)
adam_out["SF"] = temp[0]
adam_out["Count"] = temp[1] 

#### Assign Score Information

In [0]:
temp = adam_out["LF"].str.split(":", expand=True)
adam_out["LF"] = temp[0]
adam_out["Score"] = temp[2]

#### Reorder Columns

In [17]:
adam_out = adam_out[out_db.columns]
adam_out.sample(3, random_state=0)

Unnamed: 0,GroupID,SF,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS CUI,MetaMap CUI
2789,,ANF,,,,atrial natriuretic peptide,,,,ADAM,,,,0.7658,19,,,
12925,,DLS,,,,dynamic light scattering,,,,ADAM,,,,0.922,159,,,
23483,,ids,,,,idiotypes,,,,ADAM,,,,0.0378,1,,,


#### Append Output

In [0]:
out_list.append(adam_out)

## Source #3: Berman

#### Load Berman

In [0]:
berm_db = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/sources/3-berman/12000_pathology_abbreviations.txt',
                      sep='=',
                      header=None,
                      names=['SF', 'LF'],
                      na_filter=False,
                      index_col=False)

In [20]:
clean(berm_db)
berm_db.sample(3, random_state=0)

Unnamed: 0,SF,LF
10083,pf,push fluids
3706,npo,nothing by mouth
3234,mdm,mid diastolic murmur


#### Populate Output Frame

In [0]:
berm_out = out_db.copy()
berm_out['SF'] = berm_db['SF']
berm_out['LF'] = berm_db['LF']
berm_out['Source'] = 'Berman'

In [22]:
berm_out.sample(3, random_state=0)

Unnamed: 0,GroupID,SF,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS CUI,MetaMap CUI
10083,,pf,,,,push fluids,,,,Berman,,,,,,,,
3706,,npo,,,,nothing by mouth,,,,Berman,,,,,,,,
3234,,mdm,,,,mid diastolic murmur,,,,Berman,,,,,,,,


#### Append Output

In [0]:
out_list.append(berm_out)

## Source #4: Vanderbilt

#### Load Vanderbilt

In [0]:
vcln_db = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/sources/4-vanderbilt/vanderbilt_clinic_notes.txt',
                      sep='\t',
                      na_filter=False,
                      index_col=False)

In [25]:
clean(vcln_db)
vcln_db.sample(3, random_state=0)

Unnamed: 0,abbreviation,sense,variation,CUI,frequency
563,cmt,charcot-marie-tooth,CMT_6,c0007959,0.014
824,xray,energetic high-frequency electromagnetic radia...,Xray_5|xray_13|XRay_2,c0337030,1.0
436,gtt,glucose tolerance test,GTT_2,c0017741,0.005


In [0]:
vdis_db = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/sources/4-vanderbilt/vanderbilt_discharge_sums.txt',
                      sep='\t',
                      na_filter=False,
                      index_col=False)

In [27]:
clean(vdis_db)
vdis_db.sample(3, random_state=0)

Unnamed: 0,abbreviation,sense,variation,CUI,frequency
979,q,22q (chromosome),q_1,c1521100,0.003
984,q2,every two hours,q2_4,c0585322,1.0
746,nabs,normal active bowel sounds,nabs_2|NAbs_1|NABS_16|NABS._1,c0278005,1.0


#### Populate Output Frames

Please note that the 'abbreviation' column contains a normalized version of the abbreviation. 

We have not carried forward this column because we will conduct our own normalization.

In [0]:
vcln_out = out_db.copy()
vcln_out['SF'] = vcln_db['variation']
vcln_out['LF'] = vcln_db['sense']
vcln_out['Source'] = 'Vanderbilt Clinic Notes'
vcln_out['Frequency'] = vcln_db['frequency']
vcln_out['UMLS CUI'] = vcln_db['CUI']

In [29]:
vcln_out.sample(3, random_state=0)

Unnamed: 0,GroupID,SF,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS CUI,MetaMap CUI
563,,CMT_6,,,,charcot-marie-tooth,,,,Vanderbilt Clinic Notes,,,,,,0.014,c0007959,
824,,Xray_5|xray_13|XRay_2,,,,energetic high-frequency electromagnetic radia...,,,,Vanderbilt Clinic Notes,,,,,,1.0,c0337030,
436,,GTT_2,,,,glucose tolerance test,,,,Vanderbilt Clinic Notes,,,,,,0.005,c0017741,


In [0]:
vdis_out = out_db.copy()
vdis_out['SF'] = vdis_db['variation']
vdis_out['LF'] = vdis_db['sense']
vdis_out['Source'] = 'Vanderbilt Discharge Sums'
vdis_out['Frequency'] = vdis_db['frequency']
vdis_out['UMLS CUI'] = vdis_db['CUI']

In [31]:
vdis_out.sample(3, random_state=0)

Unnamed: 0,GroupID,SF,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS CUI,MetaMap CUI
979,,q_1,,,,22q (chromosome),,,,Vanderbilt Discharge Sums,,,,,,0.003,c1521100,
984,,q2_4,,,,every two hours,,,,Vanderbilt Discharge Sums,,,,,,1.0,c0585322,
746,,nabs_2|NAbs_1|NABS_16|NABS._1,,,,normal active bowel sounds,,,,Vanderbilt Discharge Sums,,,,,,1.0,c0278005,


#### Merge Frames

In [32]:
vand_out = vcln_out.append(vdis_out)
vand_out.shape

(2827, 18)

#### Unnest SF Column

In [33]:
vand_out = expand_col(vand_out, 'SF')
vand_out.sample(3, random_state=0)

Unnamed: 0,GroupID,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS CUI,MetaMap CUI,SF
1016,,,,,high-density lipoprotein,,,,Vanderbilt Clinic Notes,,,,,,1.0,c0023821,,HDL_13
687,,,,,medical doctor,,,,Vanderbilt Discharge Sums,,,,,,0.962,c0031831,,EMLA_8
261,,,,,cerebral palsy,,,,Vanderbilt Discharge Sums,,,,,,0.7986,c0007789,,CP_8


#### Assign Count Information

In [0]:
temp = vand_out["SF"].str.split("_", expand=True)
vand_out["SF"] = temp[0]
vand_out["Count"] = temp[1] 

#### Reorder Columns

In [35]:
vand_out = vand_out[out_db.columns]
vand_out.sample(3, random_state=0)

Unnamed: 0,GroupID,SF,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS CUI,MetaMap CUI
1016,,HDL,,,,high-density lipoprotein,,,,Vanderbilt Clinic Notes,,,,,13,1.0,c0023821,
687,,EMLA,,,,medical doctor,,,,Vanderbilt Discharge Sums,,,,,8,0.962,c0031831,
261,,CP,,,,cerebral palsy,,,,Vanderbilt Discharge Sums,,,,,8,0.7986,c0007789,


#### Append Output

In [0]:
out_list.append(vand_out)

## Source #5: Wikipedia

#### Load Wikipedia

In [0]:
wabr_db = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/sources/5-wikipedia/wikipedia_abbreviation_database.csv',
                      sep=',',
                      na_filter=False,
                      index_col=False)

In [38]:
clean(wabr_db)
wabr_db.sample(3, random_state=0)

Unnamed: 0,abr,long_form
346,BEP,"bleomycin, etoposide, and cisplatin"
1355,I&O,inputs and outputs
1261,HSM,hepatosplenomegaly


In [0]:
wtrl_db = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/sources/5-wikipedia/wikipedia_clinical_trials.txt',
                      sep=':',
                      header=None,
                      names=['abr', 'long_form'],
                      na_filter=False,
                      index_col=False,
                      skipinitialspace=True)

In [40]:
clean(wtrl_db)
wtrl_db.sample(3, random_state=0)

Unnamed: 0,abr,long_form
252,SURTAVI,Safety and Efficacy Study of the Medtronic Cor...
111,EVEREST,Efficacy of Vasopressin Antagonism in Heart Fa...
226,CYTO-PV,Cytoreductive Therapy in Polycythemia Vera


#### Merge Frames

In [41]:
wiki_db = wabr_db.append(wtrl_db)
wiki_db.shape

(2952, 2)

#### Populate Output Frame

In [0]:
wiki_out = out_db.copy()
wiki_out['SF'] = wiki_db['abr']
wiki_out['LF'] = wiki_db['long_form']
wiki_out['Source'] = 'Wikipedia'

In [43]:
wiki_out.sample(3, random_state=0)

Unnamed: 0,GroupID,SF,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS CUI,MetaMap CUI
1301,,ICCU,,,,intensive cardiac care unit,,,,Wikipedia,,,,,,,,
95,,TRICC,,,,Transfusion Requirements in Critical Care,,,,Wikipedia,,,,,,,,
2286,,SGB,,,,stellate ganglion block,,,,Wikipedia,,,,,,,,


#### Append Output

In [0]:
out_list.append(wiki_out)

## Source #6: Stetson

#### Load Stetson

In [0]:
stet_db = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/sources/6-stetson/sense_distribution_448.txt',
                      sep='\t',
                      header=None,
                      names=['SF', 'LF', 'Frequency'],
                      na_filter=False,
                      index_col=False)

In [46]:
clean(stet_db)
stet_db.sample(3, random_state=0)

Unnamed: 0,SF,LF,Frequency
733,med,medicine,0.386
122,d/c,discharge,0.884
113,na,normal axis,0.02002


#### Populate Output Frame

In [0]:
stet_out = out_db.copy()
stet_out['SF'] = stet_db['SF']
stet_out['LF'] = stet_db['LF']
stet_out['Source'] = 'Stetson'
stet_out['Frequency'] = stet_db['Frequency']

In [48]:
stet_out.sample(3, random_state=0)

Unnamed: 0,GroupID,SF,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS CUI,MetaMap CUI
733,,med,,,,medicine,,,,Stetson,,,,,,0.386,,
122,,d/c,,,,discharge,,,,Stetson,,,,,,0.884,,
113,,na,,,,normal axis,,,,Stetson,,,,,,0.02002,,


#### Append Output

In [0]:
out_list.append(stet_out)

## Source #7: Columbia

#### Load Columbia

In [0]:
colm_db = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/sources/7-columbia/obgyn.txt',
                      header=None,
                      names=['SF_LF'],
                      na_filter=False,
                      index_col=False)

In [51]:
clean(colm_db)
colm_db.sample(3, random_state=0)

Unnamed: 0,SF_LF
152,P/N/F Pooling/nitrazine/ferning
74,FHT Fetal heart tones
71,FAVD Forceps assisted vaginal delivery


#### Split Column

In [0]:
temp = colm_db["SF_LF"].str.split(" ", n=1, expand=True)
colm_db["SF"] = temp[0]
colm_db["LF"] = temp[1]

In [53]:
colm_db.sample(3, random_state=0)

Unnamed: 0,SF_LF,SF,LF
152,P/N/F Pooling/nitrazine/ferning,P/N/F,Pooling/nitrazine/ferning
74,FHT Fetal heart tones,FHT,Fetal heart tones
71,FAVD Forceps assisted vaginal delivery,FAVD,Forceps assisted vaginal delivery


#### Populate Output Frame

In [0]:
colm_out = out_db.copy()
colm_out['SF'] = colm_db['SF']
colm_out['LF'] = colm_db['LF']
colm_out['Source'] = 'Columbia'

In [55]:
colm_out.sample(3, random_state=0)

Unnamed: 0,GroupID,SF,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS CUI,MetaMap CUI
152,,P/N/F,,,,Pooling/nitrazine/ferning,,,,Columbia,,,,,,,,
74,,FHT,,,,Fetal heart tones,,,,Columbia,,,,,,,,
71,,FAVD,,,,Forceps assisted vaginal delivery,,,,Columbia,,,,,,,,


#### Append Output

In [0]:
out_list.append(colm_out)

## Merge Sources

#### Double-Check List

In [57]:
for item in out_list:
  name = [x for x in globals() if globals()[x] is item][0]
  print(name, item.shape)

umls_out (261389, 18)
adam_out (94657, 18)
berm_out (12088, 18)
vand_out (8800, 18)
wiki_out (2952, 18)
stet_out (765, 18)
colm_out (219, 18)


#### Conduct Merge

In [58]:
db = pd.concat(out_list)
db.shape

(380870, 18)

## Export

In [0]:
db.to_csv('Clinical_Abbreviation_Acronym_Crosswalk_Preliminary1.csv',
          index=False,
          header=True,
          sep='|')

files.download('Clinical_Abbreviation_Acronym_Crosswalk_Preliminary1.csv')