# Step 1: Preprocessing

## Setup

In [None]:
import pandas as pd
from master_functions import *

#### Instantiate Output Frame

This is the common data model for formatting all sources.

In [None]:
out_db = pd.DataFrame(columns=['GroupID', 'RecordID', 'SF', 'SFUI', 'NormSF', 
                               'LF', 'LFUI', 'NormLF', 'Source', 
                               # Auxiliary data fields
                               'SFEUI', 'LFEUI', 'Type', 'PrefSF', 'Score',
                               'Count', 'Frequency', 'UMLS.CUI'])

## Source #1: UMLS

#### Load UMLS

In [None]:
umls_db = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/sources/1-umls/LRABR',
                      sep='|',
                      header=None,
                      names=['SFEUI', 'SF', 'Type', 'LFEUI', 'LF'],
                      na_filter=False,
                      index_col=False)

In [None]:
clean(umls_db)
umls_db.sample(3, random_state=0)

#### Populate Output Frame

In [None]:
umls_out = out_db.copy()
umls_out['SF'] = umls_db['SF']
umls_out['LF'] = umls_db['LF']
umls_out['Source'] = 'UMLS'
umls_out['SFEUI'] = umls_db['SFEUI']
umls_out['LFEUI'] = umls_db['LFEUI']
umls_out['Type'] = umls_db['Type']

In [None]:
umls_out.sample(3, random_state=0)

#### Append Output

In [None]:
out_list = []
out_list.append(umls_out)

## Source #2: ADAM

#### Load ADAM

In [None]:
adam_db = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/sources/2-adam/adam_database',
                      sep='\t',
                      skiprows=38,  # skips readme portion
                      header=None,
                      names=['Pref_SF', 'Alt_SF', 'All_LF', 'Score', 'Count'],
                      na_filter=False,
                      index_col=False)

In [None]:
clean(adam_db)
adam_db.sample(3, random_state=0)

#### Populate Output Frame

In [None]:
adam_out = out_db.copy()
adam_out['SF'] = adam_db['Alt_SF']
adam_out['LF'] = adam_db['All_LF']
adam_out['Source'] = 'ADAM'
adam_out['PrefSF'] = adam_db['Pref_SF']

In [None]:
adam_out.sample(3, random_state=0)

#### Unnest SF & LF Columns

In [None]:
adam_out = expand_col(adam_out, 'SF')
adam_out = expand_col(adam_out, 'LF')
adam_out.drop_duplicates(inplace=True)

In [None]:
adam_out.sample(3, random_state=0)

#### Assign Count Information

In [None]:
temp = adam_out['SF'].str.split(':', expand=True)
adam_out['SF'] = temp[0]
adam_out['Count'] = temp[1] 

#### Assign Score Information

In [None]:
temp = adam_out['LF'].str.split(':', expand=True)
adam_out['LF'] = temp[0]
adam_out['Score'] = temp[2]

#### Reorder Columns

In [None]:
adam_out = adam_out[out_db.columns]
adam_out.sample(3, random_state=0)

#### Append Output

In [None]:
out_list.append(adam_out)

## Source #3: Berman

#### Load Berman

In [None]:
berm_db = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/sources/3-berman/12000_pathology_abbreviations.txt',
                      sep='=',
                      header=None,
                      names=['SF', 'LF'],
                      na_filter=False,
                      index_col=False)

In [None]:
clean(berm_db)
berm_db.sample(3, random_state=0)

#### Populate Output Frame

In [None]:
berm_out = out_db.copy()
berm_out['SF'] = berm_db['SF']
berm_out['LF'] = berm_db['LF']
berm_out['Source'] = 'Berman'

In [None]:
berm_out.sample(3, random_state=0)

#### Append Output

In [None]:
out_list.append(berm_out)

## Source #4: Vanderbilt

#### Load Vanderbilt

In [None]:
vcln_db = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/sources/4-vanderbilt/vanderbilt_clinic_notes.txt',
                      sep='\t',
                      na_filter=False,
                      index_col=False)

In [None]:
clean(vcln_db)
vcln_db.sample(3, random_state=0)

In [None]:
vdis_db = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/sources/4-vanderbilt/vanderbilt_discharge_sums.txt',
                      sep='\t',
                      na_filter=False,
                      index_col=False)

In [None]:
clean(vdis_db)
vdis_db.sample(3, random_state=0)

#### Populate Output Frames

Please note that the 'abbreviation' column contains a normalized version of the abbreviation. 

We have not carried forward this column because we will conduct the same normalization later on.

In [None]:
vcln_out = out_db.copy()
vcln_out['SF'] = vcln_db['variation']
vcln_out['LF'] = vcln_db['sense']
vcln_out['Source'] = 'Vanderbilt Clinic Notes'
vcln_out['Frequency'] = vcln_db['frequency']
vcln_out['UMLS.CUI'] = vcln_db['CUI']

In [None]:
vcln_out.sample(3, random_state=0)

In [None]:
vdis_out = out_db.copy()
vdis_out['SF'] = vdis_db['variation']
vdis_out['LF'] = vdis_db['sense']
vdis_out['Source'] = 'Vanderbilt Discharge Sums'
vdis_out['Frequency'] = vdis_db['frequency']
vdis_out['UMLS.CUI'] = vdis_db['CUI']

In [None]:
vdis_out.sample(3, random_state=0)

#### Merge Frames

In [None]:
vand_out = vcln_out.append(vdis_out)
vand_out = vand_out.reset_index(drop=True)
vand_out.shape

#### Unnest SF Column

In [None]:
vand_out = expand_col(vand_out, 'SF')
vand_out.sample(3, random_state=0)

#### Assign Count Information

In [None]:
temp = vand_out['SF'].str.split('_', expand=True)
vand_out['SF'] = temp[0]
vand_out['Count'] = temp[1] 

#### Reorder Columns

In [None]:
vand_out = vand_out[out_db.columns]
vand_out.sample(3, random_state=0)

#### Append Output

In [None]:
out_list.append(vand_out)

## Source #5: Wikipedia

#### Load Wikipedia

In [None]:
wabr_db = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/sources/5-wikipedia/wikipedia_abbreviation_database.csv',
                      sep=',',
                      na_filter=False,
                      index_col=False)

In [None]:
clean(wabr_db)
wabr_db.sample(3, random_state=0)

In [None]:
wtrl_db = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/sources/5-wikipedia/wikipedia_clinical_trials.txt',
                      sep=':',
                      header=None,
                      names=['abr', 'long_form'],
                      na_filter=False,
                      index_col=False,
                      skipinitialspace=True)

In [None]:
clean(wtrl_db)
wtrl_db.sample(3, random_state=0)

#### Merge Frames

In [None]:
wiki_db = wabr_db.append(wtrl_db)
wiki_db.shape

#### Populate Output Frame

In [None]:
wiki_out = out_db.copy()
wiki_out['SF'] = wiki_db['abr']
wiki_out['LF'] = wiki_db['long_form']
wiki_out['Source'] = 'Wikipedia'

In [None]:
wiki_out.sample(3, random_state=0)

#### Append Output

In [None]:
out_list.append(wiki_out)

## Source #6: Stetson

#### Load Stetson

In [None]:
stet_db = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/sources/6-stetson/sense_distribution_448.txt',
                      sep='\t',
                      header=None,
                      names=['SF', 'LF', 'Frequency'],
                      na_filter=False,
                      index_col=False)

In [None]:
clean(stet_db)
stet_db.sample(3, random_state=0)

#### Populate Output Frame

In [None]:
stet_out = out_db.copy()
stet_out['SF'] = stet_db['SF']
stet_out['LF'] = stet_db['LF']
stet_out['Source'] = 'Stetson'
stet_out['Frequency'] = stet_db['Frequency']

In [None]:
stet_out.sample(3, random_state=0)

#### Append Output

In [None]:
out_list.append(stet_out)

## Source #7: Columbia

#### Load Columbia

In [None]:
colm_db = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/sources/7-columbia/obgyn.txt',
                      header=None,
                      names=['SF_LF'],
                      na_filter=False,
                      index_col=False)

In [None]:
clean(colm_db)
colm_db.sample(3, random_state=0)

#### Split Column

In [None]:
temp = colm_db['SF_LF'].str.split(' ', n=1, expand=True)
colm_db['SF'] = temp[0]
colm_db['LF'] = temp[1]

In [None]:
colm_db.sample(3, random_state=0)

#### Populate Output Frame

In [None]:
colm_out = out_db.copy()
colm_out['SF'] = colm_db['SF']
colm_out['LF'] = colm_db['LF']
colm_out['Source'] = 'Columbia'

In [None]:
colm_out.sample(3, random_state=0)

#### Append Output

In [None]:
out_list.append(colm_out)

## Merge Sources

#### Double-Check List

In [None]:
for item in out_list:
  name = [x for x in globals() if globals()[x] is item][0]
  print(name, item.shape)

#### Conduct Merge

In [None]:
db = pd.concat(out_list)
db.shape

## Export

In [None]:
db.to_csv('Step1Output.csv',
          index=False,
          header=True,
          sep='|')