# Step 2: Add Information

## Setup

In [1]:
import pandas as pd
from master_functions import *
import os

#### Load Dataset

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/code/Step1Output_Clinical_Abbreviation_Acronym_Crosswalk.csv',
                 sep='|',
                 header=0,
                 index_col=False,
                 na_filter=False,
                 dtype=object)

HTTPError: HTTP Error 404: Not Found

In [None]:
df.sample(3, random_state=0)

## Add Record Identifier

#### Assign Record Identifier

In [None]:
assignment = 1
for index, row in df.iterrows():
    df['EntryID'].iat[index] = assignment
    assignment += 1

#### Format Entry Identifier

In [None]:
df['EntryID'] = 'E' + (df.EntryID.map('{:06}'.format))
df.head(3)

## Add Normalized Short Forms

The normalized short form is created by:
1. converting all text to lowercase; 
2. stripping leading and trailing whitespace;
3. standardizing punctuation to an underscore.

In [None]:
df['NormSF'] = df['SF'].apply(normalized_short_form)

In [None]:
df.sample(3, random_state=0)

## Add Normalized Long Forms

Requires local installation of UMLS Lexical Variant Generation program, downloadable [here](https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html).

Additional documentation available [here](https://www.ncbi.nlm.nih.gov/books/NBK9680/). Installation instructions available [here](https://lexsrv2.nlm.nih.gov/LexSysGroup/Projects/lvg/current/docs/userDoc/install/install.html).

Here, we used version lvg2019 AB.

In [None]:
lvg_path = 'C:/Users/lvg2104/Documents/clinical-abbreviations/modules/lvg2019/bin/lvg.bat'

#### Identify Non-ASCII Characters

In [None]:
df['ASCII'] = 'Y'
df.loc[df.LF.str.contains('[^\x00-\x7F]') == True, 'ASCII'] = 'N'

#### Extract Unique LFs

In [None]:
uniq_LFs = pd.Series(df.loc[df['ASCII']=='Y']['LF'].unique())

In [None]:
uniq_LFs.to_csv('uniq_LFs.temp',
                index=False,
                header=False,
                encoding='ascii')

#### Call Lexical Variant Generation

In [None]:
lvg_output = lvg(input_file='uniq_LFs.temp',
                 flow='q0:g:rs:o:t:l:B:Ct:q7:q8',
                 output_file='norm_LFs.temp',
                 lvg_path=lvg_path)

#### Extract Normalized LFs

In [None]:
norm_LFs = pd.read_csv('norm_LFs.temp',
                       sep='|',
                       header = None,
                       usecols = [1])

#### Match Unique & Normalized LFs

In [None]:
normed_df = pd.DataFrame()
normed_df['LF'] = uniq_LFs
normed_df['NormLF_temp'] = norm_LFs

#### Populate Database

In [None]:
df = pd.merge(df, normed_df, how='left', on='LF')
df['NormLF'] = df['NormLF_temp']

In [None]:
df = df.drop(columns=['NormLF_temp', 'ASCII'])

In [None]:
df.sample(3, random_state=0)

#### Clean Environment

In [None]:
df = df.replace('-No Output-', '')
df.sample(3, random_state=0)

In [None]:
os.remove('uniq_LFs.temp')
os.remove('norm_LFs.temp')

## Add Short Form Unique Identifier

#### Sort by SF

In [None]:
df = df.sort_values(by=['SF'])
df = df.reset_index(drop=True)

#### Assign SFUI

In [None]:
assignment = 1
for index, row in df.iterrows():
    if index == 0:
        df['SFUI'].iat[index] = assignment
    elif df['SF'].at[index] == df['SF'].at[index-1]:
        df['SFUI'].iat[index] = assignment
    else:
        assignment += 1
        df['SFUI'].iat[index] = assignment

#### Format SFUI

In [None]:
df['SFUI'] = 'S' + (df.SFUI.map('{:06}'.format))
df.head(5)

## Add Long Form Unique Identifier

#### Sort by LF

In [None]:
df = df.sort_values(by=['LF'])
df = df.reset_index(drop=True)

#### Assign LFUI

In [None]:
assignment = 1
for index, row in df.iterrows():
    if index == 0:
        df['LFUI'].iat[index] = assignment
    elif df['LF'].at[index] == df['LF'].at[index-1]:
        df['LFUI'].iat[index] = assignment
    else:
        assignment += 1
        df['LFUI'].iat[index] = assignment

#### Format LFUI

In [None]:
df['LFUI'] = 'L' + (df.LFUI.map('{:06}'.format))
df.head(5)

## Add Normalized Short Form Unique Identifier

#### Sort by NormSF

In [None]:
df = df.sort_values(by=['NormSF'])
df = df.reset_index(drop=True)

#### Add NSFUI

In [None]:
assignment = 1
for index, row in df.iterrows():
    if index == 0:
        df['NSFUI'].iat[index] = assignment
    elif df['NormSF'].at[index] == df['NormSF'].at[index-1]:
        df['NSFUI'].iat[index] = assignment
    else:
        assignment += 1
        df['NSFUI'].iat[index] = assignment

#### Format NSFUI

In [None]:
df['NSFUI'] = 'N' + (df.NSFUI.map('{:06}'.format))
df.head(5)

## Export

In [None]:
df = df.sort_values(by=['EntryID'])
df = df.reset_index(drop=True)

In [None]:
df.to_csv('Step2Output_Clinical_Abbreviation_Acronym_Crosswalk.csv',
          index=False,
          header=True,
          sep='|')