# Step 2: Add Information

## Setup

In [1]:
import pandas as pd
from master_functions import *
import os

#### Load Preprocessed Dataset

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/modules/Step1Output_Clinical_Abbreviation_Acronym_Crosswalk.csv',
                 sep='|',
                 header=0,
                 index_col=False,
                 na_filter=False,
                 dtype=object)

In [3]:
df.sample(3, random_state=0)

Unnamed: 0,GroupID,SF,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI,MetaMap.CUI
69748,,ER-α,,,,oestrogen receptor alpha,,,,UMLS,E0579571,E0579570,acronym,,,,,
287684,,F9,,,,mouse embryonal carcinoma,,,,ADAM,,,,0.4232,11.0,,,
349332,,NPr-A,,,,natriuretic peptide receptor,,,,ADAM,,,,0.8165,1.0,,,


## Add Normalized Forms

#### Normalized Short Form

The normalized short form is created by:
1. converting all text to lowercase; 
2. stripping leading and trailing whitespace;
3. standardizing punctuation to an underscore.

In [4]:
df['NormSF'] = df['SF'].apply(normalized_short_form)

In [5]:
df.sample(3, random_state=0)

Unnamed: 0,GroupID,SF,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI,MetaMap.CUI
69748,,ER-α,,er_α,,oestrogen receptor alpha,,,,UMLS,E0579571,E0579570,acronym,,,,,
287684,,F9,,f9,,mouse embryonal carcinoma,,,,ADAM,,,,0.4232,11.0,,,
349332,,NPr-A,,npr_a,,natriuretic peptide receptor,,,,ADAM,,,,0.8165,1.0,,,


#### Normalized Long Form

**NOTE:** Requires local installation of UMLS Lexical Variant Generation program, downloadable [here](https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html).

Additional documentation available [here](https://www.ncbi.nlm.nih.gov/books/NBK9680/). Installation instructions available [here](https://lexsrv2.nlm.nih.gov/LexSysGroup/Projects/lvg/current/docs/userDoc/install/install.html).

In [None]:
lvg_path = os.path.expanduser('~/lvg2019')

Remove non-ASCII characters

In [None]:
df['LF'] = df.LF.str.replace('[^\x00-\x7F]','')

Isolate unique long forms

In [None]:
uniq_long_forms = pd.Series(df['LF'].unique())

In [None]:
uniq_long_forms.to_csv('uniq_long_forms.temp',
                       index=False,
                       header=False,
                       encoding='ascii')

Call lexical variant generation

In [None]:
lvg_output = lvg('uniq_long_forms.temp',
                 flow='q0:g:rs:o:t:l:B:Ct:q7:q8',
                 output_file='uniq_long_forms.temp',
                 restrict=True,
                 print_no_output=True,
                 lvg_path=lvg_path)

Extract normalized long forms

In [None]:
norm_long_forms = pd.read_csv('uniq_long_forms.temp',
                              sep='|',
                              header = None,
                              usecols = [1])

Match with corresponding long forms

Populate Database

In [None]:
uniq_long_forms.head(5)

## Add Identifiers

#### Short Form Unique Identifier

Sort by SF

In [None]:
df = df.sort_values(by=['SF'])
df = df.reset_index()

Assign SFUI

In [None]:
assignment = 1
for index, row in df.iterrows():
    if index == 0:
        df['SFUI'].iat[index] = assignment
    elif df['SF'].at[index] == df['SF'].at[index-1]:
        df['SFUI'].iat[index] = assignment
    else:
        assignment += 1
        df['SFUI'].iat[index] = assignment

Format SFUI

In [None]:
df["SFUI"] = "S" + (df.SFUI.map("{:06}".format))
df.head(5)

#### Long Form Unique Identifier

Sort by LF

In [None]:
df = df.sort_values(by=['LF'])
df = df.reset_index()

Assign LFUI

In [None]:
assignment = 1
for index, row in df.iterrows():
    if index == 0:
        df['LFUI'].iat[index] = assignment
    elif df['LF'].at[index] == df['LF'].at[index-1]:
        df['LFUI'].iat[index] = assignment
    else:
        assignment += 1
        df['LFUI'].iat[index] = assignment

Format LFUI

In [None]:
df["LFUI"] = "L" + (df.LFUI.map("{:06}".format))
df.head(5)

## MetaMap CUIs

## Standardize CUIs

## Export

In [None]:
df.to_csv('Step2Output_Clinical_Abbreviation_Acronym_Crosswalk.csv',
          index=False,
          header=True,
          sep='|')