# Step 2: Add Information

## Setup

In [1]:
import pandas as pd
from master_functions import *
import os

#### Load Preprocessed Dataset

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/modules/Step1Output_Clinical_Abbreviation_Acronym_Crosswalk.csv',
                 sep='|',
                 header=0,
                 index_col=False,
                 na_filter=False,
                 dtype=object)

In [3]:
df.sample(3, random_state=0)

Unnamed: 0,GroupID,SF,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI,MetaMap.CUI
69748,,ER-α,,,,oestrogen receptor alpha,,,,UMLS,E0579571,E0579570,acronym,,,,,
287684,,F9,,,,mouse embryonal carcinoma,,,,ADAM,,,,0.4232,11.0,,,
349332,,NPr-A,,,,natriuretic peptide receptor,,,,ADAM,,,,0.8165,1.0,,,


## Add Entry Identifier

#### Assign Entry Identifier

In [None]:
assignment = 1
for index, row in df.iterrows():
    df['EntryID'].iat[index] = assignment
    assignment += 1

#### Format Entry Identifier

In [None]:
df["EntryID"] = "E" + (df.EntryID.map("{:06}".format))
df.sample(3, random_state=0)

## Add Normalized Short Forms

The normalized short form is created by:
1. converting all text to lowercase; 
2. stripping leading and trailing whitespace;
3. standardizing punctuation to an underscore.

In [4]:
df['NormSF'] = df['SF'].apply(normalized_short_form)

In [5]:
df.sample(3, random_state=0)

Unnamed: 0,GroupID,SF,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI,MetaMap.CUI
69748,,ER-α,,er_α,,oestrogen receptor alpha,,,,UMLS,E0579571,E0579570,acronym,,,,,
287684,,F9,,f9,,mouse embryonal carcinoma,,,,ADAM,,,,0.4232,11.0,,,
349332,,NPr-A,,npr_a,,natriuretic peptide receptor,,,,ADAM,,,,0.8165,1.0,,,


## Add Normalized Long Forms

Requires local installation of UMLS Lexical Variant Generation program, downloadable [here](https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html).

Additional documentation available [here](https://www.ncbi.nlm.nih.gov/books/NBK9680/). Installation instructions available [here](https://lexsrv2.nlm.nih.gov/LexSysGroup/Projects/lvg/current/docs/userDoc/install/install.html).

Here, we used version lvg2019 AB.

In [6]:
lvg_path = 'C:/Users/lvg2104/Documents/clinical-abbreviations/modules/lvg2019/bin/lvg.bat'

#### Remove Non-ASCII Characters

In [7]:
df['LF'] = df.LF.str.replace('[^\x00-\x7F]','')

#### Extract Unique LFs

In [8]:
uniq_LFs = pd.Series(df['LF'].unique())

In [9]:
uniq_LFs.to_csv('uniq_LFs.temp',
                index=False,
                header=False,
                encoding='ascii')

#### Call Lexical Variant Generation

In [10]:
lvg_output = lvg(input_file='uniq_LFs.temp',
                 flow='q0:g:rs:o:t:l:B:Ct:q7:q8',
                 output_file='norm_LFs.temp',
                 lvg_path=lvg_path)

#### Extract Normalized LFs

In [11]:
norm_LFs = pd.read_csv('norm_LFs.temp',
                       sep='|',
                       header = None,
                       usecols = [1])

#### Match Unique & Normalized LFs

In [12]:
normed_df = pd.DataFrame()
normed_df['LF'] = uniq_LFs
normed_df['NormLF_temp'] = norm_LFs

#### Populate Database

In [13]:
df = pd.merge(df, normed_df, how='left', on='LF')
df['NormLF'] = df['NormLF_temp']
df = df.drop(columns=['NormLF_temp'])
df.sample(3, random_state=0)

Unnamed: 0,GroupID,SF,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI,MetaMap.CUI
69748,,ER-α,,er_α,,oestrogen receptor alpha,,estrogen receptor alpha,,UMLS,E0579571,E0579570,acronym,,,,,
287684,,F9,,f9,,mouse embryonal carcinoma,,-No Output-,,ADAM,,,,0.4232,11.0,,,
349332,,NPr-A,,npr_a,,natriuretic peptide receptor,,natriuretic peptide receptor,,ADAM,,,,0.8165,1.0,,,


#### Clean Environment

In [14]:
df = df.replace('-No Output-', '')
df.sample(3, random_state=0)

Unnamed: 0,GroupID,SF,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI,MetaMap.CUI
69748,,ER-α,,er_α,,oestrogen receptor alpha,,estrogen receptor alpha,,UMLS,E0579571,E0579570,acronym,,,,,
287684,,F9,,f9,,mouse embryonal carcinoma,,,,ADAM,,,,0.4232,11.0,,,
349332,,NPr-A,,npr_a,,natriuretic peptide receptor,,natriuretic peptide receptor,,ADAM,,,,0.8165,1.0,,,


In [15]:
os.remove('uniq_LFs.temp')
os.remove('norm_LFs.temp')

## Add Short Form Unique Identifier

#### Sort by SF

In [16]:
df = df.sort_values(by=['SF'])
df = df.reset_index()

#### Assign SFUI

In [17]:
assignment = 1
for index, row in df.iterrows():
    if index == 0:
        df['SFUI'].iat[index] = assignment
    elif df['SF'].at[index] == df['SF'].at[index-1]:
        df['SFUI'].iat[index] = assignment
    else:
        assignment += 1
        df['SFUI'].iat[index] = assignment

#### Format SFUI

In [18]:
df["SFUI"] = "S" + (df.SFUI.map("{:06}".format))
df.head(5)

Unnamed: 0,index,GroupID,SF,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI,MetaMap.CUI
0,261389,,$Can,S000001,_can,,Canadian dollars,,,,ADAM,,,,0.8365,18.0,,,
1,356046,,%,S000002,_,,percent,,percent,,Berman,,,,,,,,
2,120876,,%LN,S000003,_ln,,per cent luminal narrowing,,percent lumenal narrowing,,UMLS,E0665149,E0665148,abbreviation,,,,,
3,120874,,%LN,S000003,_ln,,percent luminal narrowing,,percent lumenal narrowing,,UMLS,E0665149,E0665148,abbreviation,,,,,
4,120873,,%LN,S000003,_ln,,percent lumenal narrowing,,percent lumenal narrowing,,UMLS,E0665149,E0665148,abbreviation,,,,,


## Add Long Form Unique Identifier

#### Sort by LF

In [19]:
df = df.sort_values(by=['LF'])
df = df.reset_index()

#### Assign LFUI

In [20]:
assignment = 1
for index, row in df.iterrows():
    if index == 0:
        df['LFUI'].iat[index] = assignment
    elif df['LF'].at[index] == df['LF'].at[index-1]:
        df['LFUI'].iat[index] = assignment
    else:
        assignment += 1
        df['LFUI'].iat[index] = assignment

#### Format LFUI

In [21]:
df["LFUI"] = "L" + (df.LFUI.map("{:06}".format))
df.head(5)

Unnamed: 0,level_0,index,GroupID,SF,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI,MetaMap.CUI
0,622,361807,,///////////////\\\\\\\\\\\\\\\\\\\\,S000254,___________________________________,,,L000001,,,Berman,,,,,,,,
1,156677,259801,,LMS method,S037926,lms method,,method,L000002,method,,UMLS,E0751974,E0751973,acronym,,,,,
2,315362,153125,,beta 2M,S076344,beta 2m,,(2) microglobulin,L000003,,,UMLS,E0693511,E0213510,abbreviation,,,,,
3,379834,37471,,β2-m,S097863,β2_m,,(2) microglobulin,L000003,,,UMLS,E0429602,E0213510,acronym,,,,,
4,379787,37447,,β2-M,S097861,β2_m,,(2) microglobulin,L000003,,,UMLS,E0429602,E0213510,acronym,,,,,


## Export

In [22]:
df.to_csv('Step2Output_Clinical_Abbreviation_Acronym_Crosswalk.csv',
          index=False,
          header=True,
          sep='|')