# Step 2: Add Information

## Setup

In [1]:
import pandas as pd
from master_functions import *
import os

#### Load Dataset

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/modules/Step1Output_Clinical_Abbreviation_Acronym_Crosswalk.csv',
                 sep='|',
                 header=0,
                 index_col=False,
                 na_filter=False,
                 dtype=object)

In [3]:
df.sample(3, random_state=0)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI
368728,,,T-ALL,,,,,T cell acute lymphoblastic leukemias,,,,ADAM,,,,0.7357,193,,
320521,,,Emax,,,,,maximal responses,,,,ADAM,,,,0.2113,24,,
311671,,,CTG,,,,,connective tissue graft,,,,ADAM,,,,0.7103,10,,


## Add Entry Identifier

#### Assign Entry Identifier

In [4]:
assignment = 1
for index, row in df.iterrows():
    df['EntryID'].iat[index] = assignment
    assignment += 1

#### Format Entry Identifier

In [5]:
df['EntryID'] = 'E' + (df.EntryID.map('{:06}'.format))
df.head(3)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI
0,,E000001,AA,,,,,achievement age,,,,UMLS,E0000048,E0006859,acronym,,,,
1,,E000002,AA,,,,,Alcoholics Anonymous,,,,UMLS,E0000048,E0000204,acronym,,,,
2,,E000003,AA,,,,,alcohol abuse,,,,UMLS,E0000048,E0356324,acronym,,,,


## Add Normalized Short Forms

The normalized short form is created by:
1. converting all text to lowercase; 
2. stripping leading and trailing whitespace;
3. standardizing punctuation to an underscore.

In [6]:
df['NormSF'] = df['SF'].apply(normalized_short_form)

In [7]:
df.sample(3, random_state=0)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI
368728,,E368729,T-ALL,,t_all,,,T cell acute lymphoblastic leukemias,,,,ADAM,,,,0.7357,193,,
320521,,E320522,Emax,,emax,,,maximal responses,,,,ADAM,,,,0.2113,24,,
311671,,E311672,CTG,,ctg,,,connective tissue graft,,,,ADAM,,,,0.7103,10,,


## Add Normalized Long Forms

Requires local installation of UMLS Lexical Variant Generation program, downloadable [here](https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html).

Additional documentation available [here](https://www.ncbi.nlm.nih.gov/books/NBK9680/). Installation instructions available [here](https://lexsrv2.nlm.nih.gov/LexSysGroup/Projects/lvg/current/docs/userDoc/install/install.html).

Here, we used version lvg2019 AB.

In [8]:
lvg_path = 'C:/Users/lvg2104/Documents/clinical-abbreviations/modules/lvg2019/bin/lvg.bat'

#### Identify Non-ASCII Characters

In [9]:
df['ASCII'] = 'Y'
df.loc[df.LF.str.contains('[^\x00-\x7F]') == True, 'ASCII'] = 'N'

#### Extract Unique LFs

In [10]:
uniq_LFs = pd.Series(df.loc[df['ASCII']=='Y']['LF'].unique())

In [11]:
uniq_LFs.to_csv('uniq_LFs.temp',
                index=False,
                header=False,
                encoding='ascii')

#### Call Lexical Variant Generation

In [12]:
lvg_output = lvg(input_file='uniq_LFs.temp',
                 flow='q0:g:rs:o:t:l:B:Ct:q7:q8',
                 output_file='norm_LFs.temp',
                 lvg_path=lvg_path)

#### Extract Normalized LFs

In [13]:
norm_LFs = pd.read_csv('norm_LFs.temp',
                       sep='|',
                       header = None,
                       usecols = [1])

#### Match Unique & Normalized LFs

In [14]:
normed_df = pd.DataFrame()
normed_df['LF'] = uniq_LFs
normed_df['NormLF_temp'] = norm_LFs

#### Populate Database

In [15]:
df = pd.merge(df, normed_df, how='left', on='LF')
df['NormLF'] = df['NormLF_temp']

In [16]:
df = df.drop(columns=['NormLF_temp', 'ASCII'])

In [17]:
df.sample(3, random_state=0)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI
368728,,E368729,T-ALL,,t_all,,,T cell acute lymphoblastic leukemias,,T cell acute lymphoblastic leukemia,,ADAM,,,,0.7357,193,,
320521,,E320522,Emax,,emax,,,maximal responses,,-No Output-,,ADAM,,,,0.2113,24,,
311671,,E311672,CTG,,ctg,,,connective tissue graft,,connective tissue graft,,ADAM,,,,0.7103,10,,


#### Clean Environment

In [18]:
df = df.replace('-No Output-', '')
df.sample(3, random_state=0)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI
368728,,E368729,T-ALL,,t_all,,,T cell acute lymphoblastic leukemias,,T cell acute lymphoblastic leukemia,,ADAM,,,,0.7357,193,,
320521,,E320522,Emax,,emax,,,maximal responses,,,,ADAM,,,,0.2113,24,,
311671,,E311672,CTG,,ctg,,,connective tissue graft,,connective tissue graft,,ADAM,,,,0.7103,10,,


In [19]:
os.remove('uniq_LFs.temp')
os.remove('norm_LFs.temp')

## Add Short Form Unique Identifier

#### Sort by SF

In [20]:
df = df.sort_values(by=['SF'])
df = df.reset_index(drop=True)

#### Assign SFUI

In [21]:
assignment = 1
for index, row in df.iterrows():
    if index == 0:
        df['SFUI'].iat[index] = assignment
    elif df['SF'].at[index] == df['SF'].at[index-1]:
        df['SFUI'].iat[index] = assignment
    else:
        assignment += 1
        df['SFUI'].iat[index] = assignment

#### Format SFUI

In [22]:
df['SFUI'] = 'S' + (df.SFUI.map('{:06}'.format))
df.head(5)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI
0,,E294485,$Can,S000001,_can,,,Canadian dollars,,,,ADAM,,,,0.8365,18.0,,
1,,E389142,%,S000002,_,,,percent,,percent,,Berman,,,,,,,
2,,E126776,%LN,S000003,_ln,,,percent lumenal narrowing,,percent lumenal narrowing,,UMLS,E0665149,E0665148,abbreviation,,,,
3,,E126777,%LN,S000003,_ln,,,percent luminal narrowing,,percent lumenal narrowing,,UMLS,E0665149,E0665148,abbreviation,,,,
4,,E126778,%LN,S000003,_ln,,,per cent lumenal narrowing,,percent lumenal narrowing,,UMLS,E0665149,E0665148,abbreviation,,,,


## Add Long Form Unique Identifier

#### Sort by LF

In [23]:
df = df.sort_values(by=['LF'])
df = df.reset_index(drop=True)

#### Assign LFUI

In [24]:
assignment = 1
for index, row in df.iterrows():
    if index == 0:
        df['LFUI'].iat[index] = assignment
    elif df['LF'].at[index] == df['LF'].at[index-1]:
        df['LFUI'].iat[index] = assignment
    else:
        assignment += 1
        df['LFUI'].iat[index] = assignment

#### Format LFUI

In [25]:
df['LFUI'] = 'L' + (df.LFUI.map('{:06}'.format))
df.head(5)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI
0,,E315532,DNIC,S019528,dnic,,,'diffuse noxious inhibitory controls',L000001,diffuse noxious inhibitory control,,ADAM,,,,0.7571,95.0,,
1,,E315587,DNR,S019572,dnr,,,'do not resuscitate',L000002,do not resuscitate,,ADAM,,,,0.5856,196.0,,
2,,E135924,PHNO,S054986,phno,,,(+)-4-propyl-9-hydroxynaphthoxazine,L000003,,,UMLS,E0672582,,acronym,,,,
3,,E354603,PHNO,S054986,phno,,,(+)-4-propyl-9-hydroxynaphthoxazine,L000003,,,ADAM,,,,0.5417,14.0,,
4,,E342751,MK-801,S044573,mk_801,,,"(+)-5-methyl-10,11-dihydro-5H-dibenzo[a,d]cycl...",L000004,,,ADAM,,,,0.0146,15.0,,


## Add Normalized Short Form Unique Identifier

#### Sort by NormSF

In [26]:
df = df.sort_values(by=['NormSF'])
df = df.reset_index(drop=True)

#### Add NSFUI

In [27]:
assignment = 1
for index, row in df.iterrows():
    if index == 0:
        df['NSFUI'].iat[index] = assignment
    elif df['NormSF'].at[index] == df['NormSF'].at[index-1]:
        df['NSFUI'].iat[index] = assignment
    else:
        assignment += 1
        df['NSFUI'].iat[index] = assignment

#### Format NSFUI

In [28]:
df['NSFUI'] = 'N' + (df.NSFUI.map('{:06}'.format))
df.head(5)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI
0,,E287800,10 m walk test,S000333,10 m walk test,N000001,,ten-metre walk test,L155194,10 Meter Walk Test,,UMLS,E0765285,E0765283,abbreviation,,,,
1,,E287796,10 m walk test,S000333,10 m walk test,N000001,,10-metre walk test,L000561,10 Meter Walk Test,,UMLS,E0765285,E0765283,abbreviation,,,,
2,,E287799,10 m walk test,S000333,10 m walk test,N000001,,ten-meter walk test,L155193,10 Meter Walk Test,,UMLS,E0765285,E0765283,abbreviation,,,,
3,,E287795,10 m walk test,S000333,10 m walk test,N000001,,10-meter walk test,L000560,10 Meter Walk Test,,UMLS,E0765285,E0765283,abbreviation,,,,
4,,E287792,10 m walk test,S000333,10 m walk test,N000001,,ten metre walk test,L155188,10 Meter Walk Test,,UMLS,E0765285,E0765283,abbreviation,,,,


## Export

In [29]:
df = df.sort_values(by=['EntryID'])
df = df.reset_index(drop=True)

In [30]:
df.to_csv('Step2Output_Clinical_Abbreviation_Acronym_Crosswalk.csv',
          index=False,
          header=True,
          sep='|')