# Generate Training Set (Part 2)

These training sets compare long forms within each NSF.

## Setup

In [1]:
import pandas as pd
import numpy as np
import itertools
from fuzzywuzzy import fuzz

#### Load Dataset

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/modules/Step2Output_Clinical_Abbreviation_Acronym_Crosswalk.csv',
                 sep='|',
                 header=0,
                 index_col=False,
                 na_filter=False,
                 dtype=object)

In [3]:
df.sample(3, random_state=0)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI
33582,,E033583,α-GPDH,S103760,α_gpdh,N077282,,alpha-glycerophosphate dehydrogenase,L041419,alpha glycerophosphate dehydrogenase,,UMLS,E0412935,E0769306,acronym,,,,
70982,,E070983,PRPPs,S056625,prpps,N057988,,phosphoribosyl pyrophosphate synthetase,L128319,phosphoribosylpyrophosphate synthetase,,UMLS,E0571504,E0047486,acronym,,,,
192802,,E192803,IPH,S036667,iph,N037427,,intra-peritoneal haemorrhage,L097757,,,UMLS,E0703883,E0703882,acronym,,,,


## Initial Filtering

Identifies **most similar string** that is **not equivalent** within each **NSF**.

We used the standard Levenshtein distance similarity ratio available [here](https://github.com/seatgeek/fuzzywuzzy).

#### Use Normalized Long Form Where Possible

In [4]:
df['TrainLF'] = np.where(df['NormLF']=='', df['LF'], df['NormLF'])

In [5]:
df.sample(3, random_state=0)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI,TrainLF
33582,,E033583,α-GPDH,S103760,α_gpdh,N077282,,alpha-glycerophosphate dehydrogenase,L041419,alpha glycerophosphate dehydrogenase,,UMLS,E0412935,E0769306,acronym,,,,,alpha glycerophosphate dehydrogenase
70982,,E070983,PRPPs,S056625,prpps,N057988,,phosphoribosyl pyrophosphate synthetase,L128319,phosphoribosylpyrophosphate synthetase,,UMLS,E0571504,E0047486,acronym,,,,,phosphoribosylpyrophosphate synthetase
192802,,E192803,IPH,S036667,iph,N037427,,intra-peritoneal haemorrhage,L097757,,,UMLS,E0703883,E0703882,acronym,,,,,intra-peritoneal haemorrhage


In [6]:
df=df.head(1000)
df.shape

(1000, 20)

#### Compute Similarities

In [8]:
df = df.sort_values(by=['NSFUI'])
df.head(5)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI,TrainLF
67,,E000068,A.A.M.D.,S002919,a_a_m_d_,N002781,,American Association on Mental Deficiency,L004532,,,UMLS,E0000050,E0000277,acronym,,,,,American Association on Mental Deficiency
1,,E000002,AA,S003081,aa,N003003,,Alcoholics Anonymous,L004250,,,UMLS,E0000048,E0000204,acronym,,,,,Alcoholics Anonymous
2,,E000003,AA,S003081,aa,N003003,,alcohol abuse,L040702,alcohol abuse,,UMLS,E0000048,E0356324,acronym,,,,,alcohol abuse
3,,E000004,AA,S003081,aa,N003003,,alcohol-abuse,L040752,alcohol abuse,,UMLS,E0000048,E0356324,acronym,,,,,alcohol abuse
4,,E000005,AA,S003081,aa,N003003,,aortic aneurysm,L045559,aortic aneurysm,,UMLS,E0000048,E0009858,acronym,,,,,aortic aneurysm


In [None]:
train = []

In [None]:
for NSFUI in df.NSFUI.unique():
    uniq_LFs = set(df.at[df['NSFUI'] == NSFUI]['TrainLF'].values)
    for pair in itertools.combinations(uniq_LFs, 2):
        distance = fuzz.ratio(pair[0], pair[1])
        if distance > 60:
            pair_dictionary = {'LF1':pair[0], 'LF2':pair[1], 'distance':distance} 
            train.append(pair_dictionary)

In [None]:
train = pd.DataFrame(train)  

In [None]:
train.sample(3, random_state=0)

## Random Subset

Select random subset of highly and moderately similar long forms for manual annotation.

In [None]:
TrainSet3 = train[train.distance < 98]
TrainSet3 = TrainSet3.sample(frac=0.10)
TrainSet3 = TrainSet3.drop(columns=['distance'])
TrainSet3['Synonym'] = ''

In [None]:
TrainSet1.sample(3, random_state=0)

#### Export

In [None]:
TrainSet3.to_csv('TrainingSet3.csv',
                 index=False,
                 header=True,
                 sep='|')