# Generate Training Set (Part 2)

This training set compares long forms within each NSF.

## Setup

In [1]:
import pandas as pd
import numpy as np
import itertools
from fuzzywuzzy import fuzz

#### Load Dataset

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/modules/Step2Output_Clinical_Abbreviation_Acronym_Crosswalk.csv',
                 sep='|',
                 header=0,
                 index_col=False,
                 na_filter=False,
                 dtype=object)

In [3]:
df.sample(3, random_state=0)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI
33582,,E033583,α-GPDH,S103760,α_gpdh,N076113,,alpha-glycerophosphate dehydrogenase,L041419,alpha glycerophosphate dehydrogenase,,UMLS,E0412935,E0769306,acronym,,,,
70982,,E070983,PRPPs,S056625,prpps,N057119,,phosphoribosyl pyrophosphate synthetase,L128319,phosphoribosylpyrophosphate synthetase,,UMLS,E0571504,E0047486,acronym,,,,
192802,,E192803,IPH,S036667,iph,N036838,,intra-peritoneal haemorrhage,L097757,,,UMLS,E0703883,E0703882,acronym,,,,


## Initial Filtering

Identifies **most similar string** that is **not equivalent** within each **NSF**.

We used the standard Levenshtein distance similarity ratio available [here](https://github.com/seatgeek/fuzzywuzzy).

#### Use Normalized Long Form Where Possible

In [4]:
df['TrainLF'] = np.where(df['NormLF']=='', df['LF'], df['NormLF'])

In [5]:
df.sample(3, random_state=0)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI,TrainLF
33582,,E033583,α-GPDH,S103760,α_gpdh,N076113,,alpha-glycerophosphate dehydrogenase,L041419,alpha glycerophosphate dehydrogenase,,UMLS,E0412935,E0769306,acronym,,,,,alpha glycerophosphate dehydrogenase
70982,,E070983,PRPPs,S056625,prpps,N057119,,phosphoribosyl pyrophosphate synthetase,L128319,phosphoribosylpyrophosphate synthetase,,UMLS,E0571504,E0047486,acronym,,,,,phosphoribosylpyrophosphate synthetase
192802,,E192803,IPH,S036667,iph,N036838,,intra-peritoneal haemorrhage,L097757,,,UMLS,E0703883,E0703882,acronym,,,,,intra-peritoneal haemorrhage


In [6]:
df.shape

(413964, 20)

#### Compute Similarities

In [7]:
df = df.sort_values(by=['NSFUI'])
df.head(5)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,NSFUI,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI,TrainLF
287791,,E287792,10 m walk test,S000333,10 m walk test,N000001,,ten metre walk test,L155188,10 Meter Walk Test,,UMLS,E0765285,E0765283,abbreviation,,,,,10 Meter Walk Test
287790,,E287791,10 m walk test,S000333,10 m walk test,N000001,,ten meter walk test,L155187,10 Meter Walk Test,,UMLS,E0765285,E0765283,abbreviation,,,,,10 Meter Walk Test
287789,,E287790,10 m walk test,S000333,10 m walk test,N000001,,Ten Metre Walk Test,L033652,10 Meter Walk Test,,UMLS,E0765285,E0765283,abbreviation,,,,,10 Meter Walk Test
287788,,E287789,10 m walk test,S000333,10 m walk test,N000001,,Ten Meter Walk Test,L033651,10 Meter Walk Test,,UMLS,E0765285,E0765283,abbreviation,,,,,10 Meter Walk Test
287787,,E287788,10 m walk test,S000333,10 m walk test,N000001,,10 metre walk test,L000549,10 Meter Walk Test,,UMLS,E0765285,E0765283,abbreviation,,,,,10 Meter Walk Test


In [8]:
train = []

In [9]:
for NSFUI in df.NSFUI.unique():
    uniq_LFs = set(df.loc[df['NSFUI'] == NSFUI]['TrainLF'].values)
    for pair in itertools.combinations(uniq_LFs, 2):
        distance = fuzz.ratio(pair[0], pair[1])
        if distance > 80:
            pair_dictionary = {'LF1':pair[0], 'LF2':pair[1], 'distance':distance} 
            train.append(pair_dictionary)

In [10]:
train = pd.DataFrame(train)  

In [11]:
train.sample(3, random_state=0)

Unnamed: 0,LF1,LF2,distance
350681,protein kinase C betaII,protein kinase C β-II,86
134297,β-2 glycoprotein I,β(2)glycoprotein-I,83
306207,mast-cell-degranulating,mast cell-degranulating,96


## Random Subset

Select random subset of highly and moderately similar long forms for manual annotation.

In [12]:
TrainSet3 = train[train.distance < 99]
TrainSet3 = TrainSet3.sample(frac=0.10)
TrainSet3 = TrainSet3.drop(columns=['distance'])
TrainSet3['Synonym'] = ''

In [13]:
TrainSet3.sample(3, random_state=0)

Unnamed: 0,LF1,LF2,Synonym
212464,glucagon-like peptide,glucagon like peptide,
297868,light-harvesting chlorophyll a/b binding prote...,light harvesting chlorophyll protein II,
360159,peroxisome proliferator-activated receptor-γ2,peroxisome proliferator-activated receptor gam...,


#### Export

In [14]:
TrainSet3.to_csv('TrainingSet3.csv',
                 index=False,
                 header=True,
                 sep='|')