# Step 2.5: Generate Training Set

## Setup

In [1]:
import pandas as pd
import numpy as np
import itertools
from fuzzywuzzy import fuzz

#### Load Dataset

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/modules/Step2Output_Clinical_Abbreviation_Acronym_Crosswalk.csv',
                 sep='|',
                 header=0,
                 index_col=False,
                 na_filter=False,
                 dtype=object)

In [3]:
df.sample(3, random_state=0)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI
33582,,E033583,α-GPDH,S103760,α_gpdh,,alpha-glycerophosphate dehydrogenase,L041419,alpha glycerophosphate dehydrogenase,,UMLS,E0412935,E0769306,acronym,,,,
70982,,E070983,PRPPs,S056625,prpps,,phosphoribosyl pyrophosphate synthetase,L128319,phosphoribosylpyrophosphate synthetase,,UMLS,E0571504,E0047486,acronym,,,,
192802,,E192803,IPH,S036667,iph,,intra-peritoneal haemorrhage,L097757,,,UMLS,E0703883,E0703882,acronym,,,,


## Initial Filtering

Identifies **most similar string** that is **not equivalent**.

We used the standard Levenshtein distance similarity ratio available [here](https://github.com/seatgeek/fuzzywuzzy).

#### Use Normalized Long Form Where Possible

In [4]:
df['TrainLF'] = np.where(df['NormLF']=='', df['LF'], df['NormLF'])

In [5]:
df.sample(3, random_state=0)

Unnamed: 0,GroupID,EntryID,SF,SFUI,NormSF,PrefSF,LF,LFUI,NormLF,PrefLF,Source,SFEUI,LFEUI,Type,Score,Count,Frequency,UMLS.CUI,TrainLF
33582,,E033583,α-GPDH,S103760,α_gpdh,,alpha-glycerophosphate dehydrogenase,L041419,alpha glycerophosphate dehydrogenase,,UMLS,E0412935,E0769306,acronym,,,,,alpha glycerophosphate dehydrogenase
70982,,E070983,PRPPs,S056625,prpps,,phosphoribosyl pyrophosphate synthetase,L128319,phosphoribosylpyrophosphate synthetase,,UMLS,E0571504,E0047486,acronym,,,,,phosphoribosylpyrophosphate synthetase
192802,,E192803,IPH,S036667,iph,,intra-peritoneal haemorrhage,L097757,,,UMLS,E0703883,E0703882,acronym,,,,,intra-peritoneal haemorrhage


#### Extract Unique Long Forms

In [6]:
uniq_LFs = set(df['TrainLF'].values)

#### Compute Similarities

In [7]:
train = []

In [8]:
for pair in itertools.combinations(uniq_LFs, 2):
    distance = fuzz.ratio(pair[0], pair[1])
    if distance > 80:
        pair_dictionary = {'LF1':pair[0], 'LF2':pair[1], 'distance':distance} 
        train.append(pair_dictionary)

In [9]:
train = pd.DataFrame(train)  

In [10]:
train.sample(3, random_state=0)

Unnamed: 0,LF1,LF2,distance
196922,β amyloid precursor protein cleaving enzyme-2,β amyloid precursor protein-cleaving enzyme 1,93
381491,interleukin-2 receptor α-subunit,interleukin-6 receptor α-chain,81
426807,GH binding protein,XPA binding protein 2,82


## Highly Similar

Assume highly similar long forms are synonymous (~1 character difference).

In [11]:
TrainSet2 = train[train.distance >= 98]
TrainSet2 = TrainSet2.drop(columns=['distance'])
TrainSet2['Synonym'] = 'Y'

In [12]:
TrainSet2.sample(3, random_state=0)

Unnamed: 0,LF1,LF2,Synonym
95314,peroxisome proliferator-activated receptor-gam...,peroxisome proliferator-activated receptor gam...,Y
119423,receptor-like protein tyrosine phosphatase-eta,receptor-like protein tyrosine phosphatase-delta,Y
401419,post-menopausal estrogen-progestin intervention,post-menopausal oestrogen-progestin intervention,Y


#### Export

In [13]:
TrainSet2.to_csv('TrainingSet2_Annotated.csv',
                 index=False,
                 header=True,
                 sep='|')

## Random Subset

Select random subset of moderately similar long forms for manual annotation.

In [14]:
TrainSet1 = train[train.distance < 98]
TrainSet1 = TrainSet1.sample(frac=0.10)
TrainSet1 = TrainSet1.drop(columns=['distance'])
TrainSet1['Synonym'] = ''

In [15]:
TrainSet1.sample(3, random_state=0)

Unnamed: 0,LF1,LF2,Synonym
226968,stress-activated protein kinase-4,Mitogen- and stress-activated protein kinase-1,
205483,dioleoylphosphatidylcholine,diphytanoylphosphatidylcholine,
56127,A disintegrin and metalloproteinase-3,a disintegrin and metalloprotease-6,


#### Export

In [16]:
TrainSet1.to_csv('TrainingSet1.csv',
                 index=False,
                 header=True,
                 sep='|')