# Step 2.5: Generate Training Set

## Setup

In [None]:
import pandas as pd
import numpy as np
import itertools
from fuzzywuzzy import fuzz
import matplotlib.pyplot as plt

#### Load Dataset

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/modules/Step2Output_Clinical_Abbreviation_Acronym_Crosswalk.csv',
                 sep='|',
                 header=0,
                 index_col=False,
                 na_filter=False,
                 dtype=object)

In [None]:
df.sample(3, random_state=0)

## Initial Filtering

Identifies **most similar string** that is **not equivalent**.

We used the standard Levenshtein distance similarity ratio available [here](https://github.com/seatgeek/fuzzywuzzy).

#### Use Normalized Long Form Where Possible

In [None]:
df['TrainLF'] = np.where(df['NormLF']=='', df['LF'], df['NormLF'])

In [None]:
df.sample(3, random_state=0)

#### Extract Unique Long Forms

In [None]:
uniq_LFs = set(df['TrainLF'].values)

#### Compute Similarities

In [None]:
train = []

In [None]:
for pair in itertools.combinations(uniq_LFs, 2):
    distance = fuzz.ratio(pair[0], pair[1])
    if distance > 80:
        pair_dictionary = {'LF1':pair[0], 'LF2':pair[1], 'distance':distance} 
        train.append(pair_dictionary)

In [None]:
train = pd.DataFrame(train)  

In [None]:
train.sample(3, random_state=0)

## Highly Similar

Assume highly similar long forms are synonymous (~1 character difference).

In [None]:
TrainSet2 = train[train.distance >= 98]
TrainSet2 = TrainSet2.drop(columns=['distance'])
TrainSet2['Synonym'] = 'Y'

In [None]:
TrainSet2.sample(3, random_state=0)

#### Export

In [None]:
TrainSet2.to_csv('TrainingSet2_Annotated.csv',
                 index=False,
                 header=True,
                 sep='|')

## Random Subset

Select random subset of moderately similar long forms for manual annotation.

In [None]:
TrainSet1 = train[train.distance < 98]
TrainSet1 = TrainSet1.sample(frac=0.10)
TrainSet1 = TrainSet1.drop(columns=['distance'])
TrainSet1['Synonym'] = ''

In [None]:
TrainSet1.sample(3, random_state=0)

#### Export

In [None]:
TrainSet1.to_csv('TrainingSet1.csv',
                 index=False,
                 header=True,
                 sep='|')