# Generate Training Set (Part 2)

This training set compares long forms within each NSF.

## Setup

In [None]:
import pandas as pd
import numpy as np
import itertools
from fuzzywuzzy import fuzz

#### Load Dataset

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/modules/Step2Output_Clinical_Abbreviation_Acronym_Crosswalk.csv',
                 sep='|',
                 header=0,
                 index_col=False,
                 na_filter=False,
                 dtype=object)

In [None]:
df.sample(3, random_state=0)

## Initial Filtering

Identifies **most similar string** that is **not equivalent** within each **NSF**.

We used the standard Levenshtein distance similarity ratio available [here](https://github.com/seatgeek/fuzzywuzzy).

#### Use Normalized Long Form Where Possible

In [None]:
df['TrainLF'] = np.where(df['NormLF']=='', df['LF'], df['NormLF'])

In [None]:
df.sample(3, random_state=0)

In [None]:
df.shape

#### Compute Similarities

In [None]:
df = df.sort_values(by=['NSFUI'])
df.head(5)

In [None]:
train = []

In [None]:
for NSFUI in df.NSFUI.unique():
    uniq_LFs = set(df.loc[df['NSFUI'] == NSFUI]['TrainLF'].values)
    for pair in itertools.combinations(uniq_LFs, 2):
        distance = fuzz.ratio(pair[0], pair[1])
        if distance > 80:
            pair_dictionary = {'LF1':pair[0], 'LF2':pair[1], 'distance':distance} 
            train.append(pair_dictionary)

In [None]:
train = pd.DataFrame(train)  

In [None]:
train.sample(3, random_state=0)

## Random Subset

Select random subset of highly and moderately similar long forms for manual annotation.

In [None]:
TrainSet3 = train[train.distance < 99]
TrainSet3 = TrainSet3.sample(frac=0.10)
TrainSet3 = TrainSet3.drop(columns=['distance'])
TrainSet3['Synonym'] = ''

In [None]:
TrainSet3.sample(3, random_state=0)

#### Export

In [None]:
TrainSet3.to_csv('TrainingSet3.csv',
                 index=False,
                 header=True,
                 sep='|')