# Similarity Control using direct string compare

In [None]:
import pandas as pd
import configparser
import os
import jellyfish as jf
from fuzzywuzzy import utils
from fuzzywuzzy import process
from fuzzywuzzy import fuzz
from nltk.tokenize import word_tokenize

config_file = '../config.ini'
config = configparser.ConfigParser()
config.read(config_file)

def calculate_score(distance, x, y):
    return round((1 - distance / max(len(x), len(y))) * 100)

def extract(query, choices, processor=utils.full_process, scorer=jf.levenshtein_distance, limit=2, distance=True):
    tmp = choices.to_frame('name')
    tmp['name'] = tmp['name'].apply(lambda x: processor(x))
    
    tmp['distance'] = tmp['name'].apply(lambda x: scorer(remove_stop_words(processor(query)), str(x)))
    if distance:
        tmp['score'] = tmp.apply(lambda x: calculate_score(x['distance'], query, x['name']), axis=1)
    else:
        tmp['score'] = tmp['distance'].apply(lambda x: round(x * 100))
    
    tmp.sort_values(by=['score'], ascending=False, inplace=True)
    
    results = tmp[0:limit]
    best_results = []
    for key, value in results.iterrows():
        # print(value['ATC level name'])
        best_results.append((value['name'], value['score'], key))

    return best_results

def remove_stop_words(x):
    if x not in whitelist:
        x_tokenized = x.split(" ")
        itertokens = iter(x_tokenized)
        next(itertokens)
        tokens_without_sw = [word for word in itertokens if not word in stop_words]
        tokens_without_sw.insert(0,x_tokenized[0])
        text_without_sw = " ".join(tokens_without_sw)
        return text_without_sw
    else:
        return x

In [None]:
df = pd.read_csv(os.path.join(config['DEFAULT']['output_dir'],'bnf_code_clean.csv'))
stop_words = pd.read_csv(os.path.join('../data/stop_words.csv'), header=None)[0].values.tolist()
whitelist = pd.read_csv(os.path.join('../data/whitelist.csv'), header=None)[0].values.tolist()

In [None]:
stop_words = pd.read_csv(os.path.join('../data/stop_words.csv'), header=None)[0].values.tolist()

In [None]:
stop_words[:5]

In [None]:
df_test = pd.read_csv(os.path.join('../data/test_analysis_set.csv'))
atc_df = pd.read_csv(os.path.join('../data/rxnorm_atc_code_info.csv'))
atc_df.columns = ['i', 'rxcui', 'rxaui', 'sab', 'tty', 'ATC code', 'ATC level name', 'suppress']
atc_df.drop('i', axis=1, inplace=True)
atc_df['name_without_sw'] = atc_df['ATC level name'].apply(lambda x: remove_stop_words(x)) 
slim_atc_df = atc_df.loc[atc_df['tty'].isin(['IN','RXN_IN'])]

In [None]:
slim_atc_df[:10]

In [None]:
df_test['normalized_name'] = df_test['bnf_chemical_substance'].apply(lambda x: utils.full_process(x))

In [None]:
atc_df[:5]

In [None]:
df_test[:5]

In [None]:
s = slim_atc_df['ATC level name']
tmp = s.to_frame('name')

In [None]:
s = slim_atc_df['name_without_sw']

In [None]:
s[:10]

In [None]:
def control_scorer(a, b):
    if a.lower() == b.lower():
        return 1
    else:
        return 0

In [None]:
extract('metoprolol with diuretic', s, scorer=control_scorer, limit=5, distance=False)

In [None]:
extract('ferrous sulfate', s, limit=10, distance=True)

In [None]:
def remove_stop_words(x):
    if x not in whitelist:
        x_tokenized = x.split(" ")
        itertokens = iter(x_tokenized)
        next(itertokens)
        tokens_without_sw = [word for word in itertokens if not word in stop_words]
        tokens_without_sw.insert(0,x_tokenized[0])
        text_without_sw = " ".join(tokens_without_sw)
        return text_without_sw
    else:
        return x

In [None]:
m = extract('Medroxyprogesterone Acetate', s, limit=5, distance=True)
key2 = 'ATC level name'
m_revised = []
for row in m:
    m_revised.append((slim_atc_df.loc[row[2], key2], row[1], row[2]))
m_revised

In [None]:
remove_stop_words('Medroxyprogesterone Acetate')

In [None]:
remove_stop_words('metoprolol Succinate')

In [None]:
slim_atc_df.loc[4928,'ATC level name']

In [None]:
stop_words[:5]