In [None]:
import pandas as pd
import configparser
import os
import jellyfish as jf
from fuzzywuzzy import utils
from fuzzywuzzy import process
from fuzzywuzzy import fuzz
import py_stringmatching as sm

config_file = '../config.ini'
config = configparser.ConfigParser()
config.read(config_file)

def calculate_score(distance, x, y):
    return round((1 - distance / max(len(x), len(y))) * 100)

def extract(query, choices, processor=utils.full_process, scorer=jf.levenshtein_distance, limit=2, distance=True):
    tmp = choices.to_frame('name')
    tmp['name'] = tmp['name'].apply(lambda x: processor(x))
    
    tmp['distance'] = tmp['name'].apply(lambda x: scorer(processor(query), str(x)))
    if distance:
        tmp['score'] = tmp.apply(lambda x: calculate_score(x['distance'], query, x['name']), axis=1)
    else:
        tmp['score'] = tmp['distance'].apply(lambda x: round(x * 100))
    
    tmp.sort_values(by=['score'], ascending=False, inplace=True)
    
    results = tmp[0:limit]
    best_results = []
    for key, value in results.iterrows():
        # print(value['ATC level name'])
        best_results.append((value['name'], value['score'], key))

    return best_results

def remove_stop_words(x):
    x_tokenized = word_tokenize(x)
    tokens_without_sw = [word for word in x_tokenized if not word in stopwords]
    text_without_sw = " ".join(tokens_without_sw)
    return text_without_sw

In [None]:
df = pd.read_csv(os.path.join(config['DEFAULT']['output_dir'],'bnf_code_clean.csv'))

In [None]:
df_test = pd.read_csv(os.path.join('../data/test_analysis_set.csv'))
atc_df = pd.read_csv(os.path.join('../data/rxnorm_atc_code_info.csv'))
atc_df.columns = ['i', 'rxcui', 'rxaui', 'sab', 'tty', 'ATC code', 'ATC level name', 'suppress']
atc_df.drop('i', axis=1, inplace=True)

In [None]:
atc_df[:5]

In [None]:
df_test['normalized_name'] = df_test['bnf_chemical_substance'].apply(lambda x: utils.full_process(x))

In [None]:
df_test.info()

In [None]:
s = atc_df['ATC level name']
tmp = s.to_frame('name')

In [None]:
tmp['distance'] = tmp['name'].apply(lambda x: jf.jaro_distance('esomeprazole', str(x)))

In [None]:
tmp[:6]

In [None]:
tmp['distance'] = tmp['name'].apply(lambda x: jf.jaro_distance('esomeprazole', str(x)))
tmp['score'] = tmp['distance'].apply(lambda x: round(x*100))
#atc_df['score'] = atc_df.apply(lambda x: calculate_score(x['distance'], 'nizatidine', x['ATC level name']), axis=1)

In [None]:
tmp.sort_values(by=['score'], ascending=False, inplace=True)
tmp[0:5]

In [None]:
atc_df['distance'] = atc_df['ATC level name'].apply(lambda x: jf.jaro_distance('esomeprazole', x))
atc_df['score'] = atc_df['distance'].apply(lambda x: round(x*100))
#atc_df['score'] = atc_df.apply(lambda x: calculate_score(x['distance'], 'nizatidine', x['ATC level name']), axis=1)

In [None]:
results = tmp[0:5]
results

In [None]:
best_results = [];

for key,value in results.iterrows():
    #print(value['ATC level name'])
    best_results.append((value['name'], value['score'], key))
    
best_results

In [None]:
extract('Nizatidine', s, scorer=jf.jaro_winkler, limit=5, distance=False)

In [None]:
process.extract('nizatidine', s, scorer=fuzz.ratio, limit=5)

In [None]:
scorer = sm.Cosine()
ws = sm.WhitespaceTokenizer()
qgram = sm.QgramTokenizer(prefix_pad='^', suffix_pad='!')
scorer.get_sim_score(qgram.tokenize('nizatidine'),qgram.tokenize('mesna'))

In [None]:
def set_extract(query, choices, processor=utils.full_process, scorer=sm.Cosine, tokenizer=sm.WhitespaceTokenizer, limit=2, distance=True):
    tmp = choices.to_frame('name')
    tmp['name'] = tmp['name'].apply(lambda x: processor(x))
    the_scorer = scorer()
    set1 = tokenizer.tokenize(processor(query))
    print(set1)
    
    tmp['distance'] = tmp['name'].apply(lambda x: the_scorer.get_sim_score(set1, tokenizer.tokenize(str(x))))
    if distance:
        tmp['score'] = tmp.apply(lambda x: calculate_score(x['distance'], query, x['name']), axis=1)
    else:
        tmp['score'] = tmp['distance'].apply(lambda x: round(x * 100))
    
    tmp.sort_values(by=['score'], ascending=False, inplace=True)
    
    results = tmp[0:limit]
    best_results = []
    for key, value in results.iterrows():
        # print(value['ATC level name'])
        best_results.append((value['name'], value['score'], key))

    return best_results

In [None]:
set_extract('metoprolol', s, scorer=sm.TverskyIndex, tokenizer=sm.QgramTokenizer(qval=2, prefix_pad='^', suffix_pad='!'), limit=15, distance=False)
#set_extract('nizatidine hydrochloride', s, scorer=sm.Cosine, tokenizer=sm.WhitespaceTokenizer(), limit=5, distance=False)

In [None]:
scorer = sm.JaroWinkler()
hybrid = sm.MongeElkan(sim_func=scorer.get_sim_score)
ws = sm.WhitespaceTokenizer()
qgram = sm.QgramTokenizer(qval=2, prefix_pad='^', suffix_pad='!')
hybrid.get_raw_score(qgram.tokenize('metoprolol tartarte'),qgram.tokenize('metoprolol succinate'))

In [None]:
def hybrid_extract(query, choices, processor=utils.full_process, scorer=sm.JaroWinkler, tokenizer=sm.WhitespaceTokenizer, hybrid=sm.MongeElkan, limit=2, distance=True, threshold=None):
    tmp = choices.to_frame('name')
    tmp['name'] = tmp['name'].apply(lambda x: processor(x))
    the_scorer = scorer()
    
    if threshold:
        the_hybrid = hybrid(sim_func=the_scorer.get_sim_score, threshold=threshold)
    else:
        the_hybrid = hybrid(sim_func=the_scorer.get_sim_score)
    set1 = tokenizer.tokenize(processor(query))
    print(set1)
    
    tmp['distance'] = tmp['name'].apply(lambda x: the_hybrid.get_raw_score(set1, tokenizer.tokenize(str(x))))
    if distance:
        tmp['score'] = tmp.apply(lambda x: calculate_score(x['distance'], query, x['name']), axis=1)
    else:
        tmp['score'] = tmp['distance'].apply(lambda x: round(x * 100))
    
    tmp.sort_values(by=['score'], ascending=False, inplace=True)
    
    results = tmp[0:limit]
    best_results = []
    for key, value in results.iterrows():
        # print(value['ATC level name'])
        best_results.append((value['name'], value['score'], key))

    return best_results

In [None]:
#hybrid_extract('Procaine', s, scorer=sm.HammingDistance, tokenizer=sm.QgramTokenizer(qval=4, prefix_pad='^', suffix_pad='!'), hybrid=sm.MongeElkan, limit=15, distance=False)
#hybrid_extract('oxyprocaine', s, scorer=sm.HammingDistance, tokenizer=sm.WhitespaceTokenizer(), hybrid=sm.MongeElkan, limit=15, distance=False)
hybrid_extract('Emtricitabine', s, scorer=sm.Editex, tokenizer=sm.DelimiterTokenizer(['/','&',' ']), hybrid=sm.MongeElkan, limit=15, distance=False)

In [None]:
#hybrid_extract('metoprolol/thiazides', s, scorer=sm.Jaro, tokenizer=sm.QgramTokenizer(qval=4, prefix_pad='^', suffix_pad='!'), hybrid=sm.SoftTfIdf, limit=15, distance=False, threshold=0.9)
hybrid_extract('ferrous sulfate', s, scorer=sm.Levenshtein, tokenizer=sm.WhitespaceTokenizer(), hybrid=sm.SoftTfIdf, limit=15, distance=False,threshold=0.9)
#hybrid_extract('Emtricitabine & Tenofovir Alafenamide', s, scorer=sm.Levenshtein, tokenizer=sm.DelimiterTokenizer(['/','&',' ']), hybrid=sm.SoftTfIdf, limit=15, distance=False, threshold=0.95)

In [None]:
score = jf.levenshtein_distance('Dave','David')
score