In [1]:
import Levenshtein
import requests
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

In [2]:
def get_drugname_from_rxcui(rxcui: str, sleep_time=1) -> str:
    response: "Optional[Response]" = None
    while response is None:
        try:
            response = requests.get(f"http://localhost:4000/REST/rxcui/{rxcui}.json")
            break

        except Exception as ce:
            time.sleep(sleep_time)
            continue
            
    if response.status_code != 200:
        print(f"Something went wrong... Status code = {response.status_code}")
    j_res = response.json()
    try:
        drugname = j_res["idGroup"]["name"]
    except KeyError:
        return "<NA>"
    return drugname

In [3]:
df = pd.read_csv("../results/cleaned_query_responses_newsep" ,sep="$")

In [4]:
# rxcuis = df["rxcui"]

In [5]:
# drugname_list = [get_drugname_from_rxcui(str(int(rxcui))) if not np.isnan(rxcui) else "<NA>" for rxcui in tqdm(rxcuis, position=0, leave=True)]

In [6]:
ext_df = pd.read_csv("../results/rxcui_with_concept_and_og_data.csv")

In [7]:
# ext_df = pd.DataFrame({
#     "rxcui": rxcuis,
#     "og_score": df["score"],
#     "og_query": df["query"],
#     "rxnorm_concept": drugname_list
# })
# ext_df.to_csv("../results/rxcui_with_concept_and_og_data.csv", index=False)

In [8]:
strings_a = ext_df["og_query"].to_list()
strings_b = ext_df["rxnorm_concept"].to_list()

In [9]:
# for a, b in zip(strings_a, strings_b):
#     dist = Levenshtein.distance(a, b)

# distances = [Levenshtein.distance(a, b) for a, b in zip(strings_a, strings_b)]

In [57]:
def editsim(a: str, b: str, ignore_order=False)->int:
    """
    returns a score from 0 - 100 indicating the similarity score based on the edit distance between two strings.
    NOTE: if clause at the beginning is specific for this notebook's experiments.
    """
    try:
        if a == "" or b == "<NA>":
            return 0

        longer, shorter = a, b
        if len(a) < len(b):
            longer, shorter = b, a

        lonlen = len(longer)
        
        if ignore_order:
            longer = " ".join(sorted(longer.split(" ")))
            shorter = " ".join(sorted(shorter.split(" ")))

        return int(((lonlen - Levenshtein.distance(longer, shorter)) / lonlen) * 100)
    except TypeError as te:
        return 0

In [11]:
new_scores = [editsim(a, b) for a, b in zip(strings_a, strings_b)]

In [12]:
new_score_series = pd.Series(data=new_scores, name="new_score")

In [18]:
final_df = pd.concat([ext_df, new_score_series], axis=1)

In [19]:
final_df

Unnamed: 0,rxcui,og_score,og_query,rxnorm_concept,new_score
0,1111700.0,50,ioscan,Iosat,50
1,36676.0,67,bicarbonate de sodium,sodium bicarbonate,19
2,236486.0,100,omeprazole magnesium,omeprazole magnesium,100
3,2283546.0,42,oral 1.0 df tablet,cholecalciferol 0.095 MG / folic acid 1 MG Ora...,25
4,21116.0,50,contol,eucalyptol,40
...,...,...,...,...,...
49995,997822.0,75,vitacain,Vivacaine,66
49996,372588.0,50,levofloxacin oral 1.0 df tablet,levofloxacin Oral Tablet,70
49997,4508.0,50,nuban,flutamide,22
49998,1094549.0,80,cold + flu relief nighttime,acetaminophen 325 MG / dextromethorphan hydrob...,13


In [59]:
new_scores_order = [editsim(a, b, True) for a, b in zip(strings_a, strings_b)]

In [61]:
new_score_order_series = pd.Series(data=new_scores_order, name="new_score_ord")

In [62]:
actual_final_df = pd.concat([final_df, new_score_order_series], axis=1)

In [63]:
actual_final_df

Unnamed: 0,rxcui,og_score,og_query,rxnorm_concept,new_score,new_score_ord
0,1111700.0,50,ioscan,Iosat,50,50
1,36676.0,67,bicarbonate de sodium,sodium bicarbonate,19,85
2,236486.0,100,omeprazole magnesium,omeprazole magnesium,100,100
3,2283546.0,42,oral 1.0 df tablet,cholecalciferol 0.095 MG / folic acid 1 MG Ora...,25,24
4,21116.0,50,contol,eucalyptol,40,40
...,...,...,...,...,...,...
49995,997822.0,75,vitacain,Vivacaine,66,66
49996,372588.0,50,levofloxacin oral 1.0 df tablet,levofloxacin Oral Tablet,70,29
49997,4508.0,50,nuban,flutamide,22,22
49998,1094549.0,80,cold + flu relief nighttime,acetaminophen 325 MG / dextromethorphan hydrob...,13,13


In [64]:
# actual_final_df.to_csv("../results/new_editsim_scores_incl_ord.csv", index=False)