In [1]:
import os
import sys
import csv
import tqdm
import pandas as pd
from itertools import permutations
import string

# from fuzzyset import FuzzySet
from cfuzzyset import cFuzzySet as FuzzySet

from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
# this is the deep cadrme output for 199 drugs (199 instead of 200 because we're missing 1 boundary file oops)
# I dropped this file into the dropbox
Deepcadrme_guess_terms = pd.read_csv("../data/deepcadrme_guess_terms.csv")

# turn all terms to lowercase
Deepcadrme_guess_terms['term'] = Deepcadrme_guess_terms['term'].str.lower()

In [4]:
pd.options.display.max_rows = 4000
Deepcadrme_guess_terms.head(500)

Unnamed: 0,file,term,start,len
0,ACTEMRA.xml,upper respiratory tract infections,410,34
1,ACTEMRA.xml,nasopharyngitis,446,15
2,ACTEMRA.xml,headache,463,8
3,ACTEMRA.xml,hypertension,473,12
4,ACTEMRA.xml,increased alt,487,13
5,ACTEMRA.xml,injection site reactions,502,24
6,ACTEMRA.xml,infections,1701,10
7,ACTEMRA.xml,upper respiratory tract infections,1948,34
8,ACTEMRA.xml,nasopharyngitis,1984,15
9,ACTEMRA.xml,headache,2001,8


In [5]:
# Load all the strings from meddra 23.1 and map them to the preferred term id
fh = open('../data/meddra_pt_llt_map_omop_v23.1.csv')
reader = csv.reader(fh)
header = next(reader)

meddra_strings = dict()
meddra_ids = dict()
meddra_sorted_fuzzyset = FuzzySet()

for pt_concept_id, pt_concept_name, pt_meddra_id, llt_concept_id, llt_concept_name, llt_meddra_id in reader:
        
    # PT
    sorted_pt_concept_name = ' '.join(sorted(pt_concept_name.lower().split()))
    meddra_sorted_fuzzyset.add(sorted_pt_concept_name)
    meddra_strings[sorted_pt_concept_name] = pt_meddra_id
    meddra_ids[pt_meddra_id] = sorted_pt_concept_name
    
    # LLT
    sorted_llt_concept_name = ' '.join(sorted(llt_concept_name.lower().split()))
    meddra_sorted_fuzzyset.add(sorted_llt_concept_name)
    meddra_strings[sorted_llt_concept_name] = pt_meddra_id
    meddra_ids[pt_meddra_id] = sorted_pt_concept_name

fh.close()

len(meddra_strings)    

59760

In [6]:
# Idenify exact matches 
Deepcadrme_guess_terms["match_method"] = "exact" 

# just match stuff in the dictionary
Deepcadrme_guess_terms['proposed_meddra_term'] = Deepcadrme_guess_terms['term'].map(meddra_strings)

for row in range(len(Deepcadrme_guess_terms)):
    term = Deepcadrme_guess_terms.term[row]
    if term in meddra_strings:
        # exact match found
        continue
    if pd.isna(term):
        # nan
        continue
    if term == '.': # i don't know why, some terms are just periods--maybe we need preprocessing before
        # nan
        continue
    Deepcadrme_guess_terms.loc[row, 'match_method'] = "fuzzy"



In [7]:
# these are the terms to match
Deepcadrme_guess_terms.term[Deepcadrme_guess_terms["match_method"] == "fuzzy"]

# these are the total number of terms
Deepcadrme_guess_terms.term

len(Deepcadrme_guess_terms.term[Deepcadrme_guess_terms["match_method"] == "fuzzy"]), len(Deepcadrme_guess_terms.term)

(12830, 27131)

In [8]:
exact_matches = len(Deepcadrme_guess_terms.term)-len(Deepcadrme_guess_terms.term[Deepcadrme_guess_terms["match_method"] == "fuzzy"])
exact_matches

14301

In [9]:
## a note here is that there are a few that are nans and punucation marks 
# that I took out, which is why 12830 + 14301 =/= 27131 exactly
# might need to do some preprocessing the next time we do this to prevent

## FuzzySet

In [10]:
match_scores = list()
match_labels = list()

# we iterate over the rows again
for row in tqdm.tqdm(range(len(Deepcadrme_guess_terms))):
    
    if Deepcadrme_guess_terms.match_method[row] == "exact":
        continue
    
    a = Deepcadrme_guess_terms.term[row]
    
    sorted_a = ' '.join(sorted(a.split()))
    
    fuzzyset_match = meddra_sorted_fuzzyset.get(sorted_a)
    
    match_scores.append(fuzzyset_match[0][0])
           
    pred_meddra_pt_id = meddra_strings[sorted(fuzzyset_match)[0][1]]
        
    if fuzzyset_match[0][0] >= 0.88: 
        Deepcadrme_guess_terms.loc[row, 'proposed_meddra_term'] = pred_meddra_pt_id

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 27131/27131 [03:02<00:00, 148.54it/s]


In [11]:
# some post processing of the resulting df 

# get rid of anything that isn't matched since we set the threshold to 0.88
Deepcadrme_guess_terms = Deepcadrme_guess_terms.dropna() 
Deepcadrme_guess_terms.reset_index(drop=True)

Unnamed: 0,file,term,start,len,match_method,proposed_meddra_term
0,ACTEMRA.xml,upper respiratory tract infections,410,34,fuzzy,10046306
1,ACTEMRA.xml,nasopharyngitis,446,15,exact,10028810
2,ACTEMRA.xml,headache,463,8,exact,10019211
3,ACTEMRA.xml,hypertension,473,12,exact,10020772
4,ACTEMRA.xml,increased alt,487,13,fuzzy,10001551
...,...,...,...,...,...,...
19764,ZYKADIA.xml,bradycardia,7104,11,exact,10006093
19765,ZYKADIA.xml,pancreatitis,8326,12,exact,10033645
19766,ZYKADIA.xml,alt increased,56695674,39,exact,10001551
19767,ZYKADIA.xml,ast increased,57505755,39,exact,10003481


In [16]:
# just for completeness and for sanity checks, add new column wtih string corresponding to the meddra term
# fill in the meddra term (the text of it) in the next column

# meddra_ids = {y: x for x, y in meddra_strings.items()}

Deepcadrme_guess_terms['proposed_meddra_strings'] = Deepcadrme_guess_terms['proposed_meddra_term'].map(meddra_ids)
Deepcadrme_guess_terms.tail(200)

Unnamed: 0,file,term,start,len,match_method,proposed_meddra_term,proposed_meddra_strings
26853,ZYDELIG.xml,pneumonia,7626,9,exact,10035664,pneumonia
26855,ZYDELIG.xml,diarrhea,8245,8,exact,10012735,diarrhoea
26856,ZYDELIG.xml,nausea,8354,6,exact,10028813,nausea
26857,ZYDELIG.xml,abdominal pain,8463,14,exact,10000081,abdominal pain
26858,ZYDELIG.xml,vomiting,8572,8,exact,10047700,vomiting
26859,ZYDELIG.xml,fatigue,8738,7,exact,10016256,fatigue
26860,ZYDELIG.xml,pyrexia,8847,7,exact,10037660,pyrexia
26861,ZYDELIG.xml,asthenia,8956,8,exact,10003549,asthenia
26862,ZYDELIG.xml,peripheral edema,9065,16,fuzzy,10030124,oedema peripheral
26863,ZYDELIG.xml,upper respiratory tract infection,9229,33,fuzzy,10046306,infection respiratory tract upper


In [13]:
# and save
Deepcadrme_guess_terms.to_csv("../data/deepcadrme_guess_terms_meddramatch.csv")