In [1]:
import pandas as pd
import os
import re
import csv

In [2]:
DATA_PATH = os.path.expanduser("~/data1/multMyeloma/data/")
EHR_FILENAME = "MMehr_data.csv"
ICDMAP_FILENAME = "icd10cmtoicd9gem.csv"

In [4]:
##Import ehr data of MM patients and conversion table from icd10 to icd9
MMehr_data = pd.read_csv(os.path.join(DATA_PATH, EHR_FILENAME),
                         sep=",",
                         header=0,
                         index_col=False)

icd10TOicd9 = pd.read_csv(os.path.join(DATA_PATH, ICDMAP_FILENAME),
                         sep=",",
                         header=0,
                         index_col=False)

In [5]:
##create a dictionary {'icd10_code':'icd9_code'}
icd10icd9_dict = {}

for i, diag in enumerate(icd10TOicd9["icd10"]):
    icd10icd9_dict[diag] = icd10TOicd9["icd9"][i]

In [9]:
##map icd10 to icd9 when available
code_list = MMehr_data["CODE"].tolist()
label_list = MMehr_data["CODE_LABEL"].tolist()

for i, med in enumerate(code_list):
    if label_list[i] == "icd10":
        tmp = "".join(str.split(med, "."))
        if tmp in icd10icd9_dict:
            trad = icd10icd9_dict[tmp]
            if len(list(trad)) >= 4:
                code_list[i] = ''.join(list(trad)[0:3]) + '.' + ''.join(list(trad[3:len(trad)]))
            else:
                code_list[i] = trad
            label_list[i] = 'icd9'

In [10]:
##add LABEL:CODE column and the modified columns with CODE and CODE_LABEL
MMehr_data["LABEL:CODE"] = pd.Series([label_list[i] + ':' + code_list[i] for i in range(len(code_list))])
MMehr_data["CODE"] = pd.Series(code_list)
MMehr_data["CODE_LABEL"] = pd.Series(label_list)

In [11]:
idx = MMehr_data["CODE_LABEL"] == 'icd10' ##eliminated those records in which the diagnosis could not be mapped to icd9
MMehr_data = MMehr_data[-idx]

In [12]:
##read annotation files and create a dictionary of dictionaries (e.g. {'cpt':{'code':[CUI, ONTOLOGY_ID, LABEL], ...}, ...}
ann_l = {}
for _, _, files in os.walk(DATA_PATH):
    for f in files:
        if re.match('annotation', f):
            with open(os.path.join(DATA_PATH, f), newline='') as csvfile:
                rows = csv.reader(csvfile, delimiter=',', quotechar='|')
                d = {}
                for row in rows:
                    d[row[0]] = row[1:len(row)]
                ann_l[f.split('-')[1].split('.')[0]] = d

In [42]:
#map the codes to the normalized labels and add a new column to the dataframe
rxnorm = []
for lab_code in MMehr_data["LABEL:CODE"]:
    el = lab_code.split(":")
    if el[1] in ann_l[el[0]]:
        rxnorm += [ann_l[el[0]][el[1]][-1]]
    else:
        rxnorm += ['NA']
        

MMehr_data["RXNORM"] = rxnorm