In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request
from tqdm import tqdm
import os 
import sys

!pip install mlflow
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


from src.features.preprocessing import ICD9DataPreprocessor

In [None]:
knowlife_df = pd.read_csv('../data/knowlife_dump.tsv', sep='\t')
print(set(knowlife_df['relation']))

knowlife_df = knowlife_df[knowlife_df['relation'] == 'causes'].reset_index(drop=True)
knowlife_df.head()

In [None]:
def convert_to_icd9(dxStr):
    if dxStr.startswith('E'):
        if len(dxStr) > 4: return dxStr[:4] + '.' + dxStr[4:]
        else: return dxStr
    else:
        if len(dxStr) > 3: return dxStr[:3] + '.' + dxStr[3:]
        else: return dxStr


def normalise_string(dstr):
    lower_str = str(dstr).lower()
    str_parts = lower_str.split(' ')
    str_parts_singular = [(w[0:len(w)-1] if (len(w) > 1 and w[len(w)-1] == 's') else w) for w in str_parts]
    return ' '.join(str_parts_singular)

In [None]:
description_df = pd.read_csv('../data/D_ICD_DIAGNOSES.csv')[['icd9_code', 'short_title', 'long_title']]
description_df['code'] = description_df['icd9_code'].apply(convert_to_icd9)
icd9_df = pd.read_csv('../data/icd9.csv')[['child_name', 'child_code']].drop_duplicates()
icd9_df = icd9_df.rename({
    'child_code': 'code',
    'child_name': 'name',
}, axis=1)
icd_df = pd.merge(description_df, icd9_df, left_on='code', right_on='code', how='outer')
icd_df['short_title'] = icd_df['short_title'].apply(normalise_string)
icd_df['long_title'] = icd_df['long_title'].apply(normalise_string)
icd_df['name'] = icd_df['name'].apply(normalise_string)

icd_entities = icd_df[['name', 'code']]
icd_entities.head()

In [None]:
knowlife_entities = pd.DataFrame(columns=['pattern_id', 'phrase'])
knowlife_entities = knowlife_entities.append(pd.DataFrame({
    'pattern_id': knowlife_df['leftpatternentity'].tolist(),
    'phrase': knowlife_df['leftpatternphrase'].apply(normalise_string).tolist(),
}), ignore_index=True)
knowlife_entities = knowlife_entities.append(pd.DataFrame({
    'pattern_id': knowlife_df['rightpatternentity'].tolist(),
    'phrase': knowlife_df['rightpatternphrase'].apply(normalise_string).tolist(),
}), ignore_index=True)
knowlife_entities = knowlife_entities.drop_duplicates()
knowlife_entities = knowlife_entities.dropna()
knowlife_entities = knowlife_entities.reset_index(drop=True)
knowlife_entities

In [None]:
knowlife_ids = set(knowlife_entities['pattern_id'])
print(len(knowlife_ids))

icd_ids = set(icd_entities['code'])
print(len(icd_ids))

In [None]:
merged = pd.merge(icd_entities, knowlife_entities, left_on='name', right_on='phrase', how='left')
print('% non-matched codes', len(merged[merged['pattern_id'].isna()]) / len(merged))
print('% non-matched codes that are not other', len(merged[merged['pattern_id'].isna() & ~merged['name'].str.contains('other')]) / len(merged))
print('% non-matched codes that are not other or unspecified', len(merged[merged['pattern_id'].isna() & ~merged['name'].str.contains('other') & ~merged['name'].str.contains('unspecified')]) / len(merged))
merged[merged['pattern_id'].isna() & ~merged['name'].str.contains('other') & ~merged['name'].str.contains('unspecified')]

In [None]:
icd9_knowlife_mapping_df = merged.dropna()[['code', 'pattern_id', 'name']].drop_duplicates().reset_index(drop=True)
icd9_knowlife_mapping_df

In [None]:
full_icd9_df = pd.read_csv('../data/icd9.csv')
icd9_three_digits = full_icd9_df[~full_icd9_df['child_code'].str.contains('-')]
icd9_three_digits = icd9_three_digits[~icd9_three_digits['child_code'].str.contains('\.')]

icd9_knowlife_mapping = {}
for _, row in full_icd9_df.iterrows():
    parent_maps = set()
    child_maps = set()
    if row['parent_code'] in icd9_knowlife_mapping_df['code'].tolist():
        rel_df = icd9_knowlife_mapping_df[icd9_knowlife_mapping_df['code'] == row['parent_code']]
        parent_maps = set(rel_df['pattern_id'].tolist())
    if row['child_code'] in icd9_knowlife_mapping_df['code'].tolist():
        rel_df = icd9_knowlife_mapping_df[icd9_knowlife_mapping_df['code'] == row['child_code']]
        child_maps = set(rel_df['pattern_id'].tolist())
    icd9_knowlife_mapping[row['parent_code']] = parent_maps
    icd9_knowlife_mapping[row['child_code']] = child_maps

In [None]:
def find_all_parents(code, full_icd9_df):
    parents = set(full_icd9_df[full_icd9_df['child_code'] == code]['parent_code'])
    for parent in list(parents):
        if parent == code: continue
        parents.update(find_all_parents(parent, full_icd9_df))
    return parents

def find_all_children(code, full_icd9_df):
    children = set(full_icd9_df[full_icd9_df['parent_code'] == code]['child_code'])
    for child in list(children):
        if child == code: continue
        children.update(find_all_children(child, full_icd9_df))
    return children

icd9_knowlife_mapping_ex = {}
for code in set(icd9_three_digits['child_code']):
    if code not in icd9_knowlife_mapping_ex:
        icd9_knowlife_mapping_ex[code] = set(icd9_knowlife_mapping[code])

    relatives = find_all_parents(code, full_icd9_df)
    relatives.update(find_all_children(code, full_icd9_df))
    for relative in relatives:
        icd9_knowlife_mapping_ex[code].update(icd9_knowlife_mapping[relative])

In [None]:
matching_df = pd.DataFrame({
    'code': [x for (x,y) in icd9_knowlife_mapping_ex.items()],
    'knowlife': [y for (x,y) in icd9_knowlife_mapping_ex.items()],
    'matches': [len(y) for (x,y) in icd9_knowlife_mapping_ex.items()],
})
print(len(matching_df))
print(len(matching_df[matching_df['matches'] == 0]))
print(len(matching_df[matching_df['matches'] > 1]))

In [None]:
matching_df[matching_df['matches'] == 0].tail()