In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import numpy as np

In [10]:
admissions = pd.read_csv('admissions.csv')
admissions = admissions[['HADM_ID','DIAGNOSIS']]

(58976, 2)

In [12]:
# check the number of rows with missing diagnoses:
admissions = admissions[admissions['DIAGNOSIS'].notnull()]
uniq_diag = admissions['DIAGNOSIS'].unique()    # 15691

Number of unique diagnoses:  15691


In [24]:
uniq_diag

array(['BENZODIAZEPINE OVERDOSE',
       'CORONARY ARTERY DISEASE\\CORONARY ARTERY BYPASS GRAFT/SDA',
       'BRAIN MASS', ..., 'R/O METASTASIS',
       'GASTROPARESIS\\PLACEMENT OF G-TUBE **REMOTE WEST**/SDA',
       'JOINT EFFUSION'], dtype=object)

### Application of the Topic Modelling Sub-Categories for Admission Diagnoses

In [26]:
vectorizer = TfidfVectorizer(stop_words='english')
uniq_diag_vec = vectorizer.fit_transform(uniq_diag)

# LDA
lda_adm = LatentDirichletAllocation(n_components=10, random_state=42)
lda_fit_adm = lda_adm.fit_transform(uniq_diag_vec)

In [28]:
feature_names = vectorizer.get_feature_names_out()
n_top_words = 30
for topic_idx, topic in enumerate(lda_adm.components_):
    top_words_idx = topic.argsort()[::-1][:n_top_words] # get the index, sort by highest of the top words for each topic
    top_words = [feature_names[i] for i in top_words_idx] # map the index to the word
    print(f"Topic {topic_idx}:", top_words)

Topic 0: ['cancer', 'fracture', 'lung', 'sda', 'obstruction', 'ca', 'hip', 'bowel', 'fx', 'metastatic', 'breast', 'spine', 'right', 'weakness', 'left', 'femur', 'hepatitis', 'small', 'rib', 'new', 'cervical', 'cell', 'fractures', 'pneumothorax', 'airway', 'mass', 'urosepsis', 'melanoma', 'skull', 'fall']
Topic 1: ['aortic', 'valve', 'artery', 'sda', 'replacement', 'coronary', 'mitral', 'stenosis', 'graft', 'bypass', 'disease', 'aneurysm', 'avr', 'ascending', 'procedure', 'pulmonary', 'regurgitation', 'atrial', 'aorta', 'repair', 'mvr', 'maze', 'redo', 'thoracic', 'mr', 'invasive', 'insufficiency', 'bental', 'minimally', 'severe']
Topic 2: ['sepsis', 'hypotension', 'fever', 'hematoma', 'gastrointestinal', 'bleed', 'pneumonia', 'subdural', 'telemetry', 'ketoacidosis', 'diabetic', 'anemia', 'overdose', 'shock', 'seizures', 'diarrhea', 'cva', 'septic', 'afib', 'pulmonary', 'hemoptysis', 'hypothermia', 'endocarditis', 'cholecystitis', 'epidural', 'sob', 'nstemi', 'embolis', 'thrive', 'arf']

## Topic Modelling on Procedures Pipeline

In [57]:
# hyperparameter tuning
n_groups = 20

In [131]:
proceds = pd.read_csv('PROCEDURES_ICD.csv')
d_proceds = pd.read_csv('D_ICD_PROCEDURES.csv')

def combine_procedures(d_icd_procedures):
    # Combine the descriptions for the same ICD code
    d_icd_procedures['SHORT_TITLE'] = d_icd_procedures.groupby('ICD9_CODE')['SHORT_TITLE'].transform(lambda x: ' | '.join(x))
    d_icd_procedures['LONG_TITLE'] = d_icd_procedures.groupby('ICD9_CODE')['LONG_TITLE'].transform(lambda x: ' | '.join(x))

    # Drop duplicated rows based on ICD9_CODE, keeping the first occurrence
    procedure_icd_cleaned = d_icd_procedures.drop_duplicates(subset='ICD9_CODE', keep='first')

    return procedure_icd_cleaned

def merge_procedure_names(procedures_icd, procedure_icd_cleaned):
    # Copy the DataFrame to avoid modifying the original data
    procedure_icd_cleaned = procedure_icd_cleaned.copy()

    # Convert the ICD9_CODE columns to string data type
    procedure_icd_cleaned['ICD9_CODE'] = procedure_icd_cleaned['ICD9_CODE'].astype(str)
    procedures_icd['ICD9_CODE'] = procedures_icd['ICD9_CODE'].astype(str)

    # Merge procedure with procedure_icd using a left merge
    procedure_merged = pd.merge(procedures_icd, procedure_icd_cleaned, on='ICD9_CODE', how='left')

    return procedure_merged

In [175]:
cleaned_proced_icd = combine_procedures(d_proceds)
proced_icd = merge_procedure_names(proceds, cleaned_proced_icd)
proced_icd

Unnamed: 0,ROW_ID_x,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,ROW_ID_y,SHORT_TITLE,LONG_TITLE
0,944,62641,154460,3,3404,496.0,Insert intercostal cath,Insertion of intercostal catheter for drainage
1,945,2592,130856,1,9671,3678.0,Cont inv mec ven <96 hrs,Continuous invasive mechanical ventilation for...
2,946,2592,130856,2,3893,801.0,Venous cath NEC,"Venous catheterization, not elsewhere classified"
3,947,55357,119355,1,9672,3679.0,Cont inv mec ven 96+ hrs,Continuous invasive mechanical ventilation for...
4,948,55357,119355,2,331,187.0,Spinal tap | Incision of lung | Spinal tap | I...,Spinal tap | Incision of lung | Spinal tap | I...
...,...,...,...,...,...,...,...,...
240090,228330,67415,150871,5,3736,1300.0,"Exc,destrct,exclus LAA","Excision, destruction, or exclusion of left at..."
240091,228331,67415,150871,6,3893,801.0,Venous cath NEC,"Venous catheterization, not elsewhere classified"
240092,228332,67415,150871,7,8872,3233.0,Dx ultrasound-heart,Diagnostic ultrasound of heart
240093,228333,67415,150871,8,3893,801.0,Venous cath NEC,"Venous catheterization, not elsewhere classified"


In [201]:
def preprocess_procedure(procedure_merged):
    # extract rows with nans as long title, will use for later
    proced_icd_nans = procedure_merged[procedure_merged['LONG_TITLE'].isnull()]
    # drop rows with nans as long title
    proced = procedure_merged[procedure_merged['LONG_TITLE'].notnull()]
    list_procedures = proced['LONG_TITLE'].unique()

    # topic modelling for dimensionality reduction
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(list_procedures)

    # LDA
    lda = LatentDirichletAllocation(n_components=20, random_state=42)
    lda_fit = lda.fit_transform(X)  
    
    topic_list = []
    for i in range(len(lda_fit)):
        topic_list.append(np.argmax(lda_fit[i]))

    # map the topic to the procedure
    topic_proced = pd.DataFrame({'topic':topic_list, 'procedure':list_procedures})
    proced['topic'] = proced['LONG_TITLE'].map(topic_proced.set_index('procedure')['topic'])
    
    # reduce the df down to only the columns we need
    proced_icd_topic = proced[['HADM_ID','topic']]  
    proced_20 = proced_icd_topic.pivot_table(index='HADM_ID', columns='topic', aggfunc=len, fill_value=0)
    proced_20.reset_index(inplace=True)

    # reconstruct the df with the nans
    proced_icd_nans2 = proced_icd_nans[['HADM_ID']]
    proced_icd_nans2['topic'] = 20
    proced_icd_nans2 = proced_icd_nans2.pivot_table(index='HADM_ID', columns='topic', aggfunc=len, fill_value=0)
    proced_icd_nans2.reset_index(inplace=True)

    # merge the two dfs
    proced_20 = proced_20.merge(proced_icd_nans2, on='HADM_ID', how='outer')
    proced_20.fillna(0, inplace=True)
    proced_20 = proced_20.astype(int)

    # add prefix to column names
    proced_20 = proced_20.add_prefix('p_')
    proced_20.rename(columns={'p_HADM_ID':'HADM_ID'}, inplace=True)
    proced_20 = proced_20.rename_axis(None, axis=1)
    return proced_20



In [202]:
preprocess_procedure(proced_icd)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  proced['topic'] = proced['LONG_TITLE'].map(topic_proced.set_index('procedure')['topic'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  proced_icd_nans2['topic'] = 20


Unnamed: 0,HADM_ID,p_0,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_8,...,p_11,p_12,p_13,p_14,p_15,p_16,p_17,p_18,p_19,p_20
0,100003,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
1,100006,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,100007,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,100009,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
4,100010,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52238,152949,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
52239,165728,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
52240,185955,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
52241,189269,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [113]:
feature_names = vectorizer.get_feature_names_out()
n_top_words = 10
for topic_idx, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[::-1][:n_top_words] # get the index, sort by highest of the top words for each topic
    top_words = [feature_names[i] for i in top_words_idx] # map the index to the word
    print(f"Topic {topic_idx}:", top_words)

Topic 0: ['therapeutic', 'ear', 'injection', 'ultrasound', 'flap', 'diagnostic', 'bile', 'vaginal', 'external', 'pedicle']
Topic 1: ['biopsy', 'lung', 'needle', 'closed', 'percutaneous', 'arteriography', 'femoral', 'thoracoscopic', 'transfusion', 'lobectomy']
Topic 2: ['eyelid', 'defect', 'ventricular', 'septal', 'atrial', 'implantation', 'lead', 'repair', 'temporary', 'neurostimulator']
Topic 3: ['dilation', 'unilateral', 'biliary', 'catheterization', 'tract', 'lymphatic', 'complete', 'ligation', 'angioplasty', 'laryngectomy']
Topic 4: ['spinal', 'coronary', 'insertion', 'using', 'contrast', 'material', 'esophagus', 'phlebography', 'repair', 'ray']
Topic 5: ['hernia', 'repair', 'soft', 'prosthesis', 'stomach', 'unspecified', 'tissue', 'inguinal', 'skull', 'graft']
Topic 6: ['device', 'removal', 'tibia', 'fibula', 'bladder', 'application', 'external', 'fixator', 'devices', 'implanted']
Topic 7: ['procedure', 'injection', 'mouth', 'nerve', 'laparoscopic', 'control', 'hemorrhage', 'gastr

In [114]:
# example of a random string, grouped to whcih topic
example_str = 'cathether for heart fistula lesions and angioplasty of coronary artery'
example_vec = vectorizer.transform([example_str])
np.argmax(lda.transform(example_vec))


4

In [116]:
# reduce the df down to only the columns we need
proced_icd_topic = proced_icd[['HADM_ID','topic']]

In [117]:
# pivot the table
proced_icd2 = proced_icd_topic.pivot_table(index='HADM_ID', columns='topic', aggfunc=len, fill_value=0, prefix='p_')
proced_icd2.reset_index(inplace=True)

In [143]:
proced_icd_nans

Unnamed: 0,ROW_ID_x,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,ROW_ID_y,SHORT_TITLE,LONG_TITLE
10,954,16994,115444,1,3601,,,
31,975,42058,118025,1,3601,,,
37,981,42058,118025,7,3601,,,
128,1072,18687,110626,1,3601,,,
141,1085,11143,101985,3,374,,,
...,...,...,...,...,...,...,...,...
161227,146987,30611,168329,1,857,,,
161734,158772,12113,197423,1,485,,,
161785,155256,30146,111849,4,537,,,
163101,151679,5950,167106,1,458,,,


In [146]:
# recombine the nans
proced_icd_nans2 = proced_icd_nans[['HADM_ID']]
proced_icd_nans2['topic'] = 20
proced_icd_nans2 = proced_icd_nans2.pivot_table(index='HADM_ID', columns='topic', aggfunc=len, fill_value=0)
proced_icd_nans2.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  proced_icd_nans2['topic'] = 20


In [147]:
proced_icd_nans2

topic,HADM_ID,20
0,100078,1
1,100156,1
2,100161,1
3,100195,2
4,100335,1
...,...,...
1548,199535,2
1549,199714,2
1550,199864,1
1551,199881,1


In [148]:
# join the two dfs
proced_icd3 = proced_icd2.merge(proced_icd_nans2, on='HADM_ID', how='outer')
proced_icd3.fillna(0, inplace=True) 
proced_icd3

topic,HADM_ID,0,1,2,3,4,5,6,7,8,...,11,12,13,14,15,16,17,18,19,20
0,100003,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,100006,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,100009,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,100010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52238,152949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
52239,165728,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
52240,185955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
52241,189269,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [119]:
def preprocess_procedure(procedure_merged):
    procedure_subset = procedure_merged[["HADM_ID", "SHORT_TITLE"]]
    procedure_subset = procedure_subset.drop_duplicates(subset=["HADM_ID", "SHORT_TITLE"]).copy()

    procedure_subset['present'] = 1
    wide_procedure = procedure_subset.pivot(index='HADM_ID', columns='SHORT_TITLE', values='present')
    wide_procedure = wide_procedure.fillna(0).reset_index()

    # Ensure no column named NaN in wide_procedure
    wide_procedure.columns = [col if col is not np.nan else "unexpected_nan" for col in wide_procedure.columns]

    procedure_columns = wide_procedure.columns.difference(['HADM_ID'])
    wide_procedure[procedure_columns] = wide_procedure[procedure_columns].fillna(0)

    return wide_procedure


Unnamed: 0,HADM_ID,1 int mam-cor art bypass,2 int mam-cor art bypass,AICD check,Abd artery resec w repla,Abd contr phlebogram NEC,Abd region dx proc NEC,Abd varicos v liga-strip,Abd vein resect & anast,Abd vein resect w replac,...,Vitreous substitut injec,Vulvar biopsy,Vulvar/perin repair NEC,Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis | Wedg resec entropion rep | Amputation stump revis,Whole blood transfus NEC,Wide excision of lip les,Wound catheter irrigat,Wound irrigation NEC,Wrist joint biopsy,Wrist synovectomy
0,100003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100009,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52229,199993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52230,199994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52231,199995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52232,199998,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
