In [2]:
import pandas as pd
import numpy as np
import gzip
import pickle
import os
from tqdm import tqdm

In [8]:
# Define paths
data_path = '../data/'  # Adjust this path as needed
output_file = os.path.join(data_path, 'mimic3_data.pkl')
# 1. Load necessary files
print("Loading MIMIC-III files...")

# Load NOTEEVENTS - contains clinical notes including discharge summaries
notes_file = os.path.join(data_path, 'NOTEEVENTS.csv')
print(f"Loading notes from {notes_file}")
notes = pd.read_csv(
    notes_file,
    dtype={'HADM_ID': 'str', 'SUBJECT_ID': 'str'}
)

# Load DIAGNOSES_ICD - contains ICD-9 codes for each admission
diag_file = os.path.join(data_path, 'DIAGNOSES_ICD.csv')
print(f"Loading diagnoses from {diag_file}")
diagnoses = pd.read_csv(
    diag_file,
    dtype={'HADM_ID': 'str', 'SUBJECT_ID': 'str', 'ICD9_CODE': 'str'}
)

# 2. Extract discharge summaries
print("Extracting discharge summaries...")
discharge_notes = notes[notes['CATEGORY'] == 'Discharge summary']

# Keep only the most recent discharge summary for each admission (HADM_ID)
discharge_notes = discharge_notes.sort_values(by=['CHARTDATE', 'CHARTTIME'])
discharge_notes = discharge_notes.drop_duplicates(subset=['HADM_ID'], keep='last')

print(f"Found {len(discharge_notes)} discharge summaries")

# 3. Aggregate ICD-9 codes for each admission
print("Aggregating ICD-9 codes for each admission...")
diagnoses_grouped = diagnoses.groupby('HADM_ID')['ICD9_CODE'].apply(list).reset_index()

# 4. Merge discharge summaries with diagnoses
print("Merging data...")
merged_data = pd.merge(
    discharge_notes[['HADM_ID', 'SUBJECT_ID', 'TEXT']],
    diagnoses_grouped,
    on='HADM_ID',
    how='inner'
)

print(f"Final dataset size: {len(merged_data)} admissions")

# 5. Save processed data
print(f"Saving processed data to {output_file}")
merged_data.to_pickle(output_file)

# Display sample data
print("\nSample data:")
sample = merged_data.head(1)
print(f"HADM_ID: {sample['HADM_ID'].values[0]}")
print(f"SUBJECT_ID: {sample['SUBJECT_ID'].values[0]}")
print(f"Number of ICD codes: {len(sample['ICD9_CODE'].values[0])}")
print(f"Text length: {len(sample['TEXT'].values[0])} characters")
print("\nFirst few ICD codes:")
print(sample['ICD9_CODE'].values[0][:5])
print("\nText snippet:")
print(sample['TEXT'].values[0][:200] + "...")

# 6. Summary statistics
print("\nDataset statistics:")
icd_counts = merged_data['ICD9_CODE'].apply(len)
print(f"Average number of ICD codes per admission: {icd_counts.mean():.2f}")
print(f"Min number of ICD codes: {icd_counts.min()}")
print(f"Max number of ICD codes: {icd_counts.max()}")
print(f"Text length statistics:")
text_lengths = merged_data['TEXT'].apply(len)
print(f"Average text length: {text_lengths.mean():.2f} characters")
print(f"Min text length: {text_lengths.min()} characters")
print(f"Max text length: {text_lengths.max()} characters")

Loading MIMIC-III files...
Loading notes from ../data/NOTEEVENTS.csv


  notes = pd.read_csv(


Loading diagnoses from ../data/DIAGNOSES_ICD.csv
Extracting discharge summaries...
Found 52726 discharge summaries
Aggregating ICD-9 codes for each admission...
Merging data...
Final dataset size: 52726 admissions
Saving processed data to ../data/mimic3_data.pkl

Sample data:
HADM_ID: 118464
SUBJECT_ID: 82574
Number of ICD codes: 17
Text length: 13656 characters

First few ICD codes:
['4589', '28411', '1985', '19889', '6826']

Text snippet:
Admission Date:  [**2100-6-7**]              Discharge Date:   [**2100-6-9**]

Date of Birth:  [**2044-4-23**]             Sex:   M

Service: MEDICINE

Allergies:
No Known Allergies / Adverse Drug Rea...

Dataset statistics:
Average number of ICD codes per admission: 11.74
Min number of ICD codes: 1
Max number of ICD codes: 39
Text length statistics:
Average text length: 9759.75 characters
Min text length: 54 characters
Max text length: 55728 characters
