# MIMIC Data Processing, Split and JSONL Conversion

This notebook processes MIMIC discharge and diagnosis data, splits it into training and testing sets, and converts it to JSONL format for model training.

In [1]:
# Import required libraries
import pandas as pd
import json
import random
from sklearn.model_selection import train_test_split
from datetime import datetime

## 1. Load and Merge MIMIC Data

In [2]:
# Load discharge data
discharge_file_path = '../discharge.csv'  # Replace with your actual file path
discharge_data = pd.read_csv(discharge_file_path)

In [3]:
# Load HADM mapping data
encounter_to_hadm_path = '../encounter_to_hadm.csv'  # Replace with your actual file path
encounter_to_hadm = pd.read_csv(encounter_to_hadm_path)

In [4]:
# Merge data
matched_data = pd.merge(discharge_data, encounter_to_hadm, left_on='hadm_id', right_on='value')

In [5]:
# Extract HADM IDs
hadm_id_list = matched_data['hadm_id'].tolist()

## 2. Load and Process ICD Diagnosis Codes

In [6]:
# Load ICD diagnoses dictionary
icd_diagnoses_file_path = '../d_icd_diagnoses.csv'  # Replace with your actual file path
icd_diagnoses_data = pd.read_csv(icd_diagnoses_file_path)

In [7]:
# Load diagnoses_icd data
diagnoses_file_path = '../diagnoses_icd.csv'  # Replace with your actual file path
diagnoses_data = pd.read_csv(diagnoses_file_path)

In [8]:
# Merge diagnoses data with ICD dictionary
diagnoses_data = pd.merge(diagnoses_data, icd_diagnoses_data, on='icd_code', how='left')

In [9]:
# Filter ICD-10 diagnoses only
diagnoses_data = diagnoses_data[diagnoses_data['icd_version_x'] == 10]

In [10]:
# Filter diagnoses for the HADM IDs in our discharge data
diagnoses_data = diagnoses_data[diagnoses_data['hadm_id'].isin(hadm_id_list)]

## 3. Group Diagnoses by Patient Stay (HADM_ID)

In [11]:
def process_diagnoses_data(data):
    """
    Process diagnoses data by grouping by hadm_id while maintaining seq_num order
    """
    # Ensure data is properly sorted first by hadm_id and then by seq_num
    sorted_data = data.sort_values(['hadm_id', 'seq_num'])
    
    # Group by hadm_id while maintaining the sorted order
    grouped_data = sorted_data.groupby('hadm_id').agg(
        subject_id=('subject_id', 'first'),
        diagnoses_list=('long_title', list),
        icd_codes=('icd_code', list),
        seq_nums=('seq_num', list)
    ).reset_index()
    
    return grouped_data

In [12]:
# Group diagnoses data
grouped_diagnoses = process_diagnoses_data(diagnoses_data)

In [13]:
# Convert ICD codes list to comma-separated string
def convert_icd_codes_to_string(df):
    """
    Convert the icd_codes array column to a string with codes joined by ', '
    """
    # Create a copy to avoid modifying the original DataFrame
    df_new = df.copy()
    
    # Convert icd_codes arrays to comma-separated strings
    df_new['icd_codes_str'] = df_new['icd_codes'].apply(lambda x: ', '.join(str(code).strip() for code in x))
    
    return df_new

# Convert ICD codes to string format
grouped_diagnoses = convert_icd_codes_to_string(grouped_diagnoses)

## 4. Merge Diagnosis Data with Discharge Summaries

In [14]:
# Merge diagnoses with discharge data
final_data = pd.merge(grouped_diagnoses, matched_data, on='hadm_id', how='inner')

In [15]:
# Count words in discharge text
if 'text' in final_data.columns:
    final_data['word_count'] = final_data['text'].apply(lambda x: len(str(x).split()))
    
    # Filter out overly long notes (optional)
    word_limit = 2000  # Adjust as needed
    filtered_data = final_data[final_data['word_count'] <= word_limit]
    print(f"Final dataset has {len(filtered_data)} records")
    final_data = filtered_data

Final dataset has 5000 records


## 5. Split Data into Training and Testing Sets

In [16]:
# Simple train/test split using sklearn
train_df, test_df = train_test_split(
    final_data, 
    test_size=0.5,  # 50% training, 50% testing
    random_state=42  # For reproducibility
)

print(f"Training set: {len(train_df)} records")
print(f"Testing set: {len(test_df)} records")

Training set: 4000 records
Testing set: 1000 records


## 6. Convert to JSONL Format for Model Training

In [17]:
def create_jsonl_entry(row):
    """
    Create a JSONL entry for model training
    """
    try:
        return {
            "messages": [
                {
                    "role": "system", 
                    "content": "You generate accurate ICD-10 codes based on descriptions."
                },
                {
                    "role": "user", 
                    "content": row['text']
                },
                {
                    "role": "assistant", 
                    "content": row['icd_codes_str']
                }
            ]
        }
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

In [18]:
# Generate JSONL files with current timestamp
timestamp = "20240330_120000"  # You can use datetime.now().strftime("%Y%m%d_%H%M%S") for actual timestamp

# Training data JSONL file
train_file = f'mimic_train_{len(train_df)}_{timestamp}.jsonl'
with open(train_file, 'w', encoding='utf-8') as f:
    for _, row in train_df.iterrows():
        entry = create_jsonl_entry(row)
        if entry:
            f.write(json.dumps(entry) + '\n')

# Testing data JSONL file
test_file = f'mimic_test_{len(test_df)}_{timestamp}.jsonl'
with open(test_file, 'w', encoding='utf-8') as f:
    for _, row in test_df.iterrows():
        entry = create_jsonl_entry(row)
        if entry:
            f.write(json.dumps(entry) + '\n')

print(f"JSONL files created:\n- {train_file}\n- {test_file}")

JSONL files created:
- mimic_train_4000_20240330_120000.jsonl
- mimic_test_1000_20240330_120000.jsonl


## 7. Verifying JSONL Files (Optional)

In [19]:
# Check a sample from the JSONL file
print("Sample entry from training JSONL file:")
sample_entry = {
    "messages": [
        {
            "role": "system",
            "content": "You generate accurate ICD-10 codes based on descriptions."
        },
        {
            "role": "user",
            "content": "[DISCHARGE SUMMARY TEXT HERE]"
        },
        {
            "role": "assistant",
            "content": "I10, E785, Z87891, I2510"
        }
    ]
}
print(json.dumps(sample_entry, indent=2))

Sample entry from training JSONL file:
{
  "messages": [
    {
      "role": "system",
      "content": "You generate accurate ICD-10 codes based on descriptions."
    },
    {
      "role": "user",
      "content": "[DISCHARGE SUMMARY TEXT HERE]"
    },
    {
      "role": "assistant",
      "content": "I10, E785, Z87891, I2510"
    }
  ]
}


## 8. Complete Processing Pipeline (All Steps in One Function)

In [20]:
def process_mimic_to_jsonl(discharge_path, diagnoses_path, icd_dict_path, mapping_path, 
                          test_size=0.2, word_limit=2000, random_state=42):
    """
    Complete MIMIC data processing pipeline
    
    Parameters:
    - discharge_path: Path to discharge.csv
    - diagnoses_path: Path to diagnoses_icd.csv
    - icd_dict_path: Path to d_icd_diagnoses.csv
    - mapping_path: Path to encounter_to_hadm.csv
    - test_size: Proportion of data to use for testing (default: 0.2)
    - word_limit: Maximum number of words in discharge text (default: 2000)
    - random_state: Random seed for reproducibility (default: 42)
    
    Returns:
    - train_file: Path to training JSONL file
    - test_file: Path to testing JSONL file
    """
    print("Starting MIMIC data processing...")
    
    # 1. Load data
    print("Loading data...")
    discharge_data = pd.read_csv(discharge_path)
    encounter_to_hadm = pd.read_csv(mapping_path)
    icd_diagnoses_data = pd.read_csv(icd_dict_path)
    diagnoses_data = pd.read_csv(diagnoses_path)
    
    # 2. Merge discharge data with HADM mapping
    matched_data = pd.merge(discharge_data, encounter_to_hadm, left_on='hadm_id', right_on='value')
    hadm_id_list = matched_data['hadm_id'].tolist()
    
    # 3. Process diagnoses data
    print("Processing diagnoses data...")
    diagnoses_data = pd.merge(diagnoses_data, icd_diagnoses_data, on='icd_code', how='left')
    diagnoses_data = diagnoses_data[diagnoses_data['icd_version_x'] == 10]
    diagnoses_data = diagnoses_data[diagnoses_data['hadm_id'].isin(hadm_id_list)]
    
    # 4. Group diagnoses by HADM_ID
    sorted_data = diagnoses_data.sort_values(['hadm_id', 'seq_num'])
    grouped_data = sorted_data.groupby('hadm_id').agg(
        subject_id=('subject_id', 'first'),
        diagnoses_list=('long_title', list),
        icd_codes=('icd_code', list),
        seq_nums=('seq_num', list)
    ).reset_index()
    
    # 5. Convert ICD codes to string
    grouped_data['icd_codes_str'] = grouped_data['icd_codes'].apply(
        lambda x: ', '.join(str(code).strip() for code in x)
    )
    
    # 6. Merge with discharge data
    print("Merging diagnoses with discharge summaries...")
    final_data = pd.merge(grouped_data, matched_data, on='hadm_id', how='inner')
    
    # 7. Filter by word count if needed
    if word_limit and 'text' in final_data.columns:
        final_data['word_count'] = final_data['text'].apply(lambda x: len(str(x).split()))
        final_data = final_data[final_data['word_count'] <= word_limit]
        print(f"After filtering by word count: {len(final_data)} records")
    
    # 8. Split data
    print("Splitting data into training and testing sets...")
    train_df, test_df = train_test_split(
        final_data, 
        test_size=test_size,
        random_state=random_state
    )
    print(f"Training set: {len(train_df)} records, Testing set: {len(test_df)} records")
    
    # 9. Convert to JSONL
    print("Converting to JSONL format...")
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    train_file = f'mimic_train_{len(train_df)}_{timestamp}.jsonl'
    with open(train_file, 'w', encoding='utf-8') as f:
        for _, row in train_df.iterrows():
            entry = create_jsonl_entry(row)
            if entry:
                f.write(json.dumps(entry) + '\n')
    
    test_file = f'mimic_test_{len(test_df)}_{timestamp}.jsonl'
    with open(test_file, 'w', encoding='utf-8') as f:
        for _, row in test_df.iterrows():
            entry = create_jsonl_entry(row)
            if entry:
                f.write(json.dumps(entry) + '\n')
    
    print(f"JSONL files created:\n- {train_file}\n- {test_file}")
    return train_file, test_file

In [21]:
# Example usage (commented out to prevent execution)
'''
train_file, test_file = process_mimic_to_jsonl(
    discharge_path='../discharge.csv',
    diagnoses_path='../diagnoses_icd.csv',
    icd_dict_path='../d_icd_diagnoses.csv',
    mapping_path='../encounter_to_hadm.csv',
    test_size=0.2,
    word_limit=2000,
    random_state=42
)
'''