# Constants

In [1]:
DIAGNOSIS_CODES_PATH = '../data/diagnosis_codes.csv'
DISTINCT_CODES = 631
NUM_RECORDS = 58302
MAX_CODES = 43
TARGET_MEAN = 5.11

# Imports

In [2]:
import pandas as pd
import numpy as np
import random

# Exploration

We begin by inspecting the dataset to ensure it has been loaded correctly. Specifically, we confirm the presence of diagnosis codes and verify that they are stored as strings. This step is important to prepare the data for further processing and sampling.

In [3]:
df = pd.read_csv(DIAGNOSIS_CODES_PATH)
df.shape

(15361, 3)

In [4]:
df.head()

Unnamed: 0,ID,Diagnostic Code,Diagnosis Description
0,1,1,Cholera
1,2,10,Cholera D/T Vib Cholerae
2,3,11,Cholera D/T Vib El Tor
3,4,19,Cholera Nos
4,5,2,Typhoid/Paratyphoid Fev


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15361 entries, 0 to 15360
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ID                     15361 non-null  int64 
 1   Diagnostic Code        15361 non-null  object
 2   Diagnosis Description  15361 non-null  object
dtypes: int64(1), object(2)
memory usage: 360.1+ KB


# Transformation

We sample 631 unique diagnosis codes from the dataset and normalize them into standard ICD-9 format (`NNN.NN`) by inserting a decimal point after the third digit when needed. This formatting is essential for consistency, especially when comparing codes or generating synthetic data later on.

In [6]:
# Ensure 'Diagnostic Code' is treated as a string
df["Diagnostic Code"] = df["Diagnostic Code"].astype(str)

# Drop duplicate codes
unique_codes = df["Diagnostic Code"].drop_duplicates()

# Sanity check
print(f"Available unique codes: {len(unique_codes)}")
if len(unique_codes) < DISTINCT_CODES:
    raise ValueError(f"Not enough unique codes to sample {DISTINCT_CODES}.")

# Randomly sample the desired number of distinct diagnosis codes
sampled_codes = unique_codes.sample(n=DISTINCT_CODES, random_state=42).sort_values()

print(f"Sampled unique codes: {len(sampled_codes)}")

Available unique codes: 15361
Sampled unique codes: 631


In [7]:
def normalize_icd9(code: str) -> str:
    """
    Normalize ICD-9 codes by inserting a dot after the third digit if needed.
    Example: '0010' → '001.0', '0088' → '008.8'
    """
    code = code.zfill(3)  # pad with zeros if needed
    return code if len(code) <= 3 else f"{code[:3]}.{code[3:]}"

In [8]:
print(normalize_icd9("0020"))

002.0


In [9]:
sampled_codes_normalized = sampled_codes.apply(normalize_icd9)

In [10]:
sampled_codes_normalized.head()

47      006.8
88      008.8
99     010.03
101    010.05
102    010.06
Name: Diagnostic Code, dtype: object

In [11]:
# Save final codes for future use
sampled_codes_normalized.to_csv("../data/diagnosis_codes_final.csv", index=False)

To ensure that our normalization process is accurate, we manually verified several sampled and formatted ICD-9 codes using the official reference ranges available at: [AAPC ICD-9 Code Range](https://www.aapc.com/codes/icd9-codes-range/).

# Dataset Generation

In [17]:
# Generate number of codes per record
rng = np.random.default_rng(42)
code_counts = rng.poisson(lam=TARGET_MEAN, size=NUM_RECORDS)
code_counts = np.clip(code_counts, 1, MAX_CODES)

diagnosis_codes = sampled_codes_normalized.tolist()

# Create synthetic records using NumPy choice
synthetic_records = []
for count in code_counts:
    record = rng.choice(diagnosis_codes, size=count, replace=False).tolist()
    synthetic_records.append(record)

# Keeps each record as a comma-separated string
rows_as_strings = [",".join(record) for record in synthetic_records]

# Create a 1-column DataFrame just for previewing
df_preview = pd.DataFrame(rows_as_strings, columns=["Diagnosis Codes"])

# Preview
df_preview.head()

Unnamed: 0,Diagnosis Codes
0,"378.6,385.02,094.84,V45.69,952.01,782.4,726.2,..."
1,"094.84,250.50,312.1,361.31,952.01,885.0,600"
2,"790.6,131.0,805.8,298.0,011.24,491.8,637.3,655.8"
3,"759.0,801.02"
4,"838,357.5,V83.01,839.3,E83.20,270.1"


In [18]:
code_lengths = [len(r) for r in synthetic_records]
print("Total records:", len(synthetic_records))
print("Max codes per record:", max(code_lengths))
print("Average codes per record:", round(np.mean(code_lengths), 2))
print("Min codes per record:", min(code_lengths))

Total records: 58302
Max codes per record: 19
Average codes per record: 5.11
Min codes per record: 1


In [20]:
# Save final dataset
with open("../data/dataset_final.csv", "w") as f:
    for line in rows_as_strings:
        f.write(line + "\n")