In [None]:
import numpy as np
import pandas as pd
import openai

In [None]:
# (helps with displaying dataframes containing long strings)
pd.set_option('display.max_colwidth', 0)

In [None]:
# Read in ICD9 lookup table
icd9_lookup = pd.read_csv("D_ICD_DIAGNOSES.csv.gz")

In [None]:
# Filter out ICD9 codes that start with E or V
# (these provide supplementary info)
suppl_mask = icd9_lookup["ICD9_CODE"].apply(lambda x: x[0] in ["E", "V"])
icd9_lookup = icd9_lookup[~suppl_mask]

In [None]:
# Also filter out codes starting with 78 or 79
# (these also aren't associated with a particular disease)
nonspecific_mask = icd9_lookup["ICD9_CODE"].apply(lambda x: x[0:2] in ["78", "79"])
icd9_lookup = icd9_lookup[~nonspecific_mask]

In [None]:
# Get first three digits of code
icd9_lookup["first_3_digits"] = icd9_lookup["ICD9_CODE"].apply(lambda x: x[0:3])

In [None]:
# Get unique values to look up
lookup_vals = icd9_lookup["first_3_digits"].unique()

In [None]:
openai.api_key = input()

In [None]:
# Loop through lookup values (i.e., first 3 digits of ICD9 code)
# and use ChatGPT to get list of symptoms for each
symptoms = []
for i in range(0, len(lookup_vals)):
    if (i % 100 == 0):
        print(f"On code {i} of {len(lookup_vals)}")
    
    temp_first_3 = lookup_vals[i]
    temp_prompt = f"""
        Please return a list of 5-10 symptoms associated with an ICD-9 code whose first three digits are {temp_first_3}.
        These should be symptoms expected to appear in clinical notes for a patient with that diagnosis.
        Please return in the format: "Symptoms: <comma-separated list of symptoms>"
    """
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": temp_prompt},
        ]
    )
    
    # Append to list
    symptoms.append(response.choices[0].message["content"])

In [None]:
# Create and save dataframe
# NOTE: symptoms are stored as a single string for now
output_df = pd.DataFrame({
    "icd9_first_3": lookup_vals,
    "symptoms": symptoms
})
output_df.to_csv("icd9_symptom_map_v2.csv", index=False)