In [2]:
import os
import json
import pandas as pd
from zipfile import ZipFile

# === Step 1: Extract FHIR JSONs (Converted by name.zip)
zip_path = "/home/user/Desktop/LLM_UA_Test Files/LLM_UA_Test Files/Structured Data/CCDA/Converted by name.zip"
extract_path = "extracted_fhir_jsons"
with ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# === Step 2: Build LOINC Code - Display Name mapping from JSONs
loinc_display_map = {}
for root, _, files in os.walk(extract_path):
    for file in files:
        if file.endswith(".json"):
            try:
                with open(os.path.join(root, file), 'r') as f:
                    bundle = json.load(f)
                for entry in bundle.get("entry", []):
                    resource = entry.get("resource", {})
                    if resource.get("resourceType") == "Observation":
                        for coding in resource.get("code", {}).get("coding", []):
                            if "loinc.org" in coding.get("system", ""):
                                code = coding.get("code", "").strip()
                                display = coding.get("display", "").strip()
                                if code and display and display.lower() != "unknown":
                                    loinc_display_map[code] = display
            except:
                continue

# === Step 3: Clean the V2 file using corrected LOINC display names
df_v2 = pd.read_csv("patient_observations_1_cleaned_v2.csv")

# Replace bad display names
def fix_display_name(row):
    code = str(row["LOINC Code"]).strip()
    name = str(row["Display Name"]).strip().lower()
    if name in ["unknown", "nan", "", "bp systolic", "bp diastolic", 
                "bmi (body mass index)", "body weight measured", "body height measured",
                "oxygen saturation by pulse oximetry", "respiratory rate"]:
        return loinc_display_map.get(code, row["Display Name"])
    return row["Display Name"]

df_v2["Display Name"] = df_v2.apply(fix_display_name, axis=1)

# === Step 4: Save the final cleaned file
df_v2.to_csv("patient_observations_1_cleaned_v2_final.csv", index=False)
print("Final cleaned CSV saved as: patient_observations_1_cleaned_v2_final.csv")





Final cleaned CSV saved as: patient_observations_1_cleaned_v2_final.csv


In [3]:
import os
import json
import pandas as pd

# === Step 1: Load the cleaned CSV with corrected Display Names
csv_path = "/home/user/Desktop/LLM_UA_Test Files/LLM_UA_Test Files/Structured Data/CCDA/patient_observations_1_cleaned_v2_final.csv"
df = pd.read_csv(csv_path)

# Build a mapping from LOINC Code → Display Name
loinc_to_display = dict(zip(df["LOINC Code"].astype(str), df["Display Name"]))

# === Step 2: Set input and output folders
source_dir = "/home/user/Desktop/LLM_UA_Test Files/LLM_UA_Test Files/Structured Data/CCDA/cleaned_patients_v2"
output_dir = "/home/user/Desktop/LLM_UA_Test Files/LLM_UA_Test Files/Structured Data/CCDA/cleaned_patients_v3"
os.makedirs(output_dir, exist_ok=True)

# === Step 3: Update JSONs with fixed Display Names
for filename in os.listdir(source_dir):
    if not filename.endswith(".json"):
        continue

    file_path = os.path.join(source_dir, filename)
    with open(file_path, "r") as f:
        patient = json.load(f)

    for obs in patient.get("observations", []):
        code = str(obs.get("code", "")).strip()
        if code in loinc_to_display and loinc_to_display[code]:
            obs["description"] = loinc_to_display[code]

    # Save to V3 output
    out_path = os.path.join(output_dir, filename)
    with open(out_path, "w") as f:
        json.dump(patient, f, indent=2)

print("All files updated with corrected Display Names and saved to cleaned_patients_v3/")




All files updated with corrected Display Names and saved to cleaned_patients_v3/
