In [1]:
import os, duckdb, pandas as pd, matplotlib.pyplot as plt

PARQUET_ROOT = os.path.join("..", "data", "synthetic")  # if notebook is in notebooks/
OUT_DIR = os.path.join("..", "reports", "figures")
os.makedirs(OUT_DIR, exist_ok=True)

con = duckdb.connect()
print("Connected to DuckDB. Parquet root:", PARQUET_ROOT)

Connected to DuckDB. Parquet root: ../data/synthetic


In [2]:
q1 = f"""
SELECT year, gender, control_status, COUNT(*) AS n
FROM read_parquet('{PARQUET_ROOT}/year=*/month=*/part-*.parquet')
GROUP BY 1,2,3
ORDER BY 1,2,3
"""
kpi1 = con.execute(q1).fetch_df()
kpi1.to_csv(os.path.join(OUT_DIR, "control_status_by_year_gender.csv"), index=False)

# Pivot for grouped bar
pivot1 = kpi1.pivot_table(index=["year","gender"], columns="control_status", values="n", fill_value=0)
ax = pivot1.plot(kind="bar", figsize=(10,5))
ax.set_title("Asthma Control Status by Year & Gender")
ax.set_xlabel("Year, Gender")
ax.set_ylabel("Count")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "control_status_by_year_gender.png"))
plt.close()

kpi1.head()

Unnamed: 0,year,gender,control_status,n
0,2023,Female,Partly Controlled,55686
1,2023,Female,Poorly Controlled,5465
2,2023,Female,Well Controlled,514468
3,2023,Female,,49943
4,2023,Male,Partly Controlled,56036


In [3]:
q2 = f"""
SELECT CASE
    WHEN age <= 18 THEN '0-18'
    WHEN age <= 35 THEN '19-35'
    WHEN age <= 50 THEN '36-50'
    WHEN age <= 70 THEN '51-70'
    ELSE '71+'
END AS age_group,
COUNT(*) AS n
FROM read_parquet('{PARQUET_ROOT}/year=*/month=*/part-*.parquet')
GROUP BY 1
ORDER BY 1
"""
kpi2 = con.execute(q2).fetch_df()
kpi2.to_csv(os.path.join(OUT_DIR, "age_distribution.csv"), index=False)

plt.figure(figsize=(7,4))
plt.bar(kpi2["age_group"], kpi2["n"])
plt.title("Age Distribution")
plt.xlabel("Age Group")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "age_distribution.png"))
plt.close()

kpi2

Unnamed: 0,age_group,n
0,0-18,823794
1,19-35,998544
2,36-50,881975
3,51-70,1178294
4,71+,1117393


In [4]:
q3 = f"""
SELECT
  CASE
    WHEN bmi < 18.5 THEN 'Underweight'
    WHEN bmi < 25   THEN 'Normal'
    WHEN bmi < 30   THEN 'Overweight'
    ELSE 'Obese'
  END AS bmi_category,
  control_status,
  COUNT(*) AS n
FROM read_parquet('{PARQUET_ROOT}/year=*/month=*/part-*.parquet')
GROUP BY 1,2
ORDER BY 1,2
"""
kpi3 = con.execute(q3).fetch_df()
kpi3.to_csv(os.path.join(OUT_DIR, "bmi_by_control.csv"), index=False)

pivot3 = kpi3.pivot_table(index="bmi_category", columns="control_status", values="n", fill_value=0)
ax = pivot3.plot(kind="bar", figsize=(9,5))
ax.set_title("BMI Category vs Asthma Control Status")
ax.set_xlabel("BMI Category")
ax.set_ylabel("Count")
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "bmi_by_control.png"))
plt.close()

kpi3.head()

Unnamed: 0,bmi_category,control_status,n
0,Normal,Partly Controlled,34266
1,Normal,Poorly Controlled,1173
2,Normal,Well Controlled,1204742
3,Normal,,107365
4,Obese,Partly Controlled,358307


In [5]:
q4 = f"""
SELECT er_visits, COUNT(*) AS n
FROM read_parquet('{PARQUET_ROOT}/year=*/month=*/part-*.parquet')
GROUP BY 1
ORDER BY 1
"""
kpi4 = con.execute(q4).fetch_df()
kpi4.to_csv(os.path.join(OUT_DIR, "er_visits_distribution.csv"), index=False)

plt.figure(figsize=(8,4))
plt.bar(kpi4["er_visits"], kpi4["n"])
plt.title("ER Visits per Patient")
plt.xlabel("ER Visits (count)")
plt.ylabel("Patients")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "er_visits_distribution.png"))
plt.close()

kpi4.head()

Unnamed: 0,er_visits,n
0,0,3351553
1,1,1340915
2,2,267847
3,3,35863
4,4,3529


In [6]:
con.close()
print("Closed DuckDB connection.")

Closed DuckDB connection.
