# Setting environment

In [1]:
from pathlib import Path
import os
# get current file directory
cfd = Path.cwd()
print(f"Current file directory: {cfd}")
new_cwd = cfd.parent
os.chdir(new_cwd)
print(f"Current working directory changed to: {Path.cwd()}")

Current file directory: C:\Users\Usuario\PycharmProjects\data-life-cycle-project-2025\code
Current working directory changed to: C:\Users\Usuario\PycharmProjects\data-life-cycle-project-2025


# Inspecting data

In [2]:
SOURCE_II_DIR = Path("data/asthma_prevalence")
RAW_DATA_PATH = Path("data/raw/data-gov")
CSV_PATH = RAW_DATA_PATH / "CHIS Data-Current Asthma Prevalence by County.csv"

In [3]:
import pandas as pd

df = pd.read_csv(CSV_PATH, delimiter=",", encoding="cp1252")
df

Unnamed: 0,COUNTY,YEARS,STRATA,AGE GROUP,CURRENT PREVALENCE,95% CONFIDENCE INTERVAL,COUNTIES GROUPED,COMMENT
0,California,2015–2016,Total population,All ages,8.7,(8.1–9.3),,
1,Alameda,2015–2016,Total population,All ages,9.1,(5.9–12.2),,
2,Alpine,2015–2016,Total population,All ages,9.3,(4.1–14.6),"Alpine, Amador, Calaveras, Inyo, Mariposa, Mon...",
3,Amador,2015–2016,Total population,All ages,9.3,(4.1–14.6),"Alpine, Amador, Calaveras, Inyo, Mariposa, Mon...",
4,Butte,2015–2016,Total population,All ages,9.4,(3.8–15.1),,Estimate is statistically unstable. Caution is...
...,...,...,...,...,...,...,...,...
1647,Tulare,2021-2022,Age groups,65+ years,7.6,(3.6-11.5),,
1648,Tuolumne,2021-2022,Age groups,65+ years,15.8,(9.4-22.2),"Alpine, Amador, Calaveras, Inyo, Mariposa, Mon...",
1649,Ventura,2021-2022,Age groups,65+ years,7.8,(3.8-11.8),,
1650,Yolo,2021-2022,Age groups,65+ years,6.2,(2.3-10.1),,Estimate is statistically unstable. Caution is...


In [16]:
columns = df.columns.tolist()
columns

['COUNTY',
 'YEARS',
 'STRATA',
 'AGE GROUP',
 'CURRENT PREVALENCE',
 '95% CONFIDENCE INTERVAL',
 'COUNTIES GROUPED',
 'COMMENT']

From this inspection, we define the SQLite table schemas:
```sql
CREATE TABLE IF NOT EXISTS asthma_prevalence (
    id INTEGER PRIMARY KEY,
    county_id INTEGER,
    year_from INTEGER,
    year_to INTEGER,
    demographic_group_id INTEGER,
    current_prevalence FLOAT,
    ci_95_lower FLOAT,
    ci_95_upper FLOAT,
    comment TEXT,
    FOREIGN KEY (county_id) REFERENCES counties(county_id),
    FOREIGN KEY (demographic_group_id) REFERENCES demographic_group(id)
);

CREATE TABLE IF NOT EXISTS grouped_counties (
    -- Though it's a 1:N relationship, we create a separate table to allow for easier querying and future expansion
    county_id INTEGER,
    asthma_prevalence_id INTEGER,
    FOREIGN KEY (county_id) REFERENCES counties(county_id),
    FOREIGN KEY (asthma_prevalence_id) REFERENCES asthma_prevalence(id),
    PRIMARY KEY (county_id, asmthama_prevalence_id)
);

CREATE TABLE IF NOT EXISTS demographic_group (
    id INTEGER PRIMARY KEY,
    strata TEXT,
    age_group TEXT, -- as is from the data
    age_min INTEGER,
    age_max INTEGER
);
```

# Transforming and loading data

In [4]:
import sqlite3

DB_PATH = Path("data/unified.db")
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

## Create tables

In [20]:
SCHEMA_PATH = Path("code/config/data-gov_schema.sql")
with open(SCHEMA_PATH, 'r') as f:
    schema_sql = f.read()
cursor.executescript(schema_sql)
conn.commit()

In [5]:
def getId_insertMissing(sql_conn, pk_name, table_name, check_values, insert_values):
    c = sql_conn.cursor()
    row_id = pd.read_sql(f"SELECT {pk_name} FROM {table_name} WHERE " + ' AND '.join([f'{key} = "{value}"' for key, value in check_values.items()]),
                         sql_conn)
    for key, value in insert_values.items():
        insert_values[key] = str(value)
    if len(row_id) == 0:
        keys = ', '.join(f':{key}' for key in insert_values.keys())
        insert_query = f'''INSERT INTO {table_name}({",".join(insert_values.keys())}) VALUES({keys})'''
        c.execute(insert_query, insert_values)
        sql_conn.commit()
        row_id = pd.read_sql(f"SELECT {pk_name} FROM {table_name} WHERE " + ' AND '.join([f'{key} = "{value}"' for key, value in check_values.items()]),
                         sql_conn)
    return row_id[pk_name].values[0]

In [6]:
county_table = pd.read_sql("SELECT * FROM counties", conn)
def extract_counties_in_string(counties_string):
   """
   counties_string is an arbitrary string that may contain multiple county names.
   This function extracts county names from the string and returns a list of indices from the counties table.
   Use `county_table` to map county names to their IDs.
   :param counties_string:
   :return:
   """
   c_list = re.findall(r'[A-Za-z ]+', counties_string)
   county_ids = []
   for county in c_list:
         county = county.strip()
         if county:
            county_id = county_table[county_table['county_name'] == county]['county_id']
            if not county_id.empty:
                  county_ids.append(county_id.values[0])
   return county_ids

In [7]:
# Cell to delete table rows if needed
cursor.execute("DELETE FROM asthma_prevalence")
cursor.execute("DELETE FROM demographic_group")
cursor.execute("DELETE FROM grouped_counties")
conn.commit()

In [8]:
import re
import pandas as pd
from tqdm import tqdm

with tqdm(total=len(df), desc="Processing rows", unit="row") as pbar:
    for _, row in df.iterrows():
        try:
            # Always handle NaNs / non-strings safely for regex
            county = "" if pd.isna(row["COUNTY"]) else str(row["COUNTY"]).strip()

            if county == "California":
                # Skip statewide data
                continue

            # --- demographic_group ---
            age_group = "" if pd.isna(row["AGE GROUP"]) else str(row["AGE GROUP"])
            ages = re.findall(r"\d+", age_group)

            demographic_group_id = int(getId_insertMissing(
                conn,
                pk_name="id",
                table_name="demographic_group",
                check_values={
                    "strata": row["STRATA"],
                    "age_group": row["AGE GROUP"],
                },
                insert_values={
                    "strata": row["STRATA"],
                    "age_group": row["AGE GROUP"],
                    "age_min": int(ages[0]) if len(ages) > 0 else None,
                    "age_max": int(ages[1]) if len(ages) > 1 else None,
                }
            ))

            # --- years ---
            years_text = "" if pd.isna(row["YEARS"]) else str(row["YEARS"])
            years = re.findall(r"\d{4}", years_text)

            # --- 95% CI ---
            ci_text = "" if pd.isna(row["95% CONFIDENCE INTERVAL"]) else str(row["95% CONFIDENCE INTERVAL"])
            ci_95 = re.findall(r"[\d.]+", ci_text)

            # --- county_id ---
            county_id = int(extract_counties_in_string(county)[0])

            cursor.execute("""
                INSERT INTO asthma_prevalence (
                    county_id,
                    year_from,
                    year_to,
                    demographic_group_id,
                    current_prevalence,
                    ci_95_lower,
                    ci_95_upper,
                    comment
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                county_id,
                int(years[0]) if len(years) > 0 else None,
                int(years[1]) if len(years) > 1 else None,
                demographic_group_id,
                float(row["CURRENT PREVALENCE"]) if not pd.isna(row["CURRENT PREVALENCE"]) else None,
                float(ci_95[0]) if len(ci_95) > 0 else None,
                float(ci_95[1]) if len(ci_95) > 1 else None,
                row["COMMENT"] if not pd.isna(row["COMMENT"]) else None,
            ))
            lastRowID = cursor.lastrowid

            # --- grouped counties ---
            grouped_counties = row["COUNTIES GROUPED"]
            if not pd.isna(grouped_counties):
                for cid in extract_counties_in_string(str(grouped_counties)):
                    cursor.execute("""
                        INSERT INTO grouped_counties (county_id, asthma_prevalence_id)
                        VALUES (?, ?)
                    """, (int(cid), lastRowID))

        finally:
            pbar.update(1)


Processing rows: 100%|██████████| 1652/1652 [00:00<00:00, 1662.05row/s]
