In [1]:
# loading data from the stroke sqlite database
import pandas as pd
import sqlite3

conn = sqlite3.connect("../data/stroke.db")

df = pd.read_sql_query("""
SELECT *
FROM patients
JOIN medical_conditions USING(patient_id)
JOIN lifestyle USING(patient_id)
""", conn)

conn.close()
df.head()

Unnamed: 0,patient_id,gender,age,ever_married,Residence_type,stroke,id,hypertension,heart_disease,avg_glucose_level,bmi,id.1,work_type,smoking_status
0,9046,Male,67.0,Yes,Urban,1,1,0,1,228.69,36.6,1,Private,formerly smoked
1,31112,Male,80.0,Yes,Rural,1,2,0,1,105.92,32.5,2,Private,never smoked
2,60182,Female,49.0,Yes,Urban,1,3,0,0,171.23,34.4,3,Private,smokes
3,1665,Female,79.0,Yes,Rural,1,4,1,0,174.12,24.0,4,Self-employed,never smoked
4,56669,Male,81.0,Yes,Urban,1,5,0,0,186.21,29.0,5,Private,formerly smoked


In [2]:
# defining numeric and categorical columns for schema
numeric_cols = ["age", "hypertension", "heart_disease", "avg_glucose_level", "bmi"]
categorical_cols = ["gender", "ever_married", "Residence_type", "work_type", "smoking_status"]

numeric_cols, categorical_cols

(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi'],
 ['gender', 'ever_married', 'Residence_type', 'work_type', 'smoking_status'])

In [3]:
# building a schema dictionary with unique options for categoricals
schema = {
    "numeric_cols": numeric_cols,
    "categorical_cols": {}
}

for col in categorical_cols:
    values = sorted(df[col].dropna().unique().tolist())
    schema["categorical_cols"][col] = values

schema

{'numeric_cols': ['age',
  'hypertension',
  'heart_disease',
  'avg_glucose_level',
  'bmi'],
 'categorical_cols': {'gender': ['Female', 'Male', 'Other'],
  'ever_married': ['No', 'Yes'],
  'Residence_type': ['Rural', 'Urban'],
  'work_type': ['Govt_job',
   'Never_worked',
   'Private',
   'Self-employed',
   'children'],
  'smoking_status': ['Unknown', 'formerly smoked', 'never smoked', 'smokes']}}

In [4]:
# saving the schema as json for streamlit
import json
from pathlib import Path

data_dir = Path("../data")
out_path = data_dir / "data_schema.json"

with open(out_path, "w") as f:
    json.dump(schema, f, indent=2)

out_path

PosixPath('../data/data_schema.json')