In [1]:
import git
import pandas as pd

repo = git.Repo(".", search_parent_directories=True).working_tree_dir

## Helper Functions

In [2]:
# Dropped columns exist because need to filter first
def load_dataframe(
    raw_path: str,
    allowed_columns: list = [],
    column_filters: dict = {},
    column_dtypes: dict = {},
    sort_order: list = [],
    dropped_columns: list = [],
    column_renames: dict = {},
    value_renames: dict = {},
):
    """
    Loads a CSV and returns a pandas DataFrame with various transformations

    Parameters:
        raw_path (str): Path to the CSV file
        allowed_columns (list): List of columns to keep (if empty, keep all)
        column_filters (dict): Dictionary of {column: [allowed_values]}
        column_dtypes (dict): Dictionary of {column: dtype}
        sort_order (list): List of columns to sort by
        dropped_columns (list): List of columns to drop
        column_renames (dict): Dictionary of {old_column_name: new_column_name}
        value_renames (dict): Dictionary of {column: {old_value: new_value}}
    """

    if allowed_columns:
        dataframe = pd.read_csv(
            raw_path,
            usecols=allowed_columns,
        )
    else:
        dataframe = pd.read_csv(raw_path)

    for column, allowed_values in column_filters.items():
        dataframe = dataframe[dataframe[column].isin(allowed_values)]

    for column, dtype in column_dtypes.items():
        dataframe[column] = dataframe[column].astype(dtype)

    if column_renames:
        dataframe = dataframe.rename(columns=column_renames)

    if sort_order:
        dataframe = dataframe.sort_values(sort_order)
    for column, value_map in value_renames.items():
        dataframe[column] = dataframe[column].replace(value_map)

    if dropped_columns:
        dataframe = dataframe.drop(columns=dropped_columns)

    return dataframe

## CDI

In [3]:
cdi_raw_path = f"{repo}/datasets/raw/U.S._Chronic_Disease_Indicators_2023.csv"
cdi_processed_path = f"{repo}/datasets/processed/cdi.csv"

cdi_cols = [
    "YearStart",
    "YearEnd",
    "LocationAbbr",
    "Topic",
    "Question",
    "DataValueUnit",
    "DataValueType",
    "DataValue",
    "StratificationCategory1",
    "Stratification1",
]

cdi_filters = {
    "Topic": [
        "Cancer",
        "Cardiovascular Disease",
        "Chronic Kidney Disease",
        "Chronic Obstructive Pulmonary Disease",
    ],
    "DataValueType": [
        "Number",
        "Crude Prevalence",
        "Age-adjusted Prevalence",
        "Average Annual Number",
        "Average Annual Age-adjusted Rate",
        "Average Annual Crude Rate",
    ],
}

cdi_dtypes = {
    "YearStart": "category",
    "YearEnd": "category",
    "LocationAbbr": "category",
    "Topic": "category",
    "Question": "category",
    "DataValueUnit": "category",
    "DataValueType": "category",
    "DataValue": "Float32",
    "StratificationCategory1": "category",
    "Stratification1": "category",
}

cdi_sort = ["YearEnd", "Topic", "Question", "LocationAbbr"]

cdi_df = load_dataframe(
    raw_path=cdi_raw_path,
    allowed_columns=cdi_cols,
    column_filters=cdi_filters,
    column_dtypes=cdi_dtypes,
    sort_order=cdi_sort,
)

cdi_df.to_csv(cdi_processed_path, index=False)

  dataframe = pd.read_csv(


## CDI - US/Overall

In [4]:
cdi_us_overall_raw_path = (
    f"{repo}/datasets/raw/U.S._Chronic_Disease_Indicators_2023.csv"
)
cdi_us_overall_processed_path = f"{repo}/datasets/processed/cdi_us_overall.csv"

cdi_us_overall_cols = [
    "YearEnd",
    "LocationAbbr",
    "Topic",
    "Question",
    "DataValueUnit",
    "DataValueType",
    "DataValue",
    "StratificationCategory1",
]

cdi_us_overall_filters = {
    "Topic": [
        "Chronic Kidney Disease",
        "Chronic Obstructive Pulmonary Disease",
    ],
    "DataValueType": [
        "Number",
        "Crude Prevalence",
        "Age-adjusted Prevalence",
    ],
    "LocationAbbr": ["US"],
    "StratificationCategory1": ["Overall"],
}

cdi_us_overall_dtypes = {
    "YearEnd": "category",
    "Topic": "category",
    "Question": "category",
    "DataValueUnit": "category",
    "DataValueType": "category",
    "DataValue": "Float32",
}

cdi_us_overall_sort = ["YearEnd", "Topic", "Question"]

cdi_us_overall_dropped_cols = ["LocationAbbr", "StratificationCategory1"]

cdi_us_overall_df = load_dataframe(
    raw_path=cdi_us_overall_raw_path,
    allowed_columns=cdi_us_overall_cols,
    column_filters=cdi_us_overall_filters,
    column_dtypes=cdi_us_overall_dtypes,
    sort_order=cdi_us_overall_sort,
    dropped_columns=cdi_us_overall_dropped_cols,
)

cdi_us_overall_df.to_csv(cdi_us_overall_processed_path, index=False)

  dataframe = pd.read_csv(


## BRFSS

In [5]:
brfss_raw_path = f"{repo}/datasets/raw/BRFSS_Chronic_Health_Indicators_2024.csv"
brfss_processed_path = f"{repo}/datasets/processed/brfss.csv"

brfss_cols = [
    "Year",
    "Locationabbr",
    "Topic",
    "Question",
    "Response",
    "Break_Out",
    "Data_value",
    "Data_value_type",
]

brfss_filters = {
    "Locationabbr": ["UW"],
    "Topic": [
        "Asthma",
        "COPD",
        "Cardiovascular Disease",
        "Depression",
        "Diabetes",
        "Kidney",
        "Other Cancer",
        "Skin Cancer",
    ],
    "Response": ["Yes"],
}

brfss_sort = ["Topic", "Question", "Year"]

brfss_col_renames = {
    "Locationabbr": "LocationAbbr",
    "Data_value": "DataValue",
    "Data_value_type": "DataValueType",
}

brfss_val_renames = {
    "Question": {
        "Adults who have been told they currently have asthma (variable calculated from one or more BRFSS questions)": "Current Asthma",
        "Adults who have ever been told they have asthma (variable calculated from one or more BRFSS questions)": "Lifetime Asthma",
        "Ever told you have COPD?": "Chronic Obstructive Pulmonary Disease",
        "Ever told you had a heart attack (myocardial infarction)?": "Myocardial Infarction",
        "Ever told you had a stroke?": "Stroke",
        "Ever told you had angina or coronary heart disease?": "Coronary Heart Disease",
        "Ever told you that you have a form of depression?": "Depression",
        "Have you ever been told by a doctor that you have diabetes?": "Diabetes",
        "Ever told you have kidney disease?": "Chronic Kidney Disease",
        "Ever told you had any other types of cancer?": "Other Cancers",
        "Ever told you had skin cancer?": "Skin Cancer",
    },
    "Topic": {
        "Asthma": "Respiratory Diseases",
        "COPD": "Respiratory Diseases",
        "Other Cancer": "Cancer",
        "Skin Cancer": "Cancer",
    },
}

brfss_df = load_dataframe(
    raw_path=brfss_raw_path,
    allowed_columns=brfss_cols,
    column_filters=brfss_filters,
    sort_order=brfss_sort,
    column_renames=brfss_col_renames,
    value_renames=brfss_val_renames,
)

brfss_df.to_csv(brfss_processed_path, index=False)

## Vaccine

In [6]:
vaccine_raw_path = f"{repo}/datasets/raw/us-daily-covid-vaccine-doses-administered.csv"
vaccine_processed_path = f"{repo}/datasets/processed/vaccine.csv"

vaccine_dropped_cols = ["Code"]
vaccine_col_renames = {
    "Entity": "State",
    "Daily doses administered (7-day rolling average)": "VaccinationsCount",
}

vaccine_df = load_dataframe(
    raw_path=vaccine_raw_path,
    column_renames=vaccine_col_renames,
    dropped_columns=vaccine_dropped_cols,
)

vaccine_df["Day"] = pd.to_datetime(vaccine_df["Day"])
vaccine_df["Year"] = vaccine_df["Day"].dt.year
vaccine_df["Month"] = vaccine_df["Day"].dt.month
vaccine_df = vaccine_df.groupby(["State", "Year", "Month"], as_index=False).agg(
    {"VaccinationsCount": "sum"}
)
vaccine_df = vaccine_df.sort_values(by=["Year", "Month"], ascending=[True, True])

vaccine_df.to_csv(vaccine_processed_path, index=False)

In [7]:
deaths_raw_path = f"{repo}/datasets/raw/Conditions_Contributing_to_COVID-19_Deaths__by_State_and_Age__Provisional_2020-2023.csv"
deaths_processed_path = f"{repo}/datasets/processed/deaths.csv"

deaths_dropped_cols = [
    "ICD10_codes",
    "Data As Of",
    "Start Date",
    "End Date",
    "Group",
    "Flag",
    "Number of Mentions",
]

deaths_col_renames = {"Condition Group": "Topic", "Condition": "Question"}

deaths_filters = {"Group": ["By Month"]}

deaths_sort_order = ["Year", "Month"]

deaths_df = load_dataframe(
    raw_path=deaths_raw_path,
    column_filters=deaths_filters,
    dropped_columns=deaths_dropped_cols,
    column_renames=deaths_col_renames,
    sort_order=deaths_sort_order,
)

deaths_df = deaths_df.dropna(subset=["COVID-19 Deaths"])
deaths_df["Year"] = deaths_df["Year"].astype(int)
deaths_df["Month"] = deaths_df["Month"].astype(int)
deaths_df["COVID-19 Deaths"] = deaths_df["COVID-19 Deaths"].astype(int)

deaths_df.to_csv(deaths_processed_path, index=False)

## Disability

In [8]:
disability_raw_path = (
    f"{repo}/datasets/raw/Prevalence_of_Disability_Status_and_Types_2022.csv"
)
disability_processed_path = f"{repo}/datasets/processed/disability.csv"

disability_cols = [
    "Year",
    "LocationAbbr",
    "Category",
    "Response",
    "Data_Value_Type",
    "Data_Value",
]

disability_col_renames = {
    "Response": "Question",
    "Category": "Topic",
    "Data_Value_Type": "DataValueType",
    "Data_Value": "DataValue",
}

disability_sort = ["Year", "LocationAbbr", "Topic"]

disability_df = load_dataframe(
    raw_path=disability_raw_path,
    allowed_columns=disability_cols,
    column_renames=disability_col_renames,
    sort_order=disability_sort,
)
disability_df.to_csv(disability_processed_path, index=False)