## Data Cleaning Module

In [1]:
## load the data

import pandas as pd
import numpy as np
import plotly.express as px
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import matplotlib.pyplot as plt
import seaborn as sns
import os

data = pd.read_csv('U.S._Chronic_Disease_Indicators.csv')
data_raw = pd.read_csv('U.S._Chronic_Disease_Indicators.csv')


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  import dash_core_components as dcc
The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html


### Identify Questions and Topics

In [2]:
##Exploring the data
data = pd.read_csv('U.S._Chronic_Disease_Indicators.csv')

##get me alist with all Question possibilities
questions = data['Question'].unique()
topic = data['Topic'].unique()

## dataset for nutrition
data_nutrition = data[data['Topic'] == 'Nutrition, Physical Activity, and Weight Status']

## Getting the questions out of each subject
questions_nutrition = data[data['Topic'] == 'Nutrition, Physical Activity, and Weight Status']['Question'].unique()
questions_disability = data[data['Topic'] == 'Disability']['Question'].unique()
questions_healthsts = data[data['Topic'] == 'Health Status']['Question'].unique()
questions_alcohol = data[data['Topic'] == 'Alcohol']['Question'].unique()
questions_sleep = data[data['Topic'] == 'Sleep']['Question'].unique()
questions_mental = data[data['Topic'] == 'Mental Health']['Question'].unique()


## Remove some unused columns
columns_to_drop = ['StratificationCategory2', 'Stratification2', 'StratificationCategory3', 'Stratification3', 'Geolocation', 'YearEnd', 'Response',
                   'DataValueFootnoteSymbol', 'LowConfidenceLimit', 'HighConfidenceLimit', 'LocationID', 'TopicID', 'QuestionID', 'ResponseID',
                   'DataValueTypeID', 'StratificationCategoryID1', 'StratificationID1', 'StratificationCategoryID2', 'StratificationID2',
                   'StratificationCategoryID3', 'StratificationID3']
data =  data.drop(columns=columns_to_drop)

data = data.rename(columns={
    'LocationDesc': 'State',
    'LocationAbbr': 'StateAbbr',
    'YearStart': 'Year',
    'DataSource': 'Source',
})




## Data Organization for Model

In [7]:

## Write a function to process the data
def process_question_data(data_frame, column_name):
    # Filter by 'DataValueType' and calculate mean by grouping
    data_frame = data_frame[data_frame['DataValueType'] == 'Crude Prevalence']  ##use this since less missingness
    data_frame = data_frame.groupby(['State', 'Year', 'StateAbbr', 'Stratification1', 'StratificationCategory1'])['DataValue'].mean()
    data_frame = data_frame.reset_index()

    # Rename columns
    data_frame = data_frame.rename(columns={'DataValue': column_name})

    # Filter by 'StratificationCategory1' and rename columns
    filtered_data = data_frame[data_frame['StratificationCategory1'] == 'Race/Ethnicity']
    filtered_data = filtered_data.rename(columns={'Stratification1': 'Race/Ethnicity'})
    filtered_data = filtered_data.drop(columns=['StratificationCategory1'])

    return filtered_data


## Create the dataframes - these are the single question ones
data_obesity = data[data['Question'].str.contains('Obesity among adults')]
data_aerobic = data[data['Question'].str.contains('Met aerobic physical activity guideline for substantial health benefits, adults')]
data_disability = data[data['Question'].str.contains('Adults with any disability')]
data_depression = data[data['Question'].str.contains('Depression among adults')]
data_mental_distress = data[data['Question'].str.contains('Frequent mental distress among adults')]
data_mental_unhealthy = data[data['Question'].str.contains('Average mentally unhealthy days among adults')]
data_alcohol_percap = data[data['Question'].str.contains('Per capita alcohol consumption among people aged 14 years and older')]
data_alcohol_binge = data[data['Question'].str.contains('Binge drinking prevalence among adults')]
data_sleep = data[data['Question'].str.contains('Short sleep duration among adults')]
data_veggies = data[data['Question'].str.contains('Consumed vegetables less than one time daily among adults')]
data_fruit = data[data['Question'].str.contains('Consumed fruit less than one time daily among adults')]
data_chronic_health = data[data['Question'].str.contains('2 or more chronic conditions among adults')]
data_life_exp = data[data['Question'].str.contains('Life expectancy at birth')]
data_health_status = data[data['Question'].str.contains('Fair or poor self-rated health status among adults')]
data_activity_limit = data[data['Question'].str.contains('Recent activity limitation among adults')]
data_phys_unhealthy = data[data['Question'].str.contains('Average recent physically unhealthy days among adults')]
data_phys_distress = data[data['Question'].str.contains('Frequent physical distress among adults')]
data_diabetes = data[data['Question'].str.contains('Diabetes among adults')]
data_asthma = data[data['Question'].str.contains('Current asthma among adults')]
data_dentist = data[data['Question'].str.contains('Visited dentist or dental clinic in the past year among adults')]
data_blood_pressure = data[data['Question'].str.contains('High blood pressure among adults')]
data_joint_pain = data[data['Question'].str.contains('Severe joint pain among adults with arthritis')]
data_health_status = data[data['Question'].str.contains('Fair or poor self-rated health status among adults')]
data_inactivity = data[data['Question'].str.contains('Physical inactivity among adults with arthritis')]
data_cholesterol = data[data['Question'].str.contains('High cholesterol among adults who have been screened')]
data_no_activity = data[data['Question'].str.contains('No leisure-time physical activity among adults')]
data_activity_limit = data[data['Question'].str.contains('Recent activity limitation among adults')]
data_unemployment = data[data['Question'].str.contains('Unemployment rate among people 16 years and older in the labor force')]
data_copd = data[data['Question'].str.contains('Chronic obstructive pulmonary disease among adults')]
data_checkup = data[data['Question'].str.contains('Routine checkup within the past year among adults')]
data_smoking = data[data['Question'].str.contains('Current cigarette smoking among adults')]
data_medication = data[data['Question'].str.contains('Taking medicine for high cholesterol among adults')]
data_poverty = data[data['Question'].str.contains('Living below 150% of the poverty threshold among all people')]
data_food_insecure = data[data['Question'].str.contains('Food insecure in the past 12 months among households')]
data_teeth = data[data['Question'].str.contains('No teeth lost among adults aged 18-64 years')]
data_transport = data[data['Question'].str.contains('Lack of reliable transportation in the past 12 months among adults')]
data_support = data[data['Question'].str.contains('Lack of social and emotional support needed among adults')]
data_bills = data[data['Question'].str.contains('Unable to pay mortgage, rent, or utility bills in the past 12 months among adults')]



## Call The Function to get data
data_obesity = process_question_data(data_obesity, 'Obesity Rate')
data_aerobic = process_question_data(data_aerobic, 'met aerobic fitness level')
data_disability = process_question_data(data_disability, 'disability rate')
data_depression = process_question_data(data_depression, 'depression rate')
data_mental_distress = process_question_data(data_mental_distress, 'mental distress rate')
data_mental_unhealthy = process_question_data(data_mental_unhealthy, 'unhealthy mental days')
data_alcohol_binge = process_question_data(data_alcohol_binge, 'binge drinking rate')
data_alcohol_percap = process_question_data(data_alcohol_percap, 'per capita alcohol consumption')
data_sleep = process_question_data(data_sleep, 'short sleep duration rate')
data_veggies = process_question_data(data_veggies, 'veggie consumption rate')
data_fruit = process_question_data(data_fruit, 'fruit consumption rate')
data_chronic_health = process_question_data(data_chronic_health, '2 or more chronic conditions rate')
data_life_exp = process_question_data(data_life_exp, 'life expectancy')
data_health_status = process_question_data(data_health_status, 'fair or poor health rate')
data_activity_limit = process_question_data(data_activity_limit, 'activity limitation rate')
data_phys_unhealthy = process_question_data(data_phys_unhealthy, 'physically unhealthy days')
data_phys_distress = process_question_data(data_phys_distress, 'physical distress rate')
data_diabetes = process_question_data(data_diabetes, 'diabetes rate')
data_asthma = process_question_data(data_asthma, 'asthma rate')
data_dentist = process_question_data(data_dentist, 'dentist visit rate')
data_blood_pressure = process_question_data(data_blood_pressure, 'high blood pressure rate')
data_joint_pain = process_question_data(data_joint_pain, 'severe joint pain rate')
data_inactivity = process_question_data(data_inactivity, 'inactivity rate')
data_cholesterol = process_question_data(data_cholesterol, 'high cholesterol rate')
data_no_activity = process_question_data(data_no_activity, 'no activity rate')
data_unemployment = process_question_data(data_unemployment, 'unemployment rate')
data_copd = process_question_data(data_copd, 'copd rate')
data_checkup = process_question_data(data_checkup, 'checkup rate')
data_smoking = process_question_data(data_smoking, 'smoking rate')
data_medication = process_question_data(data_medication, 'medication rate')
data_poverty = process_question_data(data_poverty, 'poverty rate')
data_food_insecure = process_question_data(data_food_insecure, 'food insecurity rate')
data_teeth = process_question_data(data_teeth, 'teeth rate')
data_transport = process_question_data(data_transport, 'transport rate')
data_support = process_question_data(data_support, 'support rate')
data_bills = process_question_data(data_bills, 'bills rate')



## merge the dataframes together for model
data_frames = [data_obesity, data_aerobic, data_disability, data_depression, data_mental_distress, data_mental_unhealthy, data_alcohol_binge, data_alcohol_percap,
               data_sleep, data_veggies, data_fruit, data_chronic_health, data_life_exp, data_health_status, data_activity_limit, data_phys_unhealthy, data_phys_distress,
               data_diabetes, data_asthma, data_dentist, data_blood_pressure, data_joint_pain, data_inactivity, data_cholesterol, data_no_activity, data_unemployment,
               data_copd, data_checkup, data_smoking, data_medication, data_poverty, data_food_insecure, data_teeth, data_transport, data_support, data_bills,]

merged_data = data_frames[0]
for df in data_frames[1:]:
    merged_data = pd.merge(merged_data, df, on=['State', 'StateAbbr', 'Race/Ethnicity', 'Year'], how='outer')


## remove rows where obesity rate is nan - this is the outcome so do not want to impute
merged_data = merged_data[merged_data['Obesity Rate'].notna()]



