## Final Project - Machine Learning Model

In [15]:
## load the data

import pandas as pd
import numpy as np
import plotly.express as px
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import matplotlib.pyplot as plt
import seaborn as sns
import os
from joblib import dump






In [16]:
data = pd.read_csv('U.S._Chronic_Disease_Indicators.csv')
data_raw = pd.read_csv('U.S._Chronic_Disease_Indicators.csv')

### From Data Cleaning Module

In [17]:
##Exploring the data
data = pd.read_csv('U.S._Chronic_Disease_Indicators.csv')

##get me alist with all Question possibilities
questions = data['Question'].unique()
topic = data['Topic'].unique()

## dataset for nutrition
data_nutrition = data[data['Topic'] == 'Nutrition, Physical Activity, and Weight Status']

## Getting the questions out of each subject
questions_nutrition = data[data['Topic'] == 'Nutrition, Physical Activity, and Weight Status']['Question'].unique()
questions_disability = data[data['Topic'] == 'Disability']['Question'].unique()
questions_healthsts = data[data['Topic'] == 'Health Status']['Question'].unique()
questions_alcohol = data[data['Topic'] == 'Alcohol']['Question'].unique()
questions_sleep = data[data['Topic'] == 'Sleep']['Question'].unique()
questions_mental = data[data['Topic'] == 'Mental Health']['Question'].unique()


## Remove some unused columns
columns_to_drop = ['StratificationCategory2', 'Stratification2', 'StratificationCategory3', 'Stratification3', 'Geolocation', 'YearEnd', 'Response',
                   'DataValueFootnoteSymbol', 'LowConfidenceLimit', 'HighConfidenceLimit', 'LocationID', 'TopicID', 'QuestionID', 'ResponseID',
                   'DataValueTypeID', 'StratificationCategoryID1', 'StratificationID1', 'StratificationCategoryID2', 'StratificationID2',
                   'StratificationCategoryID3', 'StratificationID3']
data =  data.drop(columns=columns_to_drop)

data = data.rename(columns={
    'LocationDesc': 'State',
    'LocationAbbr': 'StateAbbr',
    'YearStart': 'Year',
    'DataSource': 'Source',
})




## Data Organization for Model

In [18]:
## Set up the working dataframe, doing this by parts then remerging, use functions so you can rip through them quickly

## Create the dataframes - these are the single question ones
data_obesity = data[data['Question'].str.contains('Obesity among adults')]
data_aerobic = data[data['Question'].str.contains('Met aerobic physical activity guideline for substantial health benefits, adults')]
data_disability = data[data['Question'].str.contains('Adults with any disability')]
data_depression = data[data['Question'].str.contains('Depression among adults')]
data_mental_distress = data[data['Question'].str.contains('Frequent mental distress among adults')]
data_mental_unhealthy = data[data['Question'].str.contains('Average mentally unhealthy days among adults')]
data_alcohol_percap = data[data['Question'].str.contains('Per capita alcohol consumption among people aged 14 years and older')]
data_alcohol_binge = data[data['Question'].str.contains('Binge drinking prevalence among adults')]
data_sleep = data[data['Question'].str.contains('Short sleep duration among adults')]
data_veggies = data[data['Question'].str.contains('Consumed vegetables less than one time daily among adults')]
data_fruit = data[data['Question'].str.contains('Consumed fruit less than one time daily among adults')]
data_chronic_health = data[data['Question'].str.contains('2 or more chronic conditions among adults')]
data_life_exp = data[data['Question'].str.contains('Life expectancy at birth')]
data_health_status = data[data['Question'].str.contains('Fair or poor self-rated health status among adults')]
data_activity_limit = data[data['Question'].str.contains('Recent activity limitation among adults')]
data_phys_unhealthy = data[data['Question'].str.contains('Average recent physically unhealthy days among adults')]
data_phys_distress = data[data['Question'].str.contains('Frequent physical distress among adults')]
data_diabetes = data[data['Question'].str.contains('Diabetes among adults')]
data_asthma = data[data['Question'].str.contains('Current asthma among adults')]
data_dentist = data[data['Question'].str.contains('Visited dentist or dental clinic in the past year among adults')]
data_blood_pressure = data[data['Question'].str.contains('High blood pressure among adults')]
data_joint_pain = data[data['Question'].str.contains('Severe joint pain among adults with arthritis')]
data_health_status = data[data['Question'].str.contains('Fair or poor self-rated health status among adults')]
data_inactivity = data[data['Question'].str.contains('Physical inactivity among adults with arthritis')]
data_cholesterol = data[data['Question'].str.contains('High cholesterol among adults who have been screened')]
data_no_activity = data[data['Question'].str.contains('No leisure-time physical activity among adults')]
data_activity_limit = data[data['Question'].str.contains('Recent activity limitation among adults')]
data_unemployment = data[data['Question'].str.contains('Unemployment rate among people 16 years and older in the labor force')]
data_copd = data[data['Question'].str.contains('Chronic obstructive pulmonary disease among adults')]
data_checkup = data[data['Question'].str.contains('Routine checkup within the past year among adults')]
data_smoking = data[data['Question'].str.contains('Current cigarette smoking among adults')]
data_medication = data[data['Question'].str.contains('Taking medicine for high cholesterol among adults')]
data_poverty = data[data['Question'].str.contains('Living below 150% of the poverty threshold among all people')]
data_food_insecure = data[data['Question'].str.contains('Food insecure in the past 12 months among households')]
data_teeth = data[data['Question'].str.contains('No teeth lost among adults aged 18-64 years')]
data_transport = data[data['Question'].str.contains('Lack of reliable transportation in the past 12 months among adults')]
data_support = data[data['Question'].str.contains('Lack of social and emotional support needed among adults')]
data_bills = data[data['Question'].str.contains('Unable to pay mortgage, rent, or utility bills in the past 12 months among adults')]



## Create the function for layering in new columns (ie: obesity rate)

def process_question_data(data_frame, column_name):
    # Filter by 'DataValueType' and calculate mean by grouping
    data_frame = data_frame[data_frame['DataValueType'] == 'Crude Prevalence']  ##use this since less missingness
    data_frame = data_frame.groupby(['State', 'Year', 'StateAbbr', 'Stratification1', 'StratificationCategory1'])['DataValue'].mean()
    data_frame = data_frame.reset_index()

    # Rename columns
    data_frame = data_frame.rename(columns={'DataValue': column_name})

    # Filter by 'StratificationCategory1' and rename columns
    filtered_data = data_frame[data_frame['StratificationCategory1'] == 'Race/Ethnicity']
    filtered_data = filtered_data.rename(columns={'Stratification1': 'Race/Ethnicity'})
    filtered_data = filtered_data.drop(columns=['StratificationCategory1'])

    return filtered_data



## Call The Functions
data_obesity = process_question_data(data_obesity, 'Obesity Rate')
data_aerobic = process_question_data(data_aerobic, 'met aerobic fitness level')
data_disability = process_question_data(data_disability, 'disability rate')
data_depression = process_question_data(data_depression, 'depression rate')
data_mental_distress = process_question_data(data_mental_distress, 'mental distress rate')
data_mental_unhealthy = process_question_data(data_mental_unhealthy, 'unhealthy mental days')
data_alcohol_binge = process_question_data(data_alcohol_binge, 'binge drinking rate')
data_alcohol_percap = process_question_data(data_alcohol_percap, 'per capita alcohol consumption')
data_sleep = process_question_data(data_sleep, 'short sleep duration rate')
data_veggies = process_question_data(data_veggies, 'veggie consumption rate')
data_fruit = process_question_data(data_fruit, 'fruit consumption rate')
data_chronic_health = process_question_data(data_chronic_health, '2 or more chronic conditions rate')
data_life_exp = process_question_data(data_life_exp, 'life expectancy')
data_health_status = process_question_data(data_health_status, 'fair or poor health rate')
data_activity_limit = process_question_data(data_activity_limit, 'activity limitation rate')
data_phys_unhealthy = process_question_data(data_phys_unhealthy, 'physically unhealthy days')
data_phys_distress = process_question_data(data_phys_distress, 'physical distress rate')
data_diabetes = process_question_data(data_diabetes, 'diabetes rate')
data_asthma = process_question_data(data_asthma, 'asthma rate')
data_dentist = process_question_data(data_dentist, 'dentist visit rate')
data_blood_pressure = process_question_data(data_blood_pressure, 'high blood pressure rate')
data_joint_pain = process_question_data(data_joint_pain, 'severe joint pain rate')
data_inactivity = process_question_data(data_inactivity, 'inactivity rate')
data_cholesterol = process_question_data(data_cholesterol, 'high cholesterol rate')
data_no_activity = process_question_data(data_no_activity, 'no activity rate')
data_unemployment = process_question_data(data_unemployment, 'unemployment rate')
data_copd = process_question_data(data_copd, 'copd rate')
data_checkup = process_question_data(data_checkup, 'checkup rate')
data_smoking = process_question_data(data_smoking, 'smoking rate')
data_medication = process_question_data(data_medication, 'medication rate')
data_poverty = process_question_data(data_poverty, 'poverty rate')
data_food_insecure = process_question_data(data_food_insecure, 'food insecurity rate')
data_teeth = process_question_data(data_teeth, 'teeth rate')
data_transport = process_question_data(data_transport, 'transport rate')
data_support = process_question_data(data_support, 'support rate')
data_bills = process_question_data(data_bills, 'bills rate')



## merge the dataframes, do this with a loop
data_frames = [data_obesity, data_aerobic, data_disability, data_depression, data_mental_distress, data_mental_unhealthy, data_alcohol_binge, data_alcohol_percap,
               data_sleep, data_veggies, data_fruit, data_chronic_health, data_life_exp, data_health_status, data_activity_limit, data_phys_unhealthy, data_phys_distress,
               data_diabetes, data_asthma, data_dentist, data_blood_pressure, data_joint_pain, data_inactivity, data_cholesterol, data_no_activity, data_unemployment,
               data_copd, data_checkup, data_smoking, data_medication, data_poverty, data_food_insecure, data_teeth, data_transport, data_support, data_bills,]

merged_data = data_frames[0]
for df in data_frames[1:]:
    merged_data = pd.merge(merged_data, df, on=['State', 'StateAbbr', 'Race/Ethnicity', 'Year'], how='outer')


## remove rows where obesity rate is nan - this is the outcome
merged_data = merged_data[merged_data['Obesity Rate'].notna()]





In [19]:
data_obesity

Unnamed: 0,State,Year,StateAbbr,Race/Ethnicity,Obesity Rate
3,Alabama,2019,AL,"American Indian or Alaska Native, non-Hispanic",30.9
4,Alabama,2019,AL,"Asian, non-Hispanic",
5,Alabama,2019,AL,"Black, non-Hispanic",46.6
7,Alabama,2019,AL,"Hawaiian or Pacific Islander, non-Hispanic",
8,Alabama,2019,AL,Hispanic,35.7
...,...,...,...,...,...
2852,Wyoming,2022,WY,"Black, non-Hispanic",
2854,Wyoming,2022,WY,"Hawaiian or Pacific Islander, non-Hispanic",
2855,Wyoming,2022,WY,Hispanic,46.3
2857,Wyoming,2022,WY,"Multiracial, non-Hispanic",


In [20]:
merged_data


Unnamed: 0,State,Year,StateAbbr,Race/Ethnicity,Obesity Rate,met aerobic fitness level,disability rate,depression rate,mental distress rate,unhealthy mental days,...,copd rate,checkup rate,smoking rate,medication rate,poverty rate,food insecurity rate,teeth rate,transport rate,support rate,bills rate
0,Alabama,2019,AL,"American Indian or Alaska Native, non-Hispanic",30.9,49.4,54.5,25.7,20.2,,...,17.1,68.3,30.5,31.3,23.9,,,,,
2,Alabama,2019,AL,"Black, non-Hispanic",46.6,39.7,36.2,17.0,16.0,,...,7.0,86.3,20.4,32.5,36.7,,,,,
4,Alabama,2019,AL,Hispanic,35.7,49.6,24.3,24.7,,,...,,59.9,20.4,27.3,42.6,,,,,
5,Alabama,2019,AL,"Multiracial, non-Hispanic",32.8,41.8,49.5,46.6,23.2,,...,,77.4,31.4,39.1,31.5,,,,,
6,Alabama,2019,AL,"White, non-Hispanic",32.6,48.0,35.1,26.7,18.1,,...,11.0,77.0,19.8,39.0,19.7,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1531,Wyoming,2021,WY,"Multiracial, non-Hispanic",43.5,,35.6,46.4,27.6,,...,,62.0,32.0,,18.3,,,,,
1532,Wyoming,2021,WY,"White, non-Hispanic",31.3,,28.5,20.7,15.0,,...,6.4,69.3,16.2,32.4,18.1,,,,,
1533,Wyoming,2022,WY,"American Indian or Alaska Native, non-Hispanic",40.5,,30.6,21.9,,,...,,75.7,,,,,,,,
1537,Wyoming,2022,WY,Hispanic,46.3,,29.8,21.6,17.9,,...,,65.4,11.0,,,,67.5,,10.4,13.8


## ML Set Up

We are predicting Obesity rate based off of other health metric.  Make sure to use metrics that indicate health (ie: no mortality rate from a disease but rather the prevalence of the disease).

1) Linear
2) Random Forest
3) XGBoost

In [21]:
## load sklearn packages
import xgboost as xgb
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score



## merge data & Create X and Y
data_ml = merged_data
Y = data_ml['Obesity Rate']
X = data_ml.drop(columns=['State', 'StateAbbr', 'Year', 'Race/Ethnicity', 'Obesity Rate'])

##linear can't hande NoN so need to impute

##first remove columns that have more that 80% missing data
X_lin = X.replace(np.nan, 'NaN', regex=True)
missing_data = X.isnull().mean()
cols_to_drop = missing_data[missing_data > 0.8].index
X_lin = X.drop(columns=cols_to_drop)
imputer = SimpleImputer(strategy='mean')
X_lin = pd.DataFrame(imputer.fit_transform(X_lin), columns=X_lin.columns)

### Linear Model

In [22]:
# Set up training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_lin, Y, test_size=0.2, random_state=42)

# set up linear regression model
linear_regressor = LinearRegression()

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(linear_regressor, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
cv_scores_rmse = (-cv_scores) ** 0.5  # Convert negative MSE to RMSE

# Fit the model & make predictions
linear_regressor.fit(X_train, y_train)
y_pred = linear_regressor.predict(X_test)

# Calculate the various metrics
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
test_r2 = r2_score(y_test, y_pred)

print("Cross-Validation RMSE Scores:", cv_scores_rmse)
print("Mean CV RMSE:", cv_scores_rmse.mean())
print("Test RMSE:", test_rmse)
print("Test R2 Score:", test_r2)



# Save the model
dump(linear_regressor, 'linear_regressor_model.pkl')
print("Model saved as linear_regressor_model.pkl")


Cross-Validation RMSE Scores: [6.3755119  6.15119402 7.06670772 6.75798881 7.0518673 ]
Mean CV RMSE: 6.680653950275212
Test RMSE: 6.206403697100109
Test R2 Score: 0.5649290700189126
Model saved as linear_regressor_model.pkl


In [23]:
X_train.columns

Index(['met aerobic fitness level', 'disability rate', 'depression rate',
       'mental distress rate', 'binge drinking rate',
       'short sleep duration rate', 'veggie consumption rate',
       'fruit consumption rate', '2 or more chronic conditions rate',
       'fair or poor health rate', 'physical distress rate', 'diabetes rate',
       'asthma rate', 'dentist visit rate', 'high blood pressure rate',
       'severe joint pain rate', 'inactivity rate', 'high cholesterol rate',
       'no activity rate', 'unemployment rate', 'copd rate', 'checkup rate',
       'smoking rate', 'medication rate', 'poverty rate', 'teeth rate'],
      dtype='object')

In [24]:
## get the top feature importnaces
feature_importances = pd.Series(linear_regressor.coef_, index=X_train.columns).sort_values(ascending=False)
print(feature_importances)


medication rate                      0.698961
high blood pressure rate             0.349696
diabetes rate                        0.341789
poverty rate                         0.320371
binge drinking rate                  0.222938
disability rate                      0.174625
veggie consumption rate              0.143248
no activity rate                     0.111730
fruit consumption rate               0.110960
asthma rate                          0.110584
depression rate                      0.099033
checkup rate                         0.084634
short sleep duration rate            0.075299
mental distress rate                 0.056361
fair or poor health rate             0.048210
inactivity rate                     -0.015896
unemployment rate                   -0.040542
copd rate                           -0.042706
dentist visit rate                  -0.062619
smoking rate                        -0.078246
severe joint pain rate              -0.081704
met aerobic fitness level         

### Random Forest

In [25]:

# Set up training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Set parameters for the random forest model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(rf_regressor, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
cv_scores_rmse = (-cv_scores) ** 0.5  # Convert negative MSE to RMSE

# Fit the model & make predictions
rf_regressor.fit(X_train, y_train)
y_pred = rf_regressor.predict(X_test)

# Calculate the various metrics
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
test_r2 = r2_score(y_test, y_pred)

# print out the metrics
print("Cross-Validation RMSE Scores:", cv_scores_rmse)
print("Mean CV RMSE:", cv_scores_rmse.mean())
print("Test RMSE:", test_rmse)
print("Test R2 Score:", test_r2)



ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/jianxiongshen/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/jianxiongshen/anaconda3/lib/python3.11/site-packages/sklearn/ensemble/_forest.py", line 345, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "/Users/jianxiongshen/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 584, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/jianxiongshen/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1106, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "/Users/jianxiongshen/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 921, in check_array
    _assert_all_finite(
  File "/Users/jianxiongshen/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 161, in _assert_all_finite
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


In [15]:
## Get some metrics here such as msot influential features
feature_importances = rf_regressor.feature_importances_
feature_names = X.columns
feature_importances_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
feature_importances_df = feature_importances_df.sort_values('importance', ascending=False)
feature_importances_df.head(10)


Unnamed: 0,feature,importance
1,disability rate,0.380376
16,diabetes rate,0.118033
12,fair or poor health rate,0.058082
23,no activity rate,0.051369
2,depression rate,0.049334
27,smoking rate,0.041683
29,poverty rate,0.028467
31,teeth rate,0.028341
10,2 or more chronic conditions rate,0.025769
26,checkup rate,0.024689


## XGBoost

In [28]:
# Set up training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Set parameters for the XGBoost model
xgb_regressor = xgb.XGBRegressor(n_estimators=100, random_state=42)

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(xgb_regressor, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
cv_scores_rmse = (-cv_scores) ** 0.5  # Convert negative MSE to RMSE

# Fit the model & make predictions
xgb_regressor.fit(X_train, y_train)
y_pred = xgb_regressor.predict(X_test)

# Calculate the various metrics
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
test_r2 = r2_score(y_test, y_pred)

# Print out the metrics
print("Cross-Validation RMSE Scores:", cv_scores_rmse)
print("Mean CV RMSE:", cv_scores_rmse.mean())
print("Test RMSE:", test_rmse)
print("Test R2 Score:", test_r2)

dump(xgb_regressor, 'xgb_regressor_model.pkl')


Cross-Validation RMSE Scores: [6.11977232 6.53564812 6.4947051  5.77714743 6.00143056]
Mean CV RMSE: 6.185740703498733
Test RMSE: 5.790605699961263
Test R2 Score: 0.6212714795827654


['xgb_regressor_model.pkl']

In [31]:
X_train

Unnamed: 0,met aerobic fitness level,disability rate,depression rate,mental distress rate,unhealthy mental days,binge drinking rate,per capita alcohol consumption,short sleep duration rate,veggie consumption rate,fruit consumption rate,...,copd rate,checkup rate,smoking rate,medication rate,poverty rate,food insecurity rate,teeth rate,transport rate,support rate,bills rate
43,,22.2,,,,,,,27.4,53.1,...,,52.0,,23.0,9.8,,,,,
412,,32.2,18.8,13.2,,15.4,,,14.2,41.2,...,6.5,79.7,12.5,34.7,14.0,,,,,
746,,34.4,26.9,24.9,,19.3,,,23.2,35.4,...,,68.7,22.7,27.4,28.9,,,,,
977,,31.4,14.3,11.9,,19.7,,27.5,,,...,3.2,64.4,8.6,,,,61.0,14.2,12.5,26.8
1446,,28.5,15.6,15.3,,16.1,,,30.8,31.0,...,2.0,65.2,8.2,24.3,27.2,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,64.5,27.8,21.2,12.5,,15.0,,,23.2,36.7,...,,71.4,23.9,30.7,20.5,,,,,
396,41.6,24.1,14.0,9.8,,19.3,,,42.8,34.0,...,1.9,68.8,12.2,25.5,25.8,,,,,
1301,,26.0,20.6,13.7,,16.3,,30.6,,,...,7.7,76.0,14.9,,,,67.8,,,
638,,13.4,14.3,7.4,,9.0,,30.5,,,...,,74.8,5.2,,,,78.3,6.3,10.4,


In [35]:
cols_to_drop

Index(['unhealthy mental days', 'per capita alcohol consumption',
       'life expectancy', 'activity limitation rate',
       'physically unhealthy days', 'food insecurity rate', 'transport rate',
       'support rate', 'bills rate'],
      dtype='object')

In [27]:
## get the top feature importances
feature_importances = xgb_regressor.feature_importances_
feature_names = X.columns
feature_importances_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
feature_importances_df = feature_importances_df.sort_values('importance', ascending=False)
feature_importances_df.head(10)

Unnamed: 0,feature,importance
1,disability rate,0.134688
16,diabetes rate,0.118427
12,fair or poor health rate,0.105206
29,poverty rate,0.080322
27,smoking rate,0.070418
23,no activity rate,0.062174
31,teeth rate,0.049359
17,asthma rate,0.046743
26,checkup rate,0.026741
19,high blood pressure rate,0.026573


### Summary of Findings

1) Disability Rate
2) Diabetes
3) ...