## Import packages and Define source and result file locations

In [1]:
# package imports go here
import pandas as pd
import numpy as np
import fastparquet as fp
import os
import sys
import pickle
import importlib

sys.path.insert(1, '../pkgs')
import ml_functions as mlfuncs
# import ml_clean_feature as mlclean
import ml_clean_config as mlconfigs

In [2]:
# Path to results
year = 2021
source_path     = "../data/brfss/"
source_file     = mlfuncs.brfss_parquet_file( source_path, year )

result_path     = "../data/"
clean_file_init =  result_path + 'brfss_' + str(year) + '_clean_init.parquet.gzip'

clean_file_final = result_path + 'brfss_' + str(year) + '_clean_final.parquet.gzip'

## Analyze the set of all candidate features

- Reduce full set of features to just the candidate features
- Check for features that have too many null values (>70K)
- Remove features that have too many null values
- Move on to cleaning of each feature

In [3]:
# Full candidate  dataset
diabetes_features_2021_all_candidates = [
    'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'PRIMINSR', 'PERSDOC3', 
    'CHECKUP1', 'EXERANY2', 'BPHIGH6', 'BPMEDS', 'CHOLCHK3', 
    'TOLDHI3', 'CHOLMED3', 'CVDCRHD4', 'CVDSTRK3', 'ADDEPEV3', 
    'CHCKDNY2', 'DIABETE4', 'MARITAL', 'EDUCA', 'RENTHOM1', 
    'EMPLOY1', 'INCOME3', 'WEIGHT2', 'DEAF', 'BLIND', 
    'DIFFWALK', 'FLUSHOT7', 'PREDIAB1', 'CHKHEMO3', 'EYEEXAM1', 
    'TOLDCFS', 'HAVECFS', 'TOLDHEPC', 'HAVEHEPB', 'HPVADVC4', 
    'SHINGLE2', 'CIMEMLOS', 'CDDISCUS', 'MSCODE', '_IMPRACE', 
    '_RFHLTH', '_HLTHPLN', '_TOTINDA', '_MICHD', '_PRACE1', 
    '_RACE', '_RACEGR3', '_SEX', '_AGEG5YR', 'WTKG3', 
    '_BMI5', '_BMI5CAT', '_EDUCAG', '_INCOMG1', '_SMOKER3', 
    '_RFSMOK3', '_CURECI1', '_DRNKWK1', '_RFDRHV7', 'FTJUDA2_', 
    'FRUTDA2_', 'GRENDA1_', 'FRNCHDA_', 'POTADA1_', 'VEGEDA2_', 
    '_FRUTSU1', '_VEGESU1', '_FRTLT1A']


In [4]:
# Read in full BRFSS 2021 dataset
df = pd.read_parquet(source_file, engine="fastparquet")


In [5]:
# Create feature dataframe with only candidate diabetes features
feature_df = df[diabetes_features_2021_all_candidates].copy()

In [6]:
# Describe the data
feature_df.describe()


Unnamed: 0,GENHLTH,PHYSHLTH,MENTHLTH,PRIMINSR,PERSDOC3,CHECKUP1,EXERANY2,BPHIGH6,BPMEDS,CHOLCHK3,...,_RFDRHV7,FTJUDA2_,FRUTDA2_,GRENDA1_,FRNCHDA_,POTADA1_,VEGEDA2_,_FRUTSU1,_VEGESU1,_FRTLT1A
count,438689.0,438690.0,438691.0,438690.0,438691.0,438691.0,438691.0,438691.0,172133.0,438691.0,...,438693.0,394344.0,394742.0,394443.0,393928.0,390253.0,390165.0,387606.0,378566.0,438693.0
mean,2.524761,63.190139,59.923347,10.614445,1.57887,1.470787,1.258043,2.238437,1.187861,2.720555,...,1.692548,44.59572,134.1733,78.90303,25.71366,28.51175,139.6707,178.343,271.5442,2.270561
std,1.082066,36.222075,37.47268,24.795124,0.89225,1.128487,0.522586,1.053716,0.507136,1.717564,...,2.163298,365.2686,500.6682,463.4876,173.6687,178.7448,651.1466,691.2931,1036.227,2.485479
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,5.397605e-79,5.397605e-79,5.397605e-79,5.397605e-79,5.397605e-79,5.397605e-79,5.397605e-79,5.397605e-79,1.0
25%,2.0,25.0,15.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,...,1.0,5.397605e-79,43.0,14.0,3.0,7.0,43.0,57.0,114.0,1.0
50%,2.0,88.0,88.0,3.0,1.0,1.0,1.0,3.0,1.0,2.0,...,1.0,3.0,100.0,43.0,14.0,14.0,100.0,100.0,167.0,1.0
75%,3.0,88.0,88.0,3.0,2.0,1.0,1.0,3.0,1.0,3.0,...,1.0,33.0,100.0,71.0,29.0,29.0,100.0,200.0,229.0,2.0
max,9.0,99.0,99.0,99.0,9.0,9.0,9.0,9.0,9.0,9.0,...,9.0,9900.0,9900.0,9900.0,9900.0,9900.0,9900.0,19800.0,39600.0,9.0


In [7]:
# Determine features with too many null values (>70K)
cnt = 0
max_allowed_nulls = 70000
maxed_nulls = []
for column in feature_df.columns:
    cnt += 1
    tot = feature_df[column].isna().sum()
    print(f"Feature[{cnt}]: {column}: {tot}")
    if tot > max_allowed_nulls:
        maxed_nulls.append(f'{column}')

print (f"features that exceeded max nulls: \n{maxed_nulls}")

Feature[1]: GENHLTH: 4
Feature[2]: PHYSHLTH: 3
Feature[3]: MENTHLTH: 2
Feature[4]: PRIMINSR: 3
Feature[5]: PERSDOC3: 2
Feature[6]: CHECKUP1: 2
Feature[7]: EXERANY2: 2
Feature[8]: BPHIGH6: 2
Feature[9]: BPMEDS: 266560
Feature[10]: CHOLCHK3: 2
Feature[11]: TOLDHI3: 60836
Feature[12]: CHOLMED3: 61571
Feature[13]: CVDCRHD4: 2
Feature[14]: CVDSTRK3: 2
Feature[15]: ADDEPEV3: 3
Feature[16]: CHCKDNY2: 3
Feature[17]: DIABETE4: 3
Feature[18]: MARITAL: 5
Feature[19]: EDUCA: 5
Feature[20]: RENTHOM1: 8
Feature[21]: EMPLOY1: 3588
Feature[22]: INCOME3: 8847
Feature[23]: WEIGHT2: 11816
Feature[24]: DEAF: 14602
Feature[25]: BLIND: 15744
Feature[26]: DIFFWALK: 18009
Feature[27]: FLUSHOT7: 27648
Feature[28]: PREDIAB1: 324076
Feature[29]: CHKHEMO3: 416464
Feature[30]: EYEEXAM1: 416468
Feature[31]: TOLDCFS: 438693
Feature[32]: HAVECFS: 438693
Feature[33]: TOLDHEPC: 427889
Feature[34]: HAVEHEPB: 427914
Feature[35]: HPVADVC4: 425379
Feature[36]: SHINGLE2: 425141
Feature[37]: CIMEMLOS: 405542
Feature[38]: CDD

In [8]:
# Create final set of 2021 features for use in creating the diabetes_features.md in step 3.
diabetes_features_2021 = []
for feature in diabetes_features_2021_all_candidates:
    if feature not in maxed_nulls:
        diabetes_features_2021.append(feature)

print(diabetes_features_2021)

['GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'PRIMINSR', 'PERSDOC3', 'CHECKUP1', 'EXERANY2', 'BPHIGH6', 'CHOLCHK3', 'TOLDHI3', 'CHOLMED3', 'CVDCRHD4', 'CVDSTRK3', 'ADDEPEV3', 'CHCKDNY2', 'DIABETE4', 'MARITAL', 'EDUCA', 'RENTHOM1', 'EMPLOY1', 'INCOME3', 'WEIGHT2', 'DEAF', 'BLIND', 'DIFFWALK', 'FLUSHOT7', '_IMPRACE', '_RFHLTH', '_HLTHPLN', '_TOTINDA', '_MICHD', '_PRACE1', '_RACE', '_RACEGR3', '_SEX', '_AGEG5YR', 'WTKG3', '_BMI5', '_BMI5CAT', '_EDUCAG', '_INCOMG1', '_SMOKER3', '_RFSMOK3', '_CURECI1', '_DRNKWK1', '_RFDRHV7', 'FTJUDA2_', 'FRUTDA2_', 'GRENDA1_', 'FRNCHDA_', 'POTADA1_', 'VEGEDA2_', '_FRUTSU1', '_VEGESU1', '_FRTLT1A']


In [9]:
# Drop featurs with too many null values
feature_df.drop(maxed_nulls, axis=1, inplace=True)
feature_df

Unnamed: 0,GENHLTH,PHYSHLTH,MENTHLTH,PRIMINSR,PERSDOC3,CHECKUP1,EXERANY2,BPHIGH6,CHOLCHK3,TOLDHI3,...,_RFDRHV7,FTJUDA2_,FRUTDA2_,GRENDA1_,FRNCHDA_,POTADA1_,VEGEDA2_,_FRUTSU1,_VEGESU1,_FRTLT1A
0,5.0,20.0,10.0,3.0,1.0,2.0,2.0,3.0,2.0,1.0,...,1.0,5.397605e-79,100.0,5.700000e+01,4.300000e+01,14.0,100.0,100.0,214.0,1.0
1,3.0,88.0,88.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,...,1.0,5.397605e-79,100.0,1.400000e+01,5.397605e-79,14.0,100.0,100.0,128.0,1.0
2,2.0,88.0,88.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,...,1.0,5.397605e-79,100.0,5.397605e-79,1.400000e+01,14.0,43.0,100.0,71.0,1.0
3,2.0,88.0,10.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,...,1.0,7.100000e+01,43.0,1.000000e+01,5.700000e+01,27.0,71.0,114.0,165.0,1.0
4,5.0,30.0,88.0,3.0,1.0,1.0,1.0,4.0,2.0,1.0,...,1.0,5.397605e-79,100.0,1.000000e+02,2.900000e+01,29.0,100.0,100.0,258.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438688,2.0,88.0,20.0,88.0,3.0,4.0,1.0,1.0,6.0,2.0,...,1.0,1.400000e+01,143.0,2.900000e+01,1.400000e+01,50.0,300.0,157.0,393.0,1.0
438689,3.0,88.0,88.0,77.0,1.0,1.0,2.0,1.0,2.0,2.0,...,1.0,1.000000e+02,100.0,1.000000e+02,1.400000e+01,14.0,29.0,200.0,157.0,1.0
438690,2.0,88.0,88.0,10.0,1.0,1.0,1.0,3.0,2.0,1.0,...,1.0,5.397605e-79,200.0,2.900000e+01,5.397605e-79,14.0,100.0,200.0,143.0,1.0
438691,2.0,88.0,88.0,3.0,2.0,1.0,1.0,1.0,2.0,2.0,...,1.0,5.397605e-79,100.0,4.300000e+01,5.397605e-79,13.0,100.0,100.0,156.0,1.0


In [10]:
# Drop all rows with null values
feature_df.dropna(inplace=True)

In [11]:
feature_df

Unnamed: 0,GENHLTH,PHYSHLTH,MENTHLTH,PRIMINSR,PERSDOC3,CHECKUP1,EXERANY2,BPHIGH6,CHOLCHK3,TOLDHI3,...,_RFDRHV7,FTJUDA2_,FRUTDA2_,GRENDA1_,FRNCHDA_,POTADA1_,VEGEDA2_,_FRUTSU1,_VEGESU1,_FRTLT1A
0,5.0,20.0,10.0,3.0,1.0,2.0,2.0,3.0,2.0,1.0,...,1.0,5.397605e-79,100.0,5.700000e+01,4.300000e+01,14.0,100.0,100.0,214.0,1.0
2,2.0,88.0,88.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,...,1.0,5.397605e-79,100.0,5.397605e-79,1.400000e+01,14.0,43.0,100.0,71.0,1.0
3,2.0,88.0,10.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,...,1.0,7.100000e+01,43.0,1.000000e+01,5.700000e+01,27.0,71.0,114.0,165.0,1.0
4,5.0,30.0,88.0,3.0,1.0,1.0,1.0,4.0,2.0,1.0,...,1.0,5.397605e-79,100.0,1.000000e+02,2.900000e+01,29.0,100.0,100.0,258.0,1.0
5,3.0,88.0,88.0,3.0,1.0,1.0,2.0,3.0,2.0,2.0,...,1.0,5.397605e-79,29.0,1.400000e+01,5.397605e-79,14.0,14.0,29.0,42.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438687,4.0,88.0,88.0,3.0,1.0,3.0,1.0,1.0,4.0,1.0,...,1.0,2.000000e+02,50.0,2.000000e+02,1.400000e+01,14.0,71.0,250.0,299.0,1.0
438688,2.0,88.0,20.0,88.0,3.0,4.0,1.0,1.0,6.0,2.0,...,1.0,1.400000e+01,143.0,2.900000e+01,1.400000e+01,50.0,300.0,157.0,393.0,1.0
438690,2.0,88.0,88.0,10.0,1.0,1.0,1.0,3.0,2.0,1.0,...,1.0,5.397605e-79,200.0,2.900000e+01,5.397605e-79,14.0,100.0,200.0,143.0,1.0
438691,2.0,88.0,88.0,3.0,2.0,1.0,1.0,1.0,2.0,2.0,...,1.0,5.397605e-79,100.0,4.300000e+01,5.397605e-79,13.0,100.0,100.0,156.0,1.0


In [12]:
feature_df.dtypes

GENHLTH     float64
PHYSHLTH    float64
MENTHLTH    float64
PRIMINSR    float64
PERSDOC3    float64
CHECKUP1    float64
EXERANY2    float64
BPHIGH6     float64
CHOLCHK3    float64
TOLDHI3     float64
CHOLMED3    float64
CVDCRHD4    float64
CVDSTRK3    float64
ADDEPEV3    float64
CHCKDNY2    float64
DIABETE4    float64
MARITAL     float64
EDUCA       float64
RENTHOM1    float64
EMPLOY1     float64
INCOME3     float64
WEIGHT2     float64
DEAF        float64
BLIND       float64
DIFFWALK    float64
FLUSHOT7    float64
_IMPRACE    float64
_RFHLTH     float64
_HLTHPLN    float64
_TOTINDA    float64
_MICHD      float64
_PRACE1     float64
_RACE       float64
_RACEGR3    float64
_SEX        float64
_AGEG5YR    float64
WTKG3       float64
_BMI5       float64
_BMI5CAT    float64
_EDUCAG     float64
_INCOMG1    float64
_SMOKER3    float64
_RFSMOK3    float64
_CURECI1    float64
_DRNKWK1    float64
_RFDRHV7    float64
FTJUDA2_    float64
FRUTDA2_    float64
GRENDA1_    float64
FRNCHDA_    float64


In [13]:
# Write the initially cleanned file

feature_df.to_parquet(clean_file_init, compression='gzip', engine="fastparquet")

In [14]:
df = pd.read_parquet(clean_file_init, engine="fastparquet")

#### Initial cleaning results
Resulting dataframe has 297898 Rows with 54 Features and 1 target (DIABETE4)

---

### Evaluate and clean each column:

- refer to the feature report ([diabetes_features.md](../diabetes_features.md)) generated in step 3 of data cleaning:


In [15]:
# GENHLTH: General Health
# Keep values 1..5 (Excellent to Poor)
# Remove values 7, 8, BLANK  (Don't know, Refused, blank)
#feature_df = ['GENHLTH'] = feature_df = ['GENHLTH'].replace({2:0, 3:0, 1:2, 4:1})


In [16]:
# reload any changes to mlconfigs
importlib.reload(mlconfigs)

# Load clean_configs to drive the cleaning operations
clean_configs = mlconfigs.clean_configurations()

In [17]:
column = 'GENHLTH'
params_drop = [7,9]
params_translate = {2:0, 3:0, 1:2, 4:1}
params_scale = {'div':10, 'round':2}

feature_df_translate = feature_df.copy()
feature_df_scale     = feature_df.copy()

feature_df_drop                 = feature_df[~feature_df[column].isin(params_drop)]
feature_df_translate[column]    = feature_df[column].replace(params_translate)
feature_df_scale[column]        = feature_df[column].div(params_scale['div']).round(params_scale['round'])

print(f"Original:  {feature_df[column].unique()}")
print(f"Drop Rows: {feature_df_drop[column].unique()}")
print(f"Translate: {feature_df_translate[column].unique()}")
print(f"Scale:     {feature_df_scale[column].unique()}")


Original:  [5. 2. 3. 4. 1. 7. 9.]
Drop Rows: [5. 2. 3. 4. 1.]
Translate: [5. 0. 1. 2. 7. 9.]
Scale:     [0.5 0.2 0.3 0.4 0.1 0.7 0.9]


## Iterative Cleaning 

- define cleaning parameters for config file

In [18]:
# reload any changes to mlconfigs
importlib.reload(mlconfigs)

# Load clean_configs to drive the cleaning operations
clean_configs = mlconfigs.clean_configurations()

feature_df = df.copy()
column = 'WTKG3'

if column in clean_configs:
    print(f"\n\nCleaning Feature: {column}")

    # ----------------------------------------------------------------------------
    # NOTE: COMMENT OUT THE FOLLOWING LINES ONCE FEATURE PARAMETERS ARE REFINED
    feature_list = feature_df[column].unique()
    feature_list = np.sort(feature_list)
    if len(feature_list)> 50:
        first_50 = feature_list[:50]
        print(f"  Initial Unique features in [{column}]:  \n********** More than {len(first_50)} features, list is truncated to first 50 **********\n{first_50}")
    else:
        print(f"  Initial Unique features in [{column}]:  {feature_list}")
        print(f"        Value Counts:\n{feature_df[column].value_counts()}")
    # ----------------------------------------------------------------------------
    clean_config = clean_configs[column]
    
    if 'values_to_drop' in clean_config:
        params = clean_config['values_to_drop']     # Expecting a list of values to drop
        if not params:
            print(f"  {'values_to_drop'}: ********* NO Parameters were specified *********")
        else:
            print(f"  {'values_to_drop'}: {params}")
            feature_df = feature_df[~feature_df[column].isin(params)]
    
    if 'translate' in clean_config:
        params = clean_config['translate']          # Expecting a dictionary of translations (from:to values)
        if not params:
            print(f"  {'translate'}: ********* NO Parameters were specified *********")
        else:
            print(f"  {'translate'}: {params}")
            feature_df[column] = feature_df[column].replace(params)
    
    if 'translate2' in clean_config:
        params = clean_config['translate2']          # Expecting a dictionary of translations (from:to values)
        if not params:
            print(f"  {'translate2'}: ********* NO Parameters were specified *********")
        else:
            print(f"  {'translate2'}: {params}")
            feature_df[column] = feature_df[column].replace(params)
    
    if 'scale' in clean_config:
        params = clean_config['scale']              # expecting dictionary with divisor and rounding values
        if not params:
            print(f"  {'scale'}: ********* NO Parameters were specified *********")
        else:
            print(f"  {'scale'}: {params}")
            feature_df[column] = feature_df[column].div(params['div']).round(params['round'])

    feature_list = feature_df[column].unique()
    feature_list = np.sort(feature_list)
    if len(feature_list)> 50:
        first_50 = feature_list[:50]
        print(f"  FINAL Unique features in [{column}]:  \n********** More than {len(first_50)} features, list is truncated to first 50 **********\n{first_50}")
    else:
        print(f"  FINAL Unique features in [{column}]:  {feature_list}")
        print(f"        Value Counts:\n{feature_df[column].value_counts()}")
else:
    print(f"Feature DOES NOT exist: {column}")



Cleaning Feature: WTKG3
  Initial Unique features in [WTKG3]:  
********** More than 50 features, list is truncated to first 50 **********
[2631. 2676. 2722. 2767. 3000. 3084. 3175. 3266. 3311. 3357. 3402. 3447.
 3493. 3538. 3583. 3629. 3674. 3719. 3765. 3810. 3856. 3901. 3946. 3992.
 4000. 4037. 4082. 4128. 4173. 4200. 4218. 4264. 4300. 4309. 4354. 4400.
 4445. 4491. 4500. 4536. 4581. 4600. 4627. 4672. 4700. 4717. 4763. 4800.
 4808. 4853.]
  values_to_drop: ********* NO Parameters were specified *********
  translate: ********* NO Parameters were specified *********
  scale: {'div': 100, 'round': 2}
  FINAL Unique features in [WTKG3]:  
********** More than 50 features, list is truncated to first 50 **********
[26.31 26.76 27.22 27.67 30.   30.84 31.75 32.66 33.11 33.57 34.02 34.47
 34.93 35.38 35.83 36.29 36.74 37.19 37.65 38.1  38.56 39.01 39.46 39.92
 40.   40.37 40.82 41.28 41.73 42.   42.18 42.64 43.   43.09 43.54 44.
 44.45 44.91 45.   45.36 45.81 46.   46.27 46.72 47.   47.17

## Final Cleaning Routine

In [19]:
for column in feature_df:
    if column in clean_configs:
        print(f"\n\nCleaning Feature: {column}")

        # ----------------------------------------------------------------------------
        # NOTE: COMMENT OUT THE FOLLOWING LINES ONCE FEATURE PARAMETERS ARE REFINED
        feature_list = feature_df[column].unique()
        feature_list = np.sort(feature_list)
        if len(feature_list)> 50:
            first_50 = feature_list[:50]
            print(f"  Initial Unique features in [{column}]:  \n********** More than {len(first_50)} features, list is truncated to first 50 **********\n{first_50}")
        else:
            print(f"  Initial Unique features in [{column}]:  {feature_list}")
        # ----------------------------------------------------------------------------
        clean_config = clean_configs[column]
        
        if 'values_to_drop' in clean_config:
            params = clean_config['values_to_drop']     # Expecting a list of values to drop
            if not params:
                print(f"  {'values_to_drop'}: ********* NO Parameters were specified *********")
            else:
                print(f"  {'values_to_drop'}: {params}")
                feature_df = feature_df[~feature_df[column].isin(params)]
        
        if 'translate' in clean_config:
            params = clean_config['translate']          # Expecting a dictionary of translations (from:to values)
            if not params:
                print(f"  {'translate'}: ********* NO Parameters were specified *********")
            else:
                print(f"  {'translate'}: {params}")
                feature_df[column] = feature_df[column].replace(params)
        
        if 'translate2' in clean_config:
            params = clean_config['translate2']          # Expecting a dictionary of translations (from:to values)
            if not params:
                print(f"  {'translate2'}: ********* NO Parameters were specified *********")
            else:
                print(f"  {'translate2'}: {params}")
                feature_df[column] = feature_df[column].replace(params)
        
        if 'scale' in clean_config:
            params = clean_config['scale']              # expecting dictionary with divisor and rounding values
            if not params:
                print(f"  {'scale'}: ********* NO Parameters were specified *********")
            else:
                print(f"  {'scale'}: {params}")

        feature_list = feature_df[column].unique()
        feature_list = np.sort(feature_list)
        if len(feature_list)> 50:
            first_50 = feature_list[:50]
            print(f"  FINAL Unique features in [{column}]:  \n********** More than {len(first_50)} features, list is truncated to first 50 **********\n{first_50}")
        else:
            print(f"  FINAL Unique features in [{column}]:  {feature_list}")


        # if len(feature_df[column].unique())> 50:
        #     first_50 = feature_df[column].unique()[:50]
        #     print(f"  FINAL Unique features in [{column}]:  \n********** More than {len(first_50)} features, list is truncated to first 50 **********\n{first_50}")
        # else:
        #     print(f"  FINAL Unique features in [{column}]:  {feature_df[column].unique()}")
    else:
        print(f"Feature DOES NOT exist: {column}")





Cleaning Feature: GENHLTH
  Initial Unique features in [GENHLTH]:  [1. 2. 3. 4. 5. 7. 9.]
  values_to_drop: [7, 9]
  translate: {1: 0, 2: 1, 3: 2, 4: 3, 5: 4}
  scale: ********* NO Parameters were specified *********
  FINAL Unique features in [GENHLTH]:  [0. 1. 2. 3. 4.]


Cleaning Feature: PHYSHLTH
  Initial Unique features in [PHYSHLTH]:  [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17. 18.
 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 77. 88. 99.]
  values_to_drop: [77, 99]
  translate: {88: 0}
  scale: ********* NO Parameters were specified *********
  FINAL Unique features in [PHYSHLTH]:  [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
 18. 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30.]


Cleaning Feature: MENTHLTH
  Initial Unique features in [MENTHLTH]:  [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17. 18.
 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 77. 88. 99.]
  values_to_drop: [77, 99]
  translate: 

### Summary after all the features have been cleaned

In [20]:
feature_df

Unnamed: 0_level_0,GENHLTH,PHYSHLTH,MENTHLTH,PRIMINSR,PERSDOC3,CHECKUP1,EXERANY2,BPHIGH6,CHOLCHK3,TOLDHI3,...,_RFDRHV7,FTJUDA2_,FRUTDA2_,GRENDA1_,FRNCHDA_,POTADA1_,VEGEDA2_,_FRUTSU1,_VEGESU1,_FRTLT1A
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4.0,20.0,10.0,3.0,1.0,2.0,0.0,0.0,1.0,1.0,...,1.0,5.397605e-79,100.0,5.700000e+01,4.300000e+01,1.400000e+01,100.0,100.0,214.0,1.0
2,1.0,0.0,0.0,2.0,2.0,1.0,0.0,3.0,1.0,0.0,...,1.0,5.397605e-79,100.0,5.397605e-79,1.400000e+01,1.400000e+01,43.0,100.0,71.0,1.0
3,1.0,0.0,10.0,2.0,1.0,1.0,1.0,3.0,1.0,1.0,...,1.0,7.100000e+01,43.0,1.000000e+01,5.700000e+01,2.700000e+01,71.0,114.0,165.0,1.0
5,2.0,0.0,0.0,3.0,1.0,1.0,0.0,0.0,1.0,0.0,...,1.0,5.397605e-79,29.0,1.400000e+01,5.397605e-79,1.400000e+01,14.0,29.0,42.0,2.0
9,2.0,25.0,5.0,3.0,2.0,1.0,1.0,3.0,1.0,0.0,...,1.0,5.397605e-79,100.0,2.900000e+01,2.900000e+01,4.300000e+01,57.0,100.0,158.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438685,2.0,0.0,0.0,10.0,1.0,3.0,1.0,3.0,6.0,1.0,...,1.0,5.397605e-79,57.0,4.300000e+01,5.397605e-79,7.000000e+00,100.0,57.0,150.0,2.0
438686,1.0,5.0,0.0,1.0,2.0,1.0,1.0,0.0,1.0,0.0,...,1.0,1.000000e+02,100.0,2.900000e+01,5.397605e-79,5.397605e-79,43.0,200.0,72.0,1.0
438687,3.0,0.0,0.0,3.0,1.0,3.0,1.0,3.0,3.0,1.0,...,1.0,2.000000e+02,50.0,2.000000e+02,1.400000e+01,1.400000e+01,71.0,250.0,299.0,1.0
438690,1.0,0.0,0.0,10.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,5.397605e-79,200.0,2.900000e+01,5.397605e-79,1.400000e+01,100.0,200.0,143.0,1.0


### Rename Columns:



In [21]:
feature_df.columns

Index(['GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'PRIMINSR', 'PERSDOC3', 'CHECKUP1',
       'EXERANY2', 'BPHIGH6', 'CHOLCHK3', 'TOLDHI3', 'CHOLMED3', 'CVDCRHD4',
       'CVDSTRK3', 'ADDEPEV3', 'CHCKDNY2', 'DIABETE4', 'MARITAL', 'EDUCA',
       'RENTHOM1', 'EMPLOY1', 'INCOME3', 'WEIGHT2', 'DEAF', 'BLIND',
       'DIFFWALK', 'FLUSHOT7', '_IMPRACE', '_RFHLTH', '_HLTHPLN', '_TOTINDA',
       '_MICHD', '_PRACE1', '_RACE', '_RACEGR3', '_SEX', '_AGEG5YR', 'WTKG3',
       '_BMI5', '_BMI5CAT', '_EDUCAG', '_INCOMG1', '_SMOKER3', '_RFSMOK3',
       '_CURECI1', '_DRNKWK1', '_RFDRHV7', 'FTJUDA2_', 'FRUTDA2_', 'GRENDA1_',
       'FRNCHDA_', 'POTADA1_', 'VEGEDA2_', '_FRUTSU1', '_VEGESU1', '_FRTLT1A'],
      dtype='object')

In [22]:
# From 3_bfrss_features_1_create_feature_report
    # {'GENHLTH': 'General Health',
    # 'PHYSHLTH': 'Number of Days Physical Health Not Good',
    # 'MENTHLTH': 'Number of Days Mental Health Not Good',
    # 'PRIMINSR': 'What is Primary Source of Health Insurance?',
    # 'PERSDOC3': 'Have Personal Health Care Provider?',
    # 'CHECKUP1': 'Length of time since last routine checkup',
    # 'EXERANY2': 'Exercise in Past 30 Days',
    # 'BPHIGH6': 'Ever Told Blood Pressure High',
    # 'CHOLCHK3': 'How Long since Cholesterol Checked',
    # 'TOLDHI3': 'Ever Told Cholesterol Is High',
    # 'CHOLMED3': 'Currently taking medicine for high cholesterol',
    # 'CVDCRHD4': 'Ever Diagnosed with Angina or Coronary Heart Disease',
    # 'CVDSTRK3': 'Ever Diagnosed with a Stroke',
    # 'ADDEPEV3': '(Ever told) you had a depressive disorder',
    # 'CHCKDNY2': 'Ever told you have kidney disease?',
    # 'DIABETE4': '(Ever told) you had diabetes',
    # 'MARITAL': 'Marital Status',
    # 'EDUCA': 'Education Level',
    # 'RENTHOM1': 'Own or Rent Home',
    # 'EMPLOY1': 'Employment Status',
    # 'INCOME3': 'Income Level',
    # 'WEIGHT2': 'Reported Weight in Pounds',
    # 'DEAF': 'Are you deaf or do you have serious difficulty hearing?',
    # 'BLIND': 'Blind or Difficulty seeing',
    # 'DIFFWALK': 'Difficulty Walking or Climbing Stairs',
    # 'FLUSHOT7': 'Adult flu shot/spray past 12 mos',
    # '_IMPRACE': 'Imputed race/ethnicity value',
    # '_RFHLTH': 'Adults with good or better health',
    # '_HLTHPLN': 'Have any health insurance',
    # '_TOTINDA': 'Leisure Time Physical Activity Calculated Variable',
    # '_MICHD': 'Ever had CHD or MI\u200c',
    # '_PRACE1': 'Computed Preferred Race',
    # '_RACE': 'Computed Race-Ethnicity grouping',
    # '_RACEGR3': 'Computed Five level race/ethnicity category.',
    # '_SEX': 'Calculated sex variable',
    # '_AGEG5YR': 'Reported age in five-year age categories calculated variable',
    # 'WTKG3': 'Computed Weight in Kilograms',
    # '_BMI5': 'Computed body mass index',
    # '_BMI5CAT': 'Computed body mass index categories',
    # '_EDUCAG': 'Computed level of education completed categories',
    # '_INCOMG1': 'Computed income categories',
    # '_SMOKER3': 'Computed Smoking Status',
    # '_RFSMOK3': 'Current Smoking Calculated Variable',
    # '_CURECI1': 'Current E-cigarette User Calculated Variable',
    # '_DRNKWK1': 'Computed number of drinks of alcohol beverages per week',
    # '_RFDRHV7': 'Heavy Alcohol Consumption Calculated Variable',
    # 'FTJUDA2_': 'Computed Fruit Juice intake in times per day',
    # 'FRUTDA2_': 'Computed Fruit intake in times per day',
    # 'GRENDA1_': 'Computed Dark Green Vege',
    # 'FRNCHDA_': 'Computed French Fry intake in times per day',
    # 'POTADA1_': 'Computed Potato Servings per day',
    # 'VEGEDA2_': 'Computed Other Vege',
    # '_FRUTSU1': 'Total fruits consumed per day',
    # '_VEGESU1': 'Total vege',
    # '_FRTLT1A': 'Consume Fruit 1 or more times per day'}

In [23]:
# Defined in 3_brfss_1_create_feature_report.ipynb (copied from there)
feature_dict = {'GENHLTH': 'general_health',
                'PHYSHLTH': 'days_health_not_good',
                'MENTHLTH': 'days_mental_not_good',
                'PRIMINSR': 'primary_insurance_source',
                'PERSDOC3': 'personal_provider',
                'CHECKUP1': 'years_since_checkup',
                'EXERANY2': 'exercise',
                'BPHIGH6': 'told_high_blood_pressure',
                'CHOLCHK3': 'year_chol_chckd',
                'TOLDHI3': 'high_cholesteral',
                'CHOLMED3': 'high_cholesteral_meds',
                'CVDCRHD4': 'ever_heart_disease',
                'CVDSTRK3': 'ever_stroke',
                'ADDEPEV3': 'depressive_disorder',
                'CHCKDNY2': 'ekidney_disease',
                'DIABETE4': 'diabetes',
                'MARITAL': 'marital',
                'EDUCA': 'education_level',
                'RENTHOM1': 'own_home',
                'EMPLOY1': 'employment',
                'INCOME3': 'income_level',
                'WEIGHT2': 'weight-lbs',
                'DEAF': 'hearing',
                'BLIND': 'sight',
                'DIFFWALK': 'diffwalk',
                'FLUSHOT7': 'flu_shot',
                '_IMPRACE': 'race_grp5',
                '_RFHLTH': 'good_health',
                '_HLTHPLN': 'have_insurance',
                '_TOTINDA': 'activity_level',
                '_MICHD': 'heart_chd_mi',
                '_PRACE1': 'race_grp8',
                '_RACE': 'race_grp8a',
                '_RACEGR3': 'race_grp5a',
                '_SEX': 'sex',
                '_AGEG5YR': 'age_5yr',
                'WTKG3': 'weight_kilos',
                '_BMI5': 'body_mass_index',
                '_BMI5CAT': 'BMI_cats',
                '_EDUCAG': 'ceduc_cats',
                '_INCOMG1': 'income _cats',
                '_SMOKER3': 'smoking_status',
                '_RFSMOK3': 'smoking_yn',
                '_CURECI1': 'ecig_yn',
                '_DRNKWK1': 'drinks_week',
                '_RFDRHV7': 'drinks_cats',
                'FTJUDA2_': 'fruit_juice',
                'FRUTDA2_': 'fruit_times_perday',
                'GRENDA1_': 'darkG_vege',
                'FRNCHDA_': 'french_fry',
                'POTADA1_': 'potato',
                'VEGEDA2_': 'other_vege',
                '_FRUTSU1': 'tot_fruits',
                '_VEGESU1': 'tot_vege',
                '_FRTLT1A': 'fruit_1_yn'}

In [24]:
# Rename the columns
feature_df = feature_df.rename(columns=feature_dict)

In [25]:
feature_df.columns

Index(['general_health', 'days_health_not_good', 'days_mental_not_good',
       'primary_insurance_source', 'personal_provider', 'years_since_checkup',
       'exercise', 'told_high_blood_pressure', 'year_chol_chckd',
       'high_cholesteral', 'high_cholesteral_meds', 'ever_heart_disease',
       'ever_stroke', 'depressive_disorder', 'ekidney_disease', 'diabetes',
       'marital', 'education_level', 'own_home', 'employment', 'income_level',
       'weight-lbs', 'hearing', 'sight', 'diffwalk', 'flu_shot', 'race_grp5',
       'good_health', 'have_insurance', 'activity_level', 'heart_chd_mi',
       'race_grp8', 'race_grp8a', 'race_grp5a', 'sex', 'age_5yr',
       'weight_kilos', 'body_mass_index', 'BMI_cats', 'ceduc_cats',
       'income _cats', 'smoking_status', 'smoking_yn', 'ecig_yn',
       'drinks_week', 'drinks_cats', 'fruit_juice', 'fruit_times_perday',
       'darkG_vege', 'french_fry', 'potato', 'other_vege', 'tot_fruits',
       'tot_vege', 'fruit_1_yn'],
      dtype='object'

### Clean Types

- All columns are float64, which is fine

In [26]:
feature_df.dtypes

general_health              float64
days_health_not_good        float64
days_mental_not_good        float64
primary_insurance_source    float64
personal_provider           float64
years_since_checkup         float64
exercise                    float64
told_high_blood_pressure    float64
year_chol_chckd             float64
high_cholesteral            float64
high_cholesteral_meds       float64
ever_heart_disease          float64
ever_stroke                 float64
depressive_disorder         float64
ekidney_disease             float64
diabetes                    float64
marital                     float64
education_level             float64
own_home                    float64
employment                  float64
income_level                float64
weight-lbs                  float64
hearing                     float64
sight                       float64
diffwalk                    float64
flu_shot                    float64
race_grp5                   float64
good_health                 

### Write final cleaned dataset to parquet file

In [27]:
# Print df to compare when read back
feature_df

Unnamed: 0_level_0,general_health,days_health_not_good,days_mental_not_good,primary_insurance_source,personal_provider,years_since_checkup,exercise,told_high_blood_pressure,year_chol_chckd,high_cholesteral,...,drinks_cats,fruit_juice,fruit_times_perday,darkG_vege,french_fry,potato,other_vege,tot_fruits,tot_vege,fruit_1_yn
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4.0,20.0,10.0,3.0,1.0,2.0,0.0,0.0,1.0,1.0,...,1.0,5.397605e-79,100.0,5.700000e+01,4.300000e+01,1.400000e+01,100.0,100.0,214.0,1.0
2,1.0,0.0,0.0,2.0,2.0,1.0,0.0,3.0,1.0,0.0,...,1.0,5.397605e-79,100.0,5.397605e-79,1.400000e+01,1.400000e+01,43.0,100.0,71.0,1.0
3,1.0,0.0,10.0,2.0,1.0,1.0,1.0,3.0,1.0,1.0,...,1.0,7.100000e+01,43.0,1.000000e+01,5.700000e+01,2.700000e+01,71.0,114.0,165.0,1.0
5,2.0,0.0,0.0,3.0,1.0,1.0,0.0,0.0,1.0,0.0,...,1.0,5.397605e-79,29.0,1.400000e+01,5.397605e-79,1.400000e+01,14.0,29.0,42.0,2.0
9,2.0,25.0,5.0,3.0,2.0,1.0,1.0,3.0,1.0,0.0,...,1.0,5.397605e-79,100.0,2.900000e+01,2.900000e+01,4.300000e+01,57.0,100.0,158.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438685,2.0,0.0,0.0,10.0,1.0,3.0,1.0,3.0,6.0,1.0,...,1.0,5.397605e-79,57.0,4.300000e+01,5.397605e-79,7.000000e+00,100.0,57.0,150.0,2.0
438686,1.0,5.0,0.0,1.0,2.0,1.0,1.0,0.0,1.0,0.0,...,1.0,1.000000e+02,100.0,2.900000e+01,5.397605e-79,5.397605e-79,43.0,200.0,72.0,1.0
438687,3.0,0.0,0.0,3.0,1.0,3.0,1.0,3.0,3.0,1.0,...,1.0,2.000000e+02,50.0,2.000000e+02,1.400000e+01,1.400000e+01,71.0,250.0,299.0,1.0
438690,1.0,0.0,0.0,10.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,5.397605e-79,200.0,2.900000e+01,5.397605e-79,1.400000e+01,100.0,200.0,143.0,1.0


In [28]:
# Write final cleaned dataset to parquet file
feature_df.to_parquet(clean_file_final, compression='gzip', engine="fastparquet")
# Read final cleaned dataset from parquet file
df = pd.read_parquet(clean_file_final, engine="fastparquet")

In [29]:
df

Unnamed: 0_level_0,general_health,days_health_not_good,days_mental_not_good,primary_insurance_source,personal_provider,years_since_checkup,exercise,told_high_blood_pressure,year_chol_chckd,high_cholesteral,...,drinks_cats,fruit_juice,fruit_times_perday,darkG_vege,french_fry,potato,other_vege,tot_fruits,tot_vege,fruit_1_yn
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4.0,20.0,10.0,3.0,1.0,2.0,0.0,0.0,1.0,1.0,...,1.0,5.397605e-79,100.0,5.700000e+01,4.300000e+01,1.400000e+01,100.0,100.0,214.0,1.0
2,1.0,0.0,0.0,2.0,2.0,1.0,0.0,3.0,1.0,0.0,...,1.0,5.397605e-79,100.0,5.397605e-79,1.400000e+01,1.400000e+01,43.0,100.0,71.0,1.0
3,1.0,0.0,10.0,2.0,1.0,1.0,1.0,3.0,1.0,1.0,...,1.0,7.100000e+01,43.0,1.000000e+01,5.700000e+01,2.700000e+01,71.0,114.0,165.0,1.0
5,2.0,0.0,0.0,3.0,1.0,1.0,0.0,0.0,1.0,0.0,...,1.0,5.397605e-79,29.0,1.400000e+01,5.397605e-79,1.400000e+01,14.0,29.0,42.0,2.0
9,2.0,25.0,5.0,3.0,2.0,1.0,1.0,3.0,1.0,0.0,...,1.0,5.397605e-79,100.0,2.900000e+01,2.900000e+01,4.300000e+01,57.0,100.0,158.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438685,2.0,0.0,0.0,10.0,1.0,3.0,1.0,3.0,6.0,1.0,...,1.0,5.397605e-79,57.0,4.300000e+01,5.397605e-79,7.000000e+00,100.0,57.0,150.0,2.0
438686,1.0,5.0,0.0,1.0,2.0,1.0,1.0,0.0,1.0,0.0,...,1.0,1.000000e+02,100.0,2.900000e+01,5.397605e-79,5.397605e-79,43.0,200.0,72.0,1.0
438687,3.0,0.0,0.0,3.0,1.0,3.0,1.0,3.0,3.0,1.0,...,1.0,2.000000e+02,50.0,2.000000e+02,1.400000e+01,1.400000e+01,71.0,250.0,299.0,1.0
438690,1.0,0.0,0.0,10.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,5.397605e-79,200.0,2.900000e+01,5.397605e-79,1.400000e+01,100.0,200.0,143.0,1.0
