 # Catálogo variables
 
 **Dependent variable:**

* _MICHD - Respondents that have ever reported having coronary heart disease (CHD) or myocardial infarction (MI);

**Independent variables:**

* _BMI5 - Body Mass Index (BMI);
* SMOKE100 - Have you smoked at least 100 cigarettes in your entire life? [Note: 5 packs = 100 cigarettes];
* _RFDRHV7 - Heavy drinkers (adult men having more than 14 drinks per week and adult women having more than 7 drinks per week);
* CVDSTRK3 - (Ever told) (you had) a stroke;
* PHYSHLTH - Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good?;
* MENTHLTH - Now thinking about your mental health, which includes stress, depression, and problems with emotions, for how many days during the past 30 days was your mental health not good?;
* DIFFWALK - Do you have serious difficulty walking or climbing stairs?;
* SEXVAR - Are you male or female?;
* _AGEG5YR - Fourteen-level age category;
* _IMPRACE - Imputed race/ethnicity value (This value is the reported race/ethnicity or an imputed race/ethnicity, if the respondent refused to give a race/ethnicity. The value of the imputed race/ethnicity will be the most common race/ethnicity response for that region of the state) 7;
* DIABETE4 - (Ever told) (you had) diabetes? (If ´Yes´ and respondent is female, ask ´Was this only when you were pregnant?´. If Respondent says pre-diabetes or borderline diabetes, use response code 4.);
* _TOTINDA - Adults who reported doing physical activity or exercise during the past 30 days other than their regular job;
* GENHLTH - Would you say that in general your health is;
* SLEPTIM1 - On average, how many hours of sleep do you get in a 24-hour period?;
* ASTHMA3 - (Ever told) (you had) asthma?;
* CHCKDNY2 - Not including kidney stones, bladder infection or incontinence, were you ever told you had kidney disease?;
* CHCSCNCR - (Ever told) (you had) skin cancer?

In [1]:
import pandas as pd
import numpy as np 
from sty import bg, fg,rs
import os
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import re
import datetime
from tqdm import tqdm
from sty import fg, bg, rs
from matplotlib import cm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
from scipy.stats import *
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.utils import compute_sample_weight
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel

################################################ FILTER WARNINGS #################################################
import warnings
warnings.filterwarnings("ignore")

# Preprocessing
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

> https://www.cdc.gov/brfss/annual_data/2020/pdf/codebook20_llcp-v2-508.pdf

> https://www.kaggle.com/code/nkitgupta/advance-data-preprocessing

> https://www.kaggle.com/code/luanrd/hyperparameters-optimization-for-diferent-models

> https://www.kaggle.com/code/vsevolodcherepanov/short-xgbc-test-cross-val-accuracy-0-916

> https://www.kaggle.com/code/egemenuurdalg/heart-disease-prediction

> https://www.kaggle.com/code/luanrd/hyperparameters-optimization-for-diferent-models

In [2]:
data_folder = 'data/heart_2020.csv'
work_dir = os.getcwd()
parent_dir = os.path.dirname(os.path.normpath(work_dir))
data_dir = os.path.join(parent_dir,data_folder)
df = pd.read_csv(data_dir)


In [3]:
class NameDropper(BaseEstimator,TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        columns     = ["_MICHD", "_BMI5", "_BMI5CAT", "SMOKE100", "_RFDRHV7", "CVDSTRK3", "PHYSHLTH",
             "MENTHLTH", "DIFFWALK", "SEXVAR", "_AGEG5YR", "_IMPRACE", "DIABETE4",
            "_TOTINDA", "GENHLTH", "SLEPTIM1", "ASTHMA3", "CHCKDNY2", "CHCSCNCR"]
        try:
            assert type(columns) == list ,   "Proporciona una lista de Variables presente en el Dataset"
            assert type(X) == pd.DataFrame , "Proporciona un Dataframe de pandas"
            return X[columns]
        except AssertionError as msg:
            print(msg)


class Transformer(BaseEstimator,TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        '''renaming
           dropping na
           recodification '''
        try:
            X["_MICHD"] = X["_MICHD"].replace({2: "No", 1: "Yes"})

            X["_BMI5CAT"] = X["_BMI5CAT"].replace({
                1: "Underweight (BMI < 18.5)",
                2: "Normal weight (18.5 <= BMI < 25.0)",
                3: "Overweight (25.0 <= BMI < 30.0)",
                4: "Obese (30.0 <= BMI < +Inf)"
            })

            binary_vars = ["SMOKE100", "CVDSTRK3", "DIFFWALK", "_TOTINDA", "ASTHMA3", "CHCKDNY2", "CHCSCNCR"]
            X[binary_vars] = X[binary_vars].replace({
                1: "Yes",
                2: "No",
                7: np.NaN,
                9: np.NaN
            })

            X["_RFDRHV7"] = X["_RFDRHV7"].replace({
                1: "No",
                2: "Yes",
                9: np.NaN
            })

            multi_vars = ["PHYSHLTH", "MENTHLTH"]
            X[multi_vars] = X[multi_vars].replace({
                88: 0,
                77: np.NaN,
                99: np.NaN
            })

            X["SEXVAR"] = X["SEXVAR"].replace({1: "Male", 2: "Female"})

            X["_AGEG5YR"] = X["_AGEG5YR"].replace({
                1: "18-24",
                2: "25-29",
                3: "30-34",
                4: "35-39",
                5: "40-44",
                6: "45-49",
                7: "50-54",
                8: "55-59",
                9: "60-64",
                10: "65-69",
                11: "70-74",
                12: "75-79",
                13: "80 or older",
                14: np.NaN
            })

            X["_IMPRACE"] = X["_IMPRACE"].replace({
                1: "White",
                2: "Black",
                3: "Asian",
                4: "American Indian/Alaskan Native",
                5: "Hispanic",
                6: "Other"
            })

            X["DIABETE4"] = X["DIABETE4"].replace({
                1: "Yes",
                2: "Yes (during pregnancy)",
                3: "No",
                4: "No, borderline diabetes",
                7: np.NaN,
                9: np.NaN
            })

            X["GENHLTH"] = X["GENHLTH"].replace({
                1: "Excellent",
                2: "Very good",
                3: "Good",
                4: "Fair",
                5: "Poor",
                7: np.NaN,
                9: np.NaN
            })

            X["SLEPTIM1"] = X["SLEPTIM1"].replace({
                77: np.NaN,
                99: np.NaN
            })
            X = X.dropna()
            X = X.rename({
                        "_MICHD": "HeartDisease",
                        "_BMI5"   : "BMI",
                        "_BMI5CAT": "BMICategory",
                        "SMOKE100": "Smoking",
                        "_RFDRHV7": "AlcoholDrinking",
                        "CVDSTRK3": "Stroke",
                        "PHYSHLTH": "PhysicalHealth",
                        "MENTHLTH": "MentalHealth",
                        "DIFFWALK": "DiffWalking",
                        "SEXVAR": "Sex",
                        "_AGEG5YR": "AgeCategory",
                        "_IMPRACE": "Race",
                        "DIABETE4": "Diabetic",
                        "_TOTINDA": "PhysicalActivity",
                        "GENHLTH": "GenHealth",
                        "SLEPTIM1": "SleepTime",
                        "ASTHMA3": "Asthma",
                        "CHCKDNY2": "KidneyDisease",
                        "CHCSCNCR": "SkinCancer"
                            }, axis=1)
            #X["HeartDisease"] = X["HeartDisease"].replace({"No": 0, "Yes": 1})
            #X = pd.get_dummies(X[1:])
            return X

        except Exception as msg:
            print(msg)

In [4]:
pipe = Pipeline([
      ('dropper' ,      NameDropper()),
      ('transformer' ,  Transformer())
])

df = pipe.fit_transform(df)

In [5]:
print(df.shape)
print(df.columns)
df.info()

(319795, 19)
Index(['HeartDisease', 'BMI', 'BMICategory', 'Smoking', 'AlcoholDrinking',
       'Stroke', 'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex',
       'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth',
       'SleepTime', 'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 319795 entries, 0 to 401955
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   BMICategory       319795 non-null  object 
 3   Smoking           319795 non-null  object 
 4   AlcoholDrinking   319795 non-null  object 
 5   Stroke            319795 non-null  object 
 6   PhysicalHealth    319795 non-null  float64
 7   MentalHealth      319795 non-null  float64
 8   DiffWalking       319795 non-null  object 
 9   Sex               319795 non-null  o

In [6]:
df

Unnamed: 0,HeartDisease,BMI,BMICategory,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,1660.0,Underweight (BMI < 18.5),Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
4,No,2034.0,Normal weight (18.5 <= BMI < 25.0),No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
5,No,2658.0,Overweight (25.0 <= BMI < 30.0),Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
6,No,2421.0,Normal weight (18.5 <= BMI < 25.0),No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
8,No,2371.0,Normal weight (18.5 <= BMI < 25.0),No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401951,Yes,2741.0,Overweight (25.0 <= BMI < 30.0),Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
401952,No,2984.0,Overweight (25.0 <= BMI < 30.0),Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
401953,No,2424.0,Normal weight (18.5 <= BMI < 25.0),No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
401954,No,3281.0,Obese (30.0 <= BMI < +Inf),No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


In [7]:
def procces_data(df):
    '''Procesamos los datos provenientes de un Dataframe de Pandas
       y lo transformamos para la ingesta en el modelo. Además
       con el train/test split separamos en conjuntos de test y entrenamiento'''

    assert type(df) == pd.core.frame.DataFrame

    for column in df.columns:                                                                       #Eliminamos columnas desconocidas
	    if 'Unnamed' in column:
             df.drop(column, axis=1, inplace=True)


    categoric_vars = ['Sex', 'AgeCategory', 'Race', 'Diabetic', 'GenHealth']
    target_var = ['HeartDisease']
    numeric_vars = ['MentalHealth', 'BMI', 'PhysicalHealth', 'SleepTime']
    yes_no_vars =  ["Smoking","AlcoholDrinking","Stroke","DiffWalking","PhysicalActivity","Asthma","KidneyDisease","SkinCancer"]

    age_order = ['18-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80 or older']
    gen_health_order = ['Poor', 'Fair', 'Good', 'Very good', 'Excellent']
    diabetic_order = ['No', 'Yes (during pregnancy)', 'No, borderline diabetes', 'Yes']
    sex_order = ['Male', "Female"]
    race_order = ["White","Black","Asian","American Indian/Alaskan Native","Hispanic","Other"]

    
    ordinal_encoder = OrdinalEncoder(categories=[sex_order,age_order,race_order,diabetic_order, gen_health_order])
    encoded_ordinal = ordinal_encoder.fit_transform(df[categoric_vars])
    #encoded_ordinal = pd.DataFrame(encoded_ordinal)
    print(encoded_ordinal.shape)

    df[yes_no_vars] = df[yes_no_vars].replace({'Yes':1,'No':0})
    yes_no =  df[yes_no_vars]
    #yes_no = pd.DataFrame(yes_no)
    print(yes_no.shape)

    
    #scaler = StandardScaler()
    #scaled = scaler.fit_transform(df[numeric_vars])
    #scaled = pd.DataFrame(scaled)
    #print(scaled.shape)

    encoder = LabelEncoder()              
    target = np.array(df[target_var])                                            
    encoded_target = encoder.fit_transform(target)                  
    #Diccionario mapping de la codificación 
    encoded_target = pd.DataFrame(encoded_target)
    print(encoded_target.shape)
    mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
    print(mapping)


    
    
    X =  np.concatenate([scaled, yes_no,encoded_ordinal], axis= 1)
    Y = encoded_target
    print(X.shape)
                                                           
    features = numeric_vars+yes_no_vars+categoric_vars
    #Split Test train 33%test
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=1) 

    ## para conjuntos desbalanceados vamos a obtener los classes_weight para cada clase con el fin de tenerlo en cuenta en el clasificador

    class_weights = compute_sample_weight(
       'balanced',
        y_train
    )

    print(X_train.shape, y_train.shape)

    X_train = pd.DataFrame(data=X_train, columns=features)
    X_test = pd.DataFrame(data=X_test, columns=features) 
    y_train = pd.DataFrame(data=y_train)
    y_test = pd.DataFrame(data=y_test)
     
    return X_train, y_train,X_test,y_test, class_weights,features

In [9]:
X_train, y_train,X_test,y_test, class_weights,features = procces_data(df)



(319795, 5)
(319795, 8)
(319795, 4)
(319795, 1)
{'No': 0, 'Yes': 1}
(319795, 17)
(214262, 17) (214262, 1)


In [14]:
X_train.head(1)

Unnamed: 0,MentalHealth,BMI,PhysicalHealth,SleepTime,Smoking,AlcoholDrinking,Stroke,DiffWalking,PhysicalActivity,Asthma,KidneyDisease,SkinCancer,Sex,AgeCategory,Race,Diabetic,GenHealth
0,-0.490039,2.738256,0.456341,-0.763977,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,10.0,0.0,2.0,2.0


In [10]:
params = {
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [3, 4, 5]
    }

## Etapa 1 =====> GreedSearch
print(fg.yellow+'==============> Hyperparameter Tuning Step'+fg.rs)
clf = XGBClassifier(use_label_encoder=False, eval_metric = "mlogloss",tree_method='gpu_hist')
greed_search = GridSearchCV(clf,param_grid=params, cv=10, verbose=True)
greed_search.fit(X_train,y_train, sample_weight=class_weights)
b_params, b_estimator = greed_search.best_params_ , greed_search.best_estimator_ 

y_pred = b_estimator.predict(X_test) 
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions) 

print('Estos son los mejores hiperparámetros:','\n',fg.blue + str(b_params)+fg.rs)
print(fg.blue + "Accuracy del mejor modelo: %.2f%%" % (accuracy * 100.0) + fg.rs)

Fitting 10 folds for each of 405 candidates, totalling 4050 fits
Estos son los mejores hiperparámetros: 
 [34m{'colsample_bytree': 0.6, 'gamma': 0.5, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.8}[39m
[34mAccuracy del mejor modelo: 74.45%[39m


In [11]:
b_estimator.save_model(r"D:\[3]--Python Workdir--[3]\Data - ML- DL- projects\xgb_v1_deploy.json")