#CASO 5 HEALTH ANALYTICS

###1. Importación de modulos

In [1]:
import pandas as pd ## Manejo de dataframes o set de datos
import warnings
import numpy as np ## Todo lo referente a trabajar con vectores y matrices
from scipy import stats ## Herramientas y algoritmos matemáticos para python

### 2. Lectura de datos

#### 2.1 Carga de datos

In [2]:
# Leemos la data de desarrollo de modelos
desarrll = pd.read_csv("HealthAnalytics.csv")

In [3]:
# Vemos la dimensionalidad de los datos
desarrll.shape

(43400, 12)

In [4]:
# Visualizamos los datos
desarrll.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


In [5]:
desarrll.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,43400.0,43400.0,43400.0,43400.0,43400.0,41938.0,43400.0
mean,36326.14235,42.217894,0.093571,0.047512,104.48275,28.605038,0.018041
std,21072.134879,22.519649,0.291235,0.212733,43.111751,7.77002,0.133103
min,1.0,0.08,0.0,0.0,55.0,10.1,0.0
25%,18038.5,24.0,0.0,0.0,77.54,23.2,0.0
50%,36351.5,44.0,0.0,0.0,91.58,27.7,0.0
75%,54514.25,60.0,0.0,0.0,112.07,32.9,0.0
max,72943.0,82.0,1.0,1.0,291.05,97.6,1.0


In [6]:
desarrll.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43400 entries, 0 to 43399
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 43400 non-null  int64  
 1   gender             43400 non-null  object 
 2   age                43400 non-null  float64
 3   hypertension       43400 non-null  int64  
 4   heart_disease      43400 non-null  int64  
 5   ever_married       43400 non-null  object 
 6   work_type          43400 non-null  object 
 7   Residence_type     43400 non-null  object 
 8   avg_glucose_level  43400 non-null  float64
 9   bmi                41938 non-null  float64
 10  smoking_status     30108 non-null  object 
 11  stroke             43400 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 4.0+ MB


#### 2.2. Buenas Prácticas

In [7]:
#Verificamos los espacios y caracteres especiales en los nombres de las columnas
desarrll.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

####2.3. AED

In [8]:
desarrll.describe(include='all')
# Buscamos completitud de los datos.
# Conclusiones:
# 1.- No tenemos completitud de los datos
# 2.- Tenemos valores outliers
# 3.- Realizar el procesamiento a números (Encoding)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
count,43400.0,43400,43400.0,43400.0,43400.0,43400,43400,43400,43400.0,41938.0,30108,43400.0
unique,,3,,,,2,5,2,,,3,
top,,Female,,,,Yes,Private,Urban,,,never smoked,
freq,,25665,,,,27938,24834,21756,,,16053,
mean,36326.14235,,42.217894,0.093571,0.047512,,,,104.48275,28.605038,,0.018041
std,21072.134879,,22.519649,0.291235,0.212733,,,,43.111751,7.77002,,0.133103
min,1.0,,0.08,0.0,0.0,,,,55.0,10.1,,0.0
25%,18038.5,,24.0,0.0,0.0,,,,77.54,23.2,,0.0
50%,36351.5,,44.0,0.0,0.0,,,,91.58,27.7,,0.0
75%,54514.25,,60.0,0.0,0.0,,,,112.07,32.9,,0.0


###3. Recodificación de variables

####3.1. Completitud de los datos

In [9]:
# Revisamos los valores nulos
desarrll.isnull().sum()

Unnamed: 0,0
id,0
gender,0
age,0
hypertension,0
heart_disease,0
ever_married,0
work_type,0
Residence_type,0
avg_glucose_level,0
bmi,1462


In [10]:
# Revisamos los valores perdidos
desarrll.isnull().sum()/desarrll.shape[0]*100

Unnamed: 0,0
id,0.0
gender,0.0
age,0.0
hypertension,0.0
heart_disease,0.0
ever_married,0.0
work_type,0.0
Residence_type,0.0
avg_glucose_level,0.0
bmi,3.368664


In [11]:
# No olvidemos separar las variables cualitativas para poder trabajarlas eficientemente

columnas_categoricas = ["gender","ever_married","work_type","Residence_type","smoking_status"]
columnas_numericas   = ["age","hypertension","heart_disease","avg_glucose_level","bmi", "stroke"]

In [12]:
# Usamos los metodos de imputacion aprendidos!
from sklearn.impute import SimpleImputer

# Generamos el imputador iterativo - Imputacion Univariada Numerica
imp_univ_num = SimpleImputer(missing_values=np.nan, strategy='median')

# Generamos el imputador iterativo - Imputacion Univariada Categorica
imp_univ_cat = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [13]:
# Generamos los subset de variables categoricas - continuas!
data_impt_cat = desarrll[columnas_categoricas]

# Generamos los subset de variables numericas
data_impt_num = desarrll[columnas_numericas]

In [14]:
data_impt_cat.head()

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status
0,Male,No,children,Rural,
1,Male,Yes,Private,Urban,never smoked
2,Female,No,Private,Urban,
3,Female,Yes,Private,Rural,formerly smoked
4,Male,No,Never_worked,Rural,


In [15]:
data_impt_num.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,3.0,0,0,95.12,18.0,0
1,58.0,1,0,87.96,39.2,0
2,8.0,0,0,110.89,17.6,0
3,70.0,0,0,69.04,35.9,0
4,14.0,0,0,161.28,19.1,0


In [16]:
# Realizamos la imputación univariada en una nueva base de datos - Variables Numericas
imp_univ_num.fit(data_impt_num)
imputed_data_univ_num = pd.DataFrame(data=imp_univ_num.transform(data_impt_num),
                             columns=data_impt_num.columns,dtype='float')

# Realizamos la imputación univariada en una nueva base de datos - Variables Categoricas
imp_univ_cat.fit(data_impt_cat)
imputed_data_univ_cat = pd.DataFrame(data=imp_univ_cat.transform(data_impt_cat),
                             columns=data_impt_cat.columns,dtype='object')

In [17]:
data_impt_num.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,3.0,0,0,95.12,18.0,0
1,58.0,1,0,87.96,39.2,0
2,8.0,0,0,110.89,17.6,0
3,70.0,0,0,69.04,35.9,0
4,14.0,0,0,161.28,19.1,0


In [18]:
data_impt_cat.head()

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status
0,Male,No,children,Rural,
1,Male,Yes,Private,Urban,never smoked
2,Female,No,Private,Urban,
3,Female,Yes,Private,Rural,formerly smoked
4,Male,No,Never_worked,Rural,


In [19]:
# Consolidamos los subset
desarrll_imp = pd.concat([imputed_data_univ_cat,imputed_data_univ_num],axis=1)
desarrll_imp.head()

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,Male,No,children,Rural,never smoked,3.0,0.0,0.0,95.12,18.0,0.0
1,Male,Yes,Private,Urban,never smoked,58.0,1.0,0.0,87.96,39.2,0.0
2,Female,No,Private,Urban,never smoked,8.0,0.0,0.0,110.89,17.6,0.0
3,Female,Yes,Private,Rural,formerly smoked,70.0,0.0,0.0,69.04,35.9,0.0
4,Male,No,Never_worked,Rural,never smoked,14.0,0.0,0.0,161.28,19.1,0.0


In [20]:
# Comprobamos la completitud de los datos!
desarrll_imp.isnull().sum()

Unnamed: 0,0
gender,0
ever_married,0
work_type,0
Residence_type,0
smoking_status,0
age,0
hypertension,0
heart_disease,0
avg_glucose_level,0
bmi,0


#### 3.2. Recodificacion de los datos

In [21]:
# LabelEncoder de los datos!
from sklearn.preprocessing import LabelEncoder
# Preprocesamiento con LabelEncoderfrom
for c in columnas_categoricas:
    print(str(c))
    le = LabelEncoder()
    le.fit(desarrll_imp[str(c)])
    desarrll_imp[str(c)]=le.transform(desarrll_imp[str(c)])

gender
ever_married
work_type
Residence_type
smoking_status


####3.3. Tratamiento de outliers

In [22]:
desarrll_imp.describe(include='all',percentiles=[0.01,0.05,0.25,0.5,0.75,0.95,0.99])

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,43400.0,43400.0,43400.0,43400.0,43400.0,43400.0,43400.0,43400.0,43400.0,43400.0,43400.0
mean,0.408894,0.643733,2.185438,0.50129,0.978548,42.217894,0.093571,0.047512,104.48275,28.574551,0.018041
std,0.492151,0.478901,1.093158,0.500004,0.568678,22.519649,0.291235,0.212733,43.111751,7.639769,0.133103
min,0.0,0.0,0.0,0.0,0.0,0.08,0.0,0.0,55.0,10.1,0.0
1%,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.0,56.3,15.1,0.0
5%,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,61.2,17.5,0.0
25%,0.0,0.0,2.0,0.0,1.0,24.0,0.0,0.0,77.54,23.4,0.0
50%,0.0,1.0,2.0,1.0,1.0,44.0,0.0,0.0,91.58,27.7,0.0
75%,1.0,1.0,3.0,1.0,1.0,60.0,0.0,0.0,112.07,32.6,0.0
95%,1.0,1.0,4.0,1.0,2.0,78.0,1.0,0.0,212.0415,42.4,0.0


In [23]:
# Creamos una funcion para poder visualizar los percentiles
def Cuantiles(lista):
    c = [0,1,5,10,20,30,40,50,60,70,80,90,92.5,95,97.5,99,100]
    matrix = pd.concat([pd.DataFrame(c),pd.DataFrame(np.percentile(lista.dropna(),c))],axis = 1)
    matrix.columns = ["Cuantil","Valor_Cuantil"]
    return(matrix)

In [24]:
# Analizamos las variables numericas
# Variable
Cuantiles(desarrll_imp["avg_glucose_level"]).transpose()
# Nos hacemos la pregunta, podríamos acotar la variable

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
Cuantil,0.0,1.0,5.0,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,92.5,95.0,97.5,99.0,100.0
Valor_Cuantil,55.0,56.3,61.2,66.18,74.35,80.37,85.626,91.58,98.24,106.37,120.29,180.952,198.9,212.0415,226.03,239.1902,291.05


In [25]:
# Trabajar con topes de la distribución!
## ApplicantIncome
cuantil_1 = np.percentile(desarrll_imp["avg_glucose_level"],1)
cuantil_97 = np.percentile(desarrll_imp["avg_glucose_level"],97.5)

# Reemplazamos el valor minimo y maximo
desarrll_imp.loc[desarrll_imp["avg_glucose_level"]<cuantil_1,"avg_glucose_level"] = cuantil_1
desarrll_imp.loc[desarrll_imp["avg_glucose_level"]>cuantil_97,"avg_glucose_level"] = cuantil_97

In [26]:
# Analizamos las variables numericas
# Variable
Cuantiles(desarrll_imp["avg_glucose_level"]).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
Cuantil,0.0,1.0,5.0,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,92.5,95.0,97.5,99.0,100.0
Valor_Cuantil,56.3,56.3,61.2,66.18,74.35,80.37,85.626,91.58,98.24,106.37,120.29,180.952,198.9,212.0415,226.03,226.03,226.03


In [27]:
# Analizamos las variables numericas
# Variable
Cuantiles(desarrll_imp["bmi"]).transpose()
# Nos hacemos la pregunta, podríamos acotar la variable?

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
Cuantil,0.0,1.0,5.0,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,92.5,95.0,97.5,99.0,100.0
Valor_Cuantil,10.1,15.1,17.5,19.5,22.3,24.4,26.3,27.7,29.4,31.4,34.1,38.4,40.0,42.4,46.4,51.6,97.6


In [28]:
# Trabajar con topes de la distribución!
## ApplicantIncome
cuantil_1 = np.percentile(desarrll_imp["bmi"],1)
cuantil_97 = np.percentile(desarrll_imp["bmi"],97.5)

# Reemplazamos el valor minimo y maximo
desarrll_imp.loc[desarrll_imp["bmi"]<cuantil_1,"bmi"] = cuantil_1
desarrll_imp.loc[desarrll_imp["bmi"]>cuantil_97,"bmi"] = cuantil_97

In [29]:
# Analizamos las variables numericas
# Variable
Cuantiles(desarrll_imp["bmi"]).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
Cuantil,0.0,1.0,5.0,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,92.5,95.0,97.5,99.0,100.0
Valor_Cuantil,15.1,15.1,17.5,19.5,22.3,24.4,26.3,27.7,29.4,31.4,34.1,38.4,40.0,42.4,46.4,46.4,46.4


In [30]:
desarrll_imp.shape

(43400, 11)

In [31]:
desarrll_imp.head()

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,1,0,4,0,1,3.0,0.0,0.0,95.12,18.0,0.0
1,1,1,2,1,1,58.0,1.0,0.0,87.96,39.2,0.0
2,0,0,2,1,1,8.0,0.0,0.0,110.89,17.6,0.0
3,0,1,2,0,0,70.0,0.0,0.0,69.04,35.9,0.0
4,1,0,1,0,1,14.0,0.0,0.0,161.28,19.1,0.0


### 4. Ingeniería y selección de variables

#### 4.1. Ingeniería de variables

In [32]:
# 1. Grupo de edad según riesgo clínico
desarrll_imp['age_group'] = pd.cut(desarrll_imp['age'],bins=[0, 44, 64, np.inf],labels=['young', 'middle_age', 'senior'])

# 2. Categoría de IMC según OMS
desarrll_imp['bmi_category'] = pd.cut(desarrll_imp['bmi'], bins=[0, 18.5, 24.9, 29.9, np.inf], labels=['underweight', 'normal', 'overweight', 'obese'])

# 3. Categoría de glucosa según guías médicas
desarrll_imp['glucose_category'] = pd.cut(desarrll_imp['avg_glucose_level'], bins=[0, 99, 125, np.inf], labels=['normal', 'prediabetes', 'diabetes'])

# 4. Riesgo combinado de hipertensión + enfermedad cardíaca
desarrll_imp['multi_condition_risk'] = np.where((desarrll_imp['hypertension'] == 1) & (desarrll_imp['heart_disease'] == 1), 1, 0)

# 5. Riesgo metabólico (glucosa alta + sobrepeso/obesidad)
desarrll_imp['metabolic_risk'] = np.where((desarrll_imp['avg_glucose_level'] >= 100) & (desarrll_imp['bmi'] >= 25), 1, 0)

# 6. Combines hypertension & heart disease to flag patients with compounded health risks
desarrll_imp['health_risk'] = desarrll_imp.apply(lambda row: 'High' if row['hypertension'] == '1' and row['heart_disease'] == '1'
                             else 'Moderate' if row['hypertension'] == '1' or row['heart_disease'] == '1'
                             else 'Low', axis=1)
# 7. Riesgo total por presencia de alguna condición
desarrll_imp['cardio_metabolic_risk'] = np.where((desarrll_imp['hypertension'] == 1) |(desarrll_imp['heart_disease'] == 1) |(desarrll_imp['avg_glucose_level'] >= 100) |(desarrll_imp['bmi'] >= 25), 1, 0)

# 8. Interacción edad x hipertensión
desarrll_imp['age_x_hypertension'] = desarrll_imp['age'] * desarrll_imp['hypertension']

# 9. Interacción edad x glucosa
desarrll_imp['age_x_glucose'] = desarrll_imp['age'] * desarrll_imp['avg_glucose_level']

# 10. Interacción tabaquismo x IMC
desarrll_imp['smoking_x_bmi'] = np.where(desarrll_imp['smoking_status'] == '2', desarrll_imp['bmi'], 0)

# 11. Interacción tabaquismo x glucosa
desarrll_imp['smoking_x_glucose'] = np.where(desarrll_imp['smoking_status'] == '2', desarrll_imp['avg_glucose_level'], 0)
# Verificar nuevas columnas
desarrll_imp.head()

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status,age,hypertension,heart_disease,avg_glucose_level,bmi,...,bmi_category,glucose_category,multi_condition_risk,metabolic_risk,health_risk,cardio_metabolic_risk,age_x_hypertension,age_x_glucose,smoking_x_bmi,smoking_x_glucose
0,1,0,4,0,1,3.0,0.0,0.0,95.12,18.0,...,underweight,normal,0,0,Low,0,0.0,285.36,0.0,0.0
1,1,1,2,1,1,58.0,1.0,0.0,87.96,39.2,...,obese,normal,0,0,Low,1,58.0,5101.68,0.0,0.0
2,0,0,2,1,1,8.0,0.0,0.0,110.89,17.6,...,underweight,prediabetes,0,0,Low,1,0.0,887.12,0.0,0.0
3,0,1,2,0,0,70.0,0.0,0.0,69.04,35.9,...,obese,normal,0,0,Low,1,0.0,4832.8,0.0,0.0
4,1,0,1,0,1,14.0,0.0,0.0,161.28,19.1,...,normal,diabetes,0,0,Low,1,0.0,2257.92,0.0,0.0


In [33]:
desarrll_imp.shape

(43400, 22)

In [34]:
from sklearn.preprocessing import LabelEncoder
# Encode the new categorical columns
categorical_features_for_encoding = ['age_group', 'bmi_category', 'glucose_category', 'health_risk']
for col in categorical_features_for_encoding:
    if col in desarrll_imp.columns:
        le = LabelEncoder()
        # Convert to string to handle potential NaN values after creating categories
        desarrll_imp[col] = le.fit_transform(desarrll_imp[col].astype(str))

In [35]:
desarrll_imp.head()

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status,age,hypertension,heart_disease,avg_glucose_level,bmi,...,bmi_category,glucose_category,multi_condition_risk,metabolic_risk,health_risk,cardio_metabolic_risk,age_x_hypertension,age_x_glucose,smoking_x_bmi,smoking_x_glucose
0,1,0,4,0,1,3.0,0.0,0.0,95.12,18.0,...,3,1,0,0,0,0,0.0,285.36,0.0,0.0
1,1,1,2,1,1,58.0,1.0,0.0,87.96,39.2,...,1,1,0,0,0,1,58.0,5101.68,0.0,0.0
2,0,0,2,1,1,8.0,0.0,0.0,110.89,17.6,...,3,2,0,0,0,1,0.0,887.12,0.0,0.0
3,0,1,2,0,0,70.0,0.0,0.0,69.04,35.9,...,1,1,0,0,0,1,0.0,4832.8,0.0,0.0
4,1,0,1,0,1,14.0,0.0,0.0,161.28,19.1,...,0,0,0,0,0,1,0.0,2257.92,0.0,0.0


#### 4.2. Selección de variables

In [36]:
# Seleccion por Random Forest
from sklearn.ensemble import RandomForestClassifier                                  # Paso01: Instancio el algoritmo
forest = RandomForestClassifier()                                                    # Paso02: Configuro el algoritmo
forest.fit(desarrll_imp.drop('stroke',axis=1), desarrll_imp.stroke)                  # Paso03: Ajuste el algoritmo
importances = forest.feature_importances_                                            # Paso04: Importancia!

In [37]:
# Seleccion por Random Forest
TablaImportancia = pd.concat([pd.DataFrame({'Driver':list(desarrll_imp.drop('stroke',axis=1).columns)}),
                              pd.DataFrame({'Importancia':list(forest.feature_importances_)})], axis = 1)
ImportanciaVariables = TablaImportancia[['Driver','Importancia']].sort_values('Importancia', ascending = False).reset_index(drop = True)
ImportanciaVariables

Unnamed: 0,Driver,Importancia
0,age_x_glucose,0.221659
1,avg_glucose_level,0.209286
2,bmi,0.176152
3,age,0.125174
4,smoking_status,0.043301
5,work_type,0.04145
6,age_x_hypertension,0.034778
7,Residence_type,0.028894
8,gender,0.027246
9,bmi_category,0.021347


In [38]:
! pip install boruta

Collecting boruta
  Downloading Boruta-0.4.3-py3-none-any.whl.metadata (8.8 kB)
Downloading Boruta-0.4.3-py3-none-any.whl (57 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.9/57.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: boruta
Successfully installed boruta-0.4.3


In [39]:
# Seleccion Boruta (Permutaciones de arboles)
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

rfc = RandomForestClassifier()
boruta_selector = BorutaPy(rfc, n_estimators='auto',perc = 50,alpha = 0.05,verbose=2, random_state=2025)

x=desarrll_imp.drop('stroke',axis=1)
y=desarrll_imp.stroke.values

boruta_selector.fit(x,y)

print("==============BORUTA==============")
print (boruta_selector.n_features_)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	21
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	21
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	21
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	21
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	21
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	21
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	21
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	7
Tentative: 	3
Rejected: 	11
Iteration: 	9 / 100
Confirmed: 	7
Tentative: 	3
Rejected: 	11
Iteration: 	10 / 100
Confirmed: 	7
Tentative: 	3
Rejected: 	11
Iteration: 	11 / 100
Confirmed: 	7
Tentative: 	3
Rejected: 	11
Iteration: 	12 / 100
Confirmed: 	7
Tentative: 	3
Rejected: 	11
Iteration: 	13 / 100
Confirmed: 	7
Tentative: 	3
Rejected: 	11
Iteration: 	14 / 100
Confirmed: 	7
Tentative: 	3
Rejected: 	11
Iteration: 	15 / 100
Confirmed: 	7
Tentative: 	3
Rejected: 	11
Iteration: 	16 / 100
Confirmed: 	7
Tentative: 	2
Rejected: 	12
I

In [40]:
# Elegimos las variables mas relevantes!
variables_importantes = x.columns[boruta_selector.support_].tolist()
print(variables_importantes)

['work_type', 'smoking_status', 'age', 'avg_glucose_level', 'bmi', 'age_x_hypertension', 'age_x_glucose']


### **5. Modelamiento de Datos o Creación del Algoritmo de ML**

#### 5.1. Parsimonia de datos

In [41]:
#data completa
desarrll_imp.shape

(43400, 22)

In [42]:
#data parsimoniosa
desarrll_imp_subset = desarrll_imp[variables_importantes]
desarrll_imp_subset.shape

(43400, 7)

#### 5.2. Particion Muestral

In [43]:
# Creación de la data de train y la data de test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(desarrll_imp_subset,
                                                    desarrll_imp.stroke,
                                                    test_size = 0.30,
                                                    random_state=2025)

#### 5.2. Balanceo de datos

In [44]:
# Balanceamos la data con SMOTE para el desbalanceo de clases
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=2025)

# Aplica SMOTE solo al conjunto de entrenamiento
X_train, y_train = smote.fit_resample(X_train, y_train)

# Vemos la dimensionalidad de los datos luego del balanceo
print("Dimensionalidad de X_train despues de SMOTE:", X_train.shape)
print("Dimensionalidad de y_train despues de SMOTE:", y_train.shape)

Dimensionalidad de X_train despues de SMOTE: (59642, 7)
Dimensionalidad de y_train despues de SMOTE: (59642,)


#### 5.2. Algoritmos Machine Learning

In [45]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [46]:
from sklearn import metrics as metrics
# Paso N°04: Predecir con el algoritmo entrenado para validar
y_pred_test= lr.predict(X_test)

In [47]:
# Matriz de confusión
print("Matriz confusion: Test")
cm_test = metrics.confusion_matrix(y_test, y_pred_test)
print(cm_test)

# Accuracy
print("Accuracy: Test")
accuracy_test = metrics.accuracy_score(y_test, y_pred_test)
print(accuracy_test)

#Precision
print("Precision: Test")
precision_test = metrics.precision_score(y_test, y_pred_test, zero_division=1)
print(precision_test)

# Recall
print("Recall: Test")
recall_test = metrics.recall_score(y_test, y_pred_test)
print(recall_test)

Matriz confusion: Test
[[9492 3304]
 [  49  175]]
Accuracy: Test
0.7424731182795699
Precision: Test
0.05030181086519115
Recall: Test
0.78125


In [48]:
## Modelos Supervisados : Arbol CART
from sklearn.tree import DecisionTreeClassifier

cart = DecisionTreeClassifier(criterion='entropy',
                              max_depth=4,
                              max_features="sqrt")

cart.fit(X_train, y_train) # Entrenamos el algoritmo

In [49]:
# Predecir con el algoritmo entrenado para validar
y_pred_test= cart.predict(X_test) # Prediccion sobre el test

In [50]:
#Comparar el valor pronosticado con el valor real

from sklearn import metrics as metrics
# Matriz de confusion

print("Matriz confusion: Test")
cm_test = metrics.confusion_matrix(y_test,y_pred_test)
print(cm_test)

print("Accuracy: Test")
accuracy_test=metrics.accuracy_score(y_test,y_pred_test)
print(accuracy_test)

print("Precision: Test")
precision_test=metrics.precision_score(y_test,y_pred_test, zero_division=1)
print(precision_test)

print("Recall: Test")
recall_test=metrics.recall_score(y_test,y_pred_test)
print(recall_test)

Matriz confusion: Test
[[8886 3910]
 [  34  190]]
Accuracy: Test
0.6970814132104455
Precision: Test
0.046341463414634146
Recall: Test
0.8482142857142857


In [51]:
## Modelos Supervisados : Random Forest ##
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(criterion='entropy',
                              max_depth=4,
                              max_features="sqrt")
rf.fit(X_train, y_train) # Entrenando un algoritmo


In [52]:
# Paso N°04: Predecir con el algoritmo entrenado para validar
y_pred_test= rf.predict(X_test) # Prediccion sobre el test

In [53]:
# Paso N°05: Comparar el valor pronosticado con el valor real
from sklearn import metrics as metrics
# Matriz de confusion

print("Matriz confusion: Test")
cm_test = metrics.confusion_matrix(y_test,y_pred_test)
print(cm_test)

print("Accuracy: Test")
accuracy_test=metrics.accuracy_score(y_test,y_pred_test)
print(accuracy_test)

print("Precision: Test")
precision_test=metrics.precision_score(y_test,y_pred_test,zero_division=1)
print(precision_test)

print("Recall: Test")
recall_test=metrics.recall_score(y_test,y_pred_test)
print(recall_test)

Matriz confusion: Test
[[9424 3372]
 [  46  178]]
Accuracy: Test
0.7374807987711214
Precision: Test
0.05014084507042253
Recall: Test
0.7946428571428571


In [54]:
## Modelos Supervisados : AdaBoost ##
from sklearn.ensemble import AdaBoostClassifier # Paso01: Instancio
AdaBoost=AdaBoostClassifier(learning_rate=0.9, n_estimators=10000) # Paso02: Especifico
AdaBoost.fit(X_train, y_train) # Paso03: Entrenamiento algoritmo

In [55]:
# Paso N°04: Predecir con el algoritmo entrenado para validar
y_pred_test= AdaBoost.predict(X_test) # Prediccion sobre el test

In [56]:
# Paso N°05: Comparar el valor pronosticado con el valor real
from sklearn import metrics as metrics
# Matriz de confusion

print("Matriz confusion: Test")
cm_test = metrics.confusion_matrix(y_test,y_pred_test)
print(cm_test)

print("Accuracy: Test")
accuracy_test=metrics.accuracy_score(y_test,y_pred_test)
print(accuracy_test)

print("Precision: Test")
precision_test=metrics.precision_score(y_test,y_pred_test,zero_division=1)
print(precision_test)

print("Recall: Test")
recall_test=metrics.recall_score(y_test,y_pred_test)
print(recall_test)

Matriz confusion: Test
[[11356  1440]
 [  156    68]]
Accuracy: Test
0.8774193548387097
Precision: Test
0.04509283819628647
Recall: Test
0.30357142857142855
