In [16]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

heartAttackPrediction_India = pd.read_csv('../Codigo/heart_attack_prediction_india.csv')
heartAttackPrediction_India.fillna(0, inplace=True)
heartAttackPrediction_India.head()

Unnamed: 0,Patient_ID,State_Name,Age,Gender,Diabetes,Hypertension,Obesity,Smoking,Alcohol_Consumption,Physical_Activity,...,Diastolic_BP,Air_Pollution_Exposure,Family_History,Stress_Level,Healthcare_Access,Heart_Attack_History,Emergency_Response_Time,Annual_Income,Health_Insurance,Heart_Attack_Risk
0,1,Rajasthan,42,Female,0,0,1,1,0,0,...,119,1,0,4,0,0,157,611025,0,0
1,2,Himachal Pradesh,26,Male,0,0,0,0,1,1,...,115,0,0,7,0,0,331,174527,0,0
2,3,Assam,78,Male,0,0,1,0,0,1,...,117,0,1,10,1,0,186,1760112,1,0
3,4,Odisha,58,Male,1,0,1,0,0,1,...,65,0,0,1,1,1,324,1398213,0,0
4,5,Karnataka,22,Male,0,0,0,0,0,1,...,109,0,0,9,0,0,209,97987,0,1


In [7]:
# Variables categóricas binarias (0/1)
binary_vars = [
    'Diabetes', 'Hypertension', 'Obesity', 'Smoking', 'Alcohol_Consumption',
    'Physical_Activity', 'Air_Pollution_Exposure', 'Family_History',
    'Healthcare_Access', 'Heart_Attack_History', 'Health_Insurance', 'Heart_Attack_Risk'
]

# Variables numéricas continuas
numeric_vars = [
    'Age', 'Diet_Score', 'Cholesterol_Level', 'Triglyceride_Level', 'LDL_Level',
    'HDL_Level', 'Systolic_BP', 'Diastolic_BP', 'Stress_Level',
    'Emergency_Response_Time', 'Annual_Income'
]

# Recuento de valores nulos por columna
heartAttackPrediction_India.isnull().sum()

Patient_ID                 0
State_Name                 0
Age                        0
Gender                     0
Diabetes                   0
Hypertension               0
Obesity                    0
Smoking                    0
Alcohol_Consumption        0
Physical_Activity          0
Diet_Score                 0
Cholesterol_Level          0
Triglyceride_Level         0
LDL_Level                  0
HDL_Level                  0
Systolic_BP                0
Diastolic_BP               0
Air_Pollution_Exposure     0
Family_History             0
Stress_Level               0
Healthcare_Access          0
Heart_Attack_History       0
Emergency_Response_Time    0
Annual_Income              0
Health_Insurance           0
Heart_Attack_Risk          0
dtype: int64

In [8]:
# Statistical summary
statistical_summary = heartAttackPrediction_India.drop(columns=['Patient_ID','Diabetes','Hypertension','Obesity','Smoking',
                                                                'Alcohol_Consumption','Physical_Activity','Heart_Attack_Risk',
                                                                'Family_History','Air_Pollution_Exposure','Health_Insurance',
                                                                'Heart_Attack_History','Healthcare_Access']).describe()
statistical_summary

Unnamed: 0,Age,Diet_Score,Cholesterol_Level,Triglyceride_Level,LDL_Level,HDL_Level,Systolic_BP,Diastolic_BP,Stress_Level,Emergency_Response_Time,Annual_Income
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,49.3949,5.0217,224.753,174.7333,123.8721,49.3355,134.7259,89.312,5.5188,206.3834,1022062.0
std,17.280301,3.156394,43.359172,71.163447,43.410766,17.399897,25.849077,17.396486,2.866264,112.391711,560597.8
min,20.0,0.0,150.0,50.0,50.0,20.0,90.0,60.0,1.0,10.0,50353.0
25%,35.0,2.0,187.0,114.0,86.0,34.0,112.0,74.0,3.0,110.0,535783.8
50%,49.0,5.0,226.0,174.0,124.0,49.0,135.0,89.0,6.0,206.0,1021383.0
75%,64.0,8.0,262.0,236.0,161.0,65.0,157.0,104.0,8.0,304.0,1501670.0
max,79.0,10.0,299.0,299.0,199.0,79.0,179.0,119.0,10.0,399.0,1999714.0


In [9]:
# Number of Heart attack risk
heartAttackPrediction_India['Heart_Attack_Risk'].value_counts()

Heart_Attack_Risk
0    6993
1    3007
Name: count, dtype: int64

In [None]:
x = heartAttackPrediction_India.drop(columns=['Heart_Attack_Risk','Patient_ID'])
y = heartAttackPrediction_India['Heart_Attack_Risk']

xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=2)

print(xtrain.shape)
print(xtest.shape)

(8000, 24)
(2000, 24)


In [11]:
ct = ColumnTransformer(transformers=[('onehot',OneHotEncoder(drop='first'),['State_Name','Gender']),
                                     ('normal',StandardScaler(),['Diastolic_BP','Annual_Income','Emergency_Response_Time',
                                                                 'Systolic_BP','Cholesterol_Level', 'Triglyceride_Level', 
                                                                 'LDL_Level','HDL_Level'])],remainder='passthrough')

In [12]:
xtrain1 = ct.fit_transform(xtrain)
xtest1 = ct.transform(xtest)

In [13]:
print(xtrain1.shape)
print(xtest1.shape)

(8000, 50)
(2000, 50)


In [14]:
lr = LogisticRegression()
lr.fit(xtrain1,ytrain)
ypred = lr.predict(xtest1)

print('accuracy_score',accuracy_score(ytest,ypred))
print('Confusion_matrix',confusion_matrix(ytest,ypred))

accuracy_score 0.7175
Confusion_matrix [[1435    0]
 [ 565    0]]


In [15]:
sv = SVC(kernel = 'rbf')
sv.fit(xtrain1,ytrain)
ypred = sv.predict(xtest1)
print('accuracy_score',accuracy_score(ytest,ypred))
print('Confusion_matrix',confusion_matrix(ytest,ypred))

accuracy_score 0.7175
Confusion_matrix [[1435    0]
 [ 565    0]]


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, classification_report
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# Separación X/y
X = heartAttackPrediction_India.drop(columns=['Heart_Attack_Risk', 'Patient_ID'])
y = heartAttackPrediction_India['Heart_Attack_Risk']

# Train/test split
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Columnas
numeric_features = ['Diastolic_BP', 'Annual_Income', 'Emergency_Response_Time',
                    'Systolic_BP', 'Cholesterol_Level', 'Triglyceride_Level', 
                    'LDL_Level', 'HDL_Level']
categorical_features = ['State_Name', 'Gender']

# Preprocesador
preprocessor = ColumnTransformer([
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
    ('scale', StandardScaler(), numeric_features)
], remainder='passthrough')

# Pipeline completo
pipe = ImbPipeline(steps=[
    ('preprocessing', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression())
])

# Búsqueda de hiperparámetros
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga'],
    'classifier__class_weight': [None, 'balanced'],
    'classifier__max_iter': [100, 300, 500]
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
grid.fit(xtrain, ytrain)

# Mejor modelo
best_model = grid.best_estimator_
ypred = best_model.predict(xtest)
yproba = best_model.predict_proba(xtest)[:, 1]

# Resultados
print(f"✅ Mejor configuración: {grid.best_params_}")
print(f"🔍 AUC: {roc_auc_score(ytest, yproba):.4f}")
print(f"📋 Reporte de clasificación:\n{classification_report(ytest, ypred)}")
print(f"📊 Matriz de confusión:\n{confusion_matrix(ytest, ypred)}")

Fitting 5 folds for each of 120 candidates, totalling 600 fits
✅ Mejor configuración: {'classifier__C': 0.01, 'classifier__class_weight': 'balanced', 'classifier__max_iter': 300, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
🔍 AUC: 0.4955
📋 Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.70      0.49      0.57      1399
           1       0.30      0.52      0.38       601

    accuracy                           0.50      2000
   macro avg       0.50      0.50      0.48      2000
weighted avg       0.58      0.50      0.52      2000

📊 Matriz de confusión:
[[679 720]
 [288 313]]
