In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

heartAttackPrediction = pd.read_csv('../Codigo/heart_attack_prediction_dataset.csv')
heartAttackPrediction.fillna(0, inplace=True)
heartAttackPrediction.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0


In [4]:
heartAttackPrediction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 26 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Patient ID                       8763 non-null   object 
 1   Age                              8763 non-null   int64  
 2   Sex                              8763 non-null   object 
 3   Cholesterol                      8763 non-null   int64  
 4   Blood Pressure                   8763 non-null   object 
 5   Heart Rate                       8763 non-null   int64  
 6   Diabetes                         8763 non-null   int64  
 7   Family History                   8763 non-null   int64  
 8   Smoking                          8763 non-null   int64  
 9   Obesity                          8763 non-null   int64  
 10  Alcohol Consumption              8763 non-null   int64  
 11  Exercise Hours Per Week          8763 non-null   float64
 12  Diet                

In [2]:
# Number of Heart attack risk
heartAttackPrediction['Heart Attack Risk'].value_counts()

Heart Attack Risk
0    5624
1    3139
Name: count, dtype: int64

In [12]:
# Crear columnas Systolic_BP y Diastolic_BP desde Blood Pressure
heartAttackPrediction[['Systolic BP', 'Diastolic BP']] = heartAttackPrediction['Blood Pressure'].str.split('/', expand=True).astype(int)

cols = heartAttackPrediction.columns.tolist()
bp_index = cols.index('Blood Pressure')
cols = cols[:bp_index + 1] + ['Systolic BP', 'Diastolic BP'] + cols[bp_index + 1:-2]
heartAttackPrediction = heartAttackPrediction[cols]

heartAttackPrediction.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Systolic BP,Diastolic BP,Heart Rate,Diabetes,Family History,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,158,88,72,0,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,165,93,98,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,174,99,72,1,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,163,100,73,1,1,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,91,88,93,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0


In [20]:
x = heartAttackPrediction.drop(columns=['Heart Attack Risk','Patient ID', 'Blood Pressure'])
y = heartAttackPrediction['Heart Attack Risk']

xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=2)

print(xtrain.shape)
print(xtest.shape)

(7010, 25)
(1753, 25)


In [21]:
ct = ColumnTransformer(transformers=[
    ('onehot', OneHotEncoder(drop='first'), ['Sex', 'Diet', 'Country', 'Continent', 'Hemisphere']),
    ('scaler', StandardScaler(), ['Age', 'Cholesterol', 'Heart Rate', 'Exercise Hours Per Week', 'Sedentary Hours Per Day', 
                                  'Income', 'BMI', 'Triglycerides', 'Physical Activity Days Per Week', 'Sleep Hours Per Day',
                                  'Stress Level', 'Systolic BP', 'Diastolic BP'])],
    remainder='passthrough')

In [22]:
xtrain1 = ct.fit_transform(xtrain)
xtest1 = ct.transform(xtest)

In [23]:
print(xtrain1.shape)
print(xtest1.shape)

(7010, 48)
(1753, 48)


In [24]:
lr = LogisticRegression()
lr.fit(xtrain1,ytrain)
ypred = lr.predict(xtest1)

print('accuracy_score',accuracy_score(ytest,ypred))
print('Confusion_matrix',confusion_matrix(ytest,ypred))

accuracy_score 0.6451796919566457
Confusion_matrix [[1131    0]
 [ 622    0]]


In [25]:
sv = SVC(kernel = 'rbf')
sv.fit(xtrain1,ytrain)
ypred = sv.predict(xtest1)
print('accuracy_score',accuracy_score(ytest,ypred))
print('Confusion_matrix',confusion_matrix(ytest,ypred))

accuracy_score 0.6451796919566457
Confusion_matrix [[1131    0]
 [ 622    0]]


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, classification_report
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# Separación X/y
X = heartAttackPrediction_India.drop(columns=['Heart_Attack_Risk', 'Patient_ID'])
y = heartAttackPrediction_India['Heart_Attack_Risk']

# Train/test split
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Columnas
numeric_features = ['Diastolic_BP', 'Annual_Income', 'Emergency_Response_Time',
                    'Systolic_BP', 'Cholesterol_Level', 'Triglyceride_Level', 
                    'LDL_Level', 'HDL_Level']
categorical_features = ['State_Name', 'Gender']

# Preprocesador
preprocessor = ColumnTransformer([
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
    ('scale', StandardScaler(), numeric_features)
], remainder='passthrough')

# Pipeline completo
pipe = ImbPipeline(steps=[
    ('preprocessing', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression())
])

# Búsqueda de hiperparámetros
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga'],
    'classifier__class_weight': [None, 'balanced'],
    'classifier__max_iter': [100, 300, 500]
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
grid.fit(xtrain, ytrain)

# Mejor modelo
best_model = grid.best_estimator_
ypred = best_model.predict(xtest)
yproba = best_model.predict_proba(xtest)[:, 1]

# Resultados
print(f"✅ Mejor configuración: {grid.best_params_}")
print(f"🔍 AUC: {roc_auc_score(ytest, yproba):.4f}")
print(f"📋 Reporte de clasificación:\n{classification_report(ytest, ypred)}")
print(f"📊 Matriz de confusión:\n{confusion_matrix(ytest, ypred)}")

Fitting 5 folds for each of 120 candidates, totalling 600 fits
✅ Mejor configuración: {'classifier__C': 0.01, 'classifier__class_weight': 'balanced', 'classifier__max_iter': 300, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
🔍 AUC: 0.4955
📋 Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.70      0.49      0.57      1399
           1       0.30      0.52      0.38       601

    accuracy                           0.50      2000
   macro avg       0.50      0.50      0.48      2000
weighted avg       0.58      0.50      0.52      2000

📊 Matriz de confusión:
[[679 720]
 [288 313]]
