In [1]:
import pandas as pd
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier

In [6]:
train = pd.read_csv(r"C:\Users\hi\Desktop\Github Repositories\heart-disease-analysis-prediction\Data\train.csv")
test = pd.read_csv(r"C:\Users\hi\Desktop\Github Repositories\heart-disease-analysis-prediction\Data\test.csv")


In [3]:
train.head()

Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0,58,1,4,152,239,0,0,158,1,3.6,2,2,7,Presence
1,1,52,1,1,125,325,0,2,171,0,0.0,1,0,3,Absence
2,2,56,0,2,160,188,0,2,151,0,0.0,1,0,3,Absence
3,3,44,0,3,134,229,0,2,150,0,1.0,2,0,3,Absence
4,4,58,1,4,140,234,0,2,125,1,3.8,2,3,3,Presence


In [7]:
X = train.drop(columns=['id','Heart Disease'])
y = train['Heart Disease']
y = y.map({'Presence':1, 'Absence':0})

In [12]:
y

0         1
1         0
2         0
3         0
4         1
         ..
629995    0
629996    0
629997    1
629998    1
629999    0
Name: Heart Disease, Length: 630000, dtype: int64

In [14]:
skewed_col = ['Cholesterol','Max HR','ST depression','Age','BP']
preprocessing = ColumnTransformer(
    transformers=[
        ('num_col',StandardScaler(), X.columns),
        ('skew',PowerTransformer(method='yeo-johnson'), skewed_col)
    ]
)

In [15]:
base_model = [
    ('rf', RandomForestClassifier(n_estimators=1000,max_depth=10,random_state=42)),
    ('lr',LogisticRegression(random_state=42, max_iter=1000)),
    ('xgbc', XGBClassifier(n_estimators=1000, max_depth=10, learning_rate=0.001))
]

In [16]:
stack = StackingClassifier(
    estimators = base_model,
    final_estimator = XGBClassifier(n_estimators=1000, max_depth=10, learning_rate=0.001),
    n_jobs = -1,
    cv=5

)

In [17]:
model = Pipeline(
    steps=[
        ('preprocessing',preprocessing),
        ('smote',SMOTE(random_state=42)),
        ('clsf',stack)
    ]
)

In [None]:
model.fit(X,y)