In [33]:
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score

In [34]:
dataset = fetch_openml('Cardiovascular-Disease-dataset', as_frame=True)  # OpenML ID for Cardiovascular Disease dataset
X = dataset.data
y = dataset.target

In [35]:
X

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,18393,2,168,62.0,110.0,80.0,1,1,0,0,1
1,20228,1,156,85.0,140.0,90.0,3,1,0,0,1
2,18857,1,165,64.0,130.0,70.0,3,1,0,0,0
3,17623,2,169,82.0,150.0,100.0,1,1,0,0,1
4,17474,1,156,56.0,100.0,60.0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,120.0,80.0,1,1,1,0,1
69996,22601,1,158,126.0,140.0,90.0,2,2,0,0,1
69997,19066,2,183,105.0,180.0,90.0,3,1,0,1,0
69998,22431,1,163,72.0,135.0,80.0,1,2,0,0,0


In [37]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np
# Separate numerical and categorical columns
num_cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
cat_cols = ['gender', 'cholesterol', 'gluc']

# Initialize the scalers and encoders
scaler = StandardScaler()
onehot = OneHotEncoder(sparse_output=False)

# Fit and transform the numerical columns
X_train_num = scaler.fit_transform(X_train[num_cols])
X_val_num = scaler.transform(X_val[num_cols])
X_test_num = scaler.transform(X_test[num_cols])

# Fit and transform the categorical columns
X_train_cat = onehot.fit_transform(X_train[cat_cols])
X_val_cat = onehot.transform(X_val[cat_cols])
X_test_cat = onehot.transform(X_test[cat_cols])

# Combine the transformed numerical and categorical columns
X_train_transformed = np.hstack((X_train_num, X_train_cat))
X_val_transformed = np.hstack((X_val_num, X_val_cat))
X_test_transformed = np.hstack((X_test_num, X_test_cat))

In [46]:
from sklearn.compose import ColumnTransformer

column_transformer = ColumnTransformer(
                transformers=[
                    ('scaler', StandardScaler(), num_cols),
                    ('ohe', OneHotEncoder(sparse_output=False), cat_cols)
                ], remainder='passthrough')

In [50]:
X_train_transformed = column_transformer.fit_transform(X_train)
X_val_transformed = column_transformer.transform(X_val)
X_test_transformed = column_transformer.transform(X_test)

In [51]:
from sklearn.linear_model import LogisticRegression

# Initialize the LogisticRegression classifier
logistic_classifier = LogisticRegression(random_state=42, max_iter=1000)

# Train the classifier on the transformed training data
logistic_classifier.fit(X_train_transformed, y_train)

# Predict on the validation set
y_val_pred = logistic_classifier.predict(X_val_transformed)

# Calculate the accuracy on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.4f}')

# Predict on the test set
y_test_pred = logistic_classifier.predict(X_test_transformed)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.4f}')

Validation Accuracy: 0.7179
Test Accuracy: 0.7219
