In [1]:
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
import numpy as np


In [18]:
# Sample Data
columns = ["age", "gender", "height", "weight", "ap_hi", "ap_lo", "cholesterol", "gluc", "smoke", "alco", "active"]
data = [
    [18393, 2, 168, 62.0, 110.0, 80.0, 1, 1, 0, 0, 1],
    [20228, 1, 156, 85.0, 140.0, 90.0, 3, 1, 0, 0, 1],
    [np.nan, 1, 165, 64.0, 130.0, 70.0, 3, 1, 0, 0, 0],
    [17623, 2, 169, 82.0, 150.0, np.nan, 1, 1, 0, 0, 1],
    [17474, 1, 156, 56.0, 100.0, 60.0, 1, 1, 0, 0, 0],
    [np.nan, np.nan, 172, 75.0, 120.0, 80.0, 2, 1, 0, 0, 1],
    [18050, 1, 160, 70.0, 110.0, 70.0, 1, 1, 0, 0, 1],
    [18500, 2, 175, 85.0, 130.0, 85.0, 2, 1, 0, 0, 0],
    [20000, 1, np.nan, 67.0, 140.0, 90.0, 3, 1, 0, 0, 1],
    [19500, 2, 168, 77.0, 125.0, np.nan, 2, 1, 0, 0, 1],
    [18800, 1, 162, 65.0, 115.0, 75.0, 1, 1, 0, 0, 0],
    [19200, 2, 170, 80.0, 130.0, 85.0, 2, 1, 0, 0, 1],
    [20150, 1, 164, 69.0, 140.0, 90.0, 3, 1, 0, 0, 1],
    [19600, 2, 172, 76.0, 125.0, 80.0, 2, 1, 0, 0, 1],
    [18950, 1, 158, 62.0, 110.0, 70.0, 1, 1, 0, 0, 0],
    [19100, 2, 169, 74.0, 120.0, 78.0, 2, 1, 0, 0, 1],
]
# Generate target variable
y = pd.Series([0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1], name='cardio')


df = pd.DataFrame(data, columns=columns)

In [19]:
df

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,18393.0,2.0,168.0,62.0,110.0,80.0,1,1,0,0,1
1,20228.0,1.0,156.0,85.0,140.0,90.0,3,1,0,0,1
2,,1.0,165.0,64.0,130.0,70.0,3,1,0,0,0
3,17623.0,2.0,169.0,82.0,150.0,,1,1,0,0,1
4,17474.0,1.0,156.0,56.0,100.0,60.0,1,1,0,0,0
5,,,172.0,75.0,120.0,80.0,2,1,0,0,1
6,18050.0,1.0,160.0,70.0,110.0,70.0,1,1,0,0,1
7,18500.0,2.0,175.0,85.0,130.0,85.0,2,1,0,0,0
8,20000.0,1.0,,67.0,140.0,90.0,3,1,0,0,1
9,19500.0,2.0,168.0,77.0,125.0,,2,1,0,0,1


In [25]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

# Select categorical columns
categorical_cols = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']

X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(df[categorical_cols], y, test_size=0.3, random_state=42)

# Impute missing values
imputer = SimpleImputer(strategy='most_frequent')
X_train_cat_imputed = imputer.fit_transform(X_train_cat)
X_test_cat_imputed = imputer.transform(X_test_cat)

# One-hot encode categorical variables
encoder = OneHotEncoder(handle_unknown='ignore')
X_train_cat_encoded = encoder.fit_transform(X_train_cat_imputed)
X_test_cat_encoded = encoder.transform(X_test_cat_imputed)

# Train the logistic regression model
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_cat_encoded, y_train_cat)

# Predict and evaluate
y_pred_cat = classifier.predict(X_test_cat_encoded)
accuracy = accuracy_score(y_test_cat, y_pred_cat)
print(f'Accuracy: {accuracy}')

Accuracy: 0.6


In [None]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline( steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore')),
    ('classifier', LogisticRegression(max_iter=1000))
])

pipeline.fit(X_train_cat, y_train_cat)

pipeline.predict(X_test_cat)
accuracy = accuracy_score(y_test_cat, y_pred_cat)
print(f'Accuracy: {accuracy}')

Accuracy: 0.6


In [32]:
dataset = fetch_openml('Cardiovascular-Disease-dataset', as_frame=True)  # OpenML ID for Cardiovascular Disease dataset
X = dataset.data
y = dataset.target

In [33]:
X

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,18393,2,168,62.0,110.0,80.0,1,1,0,0,1
1,20228,1,156,85.0,140.0,90.0,3,1,0,0,1
2,18857,1,165,64.0,130.0,70.0,3,1,0,0,0
3,17623,2,169,82.0,150.0,100.0,1,1,0,0,1
4,17474,1,156,56.0,100.0,60.0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,120.0,80.0,1,1,1,0,1
69996,22601,1,158,126.0,140.0,90.0,2,2,0,0,1
69997,19066,2,183,105.0,180.0,90.0,3,1,0,1,0
69998,22431,1,163,72.0,135.0,80.0,1,2,0,0,0


In [34]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [38]:
numerical_cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
categorical_cols = ['gender', 'cholesterol', 'gluc']

In [42]:
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Define the model pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [49]:
preprocessor

In [None]:
model_pipeline.fit(X_train, y_train)

model_pipeline.predict(X_val)

accuracy = accuracy_score(y_val, model_pipeline.predict(X_val))
print(f'Accuracy: {accuracy}')

Accuracy: 0.7173333333333334


In [45]:
model_pipeline

In [47]:
from sklearn.model_selection import GridSearchCV
# param grid
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'classifier__C': [0.1, 1.0, 10],
    'classifier__penalty': ['l1', 'l2']
}

# Define the model pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Define the grid search
grid = GridSearchCV(model_pipeline, param_grid=param_grid, cv=5, verbose=3)

grid.fit(X_train, y_train)

# Get the best parameters
print(grid.best_params_)
# Get the best score
print(grid.best_score_)

# Predict and evaluate
y_pred = grid.best_estimator_.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END classifier__C=0.1, classifier__penalty=l1, preprocessor__num__imputer__strategy=mean;, score=nan total time=   0.1s
[CV 2/5] END classifier__C=0.1, classifier__penalty=l1, preprocessor__num__imputer__strategy=mean;, score=nan total time=   0.1s
[CV 3/5] END classifier__C=0.1, classifier__penalty=l1, preprocessor__num__imputer__strategy=mean;, score=nan total time=   0.1s
[CV 4/5] END classifier__C=0.1, classifier__penalty=l1, preprocessor__num__imputer__strategy=mean;, score=nan total time=   0.1s
[CV 5/5] END classifier__C=0.1, classifier__penalty=l1, preprocessor__num__imputer__strategy=mean;, score=nan total time=   0.0s
[CV 1/5] END classifier__C=0.1, classifier__penalty=l1, preprocessor__num__imputer__strategy=median;, score=nan total time=   0.1s
[CV 2/5] END classifier__C=0.1, classifier__penalty=l1, preprocessor__num__imputer__strategy=median;, score=nan total time=   0.1s
[CV 3/5] END classifier__C=0.1, 

30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "c:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\ProgramData\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\ProgramData\Anaconda3\lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimato

{'classifier__C': 10, 'classifier__penalty': 'l2', 'preprocessor__num__imputer__strategy': 'mean'}
0.7147755102040817
Accuracy: 0.7206666666666667


In [48]:
from sklearn.ensemble import RandomForestClassifier

# Define the model pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Define the parameter grid
param_grid = {
    'classifier__n_estimators': [100, 300],
    'classifier__max_depth': [20, None]
}

# Define the grid search
grid = GridSearchCV(model_pipeline, param_grid=param_grid, cv=3, verbose=2)
grid.fit(X_train, y_train)

# Get the best parameters
print(grid.best_params_)
# Get the best score
print(grid.best_score_)
# Predict and evaluate
y_pred = grid.best_estimator_.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END classifier__max_depth=20, classifier__n_estimators=100; total time=   7.7s
[CV] END classifier__max_depth=20, classifier__n_estimators=100; total time=   7.2s
[CV] END classifier__max_depth=20, classifier__n_estimators=100; total time=   6.3s
[CV] END classifier__max_depth=20, classifier__n_estimators=300; total time=  19.1s
[CV] END classifier__max_depth=20, classifier__n_estimators=300; total time=  22.0s
[CV] END classifier__max_depth=20, classifier__n_estimators=300; total time=  21.1s
[CV] END classifier__max_depth=None, classifier__n_estimators=100; total time=   8.4s
[CV] END classifier__max_depth=None, classifier__n_estimators=100; total time=   8.1s
[CV] END classifier__max_depth=None, classifier__n_estimators=100; total time=   7.5s
[CV] END classifier__max_depth=None, classifier__n_estimators=300; total time=  24.3s
[CV] END classifier__max_depth=None, classifier__n_estimators=300; total time=  25.5s
[CV] E