In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
heart_data = pd.read_csv('../data/raw/heart.csv')

# Define features and target
X = heart_data.drop(columns='HeartDisease')
Y = heart_data['HeartDisease']

# Split the data
np.random.seed(67)
X_train, X_test, Y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=67, stratify=Y)

# Define the numeric and categorical features
numeric_features = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']
string_features = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), string_features)
    ]
)

# Create the logistic regression pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Train the model
model.fit(X_train, Y_train)

# Make predictions on the test set
Y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, Y_pred)
precision = precision_score(y_test, Y_pred)
recall = recall_score(y_test, Y_pred)
f1 = f1_score(y_test, Y_pred)
roc_auc = roc_auc_score(y_test, Y_pred)

# Display results
print("Model Performance Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"ROC-AUC: {roc_auc:.2f}")

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, Y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)


Model Performance Metrics:
Accuracy: 0.86
Precision: 0.90
Recall: 0.84
F1 Score: 0.87
ROC-AUC: 0.86

Confusion Matrix:
[[72 10]
 [16 86]]


In [3]:
# Load the dataset
heart_data = pd.read_csv('../data/raw/heart.csv')

# Define features and target
X = heart_data.drop(columns='HeartDisease')
Y = heart_data['HeartDisease']

# Split the data
np.random.seed(67)
X_train, X_test, Y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=67, stratify=Y)

# Define the numeric and categorical features
numeric_features = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']
string_features = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[('num', Pipeline([('power_transformer', PowerTransformer()),
                                    ('scalar_transformer',StandardScaler())]), numeric_features) ,
                                    ('cat', OneHotEncoder(handle_unknown='ignore'), string_features)
                 ])



# Create the logistic regression pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Train the model
model.fit(X_train, Y_train)

# Make predictions on the test set
Y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, Y_pred)
precision = precision_score(y_test, Y_pred)
recall = recall_score(y_test, Y_pred)
f1 = f1_score(y_test, Y_pred)
roc_auc = roc_auc_score(y_test, Y_pred)

# Display results
print("Model Performance Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"ROC-AUC: {roc_auc:.2f}")

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, Y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)


Model Performance Metrics:
Accuracy: 0.86
Precision: 0.90
Recall: 0.85
F1 Score: 0.87
ROC-AUC: 0.87

Confusion Matrix:
[[72 10]
 [15 87]]


---

The regularization parameter, CC, controls the trade-off between fitting the training data well and keeping the model simple (to prevent overfitting). Lower values of CC apply stronger regularization.

In [4]:
# Use a different C value in Logistic Regression
model.set_params(classifier__C=0.1)
model.fit(X_train, Y_train)


- Use Cross-Validation with Grid Search

Using cross-validation and grid search will allow you to systematically test different parameter values and select the best combination. 

In [5]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['lbfgs', 'liblinear'],
    'classifier__penalty': ['l2'],
}

# Create GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, Y_train)

# Print the best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

# Use the best model
best_model = grid_search.best_estimator_
Y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, Y_pred)
print("Test Accuracy with Best Model:", accuracy)


Best Parameters: {'classifier__C': 1, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
Best Cross-Validation Accuracy: 0.8637312459230267
Test Accuracy with Best Model: 0.8641304347826086


3. Add Polynomial Features

In [6]:
from sklearn.preprocessing import PolynomialFeatures

# Add polynomial features for numeric data
poly_transformer = ColumnTransformer(
    transformers=[
        ('num', Pipeline([('scaler', StandardScaler()), ('poly', PolynomialFeatures(degree=2))]), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), string_features)
    ]
)

# Update the pipeline with polynomial features
model = Pipeline(steps=[
    ('preprocessor', poly_transformer),
    ('classifier', LogisticRegression(max_iter=1000, C=0.1))
])

model.fit(X_train, Y_train)


- Try Different Solvers

In [7]:
# Try different solver
model.set_params(classifier__solver='liblinear')
model.fit(X_train, Y_train)


- Adjust Class Weights for Imbalanced Data

In [8]:
model.set_params(classifier__class_weight='balanced')
model.fit(X_train, Y_train)


- Evaluate on Different Metrics and Tune Threshold

In [9]:
# Predict probabilities instead of labels
Y_pred_proba = model.predict_proba(X_test)[:, 1]

# Adjust threshold
threshold = 0.4  # Adjust based on model needs
Y_pred_new = (Y_pred_proba >= threshold).astype(int)

# Evaluate with new threshold
new_accuracy = accuracy_score(y_test, Y_pred_new)
print("Accuracy with threshold 0.4:", new_accuracy)


Accuracy with threshold 0.4: 0.8858695652173914


- **References**

https://www.v7labs.com/blog/logistic-regression