In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
df = pd.read_csv('Heart_Disease_Prediction.csv')

# --- Verify target column before encoding ---
print("Target column before encoding:")
print(df['Heart Disease'].value_counts())

# Correctly map target variable 'Presence' -> 1 and 'Absence' -> 0
df['Heart Disease'] = df['Heart Disease'].map({'Presence': 1, 'Absence': 0})

# Verify target column after encoding
print("Target column after encoding:")
print(df['Heart Disease'].value_counts())

# --- Data Preprocessing ---
# Handling missing values, outliers, encoding features, and scaling
# Your preprocessing steps should go here...

# Split into features (X) and target (y)
X = df.drop(columns=['Heart Disease'])
y = df['Heart Disease']

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features (important for SVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Training (using SVM and hyperparameter tuning)
svm = SVC(probability=True, class_weight='balanced')
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'degree': [3, 4, 5]
}

grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Best model from GridSearchCV
best_model = grid_search.best_estimator_

# --- Model Evaluation ---
# Check target distribution before and after resampling (if resampling is applied)
print("Target distribution before resampling:")
print(y.value_counts())  # Print the original distribution

# Check prediction probabilities before evaluating
y_pred_prob = best_model.predict_proba(X_test_scaled)
print("Prediction probabilities (first 5):")
print(y_pred_prob[:5])  # First 5 prediction probabilities for the test set

# Evaluate the model on the training set
y_train_pred = best_model.predict(X_train_scaled)
print("Train Set Accuracy:")
print(accuracy_score(y_train, y_train_pred))

# Evaluate the model on the test set
y_test_pred = best_model.predict(X_test_scaled)
print("Test Set Accuracy:")
print(accuracy_score(y_test, y_test_pred))

# Check classification report for the test set
print("Classification Report:")
print(classification_report(y_test, y_test_pred))

# Final model evaluation
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

# --- Save Preprocessed Data ---
# Save the cleaned dataset (if needed for future use)
df.to_csv('Heart_Disease_Prediction_Cleaned.csv', index=False)
print("Preprocessed data saved as 'Heart_Disease_Prediction_Cleaned.csv'.")



Target column before encoding:
Heart Disease
Absence     150
Presence    120
Name: count, dtype: int64
Target column after encoding:
Heart Disease
0    150
1    120
Name: count, dtype: int64
Target distribution before resampling:
Heart Disease
0    150
1    120
Name: count, dtype: int64
Prediction probabilities (first 5):
[[0.35267537 0.64732463]
 [0.46754257 0.53245743]
 [0.94606448 0.05393552]
 [0.92191865 0.07808135]
 [0.75578131 0.24421869]]
Train Set Accuracy:
0.8425925925925926
Test Set Accuracy:
0.9074074074074074
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.97      0.93        33
           1       0.94      0.81      0.87        21

    accuracy                           0.91        54
   macro avg       0.92      0.89      0.90        54
weighted avg       0.91      0.91      0.91        54

Model Accuracy: 0.9074
Confusion Matrix:
[[32  1]
 [ 4 17]]
Preprocessed data saved as 'Heart_Disease_Prediction_Cleaned.cs