In [3]:
# Import necessary libraries
import pandas as pd
import joblib  # For saving and loading models
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the cleaned dataset
df = pd.read_csv('Heart_Disease_Prediction_Cleaned.csv')

# Split into features (X) and target (y)
X = df.drop(columns=['Heart Disease'])  # Features
y = df['Heart Disease']  # Target

# Split into train and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (important for SVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Model Implementation and Hyperparameter Tuning ---
# Create a Support Vector Machine classifier
svm = SVC(probability=True, class_weight='balanced')

# Define the hyperparameter grid for GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Types of kernels
    'gamma': ['scale', 'auto'],  # Kernel coefficient
    'degree': [3, 4, 5]  # Degree of polynomial kernel (if applicable)
}

# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy')  # 5-fold cross-validation
grid_search.fit(X_train_scaled, y_train)

# Best model after hyperparameter tuning
best_model = grid_search.best_estimator_

# --- Model Evaluation ---
# Evaluate on training set
y_train_pred = best_model.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Train Set Accuracy: {train_accuracy:.4f}")

# Evaluate on test set
y_test_pred = best_model.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

# Print classification report for the test set
print("Classification Report (Test Set):")
print(classification_report(y_test, y_test_pred))

# Confusion Matrix
print("Confusion Matrix (Test Set):")
print(confusion_matrix(y_test, y_test_pred))

# --- Final Model Performance ---
print(f"Best Model Parameters from GridSearchCV: {grid_search.best_params_}")
print(f"Best Model Accuracy: {grid_search.best_score_:.4f}")

# --- Save the model using joblib ---
# Save the best trained model and scaler
joblib.dump(best_model, 'svm_heart_disease_model.joblib')  # Save model
joblib.dump(scaler, 'scaler.joblib')  # Save scaler (for future predictions)

print("Model and scaler saved successfully!")


Train Set Accuracy: 0.8426
Test Set Accuracy: 0.9074
Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.89      0.97      0.93        33
           1       0.94      0.81      0.87        21

    accuracy                           0.91        54
   macro avg       0.92      0.89      0.90        54
weighted avg       0.91      0.91      0.91        54

Confusion Matrix (Test Set):
[[32  1]
 [ 4 17]]
Best Model Parameters from GridSearchCV: {'C': 0.1, 'degree': 3, 'gamma': 'scale', 'kernel': 'sigmoid'}
Best Model Accuracy: 0.8289
Model and scaler saved successfully!
