In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import xgboost as xgb

# Load data set into pandas
df = pd.read_csv("Cleaned_dataset.csv")

# Identify binary columns
binary_columns = [column for column in df.columns if df[column].nunique() == 2]
# Convert binary columns to 0 and 1
for col in binary_columns:
    unique_values = df[col].unique()
    if set(unique_values) == {"Yes", "No"}:
        df[col] = df[col].map({"Yes": 1, "No": 0})

categorical_columns = ['Sex', 'GeneralHealth', 'LastCheckupTime', 'RemovedTeeth', 'HadDiabetes',
                       'SmokerStatus', 'ECigaretteUsage', 'RaceEthnicityCategory', 'AgeCategory',
                       'HighRiskLastYear', 'CovidPos']

# Extract the categorical columns into a new DataFrame
Categorical_col = df[categorical_columns]

# Use pd.get_dummies to transform the categorical columns
Transform_df = pd.get_dummies(Categorical_col, dtype=int)

# Select numeric columns
numeric_columns = df.select_dtypes(include=["float64"]).columns
df_numeric = df[numeric_columns]

# Create standard scaler instance, fitting it, and scaling it
scaler = StandardScaler()
scaled_numeric_df = pd.DataFrame(scaler.fit_transform(df_numeric), columns=numeric_columns)

# Drop the original columns
df = df.drop(columns=categorical_columns + numeric_columns.tolist())

# Combine the original DataFrame and the dummy variables DataFrame
merged_df = pd.concat([df, Transform_df, scaled_numeric_df], axis=1)
merged_df = merged_df.drop(columns='State')


# Split the data into features and target variable
X = merged_df.drop(columns="HadHeartAttack")
y = merged_df["HadHeartAttack"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Hyperparameter tuning for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf_grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), 
                              param_grid=rf_param_grid, 
                              cv=3, 
                              n_jobs=-1, 
                              verbose=2)

rf_grid_search.fit(X_train, y_train)
best_rf = rf_grid_search.best_estimator_

# Evaluate the tuned Random Forest model
y_pred_test_rf = best_rf.predict(X_test)
print("Tuned Random Forest Test Set Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_test_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_test_rf))

Fitting 3 folds for each of 324 candidates, totalling 972 fits


In [None]:
# Hyperparameter tuning for Gradient Boosting
gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 0.9, 1.0]
}

gb_grid_search = GridSearchCV(estimator=GradientBoostingClassifier(random_state=42), 
                              param_grid=gb_param_grid, 
                              cv=3, 
                              n_jobs=-1, 
                              verbose=2)
gb_grid_search.fit(X_train, y_train)
best_gb = gb_grid_search.best_estimator_

# Evaluate the tuned Gradient Boosting model
y_pred_test_gb = best_gb.predict(X_test)
print("Tuned Gradient Boosting Test Set Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred_test_gb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test_gb))
print("Classification Report:\n", classification_report(y_test, y_pred_test_gb))

In [None]:
# Plot the comparison of accuracy
labels = ['Random Forest', 'Gradient Boosting']
accuracies = [
    accuracy_score(y_test, y_pred_test_rf),
    accuracy_score(y_test, y_pred_test_gb)
]

plt.bar(labels, accuracies, color=['orange', 'green'])
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Comparison of Model Accuracy on Test Set')
plt.ylim(0, 1)
plt.show()



In [None]:
# Plot the comparison of precision, recall, and f1-score
labels = ['Precision', 'Recall', 'F1-score']
class_0_scores_rf = [precision_score(y_test, y_pred_test_rf, pos_label=0), recall_score(y_test, y_pred_test_rf, pos_label=0), f1_score(y_test, y_pred_test_rf, pos_label=0)]
class_1_scores_rf = [precision_score(y_test, y_pred_test_rf, pos_label=1), recall_score(y_test, y_pred_test_rf, pos_label=1), f1_score(y_test, y_pred_test_rf, pos_label=1)]
class_0_scores_gb = [precision_score(y_test, y_pred_test_gb, pos_label=0), recall_score(y_test, y_pred_test_gb, pos_label=0), f1_score(y_test, y_pred_test_gb, pos_label=0)]
class_1_scores_gb = [precision_score(y_test, y_pred_test_gb, pos_label=1), recall_score(y_test, y_pred_test_gb, pos_label=1), f1_score(y_test, y_pred_test_gb, pos_label=1)]

x = list(range(len(labels)))
width = 0.35

fig, ax = plt.subplots()
rects1 = ax.bar([i - width for i in x], class_0_scores_rf, width, label='Class 0 - RF')
rects2 = ax.bar(x, class_1_scores_rf, width, label='Class 1 - RF')
rects3 = ax.bar([i + width for i in x], class_0_scores_gb, width, label='Class 0 - GB')
rects4 = ax.bar([i + 2*width for i in x], class_1_scores_gb, width, label='Class 1 - GB')

ax.set_xlabel('Metrics')
ax.set_ylabel('Scores')
ax.set_title('Comparison of Precision, Recall, and F1-score by Class and Model')
ax.set_xticks([i + width/2 for i in x])
ax.set_xticklabels(labels)
ax.legend()

plt.show()