In [None]:
#Import machine learning libraries including scikit-learn, numpy, pandas, matplotlib, seaborn, and xgboost
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
import xgboost as xgb

## Exploratory data analysis - Missing values, data types, and descriptive stats


In [None]:
#Import the dataset Airline Reviews and Ratings
df = pd.read_csv('UCI_Heart_Disease_Dataset_Combined.csv')

#change all variable names to snake_case
df.columns = df.columns.str.lower()

# change name chestpaintype to chest_pain_type
df.rename(columns={'chestpaintype': 'chest_pain_type'}, inplace=True)

#change name restingecg to resting_ecg
df.rename(columns={'restingecg': 'resting_ecg'}, inplace=True)

#change name fastingbs to fasting_bs
df.rename(columns={'fastingbs': 'fasting_bs'}, inplace=True)

#change name restingecg to resting_ecg
df.rename(columns={'restingecg': 'resting_ecg'}, inplace=True)

#change name maxhr to max_hr
df.rename(columns={'maxhr': 'max_hr'}, inplace=True)

#change name exerciseangina to exercise_angina
df.rename(columns={'exerciseangina': 'exercise_angina'}, inplace=True)

#change name oldpeak to old_peak
df.rename(columns={'oldpeak': 'old_peak'}, inplace=True)

#change name heartdisease to heart_disease
df.rename(columns={'heartdisease': 'heart_disease'}, inplace=True)

#change name restingbp to resting_bp
df.rename(columns={'restingbp': 'resting_bp'}, inplace=True)

#print the first 5 rows of the dataset
print(df.head())

#do a summary of the dataset
print(df.describe())

#change all variable names to snake_case
df.columns = df.columns.str.lower().str.replace(' ', '_')


In [None]:
# Do some basic Exploratory Data Analysis (EDA)
# Check for missing values
print(df.isnull().sum())

# Check the data types of each column
print(df.dtypes)

# Check the distribution of the target variable
print(df['heart_disease'].value_counts())

In [None]:
# make a histogram of all the variables
df.hist(bins=20, figsize=(20,15))
plt.show()

In [None]:
"""
This code generates a boxplot for the variables age, resting_bp, cholesterol, max_hr, and old_peak
with respect to the target variable heart_disease. The boxplot provides a visual representation of the
distribution of these variables for different values of heart_disease.

Parameters:
    - x: The target variable heart_disease.
    - y: The variables age, resting_bp, cholesterol, max_hr, and old_peak.
    - data: The dataframe containing the data.


Returns:
    None
"""

plt.figure(figsize=(20,15))
plt.subplot(2,3,1)
sns.boxplot(x='heart_disease', y='age', data=df)
plt.subplot(2,3,2)
sns.boxplot(x='heart_disease', y='resting_bp', data=df)
plt.subplot(2,3,3)
sns.boxplot(x='heart_disease', y='cholesterol', data=df)
plt.subplot(2,3,4)
sns.boxplot(x='heart_disease', y='max_hr', data=df)
plt.subplot(2,3,5)
sns.boxplot(x='heart_disease', y='old_peak', data=df)
plt.show()

In [None]:
# sex, chest_pain type, fasting_bs, resting_ecg, exercise_angina

# make a barplot of sex vs heart_disease
plt.figure(figsize=(10,5))
sns.barplot(x='sex', y='heart_disease', data=df)

# make a barplot of chest_pain_type vs heart_disease
plt.figure(figsize=(10,5))
sns.barplot(x='chest_pain_type', y='heart_disease', data=df)

# make a barplot of fasting_bs vs heart_disease
plt.figure(figsize=(10,5))
sns.barplot(x='fasting_bs', y='heart_disease', data=df)

# make a barplot of resting_ecg vs heart_disease
plt.figure(figsize=(10,5))
sns.barplot(x='resting_ecg', y='heart_disease', data=df)

# make a barplot of exercise_angina vs heart_disease
plt.figure(figsize=(10,5))
sns.barplot(x='exercise_angina', y='heart_disease', data=df)

In [None]:
# make a correlation matrix
corr = df.corr()
plt.figure(figsize=(15,10))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()

In [None]:
#print names of all columns
print(df.columns)

## One of the important measures of accuracy will be false positive rate. We write a function to calculate the false positive rate and a function to cross validate it

In [None]:
#function that calulcates the false negative rate
def calculate_fnr(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    fn = cm[1][0]
    tp = cm[1][1]
    return fn / (fn + tp)

## Running the models

### We want to run all the models on the same training data and finally test them using a test set that is not used in training. We split the data and keep the train set the same throughout.

In [None]:
# Create a dictionary that stores the accuracy, RMSE, and false positive rate of the models
results = {}
results["Metric"] = ["Accuracy", "RMSE", "False Positive Rate"]

#split the data into the train set and test set
from sklearn.model_selection import train_test_split
X = df.drop('heart_disease', axis=1)
y = df['heart_disease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

def calculate_fnr(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    fnr = fn / (fn + tn)
    return fnr

# Create a scorer using make_scorer
fnr_scorer = make_scorer(calculate_fnr, greater_is_better=False)

### 1. Baseline Models - Linear Regression and Naive regression

In [None]:
# Run the naive model only on training data, print the accuracy, RMSE, and false negative rate. do not store anything in the dictionary, do that later after cross validation


In [None]:
# Run the linear regression model only on training data, print the accuracy, RMSE, and false positive rate. do not store anything in the dictionary, do that later after cross validation
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix
linear = LinearRegression()
linear.fit(X_train, y_train)
y_pred = linear.predict(X_train)
y_pred = np.round(y_pred)
y_pred = np.clip(y_pred, 0, 1)
print('Linear Regression Model')
print('RMSE:', mean_squared_error(y_train, y_pred))

# do cross validation on the linear regression model. put accuray and fpr as nan since these are not calculated for linear regression
acc_linear = np.nan
rmse_linear = cross_val_score(linear, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print('Cross Validation RMSE:', -rmse_linear.mean())
fp_linear = np.nan

#store the results of the linear regression model in the dictionary
results["Linear Regression"] = [acc_linear, -rmse_linear, fp_linear]

# make combination of polynomial transformation of the features to the 2nd degree and run all those combinations through the linear regression model

## Random forest model

In [None]:
# Run the random forest model only on training data, print the accuracy, RMSE, and false positive rate. do not store anything in the dictionary, do that later after cross validation
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_train)
print('Random Forest Model')
print('Accuracy:', accuracy_score(y_train, y_pred))
print('RMSE:', mean_squared_error(y_train, y_pred))
print('False Positive Rate:', calculate_fpr(y_train, y_pred))

# do cross validation on the random forest model.
acc_random_forest = cross_val_score(random_forest, X_train, y_train, cv=5, scoring='accuracy')
print('Cross Validation Accuracy:', acc_random_forest.mean())
rmse_random_forest = cross_val_score(random_forest, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print('Cross Validation RMSE:', -rmse_random_forest.mean())
fp_random_forest = calculate_mean_fpr(random_forest, X_train, y_train, 5)
print('Cross Validation False Positive Rate:', fp_random_forest)

#store the results of the random forest model in the dictionary
results["Random Forest"] = [acc_random_forest.mean(), -rmse_random_forest.mean(), fp_random_forest]

# run a feature importance analysis on this model
importances = random_forest.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
    
# Plot the feature importances of the random forest model
plt.figure(figsize=(20,10))
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()

#suggest ways to improve the model through feature engineering
#drop the least important features
X_train = X_train.drop(['fasting_bs', 'resting_ecg'], axis=1)
X_test = X_test.drop(['fasting_bs', 'resting_ecg'], axis=1)

#run the random forest model again
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_train)
print('Random Forest Model')
print('Accuracy:', accuracy_score(y_train, y_pred))
print('RMSE:', mean_squared_error(y_train, y_pred))
print('False Positive Rate:', calculate_fpr(y_train, y_pred))


## Gradient Boosting Model

In [None]:
# Run the gradient boosting model only on training data, print the accuracy, RMSE, and false positive rate. do not store anything in the dictionary, do that later after cross validation
from sklearn.ensemble import GradientBoostingClassifier
gradient_boosting = GradientBoostingClassifier(random_state=42)
gradient_boosting.fit(X_train, y_train)
y_pred = gradient_boosting.predict(X_train)
print('Gradient Boosting Model')
print('Accuracy:', accuracy_score(y_train, y_pred))
print('RMSE:', mean_squared_error(y_train, y_pred))
print('False Positive Rate:', calculate_fpr(y_train, y_pred))

# do cross validation on the gradient boosting model.
acc_gradient_boosting = cross_val_score(gradient_boosting, X_train, y_train, cv=5, scoring='accuracy')
print('Cross Validation Accuracy:', acc_gradient_boosting.mean())
rmse_gradient_boosting = cross_val_score(gradient_boosting, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print('Cross Validation RMSE:', -rmse_gradient_boosting.mean())
fp_gradient_boosting = calculate_mean_fpr(gradient_boosting, X_train, y_train, 5)
print('Cross Validation False Positive Rate:', fp_gradient_boosting)

#store the results of the gradient boosting model in the dictionary
results["Gradient Boosting"] = [acc_gradient_boosting.mean(), -rmse_gradient_boosting.mean(), fp_gradient_boosting]

#store the results of the models in a dataframe
results_df = pd.DataFrame(results)
print(results_df)

## XGBoost Model

In [None]:
#Run xgboost model only on training data, print the accuracy, RMSE, and false positive rate. do not store anything in the dictionary, do that later after cross validation
# run a xgboost model on training data, print the accuracy, RMSE, and false positive rate. do not store anything in the dictionary, do that later after cross validation
from xgboost import XGBClassifier
xgboost = XGBClassifier(random_state=42)
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_train)
print('XGBoost Model')
print('Accuracy:', accuracy_score(y_train, y_pred))
print('RMSE:', mean_squared_error(y_train, y_pred))
print('False Positive Rate:', calculate_fpr(y_train, y_pred))

# do cross validation on the xgboost model.
acc_xgboost = cross_val_score(xgboost, X_train, y_train, cv=5, scoring='accuracy')
print('Cross Validation Accuracy:', acc_xgboost.mean())
rmse_xgboost = cross_val_score(xgboost, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print('Cross Validation RMSE:', -rmse_xgboost.mean())
fp_xgboost = calculate_mean_fpr(xgboost, X_train, y_train, 5)
print('Cross Validation False Positive Rate:', fp_xgboost)

# store the results in the dictionary
results['XGBoost'] = [acc_xgboost.mean(), -rmse_xgboost.mean(), fp_xgboost]

#show the results
print(results)

## Support Vector MAchine (SVM) model

In [None]:
# Run the support vector machine model only on training data, print the accuracy, RMSE, and false positive rate. do not store anything in the dictionary, do that later after cross validation
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_train)
print('Support Vector Machine Model')
print('Accuracy:', accuracy_score(y_train, y_pred))
print('RMSE:', mean_squared_error(y_train, y_pred))
tn_train, fp_train, fn_train, tp_train = confusion_matrix(y_train, y_pred).ravel()

# Calculate false positive rate on training data
fpr_train = fp_train / (fp_train + tn_train)
print('FPR:', fpr_train)

# do cross validation on the support vector machine model.
acc_svm = cross_val_score(svm, X_train, y_train, cv=5, scoring='accuracy')
print('Cross Validation Accuracy:', acc_svm.mean())
rmse_svm = cross_val_score(svm, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print('Cross Validation RMSE:', -rmse_svm.mean())
fp_svm = cross_val_score(svm, X_train, y_train, cv=5, scoring='roc_auc')
print('Cross Validation False Positive Rate:', 1-fp_svm.mean())

# store the results in the dictionary
results['Support Vector Machine'] = [acc_svm.mean(), -rmse_svm.mean(), 1-fp_svm.mean()]

#show the results
print(results)

## Decision Tree Model

In [None]:
#Run decision tree model only on training data, print the accuracy, RMSE, and false positive rate. do not store anything in the dictionary, do that later after cross validation
from sklearn.tree import DecisionTreeClassifier
decision = DecisionTreeClassifier()
decision.fit(X_train, y_train)
y_pred = decision.predict(X_train)
print('Decision Tree Model')
print('Accuracy:', accuracy_score(y_train, y_pred))
print('RMSE:', mean_squared_error(y_train, y_pred))
tn_train, fp_train, fn_train, tp_train = confusion_matrix(y_train, y_pred).ravel()

# Calculate false positive rate on training data
fpr_train = fp_train / (fp_train + tn_train)
print('FPR:', fpr_train)

# do cross validation on the decision tree model.
acc_decision = cross_val_score(decision, X_train, y_train, cv=5, scoring='accuracy')
print('Cross Validation Accuracy:', acc_decision.mean())
rmse_decision = cross_val_score(decision, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print('Cross Validation RMSE:', -rmse_decision.mean())
fp_decision = cross_val_score(decision, X_train, y_train, cv=5, scoring='roc_auc')
print('Cross Validation False Positive Rate:', 1-fp_decision.mean())

# store the results in the dictionary
results['Decision Tree'] = [acc_decision.mean(), -rmse_decision.mean(), 1-fp_decision.mean()]

#show the results
print(results)

## KNN Model

In [None]:
# run a knn model on multiple ks, plotting the number of neighbors vs the accuracy
from sklearn.neighbors import KNeighborsClassifier
accs = []
for k in range(1, 21):
    knn = KNeighborsClassifier(n_neighbors=k)
    acc = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy').mean()
    accs.append(acc)
plt.plot(range(1, 21), accs)
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.title('Number of Neighbors vs Accuracy')
plt.show()

# from the plot, we see that 7 neighbors is the best number of neighbors. We will run the knn model with 7 neighbors
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
print('KNN Model')
print('Accuracy:', accuracy_score(y_train, y_pred))
print('RMSE:', mean_squared_error(y_train, y_pred))
print('False Positive Rate:', calculate_fpr(y_train, y_pred))

# do cross validation on the knn model.
acc_knn = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')
print('Cross Validation Accuracy:', acc_knn.mean())
rmse_knn = cross_val_score(knn, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print('Cross Validation RMSE:', -rmse_knn.mean())
fp_knn = calculate_mean_fpr(knn, X_train, y_train, 5)
print('Cross Validation False Positive Rate:', fp_knn)

# store the results in the dictionary
results['KNN'] = [acc_knn.mean(), -rmse_knn.mean(), fp_knn]

#show the results
print(results)

## Logistic Regression

In [None]:
# run a logistic regression model on training data, print the accuracy, RMSE, and false positive rate. do not store anything in the dictionary, do that later after cross validation
# increase the max_iter to 1000
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(max_iter=1000)
logistic.fit(X_train, y_train)
y_pred = logistic.predict(X_train)
print('Logistic Regression Model')
print('Accuracy:', accuracy_score(y_train, y_pred))
print('RMSE:', mean_squared_error(y_train, y_pred))
print('False Positive Rate:', calculate_fpr(y_train, y_pred))

# do cross validation on the logistic regression model.
acc_logistic = cross_val_score(logistic, X_train, y_train, cv=5, scoring='accuracy')
print('Cross Validation Accuracy:', acc_logistic.mean())
rmse_logistic = cross_val_score(logistic, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print('Cross Validation RMSE:', -rmse_logistic.mean())
fp_logistic = calculate_mean_fpr(logistic, X_train, y_train, 5)
print('Cross Validation False Positive Rate:', fp_logistic)

# store the results in the dictionary
results['Logistic Regression'] = [acc_logistic.mean(), -rmse_logistic.mean(), fp_logistic]

#show the results
print(results)

# Neural Network Model

In [None]:
# run a neural network model on training data, print the accuracy, RMSE, and false positive rate. do not store anything in the dictionary, do that later after cross validation
from sklearn.neural_network import MLPClassifier
neural = MLPClassifier(random_state=42)
neural.fit(X_train, y_train)
y_pred = neural.predict(X_train)
print('Neural Network Model')
print('Accuracy:', accuracy_score(y_train, y_pred))
print('RMSE:', mean_squared_error(y_train, y_pred))
print('False Positive Rate:', calculate_fpr(y_train, y_pred))

# do cross validation on the neural network model.
acc_neural = cross_val_score(neural, X_train, y_train, cv=5, scoring='accuracy')
print('Cross Validation Accuracy:', acc_neural.mean())
rmse_neural = cross_val_score(neural, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print('Cross Validation RMSE:', -rmse_neural.mean())
fp_neural = calculate_mean_fpr(neural, X_train, y_train, 5)
print('Cross Validation False Positive Rate:', fp_neural)

# store the results in the dictionary
results['Neural Network'] = [acc_neural.mean(), -rmse_neural.mean(), fp_neural]

#show the results
print(results)

## Naive Bayes Model

In [None]:
# run a naive bayes model on training data, print the accuracy, RMSE, and false positive rate. do not store anything in the dictionary, do that later after cross validation
from sklearn.naive_bayes import GaussianNB
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)
y_pred = naive_bayes.predict(X_train)
print('Naive Bayes Model')
print('Accuracy:', accuracy_score(y_train, y_pred))
print('RMSE:', mean_squared_error(y_train, y_pred))
print('False Positive Rate:', calculate_fpr(y_train, y_pred))

# do cross validation on the naive bayes model.
acc_naive_bayes = cross_val_score(naive_bayes, X_train, y_train, cv=5, scoring='accuracy')
print('Cross Validation Accuracy:', acc_naive_bayes.mean())
rmse_naive_bayes = cross_val_score(naive_bayes, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print('Cross Validation RMSE:', -rmse_naive_bayes.mean())
fp_naive_bayes = calculate_mean_fpr(naive_bayes, X_train, y_train, 5)
print('Cross Validation False Positive Rate:', fp_naive_bayes)

# store the results in the dictionary
results['Naive Bayes'] = [acc_naive_bayes.mean(), -rmse_naive_bayes.mean(), fp_naive_bayes]

#show the results
print(results)

## Plotting the metrics based on the train data

In [None]:
# Convert the dictionary to a DataFrame
results_df = pd.DataFrame(results)

# Set 'Metric' column as index
results_df.set_index('Metric', inplace=True)

# Transpose the DataFrame
results_df = results_df.T

# Drop NaN values
results_df.dropna(inplace=True)


In [None]:
# Plotting the accuracy of different models
plt.figure(figsize=(12, 6))
sns.barplot(x=results_df.index, y=results_df['Accuracy'])
plt.xticks(rotation=45, ha='right')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Accuracy of Different Models')
plt.tight_layout()

# labeling the bars with the accuracy values
for i in range(len(results_df)):
    plt.text(i, results_df['Accuracy'][i], round(results_df['Accuracy'][i], 2), ha='center', va='bottom')
    
plt.show()

In [None]:
# Plotting the RMSE of different models
plt.figure(figsize=(12, 6))
sns.barplot(x=results_df.index, y=results_df['RMSE'])
plt.xticks(rotation=45, ha='right')
plt.xlabel('Model')
plt.ylabel('RMSE')
plt.title('RMSE of Different Models')
plt.tight_layout()

# labeling the bars with the RMSE values
for i in range(len(results_df)):
    plt.text(i, results_df['RMSE'][i], round(results_df['RMSE'][i], 2), ha='center', va='bottom')
    
plt.show()


In [None]:
# Plotting the False Positive Rate of different models
plt.figure(figsize=(12, 6))
sns.barplot(x=results_df.index, y=results_df['False Positive Rate'])
plt.xticks(rotation=45, ha='right')
plt.xlabel('Model')
plt.ylabel('False Positive Rate')
plt.title('False Positive Rate of Different Models')
plt.tight_layout()

# labeling the bars with the False Positive Rate values
for i in range(len(results_df)):
    plt.text(i, results_df['False Positive Rate'][i], round(results_df['False Positive Rate'][i], 2), ha='center', va='bottom')
    
plt.show()


In [None]:
# print the results dataframe
print(results_df)

## Running the models on the test data.

### The test data was left untouched uptil now. Running the numbers again on the test data will validate our findings about which model is the best.

In [None]:
# run all the models on the test data and store the results in a dictionary
# run the naive model on the test data
y_pred = naive.predict(X_test)
print('Naive Model')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('RMSE:', mean_squared_error(y_test, y_pred))
print('False Positive Rate:', calculate_fpr(y_test, y_pred))

# store the results in the dictionary
results_test = {}
results_test["Metric"] = ["Accuracy", "RMSE", "False Positive Rate"]
results_test["Naive"] = [accuracy_score(y_test, y_pred), mean_squared_error(y_test, y_pred), calculate_fpr(y_test, y_pred)]

# run the linear regression model on the test data
y_pred = linear.predict(X_test)
y_pred = np.round(y_pred)
y_pred = np.clip(y_pred, 0, 1)
print('Linear Regression Model')
print('RMSE:', mean_squared_error(y_test, y_pred))

# store the results in the dictionary
results_test["Linear Regression"] = [np.nan, mean_squared_error(y_test, y_pred), np.nan]

# run the random forest model on the test data
y_pred = random_forest.predict(X_test)
print('Random Forest Model')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('RMSE:', mean_squared_error(y_test, y_pred))
print('False Positive Rate:', calculate_fpr(y_test, y_pred))

# store the results in the dictionary
results_test["Random Forest"] = [accuracy_score(y_test, y_pred), mean_squared_error(y_test, y_pred), calculate_fpr(y_test, y_pred)]

# run the gradient boosting model on the test data
y_pred = gradient_boosting.predict(X_test)
print('Gradient Boosting Model')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('RMSE:', mean_squared_error(y_test, y_pred))
print('False Positive Rate:', calculate_fpr(y_test, y_pred))

# store the results in the dictionary
results_test["Gradient Boosting"] = [accuracy_score(y_test, y_pred), mean_squared_error(y_test, y_pred), calculate_fpr(y_test, y_pred)]

# run the xgboost model on the test data
y_pred = xgboost.predict(X_test)
print('XGBoost Model')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('RMSE:', mean_squared_error(y_test, y_pred))
print('False Positive Rate:', calculate_fpr(y_test, y_pred))

# store the results in the dictionary
results_test["XGBoost"] = [accuracy_score(y_test, y_pred), mean_squared_error(y_test, y_pred), calculate_fpr(y_test, y_pred)]

# run the support vector machine model on the test data
y_pred = svm.predict(X_test)
print('Support Vector Machine Model')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('RMSE:', mean_squared_error(y_test, y_pred))
print('False Positive Rate:', calculate_fpr(y_test, y_pred))

# store the results in the dictionary
results_test["Support Vector Machine"] = [accuracy_score(y_test, y_pred), mean_squared_error(y_test, y_pred), calculate_fpr(y_test, y_pred)]

# run the decision tree model on the test data
y_pred = decision.predict(X_test)
print('Decision Tree Model')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('RMSE:', mean_squared_error(y_test, y_pred))
print('False Positive Rate:', calculate_fpr(y_test, y_pred))

# run the decision tree model on the test data
results_test["Decision Tree"] = [accuracy_score(y_test, y_pred), mean_squared_error(y_test, y_pred), calculate_fpr(y_test, y_pred)]

# run the knn model on the test data
y_pred = knn.predict(X_test)
print('KNN Model')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('RMSE:', mean_squared_error(y_test, y_pred))
print('False Positive Rate:', calculate_fpr(y_test, y_pred))

# store the results in the dictionary
results_test["KNN"] = [accuracy_score(y_test, y_pred), mean_squared_error(y_test, y_pred), calculate_fpr(y_test, y_pred)]

# run the logistic regression model on the test data
y_pred = logistic.predict(X_test)
print('Logistic Regression Model')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('RMSE:', mean_squared_error(y_test, y_pred))
print('False Positive Rate:', calculate_fpr(y_test, y_pred))

# store the results in the dictionary
results_test["Logistic Regression"] = [accuracy_score(y_test, y_pred), mean_squared_error(y_test, y_pred), calculate_fpr(y_test, y_pred)]

# run the neural network model on the test data
y_pred = neural.predict(X_test)
print('Neural Network Model')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('RMSE:', mean_squared_error(y_test, y_pred))
print('False Positive Rate:', calculate_fpr(y_test, y_pred))

# store the results in the dictionary
results_test["Neural Network"] = [accuracy_score(y_test, y_pred), mean_squared_error(y_test, y_pred), calculate_fpr(y_test, y_pred)]

# run the naive bayes model on the test data
y_pred = naive_bayes.predict(X_test)
print('Naive Bayes Model')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('RMSE:', mean_squared_error(y_test, y_pred))
print('False Positive Rate:', calculate_fpr(y_test, y_pred))

# store the results in the dictionary
results_test["Naive Bayes"] = [accuracy_score(y_test, y_pred), mean_squared_error(y_test, y_pred), calculate_fpr(y_test, y_pred)]

# print the results dictionary
print(results_test)

## Plotting the test data results

In [None]:
#convert the results dictionary to a dataframe
results_test_df = pd.DataFrame(results_test)

# Set 'Metric' column as index
results_test_df.set_index('Metric', inplace=True)

# Transpose the DataFrame
results_test_df = results_test_df.T

# Drop NaN values
results_test_df.dropna(inplace=True)

In [None]:
# plotthe accuracy of different models on the test data
plt.figure(figsize=(12, 6))
sns.barplot(x=results_test_df.index, y=results_test_df['Accuracy'])
plt.xticks(rotation=45, ha='right')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Accuracy of Different Models on Test Data')
plt.tight_layout()

# labeling the bars with the accuracy values
for i in range(len(results_test_df)):
    plt.text(i, results_test_df['Accuracy'][i], round(results_test_df['Accuracy'][i], 2), ha='center', va='bottom')
    
plt.show()

# plot the RMSE of different models on the test data
plt.figure(figsize=(12, 6))
sns.barplot(x=results_test_df.index, y=results_test_df['RMSE'])
plt.xticks(rotation=45, ha='right')
plt.xlabel('Model')
plt.ylabel('RMSE')
plt.title('RMSE of Different Models on Test Data')
plt.tight_layout()

# labeling the bars with the RMSE values
for i in range(len(results_test_df)):
    plt.text(i, results_test_df['RMSE'][i], round(results_test_df['RMSE'][i], 2), ha='center', va='bottom')
    
plt.show()

# plot the False Positive Rate of different models on the test data
plt.figure(figsize=(12, 6))
sns.barplot(x=results_test_df.index, y=results_test_df['False Positive Rate'])
plt.xticks(rotation=45, ha='right')
plt.xlabel('Model')
plt.ylabel('False Positive Rate')
plt.title('False Positive Rate of Different Models on Test Data')
plt.tight_layout()

# labeling the bars with the False Positive Rate values
for i in range(len(results_test_df)):
    plt.text(i, results_test_df['False Positive Rate'][i], round(results_test_df['False Positive Rate'][i], 2), ha='center', va='bottom')
    
plt.show()

## Do these results generalise on other datasets?

In [None]:
df2 = pd.read_csv('Heart_Disease_Prediction.csv')

#head of the dataset
print(df2.head())

# what are the unique values in the columnn heart disease
print(df2['Heart Disease'].unique())

# show the data types of each column
print(df2.dtypes)




In [None]:
# the column Heart Disease is an object, we need to convert it to string
df2['Heart Disease'] = df2['Heart Disease'].astype(str)

# change Hweart Disease to binary variable with Presence as 1 and Absence as 0
df2['Heart Disease'] = df2['Heart Disease'].replace({'Presence': 1, 'Absence': 0})

#head of the dataset
print(df2.head())


In [None]:
# run random forest, xgboost, gradient boosting, and decision tree models on the new dataset

#first create a dictionary to store the results
results_new = {}
results_new["Metric"] = ["Accuracy", "RMSE", "False Positive Rate"]

#split the data into the train set and test set
X = df2.drop('Heart Disease', axis=1)
y = df2['Heart Disease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# run the random forest model on the new dataset
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_train)
print('Random Forest Model')
print('Accuracy:', accuracy_score(y_train, y_pred))
print('RMSE:', mean_squared_error(y_train, y_pred))
print('False Positive Rate:', calculate_fpr(y_train, y_pred))

# do cross validation on the random forest model.
acc_random_forest = cross_val_score(random_forest, X_train, y_train, cv=5, scoring='accuracy')
print('Cross Validation Accuracy:', acc_random_forest.mean())
rmse_random_forest = cross_val_score(random_forest, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print('Cross Validation RMSE:', -rmse_random_forest.mean())
fp_random_forest = calculate_mean_fpr(random_forest, X_train, y_train, 5)
print('Cross Validation False Positive Rate:', fp_random_forest)

#store the results of the random forest model in the dictionary
results_new["Random Forest"] = [acc_random_forest.mean(), -rmse_random_forest.mean(), fp_random_forest]

# run the gradient boosting model on the new dataset
gradient_boosting = GradientBoostingClassifier(random_state=42)
gradient_boosting.fit(X_train, y_train)
y_pred = gradient_boosting.predict(X_train)
print('Gradient Boosting Model')
print('Accuracy:', accuracy_score(y_train, y_pred))
print('RMSE:', mean_squared_error(y_train, y_pred))
print('False Positive Rate:', calculate_fpr(y_train, y_pred))

# do cross validation on the gradient boosting model.
acc_gradient_boosting = cross_val_score(gradient_boosting, X_train, y_train, cv=5, scoring='accuracy')
print('Cross Validation Accuracy:', acc_gradient_boosting.mean())
rmse_gradient_boosting = cross_val_score(gradient_boosting, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print('Cross Validation RMSE:', -rmse_gradient_boosting.mean())


