In [None]:
# Build a random forest classifier to predict the risk of heart disease based on a dataset of patient
# information. The dataset contains 303 instances with 14 features, including age, sex, chest pain type,
# resting blood pressure, serum cholesterol, and maximum heart rate achieved.
# Dataset link: https://drive.google.com/file/d/1bGoIE4Z2kG5nyh-fGZAJ7LH0ki3UfmSJ/view?

# Q1. Preprocess the dataset by handling missing values, encoding categorical variables, and scaling the
# numerical features if necessary.

# that uses scikit-learn library to build a random forest classifier to predict the risk of heart disease based on the dataset you provided:

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report

# Load the dataset into a pandas dataframe
df = pd.read_csv('heart.csv')

# Drop rows with missing values
df.dropna(inplace=True)

# Encode categorical variables
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Split the dataset into training and testing sets
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the random forest classifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rfc.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

In [None]:
# Q2. Split the dataset into a training set (70%) and a test set (30%).

# Python code that splits the dataset into a training set (70%) and a test set (30%) using scikit-learn's 'train_test_split' function:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Load the dataset into a pandas dataframe
df = pd.read_csv('heart.csv')

# Drop rows with missing values
df.dropna(inplace=True)

# Encode categorical variables
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Split the dataset into training and testing sets
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the random forest classifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rfc.predict(X_test)

# Evaluate the performance of the classifier
score = rfc.score(X_test, y_test)
print("Accuracy:", score)

In [None]:
# Q3. Train a random forest classifier on the training set using 100 trees and a maximum depth of 10 for each
# tree. Use the default values for other hyperparameters.

# Python code that trains a random forest classifier on the training set with 100 trees and a maximum depth of 10 for each tree:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Load the dataset into a pandas dataframe
df = pd.read_csv('heart.csv')

# Drop rows with missing values
df.dropna(inplace=True)

# Encode categorical variables
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Split the dataset into training and testing sets
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the random forest classifier
rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rfc.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rfc.predict(X_test)

# Evaluate the performance of the classifier
score = rfc.score(X_test, y_test)
print("Accuracy:", score)

In [None]:
# Q4. Evaluate the performance of the model on the test set using accuracy, precision, recall, and F1 score.

# Python code that trains a random forest classifier on the training set with 100 trees and a maximum depth of 10 for each tree, 
# and then evaluates its performance on the test set using accuracy, precision, recall, and F1 score

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset into a pandas dataframe
df = pd.read_csv('heart.csv')

# Drop rows with missing values
df.dropna(inplace=True)

# Encode categorical variables
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Split the dataset into training and testing sets
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the random forest classifier
rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rfc.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rfc.predict(X_test)

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
# accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
# Q5. Use the feature importance scores to identify the top 5 most important features in predicting heart
# disease risk. Visualise the feature importances using a bar chart.

#  that trains a random forest classifier on the dataset, computes the feature importances, and visualizes them using a bar chart
# to identify the top 5 most important features:

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Load the dataset into a pandas dataframe
df = pd.read_csv('heart.csv')

# Drop rows with missing values
df.dropna(inplace=True)

# Encode categorical variables
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Train the random forest classifier
X = df.drop('target', axis=1)
y = df['target']
rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rfc.fit(X, y)

# Compute feature importances
importances = rfc.feature_importances_
feature_names = X.columns
indices = importances.argsort()[::-1]

# Print the top 5 most important features
print("Top 5 most important features:")
for i in range(5):
    print(f"{i+1}. {feature_names[indices[i]]}: {importances[indices[i]]}")

# Visualize feature importances
plt.figure()
plt.title("Feature Importances")
plt.bar(range(X.shape[1]), importances[indices])
plt.xticks(range(X.shape[1]), feature_names[indices], rotation=90)
plt.tight_layout()
plt.show()

In [None]:
# to compute the feature importances and visualize them using a bar chart:

importances = rfc.feature_importances_
feature_names = X.columns
indices = importances.argsort()[::-1]

# Print the top 5 most important features
print("Top 5 most important features:")
for i in range(5):
    print(f"{i+1}. {feature_names[indices[i]]}: {importances[indices[i]]}")

# Visualize feature importances
plt.figure()
plt.title("Feature Importances")
plt.bar(range(X.shape[1]), importances[indices])
plt.xticks(range(X.shape[1]), feature_names[indices], rotation=90)
plt.tight_layout()
plt.show()

In [None]:
# Q6. Tune the hyperparameters of the random forest classifier using grid search or random search. Try
# different values of the number of trees, maximum depth, minimum samples split, and minimum samples
# leaf. Use 5-fold cross-validation to evaluate the performance of each set of hyperparameters.

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

# Load the data
url = "https://drive.google.com/file/d/1bGoIE4Z2kG5nyh-fGZAJ7LH0ki3UfmSJ/view?usp=sharing"
file_id = url.split("/")[-2]
dwn_url = 'https://drive.google.com/uc?id=' + file_id
df = pd.read_csv(dwn_url)

# Preprocess the data
# ...

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Define the random forest classifier
rfc = RandomForestClassifier()

# Define the grid search object
grid_search = GridSearchCV(rfc, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the grid search object to the training data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding accuracy score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)

In [None]:
# Q7. Report the best set of hyperparameters found by the search and the corresponding performance
# metrics. Compare the performance of the tuned model with the default model.

import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
url = "https://drive.google.com/uc?id=1bGoIE4Z2kG5nyh-fGZAJ7LH0ki3UfmSJ"
data = pd.read_csv(url)

# Preprocess the data
# ...

# Split the dataset into training and test sets
# ...

# Define the parameter grid for the grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a random forest classifier
rf = RandomForestClassifier(random_state=42)

# Perform a grid search with 5-fold cross-validation
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best set of hyperparameters and corresponding performance metrics
print("Best parameters: ", grid_search.best_params_)
y_pred = grid_search.predict(X_test)
print("Accuracy score: ", accuracy_score(y_test, y_pred))
print("Precision score: ", precision_score(y_test, y_pred))
print("Recall score: ", recall_score(y_test, y_pred))
print("F1 score: ", f1_score(y_test, y_pred))

# Compare with default model
rf_default = RandomForestClassifier(random_state=42)
rf_default.fit(X_train, y_train)
y_pred_default = rf_default.predict(X_test)
print("Default model accuracy score: ", accuracy_score(y_test, y_pred_default))

# Q8. Interpret the model by analysing the decision boundaries of the random forest classifier. Plot the
# decision boundaries on a scatter plot of two of the most important features. Discuss the insights and
# limitations of the model for predicting heart disease risk.

+ Interpreting the decision boundaries of a random forest classifier can be challenging since it involves analysing the combined effect of multiple decision trees. One way to visualise the decision boundaries is to use a scatter plot of two of the most important features and overlay the decision boundaries of the model.

+ To do this, we can first train a random forest classifier with the best set of hyperparameters found in the previous step. We can then extract the two most important features from the feature importance scores and create a scatter plot of these two features for the test set. We can use the predict_proba method of the trained model to generate a probability estimate for each point on the scatter plot. We can then use a contour plot to overlay the decision boundaries of the model on the scatter plot.


In [None]:
#  code snippet to implement this

import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

# load the dataset
url = "https://drive.google.com/file/d/1bGoIE4Z2kG5nyh-fGZAJ7LH0ki3UfmSJ/view?usp=sharing"
file_id = url.split('/')[-2]
dwn_url = 'https://drive.google.com/uc?id=' + file_id
df = pd.read_csv(dwn_url)

# split the data into features and target
X = df.drop('target', axis=1)
y = df['target']

# preprocess the data
X = pd.get_dummies(X, columns=['cp', 'thal', 'slope'])
X = X.fillna(X.mean())
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# train a random forest classifier with the best hyperparameters found by grid search
rfc = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=10, min_samples_leaf=4, random_state=42)
rfc.fit(X_train, y_train)

# select the two most important features
importances = rfc.feature_importances_
indices = np.argsort(importances)[::-1]
feat1 = indices[0] # index of the most important feature
feat2 = indices[1] # index of the second most important feature

# create a scatter plot of the two features
plt.figure()
plt.scatter(X_test[:, feat1], X_test[:, feat2], c=y_test, cmap='bwr')
plt.xlabel(df.columns[feat1])
plt.ylabel(df.columns[feat2])
plt.title("Scatter Plot of Two Most Important Features")

# generate a probability estimate for each point on the scatter plot
xx, yy = np.meshgrid(np.linspace(-3, 3, 100), np.linspace(-3, 3, 100))
Z = rfc.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)

# overlay the decision boundaries of the model on the scatter plot
plt.contour(xx, yy, Z, levels=[0.5], colors='k')
plt.show()