In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from scipy.stats import randint

In [4]:
df = pd.read_csv('/content/heart.csv')
df.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
5,58,0,0,100,248,0,0,122,0,1.0,1,0,2,1
6,58,1,0,114,318,0,2,140,0,4.4,0,3,1,0
7,55,1,0,160,289,0,0,145,1,0.8,1,1,3,0
8,46,1,0,120,249,0,0,144,0,0.8,2,0,3,0
9,54,1,0,122,286,0,0,116,1,3.2,1,2,2,0


In [5]:
# Assume the last column is the target (y) and the rest are features (X)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [6]:
# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
# Step 3: Train a model without hyperparameter tuning (baseline model)
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

In [8]:
# Predict and evaluate the baseline model
y_pred_baseline = rf_clf.predict(X_test)
baseline_accuracy = accuracy_score(y_test, y_pred_baseline)
print(f"Baseline model accuracy: {baseline_accuracy}")

Baseline model accuracy: 0.9805194805194806


In [9]:
# Step 4: Perform Hyperparameter Tuning with GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [10]:
# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [11]:
# Best hyperparameters found by GridSearchCV
print("Best hyperparameters (GridSearchCV):", grid_search.best_params_)

Best hyperparameters (GridSearchCV): {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [12]:
# Evaluate the model with the best hyperparameters
best_rf_clf = grid_search.best_estimator_
y_pred_tuned = best_rf_clf.predict(X_test)
tuned_accuracy = accuracy_score(y_test, y_pred_tuned)
print(f"Tuned model accuracy (GridSearchCV): {tuned_accuracy}")

Tuned model accuracy (GridSearchCV): 0.9902597402597403


In [13]:
# Define a distribution of hyperparameters for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(50, 300),
    'max_depth': [10, 20, 30, None],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20)
}

In [14]:
# Use RandomizedSearchCV to find the best parameters
random_search = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42),
                                   param_distributions=param_dist,
                                   n_iter=50, cv=5, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

In [15]:
# Best hyperparameters found by RandomizedSearchCV
print("Best hyperparameters (RandomizedSearchCV):", random_search.best_params_)

Best hyperparameters (RandomizedSearchCV): {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 253}


In [16]:
# Evaluate the model with the best hyperparameters
best_rf_random = random_search.best_estimator_
y_pred_random_tuned = best_rf_random.predict(X_test)
random_tuned_accuracy = accuracy_score(y_test, y_pred_random_tuned)
print(f"Tuned model accuracy (RandomizedSearchCV): {random_tuned_accuracy}")

Tuned model accuracy (RandomizedSearchCV): 0.9805194805194806
