# Classification with Sklearn RandomForestClassifier

In [43]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [44]:
# Import the Heart Disease dataset. 
# The dataset is publicly available at https://archive.ics.uci.edu/ml/datasets/Heart+Disease
# The dataset is about heart disease and contains several columns with information about the patient.
# The target column is 'target' which indicates whether the patient has heart disease or not.
heart_disease = pd.read_csv('data/classification/heart-disease.csv')
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [45]:
# Split the dataset into features (X) and target (y)
X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

In [46]:
# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
clf.fit(X_train, y_train)

clf.score(X_test, y_test)

0.8360655737704918

In [56]:
# Make predictions on the test set
y_pred = clf.predict(X_test)

In [57]:
# Visualize the predictions
pred_results = pd.DataFrame({'actual': y_test, 'predicted': y_pred})
pred_results.head(20)

Unnamed: 0,actual,predicted
179,0,0
228,0,1
111,1,1
246,0,0
60,1,1
9,1,1
119,1,1
223,0,0
268,0,0
33,1,0
