In [1]:
from platform import python_version

print(python_version())

3.12.3


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('Heart_disease.csv')

# Shuffle and split the data
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Save the data to CSV files
train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

print(f'Training set: {len(train_data)} instances')
print(f'Validation set: {len(val_data)} instances')
print(f'Testing set: {len(test_data)} instances')

Training set: 177576 instances
Validation set: 38052 instances
Testing set: 38052 instances


In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler

In [4]:
# Load data
data = pd.read_csv('Heart_disease.csv')
X = data.drop('HeartDiseaseorAttack', axis=1)
y = data['HeartDiseaseorAttack']

In [5]:
# Split data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [6]:
# Function to evaluate model
def evaluate_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    sensitivity = recall_score(y_val, y_pred)
    specificity = recall_score(y_val, y_pred, pos_label=0)
    return accuracy, sensitivity, specificity

In [7]:
# Original Data
rf = RandomForestClassifier(random_state=42)
dt = DecisionTreeClassifier(random_state=42)
results = {}

In [8]:
results['Original_RF'] = evaluate_model(rf, X_train, y_train, X_val, y_val)
results['Original_DT'] = evaluate_model(dt, X_train, y_train, X_val, y_val)



In [9]:
sampler = RandomOverSampler(random_state=11)

In [10]:
#First 
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

In [11]:
df = pd.read_csv('Heart_disease.csv')

In [12]:
# Split data into features (X) and target (y)
X = df.drop('HeartDiseaseorAttack', axis=1)
y = df['HeartDiseaseorAttack']

In [13]:
# Split data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [14]:
# Display the shape of each set
print(f"Training set: {X_train.shape[0]} instances")
print(f"Validation set: {X_val.shape[0]} instances")
print(f"Testing set: {X_test.shape[0]} instances")

Training set: 177576 instances
Validation set: 38052 instances
Testing set: 38052 instances


In [15]:
# Initialize models
rf_model = RandomForestClassifier(random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)

In [16]:
# Train Random Forest model
rf_model.fit(X_train, y_train)


In [17]:
# Train Decision Tree model
dt_model.fit(X_train, y_train)

In [18]:
# Predictions on validation set
rf_val_preds = rf_model.predict(X_val)
dt_val_preds = dt_model.predict(X_val)

In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Evaluate model performance
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    # Compute confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    return accuracy, precision, recall, f1, tp, tn, fp, fn

# Example usage
# y_true = [actual labels]
# y_pred = [predicted labels]
# accuracy, precision, recall, f1, tp, tn, fp, fn = evaluate_model(y_true, y_pred)


In [20]:
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return accuracy, precision, recall, f1


In [21]:
# Calculate metrics for Decision Tree
dt_accuracy, dt_precision, dt_recall, dt_f1 = evaluate_model(y_val, dt_val_preds)


In [22]:
# Evaluate Random Forest model
rf_accuracy, rf_precision, rf_recall, rf_f1 = evaluate_model(y_val, rf_val_preds)

# Evaluate Decision Tree model
dt_accuracy, dt_precision, dt_recall, dt_f1 = evaluate_model(y_val, dt_val_preds)


In [23]:
results = {
    'Model': ['Random Forest', 'Decision Tree'],
    'Accuracy': [rf_accuracy, dt_accuracy],
    'Precision': [rf_precision, dt_precision],
    'Recall': [rf_recall, dt_recall],
    'F1 Score': [rf_f1, dt_f1]
}
results_df = pd.DataFrame(results)
print(results_df)

           Model  Accuracy  Precision    Recall  F1 Score
0  Random Forest  0.903553    0.43983  0.116109  0.183719
1  Decision Tree  0.851887    0.24994  0.292100  0.269380
