In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Load the diabetes dataset
diabetes_data = pd.read_csv('/content/diabetes.csv')  # Replace with your file path

# Split the data into features and target
X = diabetes_data.drop(columns=['Outcome'])  # Assuming 'Outcome' is the target variable
y = diabetes_data['Outcome']

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a decision tree classifier
model = DecisionTreeClassifier(random_state=42)

# List to store the performance metrics
performance_metrics = []

# Function to perform backward feature elimination
def backward_feature_elimination(X_train, X_test, y_train, y_test):
    # Start with all features
    current_features = X_train.columns.tolist()

    while len(current_features) > 0:
        # Train the model with the current set of features
        model.fit(X_train[current_features], y_train)

        # Make predictions and calculate accuracy
        y_pred = model.predict(X_test[current_features])
        accuracy = accuracy_score(y_test, y_pred)
        performance_metrics.append((len(current_features), accuracy))

        # Get feature importances
        importances = model.feature_importances_

        # Create a DataFrame for importances
        importance_df = pd.DataFrame({'Feature': current_features, 'Importance': importances})

        # Find the least important feature
        least_important = importance_df.nsmallest(1, 'Importance')

        # Remove the least important feature
        current_features.remove(least_important['Feature'].values[0])

        # If performance decreases, break the loop
        if len(performance_metrics) > 1 and accuracy < performance_metrics[-2][1]:
            break

    return current_features

# Execute backward feature elimination
final_features = backward_feature_elimination(X_train, X_test, y_train, y_test)

# Print the final set of features
print("The final features are:", final_features)

# Optional: Print the performance metrics throughout the process
print("Performance metrics (number of features, accuracy):")
print(performance_metrics)


The final features are: ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'DiabetesPedigreeFunction', 'Age']
Performance metrics (number of features, accuracy):
[(8, 0.7467532467532467), (7, 0.7402597402597403)]
