In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the Dataset: Get the diabetes data.
file_path = '/content/diabetes.csv'  # Ensure this file is in your working directory
diabetes_data = pd.read_csv(file_path)

# Split the Data: Use 80% for training and 20% for testing.
X = diabetes_data.drop(columns='Outcome')  # Features
y = diabetes_data['Outcome']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Select Features:
selected_features = []  # Start with no features
best_accuracy = 0

# Add one feature at a time.
while True:
    current_best_accuracy = 0
    best_feature = None

    for feature in X.columns:
        if feature not in selected_features:
            # Train a model and check if it gets better
            model = LogisticRegression(max_iter=200)
            model.fit(X_train[selected_features + [feature]], y_train)
            y_pred = model.predict(X_test[selected_features + [feature]])
            accuracy = accuracy_score(y_test, y_pred)

            # Keep features that improve the model
            if accuracy > current_best_accuracy:
                current_best_accuracy = accuracy
                best_feature = feature

    # If a new feature improves the accuracy, add it to the selected features
    if best_feature is not None and current_best_accuracy > best_accuracy:
        selected_features.append(best_feature)
        best_accuracy = current_best_accuracy
        print(f"Added feature: {best_feature}, New Accuracy: {best_accuracy:.4f}")
    else:
        break

# Count the Features: See how many features you ended up with.
best_n_features = len(selected_features)  # Count of selected features
print(f"The best number of features is {best_n_features}.")
print("Selected features:", selected_features)


Added feature: Glucose, New Accuracy: 0.7532
Added feature: Pregnancies, New Accuracy: 0.7727
Added feature: DiabetesPedigreeFunction, New Accuracy: 0.7857
Added feature: SkinThickness, New Accuracy: 0.7987
The best number of features is 4.
Selected features: ['Glucose', 'Pregnancies', 'DiabetesPedigreeFunction', 'SkinThickness']
