In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the diabetes dataset
data = pd.read_csv('/content/diabetes.csv')  # Ensure the file path is correct

# Step 1: Calculate the variance of each feature
variances = data.var()
print("Feature Variances:")
print(variances)  # Print variance for analysis

# Step 2: Set a threshold for low variance
threshold = 0.1  # Adjust this threshold based on the variance output
low_variance_features = variances[variances < threshold].index.tolist()

# Step 3: Remove low variance features
data_filtered = data.drop(columns=low_variance_features)

# Separate features and target
X = data_filtered.drop(columns='Outcome')  # Assuming 'Outcome' is the target variable
y = data_filtered['Outcome']

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train the model and evaluate accuracy after removing low variance features
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate accuracy after removing low variance features
accuracy_after = accuracy_score(y_test, y_pred)

# Compare with the original model
X_original = data.drop(columns='Outcome')
X_train_original, X_test_original, y_train_original, y_test_original = train_test_split(X_original, y, test_size=0.2, random_state=42)

model_original = RandomForestClassifier(random_state=42)
model_original.fit(X_train_original, y_train_original)
y_pred_original = model_original.predict(X_test_original)

# Calculate accuracy before removing low variance features
accuracy_before = accuracy_score(y_test_original, y_pred_original)

# Output results in the desired format
print(f'Removed Features: {low_variance_features}')
print(f'Accuracy Before Filtering: {accuracy_before * 100:.2f}% (A_before = {accuracy_before:.2f})')
print(f'Accuracy After Filtering: {accuracy_after * 100:.2f}% (A_after = {accuracy_after:.2f})')
print(f'Features Used in Final Model: {X.shape[1]} (down from {X_original.shape[1]})')
print(f'Final Features Used: {X.columns.tolist()}')  # Display the names of features used


Feature Variances:
Pregnancies                    11.354056
Glucose                      1022.248314
BloodPressure                 374.647271
SkinThickness                 254.473245
Insulin                     13281.180078
BMI                            62.159984
DiabetesPedigreeFunction        0.109779
Age                           138.303046
Outcome                         0.227483
dtype: float64
Removed Features: []
Accuracy Before Filtering: 72.08% (A_before = 0.72)
Accuracy After Filtering: 72.08% (A_after = 0.72)
Features Used in Final Model: 8 (down from 8)
Final Features Used: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
