In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
file_path = '/content/diabetes.csv'
diabetes_data = pd.read_csv(file_path)

# Assume the target column is named 'Outcome' (change if different)
X = diabetes_data.drop(columns=['Outcome'])
y = diabetes_data['Outcome']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Get feature importance
importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Select the top 5 features
top_5_features = feature_importance_df['Feature'].head(5)
X_train_top5 = X_train[top_5_features]
X_test_top5 = X_test[top_5_features]

# Train a new Random Forest model using just the top 5 features
rf_model_top5 = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_top5.fit(X_train_top5, y_train)

# Make predictions and evaluate performance
y_pred_top5 = rf_model_top5.predict(X_test_top5)

mae = mean_absolute_error(y_test, y_pred_top5)
mse = mean_squared_error(y_test, y_pred_top5)
r2 = r2_score(y_test, y_pred_top5)

print(f"Top 5 Features: {top_5_features.tolist()}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R² : {r2}")


Top 5 Features: ['Glucose', 'BMI', 'Age', 'DiabetesPedigreeFunction', 'BloodPressure']
Mean Absolute Error (MAE): 0.3057792207792208
Mean Squared Error (MSE): 0.1700642857142857
R² : 0.25927555555555537
