In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
diabetes_data = pd.read_csv("/content/diabetes.csv")

# Step 1: Calculate the correlation matrix
correlation_matrix = diabetes_data.corr()

# Display the correlation matrix as a square table
print("Correlation Matrix:")
print(correlation_matrix)

# Step 2: Identify highly correlated feature pairs (correlation > 0.8)
high_correlation_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            feature_i = correlation_matrix.columns[i]
            feature_j = correlation_matrix.columns[j]
            high_correlation_pairs.append((feature_i, feature_j))

# Print the highly correlated feature pairs
print("\nHighly Correlated Feature Pairs (Correlation > 0.8):")
print(high_correlation_pairs)

# Step 3: Remove one feature from each pair of highly correlated features
features_to_remove = [pair[1] for pair in high_correlation_pairs]
diabetes_data_filtered = diabetes_data.drop(columns=features_to_remove)

# Step 4: Split data into training and test sets
X = diabetes_data.drop(columns=['Outcome'])
y = diabetes_data['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_filtered = diabetes_data_filtered.drop(columns=['Outcome'])
X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(X_filtered, y, test_size=0.2, random_state=42)

# Step 5: Train RandomForestClassifier and calculate accuracy before removing correlated features
model_before = RandomForestClassifier(random_state=42)
model_before.fit(X_train, y_train)
y_pred_before = model_before.predict(X_test)
accuracy_before = accuracy_score(y_test, y_pred_before)

# Step 6: Train RandomForestClassifier and calculate accuracy after removing correlated features
model_after = RandomForestClassifier(random_state=42)
model_after.fit(X_train_filtered, y_train_filtered)
y_pred_after = model_after.predict(X_test_filtered)
accuracy_after = accuracy_score(y_test_filtered, y_pred_after)

# Step 7: Print the accuracies
print("\nModel Accuracy Before Removing Correlated Features: {:.2f}%".format(accuracy_before * 100))
print("Model Accuracy After Removing Correlated Features: {:.2f}%".format(accuracy_after * 100))


Correlation Matrix:
                          Pregnancies   Glucose  BloodPressure  SkinThickness  \
Pregnancies                  1.000000  0.129459       0.141282      -0.081672   
Glucose                      0.129459  1.000000       0.152590       0.057328   
BloodPressure                0.141282  0.152590       1.000000       0.207371   
SkinThickness               -0.081672  0.057328       0.207371       1.000000   
Insulin                     -0.073535  0.331357       0.088933       0.436783   
BMI                          0.017683  0.221071       0.281805       0.392573   
DiabetesPedigreeFunction    -0.033523  0.137337       0.041265       0.183928   
Age                          0.544341  0.263514       0.239528      -0.113970   
Outcome                      0.221898  0.466581       0.065068       0.074752   

                           Insulin       BMI  DiabetesPedigreeFunction  \
Pregnancies              -0.073535  0.017683                 -0.033523   
Glucose              