In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the diabetes dataset
file_path = '/content/diabetes.csv'
diabetes_data = pd.read_csv(file_path)

# Step 1: Count the number of zero values in relevant columns
invalid_columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
zero_counts = (diabetes_data[invalid_columns] == 0).sum()

# Step 2: Calculate the percentage of zero values in these columns
missing_percentage = (zero_counts / len(diabetes_data)) * 100

# Display the number and percentage of zero values
print("Number of zero values:")
print(zero_counts)
print("\nPercentage of zero values:")
print(missing_percentage)

# Step 3: Model accuracy before removing any columns
X_full = diabetes_data.drop(columns=['Outcome'])
y_full = diabetes_data['Outcome']

X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

# Train the model on full data
model_full = RandomForestClassifier(random_state=42)
model_full.fit(X_train_full, y_train_full)

# Predict and calculate accuracy on full data
y_pred_full = model_full.predict(X_test_full)
accuracy_full = accuracy_score(y_test_full, y_pred_full)

print("\nModel Accuracy before removing columns: {:.2f}%".format(accuracy_full * 100))

# Step 4: Remove columns where the percentage of zero values exceeds 30%
columns_to_remove = missing_percentage[missing_percentage > 30].index
reduced_data = diabetes_data.drop(columns=columns_to_remove)

# Display reduced dataset columns
print("\nReduced dataset columns:")
print(reduced_data.columns)

# Step 5: Model accuracy after removing columns with more than 30% missing values
X_reduced = reduced_data.drop(columns=['Outcome'])
y_reduced = reduced_data['Outcome']

X_train_reduced, X_test_reduced, y_train_reduced, y_test_reduced = train_test_split(X_reduced, y_reduced, test_size=0.2, random_state=42)

# Train the model on reduced data
model_reduced = RandomForestClassifier(random_state=42)
model_reduced.fit(X_train_reduced, y_train_reduced)

# Predict and calculate accuracy on reduced data
y_pred_reduced = model_reduced.predict(X_test_reduced)
accuracy_reduced = accuracy_score(y_test_reduced, y_pred_reduced)

# Display model accuracy
print("\nModel Accuracy after removing columns: {:.2f}%".format(accuracy_reduced * 100))


Number of zero values:
Glucose            5
BloodPressure     35
SkinThickness    227
Insulin          374
BMI               11
dtype: int64

Percentage of zero values:
Glucose           0.651042
BloodPressure     4.557292
SkinThickness    29.557292
Insulin          48.697917
BMI               1.432292
dtype: float64

Model Accuracy before removing columns: 72.08%

Reduced dataset columns:
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'BMI',
       'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

Model Accuracy after removing columns: 74.03%
