In [161]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CSE_303/Datasets/diabetes.csv')
y = df['Outcome']
df = df.drop('Outcome', axis=1)

In [162]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3, random_state=42)

model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy Before: {accuracy}')

Accuracy Before: 0.7012987012987013


In [163]:
# 1. Missing Values Ratio (Threshold 30%)

missing_ratio = df.eq(0).sum() / len(df) * 100
print('Missing ratio per feature:\n', missing_ratio)

missing_ratio
X = df.loc[:, missing_ratio <= 30]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'\nAccuracy After removing features with >30% missing values: {accuracy}')

Missing ratio per feature:
 Pregnancies                 14.453125
Glucose                      0.651042
BloodPressure                4.557292
SkinThickness               29.557292
Insulin                     48.697917
BMI                          1.432292
DiabetesPedigreeFunction     0.000000
Age                          0.000000
dtype: float64

Accuracy After removing features with >30% missing values: 0.70995670995671


In [164]:
# 3. High Correlation Filter (Correlation > 0.8)

import numpy as np

# Calculate the correlation matrix
corr_matrix = df.corr().abs()

# Identify pairs of highly correlated features (correlation > 0.8)
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.8)]

print(f'Highly correlated features to drop: {to_drop}')

# Drop highly correlated features
df_reduced = df.drop(columns=to_drop)

# Split data and train model using the reduced dataset
X_train, X_test, y_train, y_test = train_test_split(df_reduced, y, test_size=0.2, random_state=42)

# Train the model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate the accuracy on the reduced feature set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy after removing highly correlated features: {accuracy}')


Highly correlated features to drop: []
Accuracy after removing highly correlated features: 0.7467532467532467


In [165]:
# 5. Low Variance Filter

from sklearn.feature_selection import VarianceThreshold

# Apply low variance filter
selector = VarianceThreshold(threshold=0.01)
X_reduced = selector.fit_transform(df)
y = y

# Train model
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy after applying low variance filter: {accuracy}')


Accuracy after applying low variance filter: 0.7467532467532467


In [166]:
# 7. Forward Feature Selection

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression

# Forward feature selection using logistic regression
model = LogisticRegression(max_iter=1000)
sfs = SequentialFeatureSelector(model, n_features_to_select="auto", direction='forward')


X = df
y = y

# Apply forward selection
sfs.fit(X, y)
X_selected = sfs.transform(X)

# Train model on selected features
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy with forward feature selection: {accuracy}')


Accuracy with forward feature selection: 0.7532467532467533


In [167]:
# 9. Backward Feature Elimination

from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

# Recursive feature elimination using decision tree classifier
model = DecisionTreeClassifier(random_state=42)
rfe = RFE(model, n_features_to_select=5)  # Select top 5 features

X = df
y = y

# Fit RFE
rfe.fit(X, y)
X_reduced = rfe.transform(X)

# Train model on selected features
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy after backward feature elimination: {accuracy}')


Accuracy after backward feature elimination: 0.7077922077922078


In [168]:
# 11. Random Forest Feature Importance

from sklearn.ensemble import RandomForestClassifier

# Train random forest classifier
model = RandomForestClassifier(random_state=42)
X = df
y = y

model.fit(X, y)

# Feature importance
importances = model.feature_importances_
indices = np.argsort(importances)[-5:]  # Keep top 5 features

# Select top 5 important features
X_reduced = X.iloc[:, indices]

# Train model on top 5 features
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

feature_names = df.columns
top_features = feature_names[indices][:5]
print("Top 5 features:",top_features)
X_train_reduced = X_train[top_features]
X_test_reduced = X_test[top_features]

rf_reduced = RandomForestClassifier(n_estimators=100, random_state=42)
rf_reduced.fit(X_train_reduced, y_train)

y_pred_reduced = rf_reduced.predict(X_test_reduced)
accuracy_reduced = accuracy_score(y_test, y_pred_reduced)

print(f"Accuracy with all features: {accuracy_score(y_test, model.predict(X_test))}")
print(f"Accuracy with top 5 features: {accuracy_reduced}")

Top 5 features: Index(['BloodPressure', 'DiabetesPedigreeFunction', 'Age', 'BMI', 'Glucose'], dtype='object')
Accuracy with all features: 0.7532467532467533
Accuracy with top 5 features: 0.7532467532467533
