In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

data_path = r"C:\Users\justino\Desktop\Boot Camp\First Week\.venv\data_refined_BC.csv"
df = pd.read_csv(data_path)

# Encode the target column ('diagnosis'): M -> malignant, B -> benign
le = LabelEncoder()
df['diagnosis'] = le.fit_transform(df['diagnosis'])

# 2. Feature Selection using Correlation
target = 'diagnosis'
correlations = {}
for feature in df.columns:
    if feature != target:
        correlations[feature] = df[feature].corr(df[target])

# Convert to DataFrame for easier handling
corr_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['Correlation'])
# Define a threshold for significance (adjust as necessary)
threshold = 0.1
important_features = corr_df[abs(corr_df['Correlation']) > threshold].index.tolist()
print("Important features based on correlation:", important_features)

# Prepare datasets
X = df.drop(columns=[target])
y = df[target]

# Full feature set
X_full = X.copy()

# Reduced feature set based on correlation threshold
X_reduced = X[important_features]

# Standardize features (especially important for KNN and SVC)
scaler_full = StandardScaler()
X_full_scaled = scaler_full.fit_transform(X_full)

scaler_reduced = StandardScaler()
X_reduced_scaled = scaler_reduced.fit_transform(X_reduced)

# Splitting the Data
# split off 80% for training and 20% for temporary testing/validation
X_train_full, X_temp_full, y_train, y_temp = train_test_split(
    X_full_scaled, y, test_size=0.2, random_state=42, stratify=y)
X_train_reduced, X_temp_reduced, _, _ = train_test_split(
    X_reduced_scaled, y, test_size=0.2, random_state=42, stratify=y)

# split the temporary set equally into 10% validation and 10% test sets
X_val_full, X_test_full, y_val, y_test = train_test_split(
    X_temp_full, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
X_val_reduced, X_test_reduced, _, _ = train_test_split(
    X_temp_reduced, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Utility function to train and evaluate a model
def evaluate_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    cm = confusion_matrix(y_val, y_pred)
    return acc, cm

# Training Classifiers

# --- KNN Classifier with Cross-Validation to choose optimal k ---
knn_params = {'n_neighbors': list(range(1, 31))}

# Full feature set KNN
knn_full_cv = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5)
knn_full_cv.fit(X_train_full, y_train)
best_k_full = knn_full_cv.best_params_['n_neighbors']
print("Optimal k for full features:", best_k_full)
knn_model_full = KNeighborsClassifier(n_neighbors=best_k_full)
acc_knn_full, cm_knn_full = evaluate_model(knn_model_full, X_train_full, y_train, X_val_full, y_val)

# Reduced feature set KNN
knn_reduced_cv = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5)
knn_reduced_cv.fit(X_train_reduced, y_train)
best_k_reduced = knn_reduced_cv.best_params_['n_neighbors']
print("Optimal k for reduced features:", best_k_reduced)
knn_model_reduced = KNeighborsClassifier(n_neighbors=best_k_reduced)
acc_knn_reduced, cm_knn_reduced = evaluate_model(knn_model_reduced, X_train_reduced, y_train, X_val_reduced, y_val)

# --- Random Forest Classifier ---
rf_full = RandomForestClassifier(random_state=42)
acc_rf_full, cm_rf_full = evaluate_model(rf_full, X_train_full, y_train, X_val_full, y_val)

rf_reduced = RandomForestClassifier(random_state=42)
acc_rf_reduced, cm_rf_reduced = evaluate_model(rf_reduced, X_train_reduced, y_train, X_val_reduced, y_val)

# --- Support Vector Classifier (SVC) ---
svc_full = SVC(random_state=42)
acc_svc_full, cm_svc_full = evaluate_model(svc_full, X_train_full, y_train, X_val_full, y_val)

svc_reduced = SVC(random_state=42)
acc_svc_reduced, cm_svc_reduced = evaluate_model(svc_reduced, X_train_reduced, y_train, X_val_reduced, y_val)

# Print accuracy results for full features
print("\nAccuracy with Full Features:")
print("KNN:", acc_knn_full)
print("Random Forest:", acc_rf_full)
print("SVC:", acc_svc_full)

# Print accuracy results for reduced features
print("\nAccuracy with Reduced Features (Correlation-Based):")
print("KNN:", acc_knn_reduced)
print("Random Forest:", acc_rf_reduced)
print("SVC:", acc_svc_reduced)



# Challenge: Alternative Feature Reduction using RFE
# Using Logistic Regression as the estimator for RFE
estimator = LogisticRegression(max_iter=1000, random_state=42)
# For example, select the top 5 features
selector = RFE(estimator, n_features_to_select=5, step=1)
selector.fit(X_full_scaled, y)
selected_features_rfe = X_full.columns[selector.support_].tolist()
print("\nFeatures selected by RFE:", selected_features_rfe)

# Build new dataset based on RFE-selected features
X_rfe = X[selected_features_rfe]
scaler_rfe = StandardScaler()
X_rfe_scaled = scaler_rfe.fit_transform(X_rfe)

# Split the RFE-based data (using the same proportions)
X_train_rfe, X_temp_rfe, y_train, y_temp = train_test_split(
    X_rfe_scaled, y, test_size=0.2, random_state=42, stratify=y)
X_val_rfe, X_test_rfe, y_val, y_test = train_test_split(
    X_temp_rfe, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Train classifiers on RFE-based features

# KNN with cross-validation
knn_rfe_cv = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5)
knn_rfe_cv.fit(X_train_rfe, y_train)
best_k_rfe = knn_rfe_cv.best_params_['n_neighbors']
print("Optimal k for RFE features:", best_k_rfe)
knn_model_rfe = KNeighborsClassifier(n_neighbors=best_k_rfe)
acc_knn_rfe, cm_knn_rfe = evaluate_model(knn_model_rfe, X_train_rfe, y_train, X_val_rfe, y_val)

# Random Forest and SVC on RFE features
rf_rfe = RandomForestClassifier(random_state=42)
acc_rf_rfe, cm_rf_rfe = evaluate_model(rf_rfe, X_train_rfe, y_train, X_val_rfe, y_val)

svc_rfe = SVC(random_state=42)
acc_svc_rfe, cm_svc_rfe = evaluate_model(svc_rfe, X_train_rfe, y_train, X_val_rfe, y_val)

print("\nAccuracy with RFE-Based Features:")
print("KNN:", acc_knn_rfe)
print("Random Forest:", acc_rf_rfe)
print("SVC:", acc_svc_rfe)


Important features based on correlation: ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'radius_se', 'perimeter_se', 'area_se', 'compactness_se', 'concavity_se', 'concave points_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']
Optimal k for full features: 3
Optimal k for reduced features: 3

Accuracy with Full Features:
KNN: 0.8947368421052632
Random Forest: 0.9824561403508771
SVC: 0.9649122807017544

Accuracy with Reduced Features (Correlation-Based):
KNN: 0.9298245614035088
Random Forest: 0.9649122807017544
SVC: 0.9473684210526315

Features selected by RFE: ['radius_se', 'radius_worst', 'texture_worst', 'area_worst', 'concave points_worst']
Optimal k for RFE features: 3

Accuracy with RFE-Based Features:
KNN: 0.9649122807017