In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, mean_absolute_error

file_path = r"C:\Users\ma007\Downloads\archive (6)\data.csv"
df = pd.read_csv(file_path)

drop_cols = [col for col in ['id', 'Unnamed: 32'] if col in df.columns]
df = df.drop(columns=drop_cols)

# Features and Target
X = df.drop(columns=['diagnosis'])
y = df['diagnosis'].map({'M': 1, 'B': 0})

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features (for LR & KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Baseline: Logistic Regression ---
log_model = LogisticRegression(max_iter=5000)
log_model.fit(X_train_scaled, y_train)
y_pred_log = log_model.predict(X_test_scaled)

baseline_acc = accuracy_score(y_test, y_pred_log)
baseline_mae = mean_absolute_error(y_test, y_pred_log)

# --- Alternative Model 1: KNN ---
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)
y_pred_knn = knn_model.predict(X_test_scaled)

knn_acc = accuracy_score(y_test, y_pred_knn)
knn_mae = mean_absolute_error(y_test, y_pred_knn)

# --- Alternative Model 2: Decision Tree ---
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)  # no scaling needed for tree
y_pred_dt = dt_model.predict(X_test)

dt_acc = accuracy_score(y_test, y_pred_dt)
dt_mae = mean_absolute_error(y_test, y_pred_dt)

# --- Model Comparison Table ---
comparison_df = pd.DataFrame({
    "Model": ["Logistic Regression (Baseline)", "KNN", "Decision Tree"],
    "Accuracy": [baseline_acc, knn_acc, dt_acc],
    "MAE": [baseline_mae, knn_mae, dt_mae]
})

print("\nModel Comparison Table:")
print(comparison_df)



Model Comparison Table:
                            Model  Accuracy       MAE
0  Logistic Regression (Baseline)  0.973684  0.026316
1                             KNN  0.947368  0.052632
2                   Decision Tree  0.947368  0.052632
