In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)
import joblib


In [2]:
# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
columns = ["ID", "Diagnosis"] + [f"feature_{i}" for i in range(1, 31)]
data_frames = pd.read_csv(url, header=None, names=columns)

# Convert target to numeric (M=1, B=0)
data_frames['Diagnosis'] = data_frames['Diagnosis'].map({'M': 1, 'B': 0})
data_frames = data_frames.drop("ID", axis=1)

X = data_frames.drop("Diagnosis", axis=1)
y = data_frames["Diagnosis"]


In [3]:

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split on scaled data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)


In [4]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, solver="lbfgs"),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(eval_metric='logloss')
}

In [5]:
# Create project folder and model subfolder
os.makedirs("project-folder", exist_ok=True)
os.makedirs("project-folder/model", exist_ok=True)

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    calcualted_metrics = {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob) if y_prob is not None else None,
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }
    results.append(calcualted_metrics)
    
    # Save trained model
    joblib.dump(model, f"project-folder/model/{name.replace(' ', '_')}.pkl")

results_data_frames= pd.DataFrame(results)
print(results_data_frames)


                 Model  Accuracy       AUC  Precision    Recall        F1  \
0  Logistic Regression  0.982456  0.998089   0.968750  0.984127  0.976378   
1        Decision Tree  0.918129  0.925265   0.845070  0.952381  0.895522   
2                  KNN  0.959064  0.977660   0.951613  0.936508  0.944000   
3          Naive Bayes  0.935673  0.992651   0.919355  0.904762  0.912000   
4        Random Forest  0.970760  0.996620   0.983333  0.936508  0.959350   
5              XGBoost  0.964912  0.995003   0.938462  0.968254  0.953125   

        MCC  
0  0.962501  
1  0.832611  
2  0.911818  
3  0.861382  
4  0.937219  
5  0.925387  
