
# Machine Learning Classification Assignment

## Dataset: Breast Cancer Classification (Built-in Dataset)

This notebook includes:
1. Dataset Overview
2. Data Preprocessing
3. Model Training (6 Models)
4. Model Comparison
5. Prediction Example
6. Model Saving

Dataset Source: sklearn built-in dataset  
Samples: 569  
Features: 30  
Type: Binary Classification


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import matthews_corrcoef, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import joblib
import os


## 1. Dataset Overview

In [None]:

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

df = X.copy()
df["target"] = y

print("Dataset Shape:", df.shape)
print("\nFirst 5 Rows:")
display(df.head())

print("\nClass Distribution:")
print(df["target"].value_counts())

df["target"].value_counts().plot(kind="bar")
plt.title("Target Class Distribution")
plt.xlabel("Class (0=Malignant, 1=Benign)")
plt.ylabel("Count")
plt.show()


## 2. Model Training

In [None]:

X = df.drop("target", axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

results = []

for name, model in models.items():
    
    if name in ["Logistic Regression", "KNN"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_prob = model.predict_proba(X_test_scaled)[:,1]
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:,1]
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    
    results.append([name, accuracy, auc, precision, recall, f1, mcc])

print("Training Completed Successfully!")


## 3. Model Comparison

In [None]:

results_df = pd.DataFrame(results, columns=[
    "Model", "Accuracy", "AUC", "Precision", "Recall", "F1", "MCC"
])

results_df = results_df.sort_values(by="Accuracy", ascending=False)
display(results_df)

plt.figure()
plt.barh(results_df["Model"], results_df["Accuracy"])
plt.xlabel("Accuracy")
plt.title("Model Accuracy Comparison")
plt.show()


## 4. Prediction Example

In [None]:

best_model = models["XGBoost"]

sample = X_test.iloc[0:1]
prediction = best_model.predict(sample)

print("Predicted Class:", prediction[0])
print("Actual Class:", y_test.iloc[0])


## 5. Save Models

In [None]:

os.makedirs("model", exist_ok=True)

for name, model in models.items():
    joblib.dump(model, f"model/{name}.pkl")

print("All models saved successfully!")
