In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
csv_path = "~/.cache/kagglehub/datasets/uciml/breast-cancer-wisconsin-data/versions/2/data.csv"
df = pd.read_csv(csv_path)
df.head()

target_col = "diagnosis"

X = df.drop([target_col, "id", "Unnamed: 32"], axis=1)

y = df[target_col]
y = y.map({"M": 1, "B": 0, "Malignant": 1, "Benign": 0})

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
pipe = Pipeline([("scaler", StandardScaler()), ("decision_tree", DecisionTreeClassifier())])

scores = cross_validate(pipe, X_train, y_train, cv=cv, scoring="accuracy", return_train_score=True)

val = scores["test_score"]
trn = scores["train_score"]

print(f"val: {np.mean(val)}, train: {np.mean(trn)}")

val: 0.9274725274725274, train: 1.0


In [None]:
pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("random_forest", RandomForestClassifier(random_state=42)),
    ]
)

scores = cross_validate(pipe, X_train, y_train, cv=cv, scoring="accuracy", return_train_score=True)

val = scores["test_score"]
trn = scores["train_score"]

print(f"val: {np.mean(val)}, train: {np.mean(trn)}")

val: [0.95604396 0.98901099 0.94505495 0.96703297 0.96703297], train: 1.0


In [None]:
pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        (
            "gdbt",
            GradientBoostingClassifier(
                n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0
            ),
        ),
    ]
)

scores = cross_validate(pipe, X_train, y_train, cv=cv, scoring="accuracy", return_train_score=True)

val = scores["test_score"]
trn = scores["train_score"]

print(f"val: {np.mean(val)}, train: {np.mean(trn)}")

val: 0.9670329670329672, train: 1.0
