In [24]:
# === Cell 0. Imports, warnings, MLflow setup ===
import warnings, os, json
warnings.filterwarnings("ignore", category=UserWarning, message="Bins whose width are too small")
warnings.filterwarnings("ignore", category=FutureWarning, message="`fit_params` is deprecated")

import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, KBinsDiscretizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin

from category_encoders import TargetEncoder

import mlflow, mlflow.sklearn
from mlflow.models import infer_signature

# MLflow (UI у тебя на http://127.0.0.1:5000)
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("LR2 - Experiments")

# Папка для артефактов ноутбука
os.makedirs("artifacts", exist_ok=True)

print("✅ Init done")


✅ Init done


In [25]:
# === Cell 1. Load data ===
DATA_PATH = Path("/home/user/my_proj/data/dataset.csv")  # <-- твой реальный путь (по tree)
assert DATA_PATH.exists(), f"Нет файла: {DATA_PATH}"

df = pd.read_csv(DATA_PATH)
print("df.shape:", df.shape)
df.head()


df.shape: (303, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [26]:
# === Cell 2. Split & feature lists ===
TARGET = "target"
assert TARGET in df.columns, f"В датасете нет колонки '{TARGET}'"

X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

def auto_cats(df, exclude_cols=(), max_unique_abs=12, max_unique_ratio=0.05):
    cats = []
    n = len(df)
    thr = max(2, min(max_unique_abs, int(max_unique_ratio*n)))  # <= min(12, 5% от строк)
    for c in df.columns:
        if c in exclude_cols: continue
        u = df[c].nunique(dropna=True)
        if u <= thr:
            cats.append(c)
    return cats

# предпочтительный список для heart-disease; оставим только существующие
preferred = ["sex","cp","fbs","restecg","exang","slope","ca","thal"]
categorical_features = [c for c in preferred if c in X_train.columns]

# если пересечение пустое — включим авто-детект (категориальные = мало уникальных)
if not categorical_features:
    categorical_features = auto_cats(X_train, exclude_cols=[TARGET])

numeric_features = [c for c in X_train.columns if c not in categorical_features]

# Для FE:
poly_cols  = [c for c in ["age","trestbps","chol"] if c in numeric_features]
kbins_cols = [c for c in ["oldpeak","thalach"]     if c in numeric_features]
num_std_cols = [c for c in numeric_features if c not in poly_cols+kbins_cols]

print("Категориальные:", categorical_features)
print("Числовые (примеры):", numeric_features[:8])
print("poly_cols:", poly_cols, "kbins_cols:", kbins_cols, "num_std_cols:", num_std_cols[:6])


Категориальные: ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
Числовые (примеры): ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
poly_cols: ['age', 'trestbps', 'chol'] kbins_cols: ['oldpeak', 'thalach'] num_std_cols: []


In [28]:
# === Cell 3. Baseline pipeline (TE -> CT -> RF), metrics & MLflow log ===

# шагаем: (1) TargetEncoder по именам (на DataFrame), (2) ColumnTransformer для числовых
te = TargetEncoder(cols=categorical_features) if categorical_features else "drop"

poly = Pipeline([("scale", StandardScaler()),
                 ("poly", PolynomialFeatures(degree=2, include_bias=False))])

kbins = Pipeline([("kb", KBinsDiscretizer(n_bins=3, encode="onehot-dense", strategy="uniform"))])

transformers = []
if num_std_cols:
    transformers.append(("num_std", StandardScaler(), num_std_cols))
if poly_cols:
    transformers.append(("poly", poly, poly_cols))
if kbins_cols:
    transformers.append(("kbins", kbins, kbins_cols))
if categorical_features:
    # после TE эти колонки уже числовые → просто пропускаем
    transformers.append(("cat_passthrough", "passthrough", categorical_features))

ct = ColumnTransformer(transformers, remainder="drop")

baseline_pipe = Pipeline([
    ("te", te),
    ("fe", ct),
    ("model", RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
])

baseline_pipe.fit(X_train, y_train)
y_proba = baseline_pipe.predict_proba(X_test)[:,1]
y_pred  = (y_proba >= 0.5).astype(int)

metrics_baseline = {
    "precision": float(precision_score(y_test, y_pred)),
    "recall":    float(recall_score(y_test, y_pred)),
    "f1":        float(f1_score(y_test, y_pred)),
    "roc_auc":   float(roc_auc_score(y_test, y_proba)),
}
metrics_baseline
# === Cell 3b. MLflow log (baseline) + артефакт числа фич ===
# посчитаем число признаков после FE на train (TE -> CT)
Xtr_te = baseline_pipe.named_steps['te'].fit_transform(X_train, y_train) if categorical_features else X_train
Xtr_fe = baseline_pipe.named_steps['fe'].fit_transform(Xtr_te, y_train)
with open("artifacts/fe_shape.json","w") as f:
    json.dump({"n_features": int(Xtr_fe.shape[1])}, f)

with mlflow.start_run(run_name="baseline_TE_CT_RF"):
    mlflow.log_metrics(metrics_baseline)
    mlflow.log_params({
        "task": "classification",
        "n_estimators": 200,
        "poly_cols": ",".join(poly_cols),
        "kbins_cols": ",".join(kbins_cols),
        "categoricals": ",".join(categorical_features),
    })
    mlflow.log_artifact("artifacts/fe_shape.json")

    sig = infer_signature(X_train, baseline_pipe.predict_proba(X_train)[:,1])
    mlflow.sklearn.log_model(baseline_pipe, "model", signature=sig, input_example=X_train.head(5))

print("✅ Baseline залогирован в MLflow")


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 524.74it/s] 
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
2025/10/19 21:45:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run baseline_TE_CT_RF at: http://127.0.0.1:5000/#/experiments/1/runs/04b6c1e398dc44c485f8b9fda3288fad.
2025/10/19 21:45:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.


✅ Baseline залогирован в MLflow


In [31]:
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# (опц.) мини-сэмплинг train на время подбора — ускоряет в 3-5 раз:
X_tr_small = X_train.sample(frac=0.7, random_state=42)
y_tr_small = y_train.loc[X_tr_small.index]

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 80, 160, step=40),  # меньше деревьев
        "max_depth":    trial.suggest_int("max_depth", 3, 12),
        "max_features": trial.suggest_float("max_features", 0.5, 0.95),
    }
    est = RandomForestClassifier(random_state=42, n_jobs=1, **params)  # n_jobs=1
    pipe = Pipeline([
        ("te", baseline_pipe.named_steps['te']),
        ("fe", baseline_pipe.named_steps['fe']),
        ("model", est),
    ])
    # cv=2 и n_jobs=1 → стабильно и быстро
    return cross_val_score(pipe, X_tr_small, y_tr_small, cv=2, scoring="f1", n_jobs=1).mean()

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=6, timeout=60)  # 6 трайлов или 60 секунд
print("Best f1:", study.best_value)
print("Best params:", study.best_params)


[I 2025-10-19 21:53:56,720] A new study created in memory with name: no-name-0db56c2d-f5c2-40d8-b15f-67ee8be3a2c1
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column

Best f1: 0.7773799234473392
Best params: {'n_estimators': 160, 'max_depth': 5, 'max_features': 0.5489790771229004}


In [33]:
# === Cell 6A. Best model (без SFS) + MLflow log ===
best_model = Pipeline([
    ("te", baseline_pipe.named_steps['te']),
    ("fe", baseline_pipe.named_steps['fe']),
    ("model", RandomForestClassifier(random_state=42, n_jobs=-1, **study.best_params)),
]).fit(X_train, y_train)

y_proba_best = best_model.predict_proba(X_test)[:,1]
y_pred_best  = (y_proba_best >= 0.5).astype(int)
metrics_best = {
    "precision": float(precision_score(y_test, y_pred_best)),
    "recall":    float(recall_score(y_test, y_pred_best)),
    "f1":        float(f1_score(y_test, y_pred_best)),
    "roc_auc":   float(roc_auc_score(y_test, y_proba_best)),
}
metrics_best
with mlflow.start_run(run_name="optuna_best_RF_TE_CT"):
    mlflow.log_metrics(metrics_best)
    mlflow.log_params(study.best_params)
    if os.path.exists("artifacts/fe_shape.json"):
        mlflow.log_artifact("artifacts/fe_shape.json")
    sig = infer_signature(X_train, best_model.predict_proba(X_train)[:,1])
    mlflow.sklearn.log_model(best_model, "model", signature=sig, input_example=X_train.head(5))

print("✅ Optuna best (без SFS) залогирован")


  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.

✅ Optuna best (без SFS) залогирован


In [34]:
# === Cell 7. Final full fit (на всех данных) + артефакты ===
# используем best_model из предыдущей ячейки (A или B)

final_model = best_model.fit(X, y)

with open("artifacts/source_columns.json","w") as f:
    json.dump(list(X.columns), f)

with mlflow.start_run(run_name="final_full_fit_production"):
    # метрики на full fit уже не считаем согласно заданию
    if os.path.exists("../requirements.txt"):
        mlflow.log_artifact("../requirements.txt")
    mlflow.log_artifact("artifacts/source_columns.json")
    sig_full = infer_signature(X, final_model.predict_proba(X)[:,1])
    mlflow.sklearn.log_model(final_model, "model", signature=sig_full, input_example=X.head(5))

print("✅ Финальная модель на всей выборке залогирована. Сделай её Production в UI.")


  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.

✅ Финальная модель на всей выборке залогирована. Сделай её Production в UI.
