In [1]:
# Improvement’da train qilamiz:
# OVR LogisticRegression (2 variant), OVR LinearSVC, OVR SGD (log_loss + hinge), OVR ComplementNB

# har biriga VAL’da per-label threshold tuning + model+thresholds+metrics save.

# CELL 1 — Imports
from __future__ import annotations

from pathlib import Path
from datetime import datetime
import json

import numpy as np
import pandas as pd
from scipy import sparse
import joblib

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import ComplementNB

In [2]:
#CELL 2 — Config + PROJECT_ROOT + Load Engineered Data
# =========================
# CONFIG
# =========================
FE_VERSION = "fe_v1_fs_chi2_v1"  # Data/Feature_Selected/fe_v1_fs_chi2_v1
ENGINEERED_DIR_NAME = "Feature_Selected"

# Output run id (folder)
RUN_ID = datetime.now().strftime("impr_%Y%m%d_%H%M%S")

# Where to save
# - models + thresholds + metrics
# - summary
ART_MODELS_DIR_REL = Path("artifacts") / "models" / "improvement" / RUN_ID
RES_DIR_REL = Path("results") / "improvement" / RUN_ID

# Threshold tuning grid size (har label uchun)
N_THR = 61  # 61 quantile threshold (tez va yetarli)

RANDOM_STATE = 42
N_JOBS = 1  # Windows/OneDrive muammolari uchun 1


def find_project_root_by_engineered(start: Path | None = None) -> tuple[Path, Path]:
    """
    PROJECT_ROOT ni Data/Engineered_data/<FE_VERSION>/X_train.npz bor joydan topadi.
    Returns: (project_root, engineered_dir)
    """
    start = start or Path.cwd()
    checked = []
    for p in [start] + list(start.parents):
        ed = p / "Data" / ENGINEERED_DIR_NAME / FE_VERSION
        checked.append(ed)
        if (ed / "X_train.npz").exists() and (ed / "Y_train.npy").exists() and (ed / "engineered_meta.json").exists():
            return p, ed

    raise FileNotFoundError(
        "Engineered data topilmadi (parents bo‘ylab qidirildi).\n"
        f"Start: {start.resolve()}\n"
        "Oxirgi 10 tekshirilgan yo‘l:\n" + "\n".join(str(x) for x in checked[-10:]) +
        "\n\nYECHIM:\n"
        "Avval 09_feature_engineering ni run qiling (Data/Engineered_data/fe_v1/ chiqishi kerak)."
    )


PROJECT_ROOT, ENGINEERED_DIR = find_project_root_by_engineered()
print("PROJECT_ROOT:", PROJECT_ROOT.resolve())
print("ENGINEERED_DIR:", ENGINEERED_DIR.resolve())

ART_MODELS_DIR = PROJECT_ROOT / ART_MODELS_DIR_REL
RES_DIR = PROJECT_ROOT / RES_DIR_REL
ART_MODELS_DIR.mkdir(parents=True, exist_ok=True)
RES_DIR.mkdir(parents=True, exist_ok=True)

# ---- Load meta ----
with open(ENGINEERED_DIR / "engineered_meta.json", "r", encoding="utf-8") as f:
    meta = json.load(f)

y_cols = meta["y_cols"]
TEXT_COL = meta.get("text_col", "REAC_pt_symptom_v2")
print("Num labels:", len(y_cols))

# ---- Load X ----
X_train = sparse.load_npz(ENGINEERED_DIR / "X_train.npz").tocsr()
X_val   = sparse.load_npz(ENGINEERED_DIR / "X_val.npz").tocsr()
X_test  = sparse.load_npz(ENGINEERED_DIR / "X_test.npz").tocsr()

# ---- Load Y ----
Y_train = np.load(ENGINEERED_DIR / "Y_train.npy")
Y_val   = np.load(ENGINEERED_DIR / "Y_val.npy")
Y_test  = np.load(ENGINEERED_DIR / "Y_test.npy")

print("X:", X_train.shape, X_val.shape, X_test.shape)
print("Y:", Y_train.shape, Y_val.shape, Y_test.shape)

# ---- Load IDs (optional) ----
ids_train_path = ENGINEERED_DIR / "ids_train.csv"
ids_val_path   = ENGINEERED_DIR / "ids_val.csv"
ids_test_path  = ENGINEERED_DIR / "ids_test.csv"

IDS_AVAILABLE = ids_test_path.exists()
if IDS_AVAILABLE:
    ids_test = pd.read_csv(ids_test_path)
    print("IDs:", ids_test.columns.tolist())
else:
    ids_test = None
    print("IDs not found (ok).")

PROJECT_ROOT: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract
ENGINEERED_DIR: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Feature_Selected\fe_v1_fs_chi2_v1
Num labels: 21
X: (201176, 33671) (24410, 33671) (24164, 33671)
Y: (201176, 21) (24410, 21) (24164, 21)
IDs: ['primaryid']


In [3]:
#CELL 3 — Metrics helpers (per-label + micro/macro)
def prf_from_counts(tp: int, fp: int, fn: int) -> tuple[float, float, float]:
    prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    rec  = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1   = (2 * prec * rec / (prec + rec)) if (prec + rec) > 0 else 0.0
    return prec, rec, f1


def multilabel_micro_macro(Y_true: np.ndarray, Y_pred: np.ndarray) -> dict:
    # micro
    tp = int(((Y_true == 1) & (Y_pred == 1)).sum())
    fp = int(((Y_true == 0) & (Y_pred == 1)).sum())
    fn = int(((Y_true == 1) & (Y_pred == 0)).sum())
    micro_p, micro_r, micro_f1 = prf_from_counts(tp, fp, fn)

    # macro (avg of per-label f1)
    f1s = []
    ps = []
    rs = []
    for j in range(Y_true.shape[1]):
        y = Y_true[:, j]
        p = Y_pred[:, j]
        tpj = int(((y == 1) & (p == 1)).sum())
        fpj = int(((y == 0) & (p == 1)).sum())
        fnj = int(((y == 1) & (p == 0)).sum())
        pj_, rj_, f1j_ = prf_from_counts(tpj, fpj, fnj)
        ps.append(pj_)
        rs.append(rj_)
        f1s.append(f1j_)
    macro_p = float(np.mean(ps))
    macro_r = float(np.mean(rs))
    macro_f1 = float(np.mean(f1s))

    return {
        "micro_precision": micro_p,
        "micro_recall": micro_r,
        "micro_f1": micro_f1,
        "macro_precision": macro_p,
        "macro_recall": macro_r,
        "macro_f1": macro_f1,
    }


def per_label_report(Y_true: np.ndarray, Y_pred: np.ndarray, thresholds: np.ndarray) -> pd.DataFrame:
    rows = []
    for j, lab in enumerate(y_cols):
        y = Y_true[:, j]
        p = Y_pred[:, j]
        tp = int(((y == 1) & (p == 1)).sum())
        fp = int(((y == 0) & (p == 1)).sum())
        fn = int(((y == 1) & (p == 0)).sum())
        tn = int(((y == 0) & (p == 0)).sum())
        prec, rec, f1 = prf_from_counts(tp, fp, fn)
        rows.append({
            "label": lab,
            "support_pos": int(y.sum()),
            "support_neg": int((y == 0).sum()),
            "tp": tp, "fp": fp, "fn": fn, "tn": tn,
            "precision": prec, "recall": rec, "f1": f1,
            "threshold": float(thresholds[j]),
        })
    return pd.DataFrame(rows)

In [4]:
#CELL 4 — Threshold tuning (VAL’da per-label)
def score_matrix(model, X) -> tuple[np.ndarray, str]:
    """
    Returns:
      scores: (n_samples, n_labels)
      mode: "proba" yoki "score"
    """
    if hasattr(model, "predict_proba"):
        P = model.predict_proba(X)
        return np.asarray(P), "proba"
    if hasattr(model, "decision_function"):
        S = model.decision_function(X)
        return np.asarray(S), "score"
    # fallback: predict -> 0/1
    Yp = model.predict(X)
    return np.asarray(Yp), "binary"


def tune_thresholds_per_label(Y_true: np.ndarray, scores: np.ndarray, mode: str, n_thr: int = 61) -> np.ndarray:
    """
    Har label uchun threshold: F1 max bo‘ladigan thr.
    mode:
      - "proba" => thresholds score>=thr, thr quantiles [0.01..0.99]
      - "score" => thresholds score>=thr, thr quantiles [0.01..0.99]
    """
    n_labels = Y_true.shape[1]
    thr_out = np.zeros(n_labels, dtype=np.float32)

    q = np.linspace(0.01, 0.99, n_thr)

    for j in range(n_labels):
        y = Y_true[:, j].astype(np.int8)
        s = scores[:, j].astype(np.float32)

        # Agar label umuman positive bo‘lmasa: doim 0 predict (thr juda katta)
        if int(y.sum()) == 0:
            thr_out[j] = 1.0 if mode == "proba" else float(np.max(s) + 1.0)
            continue

        # Threshold grid: score quantile’lari
        thr_grid = np.unique(np.quantile(s, q))
        if thr_grid.size < 10:
            # fallback
            mn, mx = float(np.min(s)), float(np.max(s))
            if mn == mx:
                thr_grid = np.array([mn], dtype=np.float32)
            else:
                thr_grid = np.linspace(mn, mx, num=min(61, max(11, thr_grid.size)), dtype=np.float32)

        best_f1 = -1.0
        best_thr = float(thr_grid[len(thr_grid)//2])

        for thr in thr_grid:
            pred = (s >= thr).astype(np.int8)
            tp = int(((y == 1) & (pred == 1)).sum())
            fp = int(((y == 0) & (pred == 1)).sum())
            fn = int(((y == 1) & (pred == 0)).sum())
            _, _, f1 = prf_from_counts(tp, fp, fn)
            if f1 > best_f1:
                best_f1 = f1
                best_thr = float(thr)

        thr_out[j] = best_thr

    return thr_out


def apply_thresholds(scores: np.ndarray, thresholds: np.ndarray) -> np.ndarray:
    return (scores >= thresholds.reshape(1, -1)).astype(np.int8)

In [5]:
# CELL 5A — model registry (hammasi shu yerda)

# CELL 5B — qaysi modelni train qilishni tanlash

# CELL 5C — bitta modelni train → VAL threshold tuning → save

# ✅ CELL 5A — Model registry (loop yo‘q)
# =========================================================
# CELL 5A — Model registry (manual train uchun)
# =========================================================
models = {}

# 1) OVR Logistic Regression (2 variant)
models["ovr_logreg_bal_C1"] = OneVsRestClassifier(
    LogisticRegression(
        solver="liblinear",
        max_iter=3000,
        C=1.0,
        class_weight="balanced",
    ),
    n_jobs=N_JOBS,
)

models["ovr_logreg_bal_C2"] = OneVsRestClassifier(
    LogisticRegression(
        solver="liblinear",
        max_iter=3000,
        C=2.0,
        class_weight="balanced",
    ),
    n_jobs=N_JOBS,
)

# 2) OVR LinearSVC
models["ovr_linearsvc_C1"] = OneVsRestClassifier(
    LinearSVC(C=1.0, random_state=RANDOM_STATE),
    n_jobs=N_JOBS,
)

# 3) OVR SGD (log_loss)
models["ovr_sgd_logloss"] = OneVsRestClassifier(
    SGDClassifier(
        loss="log_loss",
        alpha=1e-5,
        max_iter=2000,
        tol=1e-3,
        random_state=RANDOM_STATE,
    ),
    n_jobs=N_JOBS,
)

# 4) OVR SGD (hinge)
models["ovr_sgd_hinge"] = OneVsRestClassifier(
    SGDClassifier(
        loss="hinge",
        alpha=1e-5,
        max_iter=2000,
        tol=1e-3,
        random_state=RANDOM_STATE,
    ),
    n_jobs=N_JOBS,
)

# 5) OVR ComplementNB
models["ovr_complementnb_a05"] = OneVsRestClassifier(
    ComplementNB(alpha=0.5),
    n_jobs=N_JOBS,
)

print("Available models:")
for k in models.keys():
    print(" -", k)

Available models:
 - ovr_logreg_bal_C1
 - ovr_logreg_bal_C2
 - ovr_linearsvc_C1
 - ovr_sgd_logloss
 - ovr_sgd_hinge
 - ovr_complementnb_a05


# 1)OVR Logistic Regression(2ta variant)

In [6]:
#CELL 5B — Model tanlash (siz faqat shu qatordagi nomni o‘zgartirasiz)
# =========================================================
# CELL 5B — Choose ONE model to train (manual)
# =========================================================
MODEL_NAME = "ovr_logreg_bal_C1"   # <-- faqat shuni almashtirib bosamiz
clf = models[MODEL_NAME]
print("Selected:", MODEL_NAME, "->", clf)

Selected: ovr_logreg_bal_C1 -> OneVsRestClassifier(estimator=LogisticRegression(class_weight='balanced',
                                                 max_iter=3000,
                                                 solver='liblinear'),
                    n_jobs=1)


In [None]:
# CELL 5C — Train ONE model + VAL thresholds + SAVE (loop yo‘q)
# =========================================================
# CELL 5C — Train ONE model, tune thresholds on VAL, save
# =========================================================
print("\n" + "="*90)
print("TRAIN:", MODEL_NAME)
print("="*90)

# --- fit ---
clf.fit(X_train, Y_train)

# --- scores on VAL ---
S_val, mode = score_matrix(clf, X_val)
print("Score mode:", mode, "| shape:", S_val.shape)

# --- tune thresholds on VAL ---
thr = tune_thresholds_per_label(Y_val, S_val, mode=mode, n_thr=N_THR)
Y_val_pred = apply_thresholds(S_val, thr)

# --- metrics on VAL ---
overall = multilabel_micro_macro(Y_val, Y_val_pred)
print("VAL micro_f1:", overall["micro_f1"], "| macro_f1:", overall["macro_f1"])

per_label = per_label_report(Y_val, Y_val_pred, thr)

# --- save artifacts ---
model_path = ART_MODELS_DIR / f"{MODEL_NAME}.joblib"
thr_path   = ART_MODELS_DIR / f"{MODEL_NAME}_thresholds.json"
val_metrics_path = RES_DIR / f"{MODEL_NAME}_val_per_label_metrics.csv"
val_summary_path = RES_DIR / f"{MODEL_NAME}_val_summary.json"

joblib.dump(clf, model_path)

thr_dict = {lab.replace("y_", "", 1): float(t) for lab, t in zip(y_cols, thr)}
with open(thr_path, "w", encoding="utf-8") as f:
    json.dump(thr_dict, f, ensure_ascii=False, indent=2)

per_label.to_csv(val_metrics_path, index=False)
with open(val_summary_path, "w", encoding="utf-8") as f:
    json.dump({"model": MODEL_NAME, "mode": mode, **overall}, f, ensure_ascii=False, indent=2)

print("Saved model:", model_path.resolve())
print("Saved thresholds:", thr_path.resolve())
print("Saved val per-label:", val_metrics_path.resolve())
print("Saved val summary:", val_summary_path.resolve())

# ko‘rsatib qo‘yamiz
per_label.sort_values("f1").head(10)

# 5minut


TRAIN: ovr_logreg_bal_C1
Score mode: proba | shape: (24410, 21)
VAL micro_f1: 0.9784034554471285 | macro_f1: 0.9441853209671194
Saved model: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\artifacts\models\improvement\impr_20260225_211312\ovr_logreg_bal_C1.joblib
Saved thresholds: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\artifacts\models\improvement\impr_20260225_211312\ovr_logreg_bal_C1_thresholds.json
Saved val per-label: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\results\improvement\impr_20260225_211312\ovr_logreg_bal_C1_val_per_label_metrics.csv
Saved val summary: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\results\improvement\impr_20260225_211312\ovr_logreg_bal_C1_val_summary.json


Unnamed: 0,label,support_pos,support_neg,tp,fp,fn,tn,precision,recall,f1,threshold
16,y_pregnancy_reproductive,347,24063,95,0,252,24063,1.0,0.273775,0.429864,0.999785
6,y_hepatic,844,23566,844,198,0,23368,0.809981,1.0,0.895016,0.176502
7,y_hypersensitivity_allergy,771,23639,643,0,128,23639,1.0,0.833982,0.909477,0.988019
11,y_metabolic_endocrine,1256,23154,1256,185,0,22969,0.871617,1.0,0.931405,0.18554
2,y_edema_swelling,1600,22810,1441,0,159,22810,1.0,0.900625,0.947715,0.979509
18,y_renal,618,23792,608,35,10,23757,0.945568,0.983819,0.964314,0.903791
10,y_injury_accident,1718,22692,1718,121,0,22571,0.934203,1.0,0.965983,0.184867
14,y_ocular_visual,1106,23304,1041,1,65,23303,0.99904,0.94123,0.969274,0.913351
20,y_urinary,616,23794,616,27,0,23767,0.958009,1.0,0.978554,0.458375
0,y_cardiovascular,2528,21882,2528,109,0,21773,0.958665,1.0,0.978896,0.377064


# 2chi variant

In [9]:
#CELL 5B — Model tanlash (siz faqat shu qatordagi nomni o‘zgartirasiz)
# =========================================================
# CELL 5B — Choose ONE model to train (manual)
# =========================================================
MODEL_NAME = "ovr_logreg_bal_C2"   # <-- faqat shuni almashtirib bosamiz
clf = models[MODEL_NAME]
print("Selected:", MODEL_NAME, "->", clf)

Selected: ovr_logreg_bal_C2 -> OneVsRestClassifier(estimator=LogisticRegression(C=2.0, class_weight='balanced',
                                                 max_iter=3000,
                                                 solver='liblinear'),
                    n_jobs=1)


In [10]:
# CELL 5C — Train ONE model + VAL thresholds + SAVE (loop yo‘q)
# =========================================================
# CELL 5C — Train ONE model, tune thresholds on VAL, save
# =========================================================
print("\n" + "="*90)
print("TRAIN:", MODEL_NAME)
print("="*90)

# --- fit ---
clf.fit(X_train, Y_train)

# --- scores on VAL ---
S_val, mode = score_matrix(clf, X_val)
print("Score mode:", mode, "| shape:", S_val.shape)

# --- tune thresholds on VAL ---
thr = tune_thresholds_per_label(Y_val, S_val, mode=mode, n_thr=N_THR)
Y_val_pred = apply_thresholds(S_val, thr)

# --- metrics on VAL ---
overall = multilabel_micro_macro(Y_val, Y_val_pred)
print("VAL micro_f1:", overall["micro_f1"], "| macro_f1:", overall["macro_f1"])

per_label = per_label_report(Y_val, Y_val_pred, thr)

# --- save artifacts ---
model_path = ART_MODELS_DIR / f"{MODEL_NAME}.joblib"
thr_path   = ART_MODELS_DIR / f"{MODEL_NAME}_thresholds.json"
val_metrics_path = RES_DIR / f"{MODEL_NAME}_val_per_label_metrics.csv"
val_summary_path = RES_DIR / f"{MODEL_NAME}_val_summary.json"

joblib.dump(clf, model_path)

thr_dict = {lab.replace("y_", "", 1): float(t) for lab, t in zip(y_cols, thr)}
with open(thr_path, "w", encoding="utf-8") as f:
    json.dump(thr_dict, f, ensure_ascii=False, indent=2)

per_label.to_csv(val_metrics_path, index=False)
with open(val_summary_path, "w", encoding="utf-8") as f:
    json.dump({"model": MODEL_NAME, "mode": mode, **overall}, f, ensure_ascii=False, indent=2)

print("Saved model:", model_path.resolve())
print("Saved thresholds:", thr_path.resolve())
print("Saved val per-label:", val_metrics_path.resolve())
print("Saved val summary:", val_summary_path.resolve())

# ko‘rsatib qo‘yamiz
per_label.sort_values("f1").head(10)


TRAIN: ovr_logreg_bal_C2
Score mode: proba | shape: (24410, 21)
VAL micro_f1: 0.9803763087160016 | macro_f1: 0.9658361670431072
Saved model: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\artifacts\models\improvement\impr_20260225_211312\ovr_logreg_bal_C2.joblib
Saved thresholds: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\artifacts\models\improvement\impr_20260225_211312\ovr_logreg_bal_C2_thresholds.json
Saved val per-label: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\results\improvement\impr_20260225_211312\ovr_logreg_bal_C2_val_per_label_metrics.csv
Saved val summary: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\results\improvement\impr_20260225_211312\ovr_logreg_bal_C2_val_summary.json


Unnamed: 0,label,support_pos,support_neg,tp,fp,fn,tn,precision,recall,f1,threshold
16,y_pregnancy_reproductive,347,24063,258,0,89,24063,1.0,0.743516,0.852893,0.999936
6,y_hepatic,844,23566,844,198,0,23368,0.809981,1.0,0.895016,0.113557
7,y_hypersensitivity_allergy,771,23639,643,0,128,23639,1.0,0.833982,0.909477,0.992646
11,y_metabolic_endocrine,1256,23154,1256,185,0,22969,0.871617,1.0,0.931405,0.126919
2,y_edema_swelling,1600,22810,1441,0,159,22810,1.0,0.900625,0.947715,0.98695
10,y_injury_accident,1718,22692,1718,121,0,22571,0.934203,1.0,0.965983,0.131319
14,y_ocular_visual,1106,23304,1042,0,64,23304,1.0,0.942134,0.970205,0.934361
20,y_urinary,616,23794,616,27,0,23767,0.958009,1.0,0.978554,0.363738
0,y_cardiovascular,2528,21882,2528,109,0,21773,0.958665,1.0,0.978896,0.283991
19,y_respiratory,2540,21870,2540,97,0,21773,0.963216,1.0,0.981263,0.242834


# 2)OVR LinearSVC

In [11]:
# =========================================================
MODEL_NAME = "ovr_linearsvc_C1"   # <-- faqat shuni almashtirib bosamiz
clf = models[MODEL_NAME]
print("Selected:", MODEL_NAME, "->", clf)

Selected: ovr_linearsvc_C1 -> OneVsRestClassifier(estimator=LinearSVC(random_state=42), n_jobs=1)


In [12]:
# CELL 5C — Train ONE model + VAL thresholds + SAVE (loop yo‘q)
# =========================================================
# CELL 5C — Train ONE model, tune thresholds on VAL, save
# =========================================================
print("\n" + "="*90)
print("TRAIN:", MODEL_NAME)
print("="*90)

# --- fit ---
clf.fit(X_train, Y_train)

# --- scores on VAL ---
S_val, mode = score_matrix(clf, X_val)
print("Score mode:", mode, "| shape:", S_val.shape)

# --- tune thresholds on VAL ---
thr = tune_thresholds_per_label(Y_val, S_val, mode=mode, n_thr=N_THR)
Y_val_pred = apply_thresholds(S_val, thr)

# --- metrics on VAL ---
overall = multilabel_micro_macro(Y_val, Y_val_pred)
print("VAL micro_f1:", overall["micro_f1"], "| macro_f1:", overall["macro_f1"])

per_label = per_label_report(Y_val, Y_val_pred, thr)

# --- save artifacts ---
model_path = ART_MODELS_DIR / f"{MODEL_NAME}.joblib"
thr_path   = ART_MODELS_DIR / f"{MODEL_NAME}_thresholds.json"
val_metrics_path = RES_DIR / f"{MODEL_NAME}_val_per_label_metrics.csv"
val_summary_path = RES_DIR / f"{MODEL_NAME}_val_summary.json"

joblib.dump(clf, model_path)

thr_dict = {lab.replace("y_", "", 1): float(t) for lab, t in zip(y_cols, thr)}
with open(thr_path, "w", encoding="utf-8") as f:
    json.dump(thr_dict, f, ensure_ascii=False, indent=2)

per_label.to_csv(val_metrics_path, index=False)
with open(val_summary_path, "w", encoding="utf-8") as f:
    json.dump({"model": MODEL_NAME, "mode": mode, **overall}, f, ensure_ascii=False, indent=2)

print("Saved model:", model_path.resolve())
print("Saved thresholds:", thr_path.resolve())
print("Saved val per-label:", val_metrics_path.resolve())
print("Saved val summary:", val_summary_path.resolve())

# ko‘rsatib qo‘yamiz
per_label.sort_values("f1").head(10)


TRAIN: ovr_linearsvc_C1
Score mode: score | shape: (24410, 21)
VAL micro_f1: 0.9801824827806541 | macro_f1: 0.9644168101133983
Saved model: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\artifacts\models\improvement\impr_20260225_211312\ovr_linearsvc_C1.joblib
Saved thresholds: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\artifacts\models\improvement\impr_20260225_211312\ovr_linearsvc_C1_thresholds.json
Saved val per-label: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\results\improvement\impr_20260225_211312\ovr_linearsvc_C1_val_per_label_metrics.csv
Saved val summary: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\results\improvement\impr_20260225_211312\ovr_linearsvc_C1_val_summary.json


Unnamed: 0,label,support_pos,support_neg,tp,fp,fn,tn,precision,recall,f1,threshold
16,y_pregnancy_reproductive,347,24063,249,0,98,24063,1.0,0.717579,0.83557,2.386066
6,y_hepatic,844,23566,844,198,0,23368,0.809981,1.0,0.895016,-0.965957
7,y_hypersensitivity_allergy,771,23639,643,0,128,23639,1.0,0.833982,0.909477,1.157807
11,y_metabolic_endocrine,1256,23154,1256,185,0,22969,0.871617,1.0,0.931405,-0.958605
2,y_edema_swelling,1600,22810,1441,0,159,22810,1.0,0.900625,0.947715,1.105263
10,y_injury_accident,1718,22692,1718,121,0,22571,0.934203,1.0,0.965983,-0.885546
14,y_ocular_visual,1106,23304,1042,0,64,23304,1.0,0.942134,0.970205,0.602956
20,y_urinary,616,23794,616,27,0,23767,0.958009,1.0,0.978554,-0.666633
0,y_cardiovascular,2528,21882,2528,109,0,21773,0.958665,1.0,0.978896,-0.751632
18,y_renal,618,23792,618,25,0,23767,0.96112,1.0,0.980174,-0.668777



# 3) OVR SGD (log_loss)


In [13]:
# =========================================================
MODEL_NAME = "ovr_sgd_logloss"   # <-- faqat shuni almashtirib bosamiz
clf = models[MODEL_NAME]
print("Selected:", MODEL_NAME, "->", clf)

Selected: ovr_sgd_logloss -> OneVsRestClassifier(estimator=SGDClassifier(alpha=1e-05, loss='log_loss',
                                            max_iter=2000, random_state=42),
                    n_jobs=1)


In [14]:
# CELL 5C — Train ONE model + VAL thresholds + SAVE (loop yo‘q)
# =========================================================
# CELL 5C — Train ONE model, tune thresholds on VAL, save
# =========================================================
print("\n" + "="*90)
print("TRAIN:", MODEL_NAME)
print("="*90)

# --- fit ---
clf.fit(X_train, Y_train)

# --- scores on VAL ---
S_val, mode = score_matrix(clf, X_val)
print("Score mode:", mode, "| shape:", S_val.shape)

# --- tune thresholds on VAL ---
thr = tune_thresholds_per_label(Y_val, S_val, mode=mode, n_thr=N_THR)
Y_val_pred = apply_thresholds(S_val, thr)

# --- metrics on VAL ---
overall = multilabel_micro_macro(Y_val, Y_val_pred)
print("VAL micro_f1:", overall["micro_f1"], "| macro_f1:", overall["macro_f1"])

per_label = per_label_report(Y_val, Y_val_pred, thr)

# --- save artifacts ---
model_path = ART_MODELS_DIR / f"{MODEL_NAME}.joblib"
thr_path   = ART_MODELS_DIR / f"{MODEL_NAME}_thresholds.json"
val_metrics_path = RES_DIR / f"{MODEL_NAME}_val_per_label_metrics.csv"
val_summary_path = RES_DIR / f"{MODEL_NAME}_val_summary.json"

joblib.dump(clf, model_path)

thr_dict = {lab.replace("y_", "", 1): float(t) for lab, t in zip(y_cols, thr)}
with open(thr_path, "w", encoding="utf-8") as f:
    json.dump(thr_dict, f, ensure_ascii=False, indent=2)

per_label.to_csv(val_metrics_path, index=False)
with open(val_summary_path, "w", encoding="utf-8") as f:
    json.dump({"model": MODEL_NAME, "mode": mode, **overall}, f, ensure_ascii=False, indent=2)

print("Saved model:", model_path.resolve())
print("Saved thresholds:", thr_path.resolve())
print("Saved val per-label:", val_metrics_path.resolve())
print("Saved val summary:", val_summary_path.resolve())

# ko‘rsatib qo‘yamiz
per_label.sort_values("f1").head(10)


TRAIN: ovr_sgd_logloss
Score mode: proba | shape: (24410, 21)
VAL micro_f1: 0.97403320653559 | macro_f1: 0.9529507712434356
Saved model: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\artifacts\models\improvement\impr_20260225_211312\ovr_sgd_logloss.joblib
Saved thresholds: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\artifacts\models\improvement\impr_20260225_211312\ovr_sgd_logloss_thresholds.json
Saved val per-label: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\results\improvement\impr_20260225_211312\ovr_sgd_logloss_val_per_label_metrics.csv
Saved val summary: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\results\improvement\impr_20260225_211312\ovr_sgd_logloss_val_summary.json


Unnamed: 0,label,support_pos,support_neg,tp,fp,fn,tn,precision,recall,f1,threshold
6,y_hepatic,844,23566,618,25,226,23541,0.96112,0.732227,0.831204,0.079379
16,y_pregnancy_reproductive,347,24063,255,0,92,24063,1.0,0.73487,0.847176,0.99431
7,y_hypersensitivity_allergy,771,23639,643,0,128,23639,1.0,0.833982,0.909477,0.554209
14,y_ocular_visual,1106,23304,984,58,122,23246,0.944338,0.889693,0.916201,0.043821
11,y_metabolic_endocrine,1256,23154,1255,186,1,22968,0.870923,0.999204,0.930664,0.290675
18,y_renal,618,23792,590,53,28,23739,0.917574,0.954693,0.935765,0.266437
2,y_edema_swelling,1600,22810,1440,1,160,22809,0.999306,0.9,0.947057,0.975486
20,y_urinary,616,23794,602,41,14,23753,0.936236,0.977273,0.956315,0.676255
17,y_psychiatric,2316,22094,2195,43,121,22051,0.980786,0.947755,0.963988,0.855404
12,y_musculoskeletal,1879,22531,1794,45,85,22486,0.97553,0.954763,0.965035,0.901553


# 4)OVR SGD (hinge)


In [15]:
# =========================================================
MODEL_NAME = "ovr_sgd_hinge"   # <-- faqat shuni almashtirib bosamiz
clf = models[MODEL_NAME]
print("Selected:", MODEL_NAME, "->", clf)

Selected: ovr_sgd_hinge -> OneVsRestClassifier(estimator=SGDClassifier(alpha=1e-05, max_iter=2000,
                                            random_state=42),
                    n_jobs=1)


In [16]:
# CELL 5C — Train ONE model + VAL thresholds + SAVE (loop yo‘q)
# =========================================================
# CELL 5C — Train ONE model, tune thresholds on VAL, save
# =========================================================
print("\n" + "="*90)
print("TRAIN:", MODEL_NAME)
print("="*90)

# --- fit ---
clf.fit(X_train, Y_train)

# --- scores on VAL ---
S_val, mode = score_matrix(clf, X_val)
print("Score mode:", mode, "| shape:", S_val.shape)

# --- tune thresholds on VAL ---
thr = tune_thresholds_per_label(Y_val, S_val, mode=mode, n_thr=N_THR)
Y_val_pred = apply_thresholds(S_val, thr)

# --- metrics on VAL ---
overall = multilabel_micro_macro(Y_val, Y_val_pred)
print("VAL micro_f1:", overall["micro_f1"], "| macro_f1:", overall["macro_f1"])

per_label = per_label_report(Y_val, Y_val_pred, thr)

# --- save artifacts ---
model_path = ART_MODELS_DIR / f"{MODEL_NAME}.joblib"
thr_path   = ART_MODELS_DIR / f"{MODEL_NAME}_thresholds.json"
val_metrics_path = RES_DIR / f"{MODEL_NAME}_val_per_label_metrics.csv"
val_summary_path = RES_DIR / f"{MODEL_NAME}_val_summary.json"

joblib.dump(clf, model_path)

thr_dict = {lab.replace("y_", "", 1): float(t) for lab, t in zip(y_cols, thr)}
with open(thr_path, "w", encoding="utf-8") as f:
    json.dump(thr_dict, f, ensure_ascii=False, indent=2)

per_label.to_csv(val_metrics_path, index=False)
with open(val_summary_path, "w", encoding="utf-8") as f:
    json.dump({"model": MODEL_NAME, "mode": mode, **overall}, f, ensure_ascii=False, indent=2)

print("Saved model:", model_path.resolve())
print("Saved thresholds:", thr_path.resolve())
print("Saved val per-label:", val_metrics_path.resolve())
print("Saved val summary:", val_summary_path.resolve())

# ko‘rsatib qo‘yamiz
per_label.sort_values("f1").head(10)


TRAIN: ovr_sgd_hinge
Score mode: score | shape: (24410, 21)
VAL micro_f1: 0.9575350204425551 | macro_f1: 0.9369053317968185
Saved model: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\artifacts\models\improvement\impr_20260225_211312\ovr_sgd_hinge.joblib
Saved thresholds: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\artifacts\models\improvement\impr_20260225_211312\ovr_sgd_hinge_thresholds.json
Saved val per-label: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\results\improvement\impr_20260225_211312\ovr_sgd_hinge_val_per_label_metrics.csv
Saved val summary: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\results\improvement\impr_20260225_211312\ovr_sgd_hinge_val_summary.json


Unnamed: 0,label,support_pos,support_neg,tp,fp,fn,tn,precision,recall,f1,threshold
19,y_respiratory,2540,21870,1975,259,565,21611,0.884064,0.777559,0.827398,-1.621564
16,y_pregnancy_reproductive,347,24063,257,0,90,24063,1.0,0.740634,0.850993,7.214348
6,y_hepatic,844,23566,633,10,211,23556,0.984448,0.75,0.851379,1.137942
11,y_metabolic_endocrine,1256,23154,1003,59,253,23095,0.944444,0.798567,0.865401,-1.661809
14,y_ocular_visual,1106,23304,960,82,146,23222,0.921305,0.867993,0.893855,-3.195131
7,y_hypersensitivity_allergy,771,23639,643,0,128,23639,1.0,0.833982,0.909477,3.412648
10,y_injury_accident,1718,22692,1640,199,78,22493,0.891789,0.954598,0.922125,-1.416286
3,y_gastrointestinal,4912,19498,4434,196,478,19302,0.957667,0.902687,0.929365,-1.768378
18,y_renal,618,23792,587,56,31,23736,0.912908,0.949838,0.931007,-1.03957
0,y_cardiovascular,2528,21882,2419,218,109,21664,0.91733,0.956883,0.936689,-4.306855


# 5)OVR ComplementNB


In [17]:
# =========================================================
MODEL_NAME = "ovr_complementnb_a05"   # <-- faqat shuni almashtirib bosamiz
clf = models[MODEL_NAME]
print("Selected:", MODEL_NAME, "->", clf)

Selected: ovr_complementnb_a05 -> OneVsRestClassifier(estimator=ComplementNB(alpha=0.5), n_jobs=1)


In [18]:
# CELL 5C — Train ONE model + VAL thresholds + SAVE (loop yo‘q)
# =========================================================
# CELL 5C — Train ONE model, tune thresholds on VAL, save
# =========================================================
print("\n" + "="*90)
print("TRAIN:", MODEL_NAME)
print("="*90)

# --- fit ---
clf.fit(X_train, Y_train)

# --- scores on VAL ---
S_val, mode = score_matrix(clf, X_val)
print("Score mode:", mode, "| shape:", S_val.shape)

# --- tune thresholds on VAL ---
thr = tune_thresholds_per_label(Y_val, S_val, mode=mode, n_thr=N_THR)
Y_val_pred = apply_thresholds(S_val, thr)

# --- metrics on VAL ---
overall = multilabel_micro_macro(Y_val, Y_val_pred)
print("VAL micro_f1:", overall["micro_f1"], "| macro_f1:", overall["macro_f1"])

per_label = per_label_report(Y_val, Y_val_pred, thr)

# --- save artifacts ---
model_path = ART_MODELS_DIR / f"{MODEL_NAME}.joblib"
thr_path   = ART_MODELS_DIR / f"{MODEL_NAME}_thresholds.json"
val_metrics_path = RES_DIR / f"{MODEL_NAME}_val_per_label_metrics.csv"
val_summary_path = RES_DIR / f"{MODEL_NAME}_val_summary.json"

joblib.dump(clf, model_path)

thr_dict = {lab.replace("y_", "", 1): float(t) for lab, t in zip(y_cols, thr)}
with open(thr_path, "w", encoding="utf-8") as f:
    json.dump(thr_dict, f, ensure_ascii=False, indent=2)

per_label.to_csv(val_metrics_path, index=False)
with open(val_summary_path, "w", encoding="utf-8") as f:
    json.dump({"model": MODEL_NAME, "mode": mode, **overall}, f, ensure_ascii=False, indent=2)

print("Saved model:", model_path.resolve())
print("Saved thresholds:", thr_path.resolve())
print("Saved val per-label:", val_metrics_path.resolve())
print("Saved val summary:", val_summary_path.resolve())

# ko‘rsatib qo‘yamiz
per_label.sort_values("f1").head(10)


TRAIN: ovr_complementnb_a05
Score mode: proba | shape: (24410, 21)
VAL micro_f1: 0.9133595451588268 | macro_f1: 0.8821145921301132
Saved model: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\artifacts\models\improvement\impr_20260225_211312\ovr_complementnb_a05.joblib
Saved thresholds: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\artifacts\models\improvement\impr_20260225_211312\ovr_complementnb_a05_thresholds.json
Saved val per-label: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\results\improvement\impr_20260225_211312\ovr_complementnb_a05_val_per_label_metrics.csv
Saved val summary: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\results\improvement\impr_20260225_211312\ovr_complementnb_a05_val_summary.json


Unnamed: 0,label,support_pos,support_neg,tp,fp,fn,tn,precision,recall,f1,threshold
18,y_renal,618,23792,453,190,165,23602,0.70451,0.73301,0.718477,0.999821
20,y_urinary,616,23794,496,147,120,23647,0.771384,0.805195,0.787927,0.999477
6,y_hepatic,844,23566,610,34,234,23532,0.947205,0.722749,0.819892,0.999999
11,y_metabolic_endocrine,1256,23154,1107,334,149,22820,0.768217,0.881369,0.820912,0.997997
7,y_hypersensitivity_allergy,771,23639,599,44,172,23595,0.931571,0.776913,0.847242,0.999955
14,y_ocular_visual,1106,23304,923,119,183,23185,0.885797,0.834539,0.859404,0.996494
0,y_cardiovascular,2528,21882,2263,374,265,21508,0.858172,0.895174,0.876283,0.993311
17,y_psychiatric,2316,22094,2006,232,310,21862,0.896336,0.866149,0.880984,0.999352
2,y_edema_swelling,1600,22810,1516,323,84,22487,0.824361,0.9475,0.881652,0.982578
5,y_hematologic,2295,22115,2015,223,280,21892,0.900357,0.877996,0.889036,0.996025


In [21]:
from pathlib import Path
import joblib

CWD = Path.cwd()

PROJECT_ROOT = None
checked = []

# Haqiqiy project root mezoni:
# Data/Raw_data yoki Data/Processed ichida real papkalar bor bo‘lsin
for p in [CWD] + list(CWD.parents):
    data = p / "Data"
    raw = data / "Raw_data"
    processed = data / "Processed"

    checked.append(p)

    ok = False
    if raw.exists() and any(raw.iterdir()):
        ok = True
    if processed.exists() and any(processed.iterdir()):
        ok = True

    if ok:
        PROJECT_ROOT = p
        break

if PROJECT_ROOT is None:
    raise FileNotFoundError(
        "PROJECT_ROOT topilmadi.\n"
        f"CWD: {CWD.resolve()}\n"
        "Ye chim: project ichida Data/Raw_data yoki Data/Processed papkasi bo‘lishi kerak."
    )

SAVE_DIR = PROJECT_ROOT / "Models" / "improvement_models" / "Improvement_Models"
SAVE_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT.resolve())
print("SAVE_DIR:", SAVE_DIR.resolve())

PROJECT_ROOT: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract
SAVE_DIR: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models


In [22]:
import json
import numpy as np
import joblib

MODEL_NAME = "ovr_logreg_bal_C1"   # <-- HAR SAFAR shu nomni o‘zgartiramiz

# 1) modelni saqlash
model_path = SAVE_DIR / f"{MODEL_NAME}.joblib"
joblib.dump(clf, model_path)
print("✅ Saved model:", model_path.resolve())

# 2) threshold bo‘lsa, saqlash (ixtiyoriy)
# Sizda thr_dict bo‘lsa (label->thr dict), saqlaydi.
if "thr_dict" in globals() and isinstance(thr_dict, dict) and len(thr_dict) > 0:
    thr_path = SAVE_DIR / f"{MODEL_NAME}_thresholds.json"
    with open(thr_path, "w", encoding="utf-8") as f:
        json.dump(thr_dict, f, ensure_ascii=False, indent=2)
    print("✅ Saved thresholds:", thr_path.resolve())
else:
    print("⚠️ thr_dict topilmadi (threshold saqlanmadi). Agar thr_dict bor bo‘lsa shu celldan oldin yarating.")

✅ Saved model: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\ovr_logreg_bal_C1.joblib
✅ Saved thresholds: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\ovr_logreg_bal_C1_thresholds.json


In [23]:
import json
import numpy as np
import joblib

MODEL_NAME = "ovr_logreg_bal_C2"   # <-- HAR SAFAR shu nomni o‘zgartiramiz

# 1) modelni saqlash
model_path = SAVE_DIR / f"{MODEL_NAME}.joblib"
joblib.dump(clf, model_path)
print("✅ Saved model:", model_path.resolve())

# 2) threshold bo‘lsa, saqlash (ixtiyoriy)
# Sizda thr_dict bo‘lsa (label->thr dict), saqlaydi.
if "thr_dict" in globals() and isinstance(thr_dict, dict) and len(thr_dict) > 0:
    thr_path = SAVE_DIR / f"{MODEL_NAME}_thresholds.json"
    with open(thr_path, "w", encoding="utf-8") as f:
        json.dump(thr_dict, f, ensure_ascii=False, indent=2)
    print("✅ Saved thresholds:", thr_path.resolve())
else:
    print("⚠️ thr_dict topilmadi (threshold saqlanmadi). Agar thr_dict bor bo‘lsa shu celldan oldin yarating.")

✅ Saved model: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\ovr_logreg_bal_C2.joblib
✅ Saved thresholds: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\ovr_logreg_bal_C2_thresholds.json


In [24]:
import json
import numpy as np
import joblib

MODEL_NAME = "ovr_sgd_logloss"   # <-- HAR SAFAR shu nomni o‘zgartiramiz

# 1) modelni saqlash
model_path = SAVE_DIR / f"{MODEL_NAME}.joblib"
joblib.dump(clf, model_path)
print("✅ Saved model:", model_path.resolve())

# 2) threshold bo‘lsa, saqlash (ixtiyoriy)
# Sizda thr_dict bo‘lsa (label->thr dict), saqlaydi.
if "thr_dict" in globals() and isinstance(thr_dict, dict) and len(thr_dict) > 0:
    thr_path = SAVE_DIR / f"{MODEL_NAME}_thresholds.json"
    with open(thr_path, "w", encoding="utf-8") as f:
        json.dump(thr_dict, f, ensure_ascii=False, indent=2)
    print("✅ Saved thresholds:", thr_path.resolve())
else:
    print("⚠️ thr_dict topilmadi (threshold saqlanmadi). Agar thr_dict bor bo‘lsa shu celldan oldin yarating.")

✅ Saved model: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\ovr_sgd_logloss.joblib
✅ Saved thresholds: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\ovr_sgd_logloss_thresholds.json


In [25]:
import json
import numpy as np
import joblib

MODEL_NAME = "ovr_sgd_hinge"   # <-- HAR SAFAR shu nomni o‘zgartiring

# 1) modelni saqlash
model_path = SAVE_DIR / f"{MODEL_NAME}.joblib"
joblib.dump(clf, model_path)
print("✅ Saved model:", model_path.resolve())

# 2) threshold bo‘lsa, saqlash (ixtiyoriy)
# Sizda thr_dict bo‘lsa (label->thr dict), saqlaydi.
if "thr_dict" in globals() and isinstance(thr_dict, dict) and len(thr_dict) > 0:
    thr_path = SAVE_DIR / f"{MODEL_NAME}_thresholds.json"
    with open(thr_path, "w", encoding="utf-8") as f:
        json.dump(thr_dict, f, ensure_ascii=False, indent=2)
    print("✅ Saved thresholds:", thr_path.resolve())
else:
    print("⚠️ thr_dict topilmadi (threshold saqlanmadi). Agar thr_dict bor bo‘lsa shu celldan oldin yarating.")

✅ Saved model: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\ovr_sgd_hinge.joblib
✅ Saved thresholds: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\ovr_sgd_hinge_thresholds.json


In [26]:
import json
import numpy as np
import joblib

MODEL_NAME = "ovr_complementnb_a05"   # <-- HAR SAFAR shu nomni o‘zgartiring

# 1) modelni saqlash
model_path = SAVE_DIR / f"{MODEL_NAME}.joblib"
joblib.dump(clf, model_path)
print("✅ Saved model:", model_path.resolve())

# 2) threshold bo‘lsa, saqlash (ixtiyoriy)
# Sizda thr_dict bo‘lsa (label->thr dict), saqlaydi.
if "thr_dict" in globals() and isinstance(thr_dict, dict) and len(thr_dict) > 0:
    thr_path = SAVE_DIR / f"{MODEL_NAME}_thresholds.json"
    with open(thr_path, "w", encoding="utf-8") as f:
        json.dump(thr_dict, f, ensure_ascii=False, indent=2)
    print("✅ Saved thresholds:", thr_path.resolve())
else:
    print("⚠️ thr_dict topilmadi (threshold saqlanmadi). Agar thr_dict bor bo‘lsa shu celldan oldin yarating.")

✅ Saved model: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\ovr_complementnb_a05.joblib
✅ Saved thresholds: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\ovr_complementnb_a05_thresholds.json


In [None]:
# CELL B — Hozirgi 5ta modelni saqlash (manual)

# Bu cell sizda models dict bor deb hisoblaydi (MODEL_NAME→clf).
# Agar siz clfni alohida train qilgan bo‘lsangiz ham, pastda “single model” varianti ham beraman.

# # models dict ichidagi hamma train bo‘lganlarini saqlab qo'yamiz
# # (joblib dump + thresholds bo'lsa json)

# saved = []

# for name, clf in models.items():
#     # Model file
#     model_path = SAVE_DIR / f"{name}.joblib"
#     joblib.dump(clf, model_path)

#     # Threshold bo‘lsa (sizda thr_dict yoki thresholds dict bo‘lishi mumkin)
#     # 1) agar siz global dict yig'gan bo'lsangiz: thresholds_by_model[name]
#     # 2) yoki oxirgi train qilinganda thr_dict shu nom bilan saqlangan bo‘lsa
#     if "thresholds_by_model" in globals() and isinstance(thresholds_by_model, dict) and name in thresholds_by_model:
#         thr_path = SAVE_DIR / f"{name}_thresholds.json"
#         with open(thr_path, "w", encoding="utf-8") as f:
#             json.dump(thresholds_by_model[name], f, ensure_ascii=False, indent=2)

#     saved.append(str(model_path))

# print("✅ Saved models:")
# for p in saved:
#     print(" -", p)

In [None]:
# CELL 5 — Model zoo + Train/Eval/Save loop
# # =========================
# # Model configs (improvement)
# # =========================
# models = {}

# # 1) OVR Logistic Regression (2 variant)
# models["ovr_logreg_bal_C1"] = OneVsRestClassifier(
#     LogisticRegression(
#         solver="liblinear",
#         max_iter=3000,
#         C=1.0,
#         class_weight="balanced",
#     ),
#     n_jobs=N_JOBS,
# )

# models["ovr_logreg_bal_C2"] = OneVsRestClassifier(
#     LogisticRegression(
#         solver="liblinear",
#         max_iter=3000,
#         C=2.0,
#         class_weight="balanced",
#     ),
#     n_jobs=N_JOBS,
# )

# # 2) OVR LinearSVC
# models["ovr_linearsvc_C1"] = OneVsRestClassifier(
#     LinearSVC(C=1.0, random_state=RANDOM_STATE),
#     n_jobs=N_JOBS,
# )

# # 3) OVR SGD (log_loss)
# models["ovr_sgd_logloss"] = OneVsRestClassifier(
#     SGDClassifier(
#         loss="log_loss",
#         alpha=1e-5,
#         max_iter=2000,
#         tol=1e-3,
#         random_state=RANDOM_STATE,
#     ),
#     n_jobs=N_JOBS,
# )

# # 4) OVR SGD (hinge)
# models["ovr_sgd_hinge"] = OneVsRestClassifier(
#     SGDClassifier(
#         loss="hinge",
#         alpha=1e-5,
#         max_iter=2000,
#         tol=1e-3,
#         random_state=RANDOM_STATE,
#     ),
#     n_jobs=N_JOBS,
# )

# # 5) OVR ComplementNB (high-recall candidate)
# models["ovr_complementnb_a05"] = OneVsRestClassifier(
#     ComplementNB(alpha=0.5),
#     n_jobs=N_JOBS,
# )

# # =========================
# # Train loop
# # =========================
# summary_rows = []

# for name, clf in models.items():
#     print("\n" + "="*90)
#     print("TRAIN:", name)
#     print("="*90)

#     # --- fit ---
#     clf.fit(X_train, Y_train)

#     # --- scores on VAL ---
#     S_val, mode = score_matrix(clf, X_val)
#     print("Score mode:", mode, "| shape:", S_val.shape)

#     # --- tune thresholds on VAL ---
#     thr = tune_thresholds_per_label(Y_val, S_val, mode=mode, n_thr=N_THR)
#     Y_val_pred = apply_thresholds(S_val, thr)

#     # --- metrics on VAL ---
#     overall = multilabel_micro_macro(Y_val, Y_val_pred)
#     print("VAL micro_f1:", overall["micro_f1"], "| macro_f1:", overall["macro_f1"])

#     per_label = per_label_report(Y_val, Y_val_pred, thr)

#     # --- save artifacts ---
#     model_path = ART_MODELS_DIR / f"{name}.joblib"
#     thr_path   = ART_MODELS_DIR / f"{name}_thresholds.json"
#     val_metrics_path = RES_DIR / f"{name}_val_per_label_metrics.csv"
#     val_summary_path = RES_DIR / f"{name}_val_summary.json"

#     joblib.dump(clf, model_path)

#     thr_dict = {lab.replace("y_", "", 1): float(t) for lab, t in zip(y_cols, thr)}
#     with open(thr_path, "w", encoding="utf-8") as f:
#         json.dump(thr_dict, f, ensure_ascii=False, indent=2)

#     per_label.to_csv(val_metrics_path, index=False)
#     with open(val_summary_path, "w", encoding="utf-8") as f:
#         json.dump({"model": name, "mode": mode, **overall}, f, ensure_ascii=False, indent=2)

#     print("Saved model:", model_path.resolve())
#     print("Saved thresholds:", thr_path.resolve())
#     print("Saved val per-label:", val_metrics_path.resolve())

#     summary_rows.append({
#         "model": name,
#         "mode": mode,
#         **overall,
#         "model_path": str(model_path),
#         "thresholds_path": str(thr_path),
#     })

# # summary table
# summary_df = pd.DataFrame(summary_rows).sort_values("micro_f1", ascending=False)
# summary_csv = RES_DIR / "val_models_summary.csv"
# summary_df.to_csv(summary_csv, index=False)
# print("\nSaved summary:", summary_csv.resolve())
# summary_df

In [None]:
# CELL 6 — Best model’ni TEST’da baholash + save
# # Best model by VAL micro_f1
# best_name = summary_df.iloc[0]["model"]
# best_mode = summary_df.iloc[0]["mode"]
# print("BEST:", best_name, "| mode:", best_mode)

# best_model = joblib.load(Path(summary_df.iloc[0]["model_path"]))
# with open(Path(summary_df.iloc[0]["thresholds_path"]), "r", encoding="utf-8") as f:
#     thr_dict = json.load(f)

# # thresholds array back (y_cols tartibida!)
# thr_best = np.array([thr_dict[lab.replace("y_", "", 1)] for lab in y_cols], dtype=np.float32)

# # TEST scoring
# S_test, mode_test = score_matrix(best_model, X_test)
# Y_test_pred = apply_thresholds(S_test, thr_best)

# test_overall = multilabel_micro_macro(Y_test, Y_test_pred)
# print("TEST micro_f1:", test_overall["micro_f1"], "| macro_f1:", test_overall["macro_f1"])

# test_per_label = per_label_report(Y_test, Y_test_pred, thr_best)

# # save
# test_metrics_path = RES_DIR / f"{best_name}_test_per_label_metrics.csv"
# test_summary_path = RES_DIR / f"{best_name}_test_summary.json"

# test_per_label.to_csv(test_metrics_path, index=False)
# with open(test_summary_path, "w", encoding="utf-8") as f:
#     json.dump({"model": best_name, **test_overall}, f, ensure_ascii=False, indent=2)

# print("Saved test per-label:", test_metrics_path.resolve())
# print("Saved test summary:", test_summary_path.resolve())

# # save predictions (optional)
# pred_dir = RES_DIR / "predictions"
# pred_dir.mkdir(parents=True, exist_ok=True)

# np.save(pred_dir / f"{best_name}_Y_test_pred.npy", Y_test_pred)
# np.save(pred_dir / f"{best_name}_Y_test_true.npy", Y_test)

# if ids_test is not None:
#     ids_test.to_csv(pred_dir / "ids_test.csv", index=False)
# print("Saved predictions in:", pred_dir.resolve())