In [1]:
# bitta-bitta bosib train qilasiz, va model doim:

# Models/improvement_models/with_SMOTE_improvement_train/

# ga saqlanadi. Root topishda ham Notebooks tuzog‘iga tushmaydi (Data fayllar bor joydan topadi).

# ⚠️ Eslatma (muhim): TF-IDF sparse feature’da SMOTE to‘g‘ridan-to‘g‘ri ishlasa RAM portlashi mumkin. Shuning uchun bu yerda TruncatedSVD (dim reduction) → SMOTE → classifier pipeline qilamiz. Bu SMOTE uchun eng amaliy yo‘l.

# CELL 1 — Imports
from __future__ import annotations

from pathlib import Path
from datetime import datetime
import json

import numpy as np
import pandas as pd
from scipy import sparse
import joblib

# models
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC

# dim reduction
from sklearn.decomposition import TruncatedSVD

# SMOTE
try:
    from imblearn.over_sampling import SMOTE
    from imblearn.pipeline import Pipeline as ImbPipeline
except Exception as e:
    raise ImportError(
        "imblearn topilmadi. O‘rnating:\n"
        "pip install imbalanced-learn\n"
        f"Original error: {e}"
    )

In [2]:
#CELL 2 — Config + PROJECT_ROOT (adashmaydi) + Load X/Y (Feature_Selected yoki Engineered_data)
# =========================
# CONFIG
# =========================
DATA_VERSION = "fe_v1_fs_chi2_v1"   # Feature_Selected bo‘lsa shuni ishlating (tavsiya)
# DATA_VERSION = "fe_v1"           # agar Feature_Selected yo‘q bo‘lsa, Engineered_data

PREFER_FEATURE_SELECTED = True     # avval Data/Feature_Selected dan qidirsin
N_JOBS = 1
RANDOM_STATE = 42

# SVD komponentlar (SMOTE uchun)
SVD_N_COMPONENTS = 300  # xohlasangiz 200/300/500

# Threshold tuning
N_THR = 61

# Save model dir (aytgan joy)
SAVE_DIR_REL = Path("Models") / "improvement_models" / "with_SMOTE_improvement_train"
RESULTS_DIR_REL = Path("results") / "improvement_smote"

def find_project_root_by_dataset(version: str, prefer_fs: bool = True, start: Path | None = None):
    """
    PROJECT_ROOT ni Notebooks deb olmaydi.
    Mezoni: Data/Feature_Selected/<ver>/X_train.npz yoki Data/Engineered_data/<ver>/X_train.npz bor joy.
    """
    start = start or Path.cwd()
    checked = []
    for p in [start] + list(start.parents):
        fs_dir = p / "Data" / "Feature_Selected" / version
        en_dir = p / "Data" / "Engineered_data" / version

        checked.append((fs_dir, en_dir))

        if prefer_fs and (fs_dir / "X_train.npz").exists() and (fs_dir / "Y_train.npy").exists() and (fs_dir / "engineered_meta.json").exists():
            return p, fs_dir, "Feature_Selected"
        if (en_dir / "X_train.npz").exists() and (en_dir / "Y_train.npy").exists() and (en_dir / "engineered_meta.json").exists():
            return p, en_dir, "Engineered_data"
        if (fs_dir / "X_train.npz").exists() and (fs_dir / "Y_train.npy").exists() and (fs_dir / "engineered_meta.json").exists():
            return p, fs_dir, "Feature_Selected"

    # helpful message
    last = checked[-5:] if checked else []
    raise FileNotFoundError(
        f"Dataset topilmadi. version={version}\n"
        f"Start: {start.resolve()}\n"
        "Oxirgi tekshirilgan joylar (FS, ENG):\n"
        + "\n".join([f"- {a} | {b}" for a,b in last]) +
        "\n\nYECHIM:\n"
        "Avval 09_feature_engineering va (bo‘lsa) 09b_feature_selection ni run qiling."
    )

PROJECT_ROOT, DATA_DIR, DATA_SOURCE = find_project_root_by_dataset(DATA_VERSION, PREFER_FEATURE_SELECTED)
print("PROJECT_ROOT:", PROJECT_ROOT.resolve())
print("DATA_SOURCE:", DATA_SOURCE)
print("DATA_DIR:", DATA_DIR.resolve())

SAVE_DIR = PROJECT_ROOT / SAVE_DIR_REL
SAVE_DIR.mkdir(parents=True, exist_ok=True)
print("SAVE_DIR:", SAVE_DIR.resolve())

RESULTS_DIR = PROJECT_ROOT / RESULTS_DIR_REL / datetime.now().strftime("smote_%Y%m%d_%H%M%S")
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
print("RESULTS_DIR:", RESULTS_DIR.resolve())

# ---- load meta ----
with open(DATA_DIR / "engineered_meta.json", "r", encoding="utf-8") as f:
    meta = json.load(f)
y_cols = meta["y_cols"]
print("Num labels:", len(y_cols))

# ---- load X/Y ----
X_train = sparse.load_npz(DATA_DIR / "X_train.npz").tocsr()
X_val   = sparse.load_npz(DATA_DIR / "X_val.npz").tocsr()
X_test  = sparse.load_npz(DATA_DIR / "X_test.npz").tocsr()

Y_train = np.load(DATA_DIR / "Y_train.npy")
Y_val   = np.load(DATA_DIR / "Y_val.npy")
Y_test  = np.load(DATA_DIR / "Y_test.npy")

print("X shapes:", X_train.shape, X_val.shape, X_test.shape)
print("Y shapes:", Y_train.shape, Y_val.shape, Y_test.shape)

PROJECT_ROOT: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract
DATA_SOURCE: Feature_Selected
DATA_DIR: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Feature_Selected\fe_v1_fs_chi2_v1
SAVE_DIR: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\with_SMOTE_improvement_train
RESULTS_DIR: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\results\improvement_smote\smote_20260225_233337
Num labels: 21
X shapes: (201176, 33671) (24410, 33671) (24164, 33671)
Y shapes: (201176, 21) (24410, 21) (24164, 21)


In [3]:
#CELL 3 — Helpers (metrics + threshold tuning) ✅ (NameError bo‘lmasin)
def prf_from_counts(tp: int, fp: int, fn: int):
    prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    rec  = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1   = (2 * prec * rec / (prec + rec)) if (prec + rec) > 0 else 0.0
    return prec, rec, f1

def multilabel_micro_macro(Y_true: np.ndarray, Y_pred: np.ndarray) -> dict:
    tp = int(((Y_true == 1) & (Y_pred == 1)).sum())
    fp = int(((Y_true == 0) & (Y_pred == 1)).sum())
    fn = int(((Y_true == 1) & (Y_pred == 0)).sum())
    micro_p, micro_r, micro_f1 = prf_from_counts(tp, fp, fn)

    f1s, ps, rs = [], [], []
    for j in range(Y_true.shape[1]):
        y = Y_true[:, j]
        p = Y_pred[:, j]
        tpj = int(((y == 1) & (p == 1)).sum())
        fpj = int(((y == 0) & (p == 1)).sum())
        fnj = int(((y == 1) & (p == 0)).sum())
        pj_, rj_, f1j_ = prf_from_counts(tpj, fpj, fnj)
        ps.append(pj_); rs.append(rj_); f1s.append(f1j_)
    return {
        "micro_precision": float(micro_p),
        "micro_recall": float(micro_r),
        "micro_f1": float(micro_f1),
        "macro_precision": float(np.mean(ps)),
        "macro_recall": float(np.mean(rs)),
        "macro_f1": float(np.mean(f1s)),
    }

def score_matrix(model, X):
    if hasattr(model, "predict_proba"):
        P = model.predict_proba(X)
        return np.asarray(P), "proba"
    if hasattr(model, "decision_function"):
        S = model.decision_function(X)
        return np.asarray(S), "score"
    Yp = model.predict(X)
    return np.asarray(Yp), "binary"

def tune_thresholds_per_label(Y_true: np.ndarray, scores: np.ndarray, mode: str, n_thr: int = 61) -> np.ndarray:
    n_labels = Y_true.shape[1]
    thr_out = np.zeros(n_labels, dtype=np.float32)
    q = np.linspace(0.01, 0.99, n_thr)

    for j in range(n_labels):
        y = Y_true[:, j].astype(np.int8)
        s = scores[:, j].astype(np.float32)

        if int(y.sum()) == 0:
            thr_out[j] = 1.0 if mode == "proba" else float(np.max(s) + 1.0)
            continue

        thr_grid = np.unique(np.quantile(s, q))
        if thr_grid.size < 10:
            mn, mx = float(np.min(s)), float(np.max(s))
            thr_grid = np.array([mn], dtype=np.float32) if mn == mx else np.linspace(mn, mx, num=31, dtype=np.float32)

        best_f1 = -1.0
        best_thr = float(thr_grid[len(thr_grid)//2])

        for thr in thr_grid:
            pred = (s >= thr).astype(np.int8)
            tp = int(((y == 1) & (pred == 1)).sum())
            fp = int(((y == 0) & (pred == 1)).sum())
            fn = int(((y == 1) & (pred == 0)).sum())
            _, _, f1 = prf_from_counts(tp, fp, fn)
            if f1 > best_f1:
                best_f1 = f1
                best_thr = float(thr)
        thr_out[j] = best_thr

    return thr_out

def apply_thresholds(scores: np.ndarray, thresholds: np.ndarray) -> np.ndarray:
    return (scores >= thresholds.reshape(1, -1)).astype(np.int8)

def per_label_metrics_df(Y_true: np.ndarray, Y_pred: np.ndarray, thresholds: np.ndarray) -> pd.DataFrame:
    rows = []
    for j, lab in enumerate(y_cols):
        y = Y_true[:, j]; p = Y_pred[:, j]
        tp = int(((y==1) & (p==1)).sum())
        fp = int(((y==0) & (p==1)).sum())
        fn = int(((y==1) & (p==0)).sum())
        tn = int(((y==0) & (p==0)).sum())
        prec, rec, f1 = prf_from_counts(tp, fp, fn)
        rows.append({
            "label": lab,
            "support_pos": int(y.sum()),
            "tp": tp, "fp": fp, "fn": fn, "tn": tn,
            "precision": prec, "recall": rec, "f1": f1,
            "threshold": float(thresholds[j]),
        })
    return pd.DataFrame(rows)

In [4]:
#CELL 4 — Model registry (SMOTE pipeline) + siz tanlaysiz (manual)
def make_smote_ovr(model_name: str):
    # # SVD: sparse -> dense low-dim
    # n_comp = min(SVD_N_COMPONENTS, X_train.shape[1]-1) if X_train.shape[1] > 1 else 1
    # svd = TruncatedSVD(n_components=n_comp, random_state=RANDOM_STATE)

    # SMOTE (SVD space’da)
    smote = SMOTE(
        sampling_strategy="auto",
        k_neighbors=3,
        random_state=RANDOM_STATE
    )

    if model_name == "ovr_logreg_smote_C1":
        base = LogisticRegression(solver="liblinear", max_iter=3000, C=1.0, class_weight=None)
    elif model_name == "ovr_logreg_smote_C2":
        base = LogisticRegression(solver="liblinear", max_iter=3000, C=2.0, class_weight=None)
    elif model_name == "ovr_linearsvc_smote_C1":
        base = LinearSVC(C=1.0, random_state=RANDOM_STATE)
    elif model_name == "ovr_sgd_logloss_smote":
        base = SGDClassifier(loss="log_loss", alpha=1e-5, max_iter=2000, tol=1e-3, random_state=RANDOM_STATE)
    elif model_name == "ovr_sgd_hinge_smote":
        base = SGDClassifier(loss="hinge", alpha=1e-5, max_iter=2000, tol=1e-3, random_state=RANDOM_STATE)
    else:
        raise ValueError("Unknown model_name")

    # OneVsRest har label uchun alohida fit qiladi -> pipeline ichidagi SMOTE ham har labelga alohida ishlaydi
    pipe = ImbPipeline([
        ("smote", smote),
        ("clf", base),
    ])
    
    
    # pipe = ImbPipeline([
    #     ("svd", svd),
    #     ("smote", smote),
    #     ("clf", base),
    # ])

    return OneVsRestClassifier(pipe, n_jobs=N_JOBS)

AVAILABLE_MODELS = [
    "ovr_logreg_smote_C1",
    "ovr_logreg_smote_C2",
    "ovr_linearsvc_smote_C1",
    "ovr_sgd_logloss_smote",
    "ovr_sgd_hinge_smote",
]

print("Available SMOTE models:")
for m in AVAILABLE_MODELS:
    print(" -", m)



Available SMOTE models:
 - ovr_logreg_smote_C1
 - ovr_logreg_smote_C2
 - ovr_linearsvc_smote_C1
 - ovr_sgd_logloss_smote
 - ovr_sgd_hinge_smote


# ovr_logreg_smote_C1

In [5]:
MODEL_NAME = "ovr_logreg_smote_C1"   # <-- har safar shu nomni o'zgartirib RUN qilamiz
clf = make_smote_ovr(MODEL_NAME)
print("Selected:", MODEL_NAME)

Selected: ovr_logreg_smote_C1


In [None]:
#CELL 5 — Train (manual) + VAL threshold tuning + SAVE (siz bitta bosasiz)
print("\n" + "="*90)
print("TRAIN (SMOTE):", MODEL_NAME)
print("="*90)

# fit
clf.fit(X_train, Y_train)

# VAL scores
S_val, mode = score_matrix(clf, X_val)
print("Score mode:", mode, "| S_val shape:", S_val.shape)

# thresholds on VAL
thr = tune_thresholds_per_label(Y_val, S_val, mode=mode, n_thr=N_THR)
Y_val_pred = apply_thresholds(S_val, thr)

val_overall = multilabel_micro_macro(Y_val, Y_val_pred)
print("VAL micro_f1:", val_overall["micro_f1"], "| macro_f1:", val_overall["macro_f1"])

val_per_label = per_label_metrics_df(Y_val, Y_val_pred, thr)

# ---- SAVE model + thresholds ----
model_path = SAVE_DIR / f"{MODEL_NAME}.joblib"
thr_path   = SAVE_DIR / f"{MODEL_NAME}_thresholds.json"

joblib.dump(clf, model_path)

thr_dict = {lab.replace("y_", "", 1): float(t) for lab, t in zip(y_cols, thr)}
with open(thr_path, "w", encoding="utf-8") as f:
    json.dump(thr_dict, f, ensure_ascii=False, indent=2)

print("✅ Saved model:", model_path.resolve())
print("✅ Saved thresholds:", thr_path.resolve())

# ---- SAVE metrics ----
val_per_label.to_csv(RESULTS_DIR / f"{MODEL_NAME}_val_per_label_metrics.csv", index=False)
with open(RESULTS_DIR / f"{MODEL_NAME}_val_summary.json", "w", encoding="utf-8") as f:
    json.dump({"model": MODEL_NAME, "mode": mode, **val_overall}, f, ensure_ascii=False, indent=2)

print("✅ Saved VAL metrics:", (RESULTS_DIR / f"{MODEL_NAME}_val_per_label_metrics.csv").resolve())

val_per_label.sort_values("f1").head(10)

# 66 minut ketdi


TRAIN (SMOTE): ovr_logreg_smote_C1
Score mode: proba | S_val shape: (24410, 21)
VAL micro_f1: 0.9784687194157947 | macro_f1: 0.9440208168219181
✅ Saved model: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\with_SMOTE_improvement_train\ovr_logreg_smote_C1.joblib
✅ Saved thresholds: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\with_SMOTE_improvement_train\ovr_logreg_smote_C1_thresholds.json
✅ Saved VAL metrics: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\results\improvement_smote\smote_20260225_233337\ovr_logreg_smote_C1_val_per_label_metrics.csv


Unnamed: 0,label,support_pos,tp,fp,fn,tn,precision,recall,f1,threshold
16,y_pregnancy_reproductive,347,92,0,255,24063,1.0,0.26513,0.419134,0.99992
6,y_hepatic,844,844,198,0,23368,0.809981,1.0,0.895016,0.110211
7,y_hypersensitivity_allergy,771,643,0,128,23639,1.0,0.833982,0.909477,0.991319
11,y_metabolic_endocrine,1256,1256,185,0,22969,0.871617,1.0,0.931405,0.125102
2,y_edema_swelling,1600,1441,0,159,22810,1.0,0.900625,0.947715,0.981808
10,y_injury_accident,1718,1718,121,0,22571,0.934203,1.0,0.965983,0.124604
14,y_ocular_visual,1106,1042,0,64,23304,1.0,0.942134,0.970205,0.919523
18,y_renal,618,612,31,6,23761,0.951788,0.990291,0.970658,0.872241
20,y_urinary,616,616,27,0,23767,0.958009,1.0,0.978554,0.387112
0,y_cardiovascular,2528,2528,109,0,21773,0.958665,1.0,0.978896,0.28025


In [None]:
#CELL 6 — (ixtiyoriy) TEST baholash (VAL threshold bilan) + save
S_test, _ = score_matrix(clf, X_test)
Y_test_pred = apply_thresholds(S_test, thr)

test_overall = multilabel_micro_macro(Y_test, Y_test_pred)
print("TEST micro_f1:", test_overall["micro_f1"], "| macro_f1:", test_overall["macro_f1"])

test_per_label = per_label_metrics_df(Y_test, Y_test_pred, thr)
test_per_label.to_csv(RESULTS_DIR / f"{MODEL_NAME}_test_per_label_metrics.csv", index=False)

with open(RESULTS_DIR / f"{MODEL_NAME}_test_summary.json", "w", encoding="utf-8") as f:
    json.dump({"model": MODEL_NAME, **test_overall}, f, ensure_ascii=False, indent=2)

print("✅ Saved TEST metrics:", (RESULTS_DIR / f"{MODEL_NAME}_test_per_label_metrics.csv").resolve())

# Sizning workflow (bitta-bitta bosib)

# CELL 1–3: bir marta run

# Har model uchun:

# CELL 4’da MODEL_NAME ni o‘zgartirasiz

# CELL 5 ni bosasiz (train + save)

# xohlasangiz CELL 6 (test)

# Model saqlanadigan joy doim:
# Models/improvement_models/with_SMOTE_improvement_train/

# Root ham endi adashmaydi, chunki u Data/Feature_Selected yoki Data/Engineered_data bor joydan topyapti.

# ovr_logreg_smote_C2

# ovr_linearsvc_smote_C1

# ovr_sgd_logloss_smote

# ovr_sgd_hinge_smote