In [1]:
# 4 ta algoritm (LogReg, LinearSVC, SGD log_loss, SGD hinge) uchun Optuna tuning qiladi va:

# best paramsni topadi

# best modelni qayta train qiladi

# VAL’da per-label threshold tuning qiladi

# hammasini saqlaydi:

# ✅ Model: Models/improvement_models/Improvement_Models/Optuna_Tuned_<RUN_ID>/...
# ✅ Threshold: ..._thresholds.json
# ✅ Best params: ..._best_params.json
# ✅ Trial results: results/optuna_tuning/<RUN_ID>/*.csv
# ✅ Test summary: ..._test_summary.json





#CELL 1 — Load X/Y (Feature_Selected dan) + paths (root adashmaydi)
from __future__ import annotations

from pathlib import Path
from datetime import datetime
import json

import numpy as np
import pandas as pd
from scipy import sparse
import joblib

import optuna

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC

# =========================
# CONFIG
# =========================
VERSION = "fe_v1_fs_chi2_v1"      # Feature_Selected versiya
PREFER_FEATURE_SELECTED = True

N_THR = 31
N_JOBS = 1
RANDOM_STATE = 42

N_TRIALS = 15         # tez boshlash (keyin 50/100)
TIMEOUT_SEC = 1800    # xohlasangiz: 1800 (30 min) kabi

RUN_ID = datetime.now().strftime("optuna_%Y%m%d_%H%M%S")

def find_project_root(start: Path | None = None) -> Path:
    start = start or Path.cwd()
    for p in [start] + list(start.parents):
        data = p / "Data"
        raw = data / "Raw_data"
        processed = data / "Processed"
        ok = False
        if raw.exists() and any(raw.iterdir()): ok = True
        if processed.exists() and any(processed.iterdir()): ok = True
        if ok:
            return p
    return start

def find_data_dir(project_root: Path, version: str, prefer_fs: bool = True) -> tuple[Path, str]:
    fs = project_root / "Data" / "Feature_Selected" / version
    eng = project_root / "Data" / "Engineered_data" / version
    if prefer_fs and (fs / "X_train.npz").exists(): return fs, "Feature_Selected"
    if (eng / "X_train.npz").exists(): return eng, "Engineered_data"
    if (fs / "X_train.npz").exists(): return fs, "Feature_Selected"
    raise FileNotFoundError(f"X_train.npz topilmadi: {fs} yoki {eng}")

PROJECT_ROOT = find_project_root()
DATA_DIR, DATA_SOURCE = find_data_dir(PROJECT_ROOT, VERSION, PREFER_FEATURE_SELECTED)

print("PROJECT_ROOT:", PROJECT_ROOT.resolve())
print("DATA_SOURCE:", DATA_SOURCE)
print("DATA_DIR:", DATA_DIR.resolve())

# outputs
MODEL_SAVE_DIR = PROJECT_ROOT / "Models" / "improvement_models" / "Improvement_Models" / f"Optuna_Tuned_{RUN_ID}"
RESULTS_DIR = PROJECT_ROOT / "results" / "optuna_tuning" / RUN_ID
MODEL_SAVE_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

print("MODEL_SAVE_DIR:", MODEL_SAVE_DIR.resolve())
print("RESULTS_DIR:", RESULTS_DIR.resolve())

with open(DATA_DIR / "engineered_meta.json", "r", encoding="utf-8") as f:
    meta = json.load(f)
y_cols = meta["y_cols"]

X_train = sparse.load_npz(DATA_DIR / "X_train.npz").tocsr()
X_val   = sparse.load_npz(DATA_DIR / "X_val.npz").tocsr()
X_test  = sparse.load_npz(DATA_DIR / "X_test.npz").tocsr()

Y_train = np.load(DATA_DIR / "Y_train.npy")
Y_val   = np.load(DATA_DIR / "Y_val.npy")
Y_test  = np.load(DATA_DIR / "Y_test.npy")

print("X:", X_train.shape, X_val.shape, X_test.shape)
print("Y:", Y_train.shape, Y_val.shape, Y_test.shape)
print("labels:", len(y_cols))

  from .autonotebook import tqdm as notebook_tqdm


PROJECT_ROOT: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract
DATA_SOURCE: Feature_Selected
DATA_DIR: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Feature_Selected\fe_v1_fs_chi2_v1
MODEL_SAVE_DIR: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\Optuna_Tuned_optuna_20260226_005315
RESULTS_DIR: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\results\optuna_tuning\optuna_20260226_005315
X: (201176, 33671) (24410, 33671) (24164, 33671)
Y: (201176, 21) (24410, 21) (24164, 21)
labels: 21


In [2]:
# CELL 2 — Helper: score/threshold/metric (Optuna objective shu bilan)

import numpy as np

def prf_from_counts(tp: int, fp: int, fn: int):
    prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    rec  = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1   = (2 * prec * rec / (prec + rec)) if (prec + rec) > 0 else 0.0
    return prec, rec, f1

def multilabel_micro_macro(Y_true: np.ndarray, Y_pred: np.ndarray) -> dict:
    tp = int(((Y_true == 1) & (Y_pred == 1)).sum())
    fp = int(((Y_true == 0) & (Y_pred == 1)).sum())
    fn = int(((Y_true == 1) & (Y_pred == 0)).sum())
    _, _, micro_f1 = prf_from_counts(tp, fp, fn)

    f1s = []
    for j in range(Y_true.shape[1]):
        y = Y_true[:, j]
        p = Y_pred[:, j]
        tpj = int(((y == 1) & (p == 1)).sum())
        fpj = int(((y == 0) & (p == 1)).sum())
        fnj = int(((y == 1) & (p == 0)).sum())
        _, _, f1j = prf_from_counts(tpj, fpj, fnj)
        f1s.append(f1j)

    return {"micro_f1": float(micro_f1), "macro_f1": float(np.mean(f1s))}

def score_matrix(model, X):
    if hasattr(model, "predict_proba"):
        return np.asarray(model.predict_proba(X)), "proba"
    if hasattr(model, "decision_function"):
        return np.asarray(model.decision_function(X)), "score"
    return np.asarray(model.predict(X)), "binary"

def tune_thresholds_per_label(Y_true: np.ndarray, scores: np.ndarray, n_thr: int = 61) -> np.ndarray:
    n_labels = Y_true.shape[1]
    thr_out = np.zeros(n_labels, dtype=np.float32)
    q = np.linspace(0.01, 0.99, n_thr)

    for j in range(n_labels):
        y = Y_true[:, j].astype(np.int8)
        s = scores[:, j].astype(np.float32)

        if int(y.sum()) == 0:
            thr_out[j] = float(np.max(s) + 1.0)
            continue

        thr_grid = np.unique(np.quantile(s, q))
        if thr_grid.size < 10:
            mn, mx = float(np.min(s)), float(np.max(s))
            thr_grid = np.array([mn], dtype=np.float32) if mn == mx else np.linspace(mn, mx, num=31, dtype=np.float32)

        best_f1 = -1.0
        best_thr = float(thr_grid[len(thr_grid)//2])

        for thr in thr_grid:
            pred = (s >= thr).astype(np.int8)
            tp = int(((y == 1) & (pred == 1)).sum())
            fp = int(((y == 0) & (pred == 1)).sum())
            fn = int(((y == 1) & (pred == 0)).sum())
            _, _, f1 = prf_from_counts(tp, fp, fn)
            if f1 > best_f1:
                best_f1 = f1
                best_thr = float(thr)

        thr_out[j] = best_thr

    return thr_out

def apply_thresholds(scores: np.ndarray, thr: np.ndarray) -> np.ndarray:
    return (scores >= thr.reshape(1, -1)).astype(np.int8)

In [3]:
# CELL 3 — Save helpers
def save_best(model, model_tag: str, best_params: dict, thr: np.ndarray, val_metrics: dict, test_metrics: dict):
    # save model
    model_path = MODEL_SAVE_DIR / f"{model_tag}.joblib"
    joblib.dump(model, model_path)

    # thresholds dict
    thr_dict = {lab.replace("y_", "", 1): float(t) for lab, t in zip(y_cols, thr)}
    thr_path = MODEL_SAVE_DIR / f"{model_tag}_thresholds.json"
    with open(thr_path, "w", encoding="utf-8") as f:
        json.dump(thr_dict, f, ensure_ascii=False, indent=2)

    # params
    params_path = MODEL_SAVE_DIR / f"{model_tag}_best_params.json"
    with open(params_path, "w", encoding="utf-8") as f:
        json.dump(best_params, f, ensure_ascii=False, indent=2)

    # summaries
    with open(RESULTS_DIR / f"{model_tag}_val_summary.json", "w", encoding="utf-8") as f:
        json.dump(val_metrics, f, ensure_ascii=False, indent=2)
    with open(RESULTS_DIR / f"{model_tag}_test_summary.json", "w", encoding="utf-8") as f:
        json.dump(test_metrics, f, ensure_ascii=False, indent=2)

    print("✅ Saved model:", model_path.resolve())
    print("✅ Saved thresholds:", thr_path.resolve())
    print("✅ Saved params:", params_path.resolve())

In [None]:
#Bu har algoritm uchun 1 trialni real vaqt bilan o‘lchaydi, keyin siz N_TRIALSga ko‘paytirib olamiz.

# import time
# import optuna

# def time_one_trial(objective_fn, name: str):
#     study = optuna.create_study(direction="maximize")
#     t0 = time.time()
#     study.optimize(objective_fn, n_trials=1)
#     dt = time.time() - t0
#     print(f"{name}: 1 trial time = {dt:.2f} sec | best={study.best_value:.6f}")
#     return dt

# t_logreg = time_one_trial(objective_logreg, "logreg")
# t_svc    = time_one_trial(objective_linearsvc, "linearsvc")
# t_sgdll  = time_one_trial(objective_sgd_logloss, "sgd_logloss")
# t_sgdh   = time_one_trial(objective_sgd_hinge, "sgd_hinge")

# print("\nRough totals (sec):")
# print("logreg total ~", t_logreg * N_TRIALS)
# print("linearsvc total ~", t_svc * N_TRIALS)
# print("sgd_logloss total ~", t_sgdll * N_TRIALS)
# print("sgd_hinge total ~", t_sgdh * N_TRIALS)
# print("ALL 4 total ~", (t_logreg + t_svc + t_sgdll + t_sgdh) * N_TRIALS)

#Shundan keyin siz o‘zingizning PCda aniq vaqtni bilasiz.

In [None]:
# # CELL 4 — Optuna tuning: LogReg (bosib yuborasiz)
# def objective_logreg(trial: optuna.Trial) -> float:
#     C = trial.suggest_float("C", 0.25, 8.0, log=True)
#     cw = trial.suggest_categorical("class_weight", [None, "balanced"])

#     base = LogisticRegression(
#         solver="liblinear",
#         max_iter=4000,
#         C=C,
#         class_weight=cw,
#     )
#     clf = OneVsRestClassifier(base, n_jobs=N_JOBS)
#     clf.fit(X_train, Y_train)

#     S_val, _ = score_matrix(clf, X_val)
#     thr = tune_thresholds_per_label(Y_val, S_val, n_thr=N_THR)
#     Y_val_pred = apply_thresholds(S_val, thr)
#     m = multilabel_micro_macro(Y_val, Y_val_pred)

#     # trial log
#     trial.set_user_attr("macro_f1", m["macro_f1"])
#     return m["micro_f1"]

# study = optuna.create_study(direction="maximize", study_name=f"logreg_{RUN_ID}")
# study.optimize(objective_logreg, n_trials=N_TRIALS, timeout=TIMEOUT_SEC)

# print("BEST logreg:", study.best_value, study.best_params)

# # Train best again + save
# best_params = study.best_params
# base = LogisticRegression(solver="liblinear", max_iter=5000, C=best_params["C"], class_weight=best_params["class_weight"])
# best_model = OneVsRestClassifier(base, n_jobs=N_JOBS)
# best_model.fit(X_train, Y_train)

# S_val, _ = score_matrix(best_model, X_val)
# thr = tune_thresholds_per_label(Y_val, S_val, n_thr=N_THR)
# Y_val_pred = apply_thresholds(S_val, thr)
# val_m = multilabel_micro_macro(Y_val, Y_val_pred)

# S_test, _ = score_matrix(best_model, X_test)
# Y_test_pred = apply_thresholds(S_test, thr)
# test_m = multilabel_micro_macro(Y_test, Y_test_pred)

# # save trials
# trials_df = study.trials_dataframe()
# trials_df.to_csv(RESULTS_DIR / "logreg_trials.csv", index=False)

# save_best(best_model, "optuna_logreg_best", best_params, thr, val_m, test_m)
# val_m, test_m










# CELL 4 — Optuna tuning: LogReg (SUBSAMPLE + PRUNER)

import optuna
import numpy as np

# -------- SPEED KNOBS (shu yerlarini xohlasangiz o'zgartirasiz) --------
TRAIN_SUBSAMPLE = 60000   # 30k/60k/100k; None => full train (sekin)
N_THR = 31           # trial ichida 21/31; finalda N_THR ishlatamiz
SEED = RANDOM_STATE

# -------- 1) Trial uchun train subsample tayyorlash --------
if TRAIN_SUBSAMPLE is not None and X_train.shape[0] > TRAIN_SUBSAMPLE:
    rng = np.random.RandomState(SEED)
    idx = rng.choice(X_train.shape[0], size=TRAIN_SUBSAMPLE, replace=False)
    X_tr = X_train[idx]
    Y_tr = Y_train[idx]
    print(f"Using TRAIN_SUBSAMPLE={TRAIN_SUBSAMPLE} for trials")
else:
    X_tr = X_train
    Y_tr = Y_train
    print("Using FULL train for trials")

def objective_logreg(trial: optuna.Trial) -> float:
    C = trial.suggest_float("C", 0.25, 8.0, log=True)
    cw = trial.suggest_categorical("class_weight", [None, "balanced"])

    # trialda iter kamroq (tezroq)
    base = LogisticRegression(
        solver="liblinear",
        max_iter=2500,     # 4000 -> 2500 (tezroq)
        C=C,
        class_weight=cw,
    )
    clf = OneVsRestClassifier(base, n_jobs=N_JOBS)

    # !!! o'zgargan joy: subsample fit
    clf.fit(X_tr, Y_tr)

    S_val, _ = score_matrix(clf, X_val)

    # !!! o'zgargan joy: trial’da tez threshold
    thr = tune_thresholds_per_label(Y_val, S_val, n_thr=N_THR)

    Y_val_pred = apply_thresholds(S_val, thr)
    m = multilabel_micro_macro(Y_val, Y_val_pred)

    trial.set_user_attr("macro_f1", m["macro_f1"])
    return m["micro_f1"]

# -------- 2) Sampler + Pruner qo'shish --------
sampler = optuna.samplers.TPESampler(seed=SEED)
pruner  = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=0)

study = optuna.create_study(
    direction="maximize",
    study_name=f"logreg_{RUN_ID}",
    sampler=sampler,
    pruner=pruner,
)

study.optimize(objective_logreg, n_trials=N_TRIALS, timeout=TIMEOUT_SEC)

print("BEST logreg:", study.best_value, study.best_params)

# -------- 3) Final: FULL TRAIN + aniq threshold (N_THR) --------
best_params = study.best_params
base = LogisticRegression(
    solver="liblinear",
    max_iter=5000,  # finalda kattaroq
    C=best_params["C"],
    class_weight=best_params["class_weight"],
)
best_model = OneVsRestClassifier(base, n_jobs=N_JOBS)
best_model.fit(X_train, Y_train)

S_val, _ = score_matrix(best_model, X_val)

# finalda aniq threshold
thr = tune_thresholds_per_label(Y_val, S_val, n_thr=N_THR)

Y_val_pred = apply_thresholds(S_val, thr)
val_m = multilabel_micro_macro(Y_val, Y_val_pred)

S_test, _ = score_matrix(best_model, X_test)
Y_test_pred = apply_thresholds(S_test, thr)
test_m = multilabel_micro_macro(Y_test, Y_test_pred)

# save trials
study.trials_dataframe().to_csv(RESULTS_DIR / "logreg_trials.csv", index=False)

save_best(best_model, "optuna_logreg_best", best_params, thr, val_m, test_m)
val_m, test_m

# Tezlikni yana oshirish uchun (2 ta knob)

# TRAIN_SUBSAMPLE = 30000

# N_THR_FAST = 21

# 31 minut ketdi

[32m[I 2026-02-26 09:29:40,294][0m A new study created in memory with name: logreg_optuna_20260226_005315[0m


Using TRAIN_SUBSAMPLE=60000 for trials


[32m[I 2026-02-26 09:30:45,000][0m Trial 0 finished with value: 0.9625195978972609 and parameters: {'C': 0.9155436618548748, 'class_weight': None}. Best is trial 0 with value: 0.9625195978972609.[0m
[32m[I 2026-02-26 09:31:55,998][0m Trial 1 finished with value: 0.9628884994927603 and parameters: {'C': 1.990722903930924, 'class_weight': None}. Best is trial 1 with value: 0.9628884994927603.[0m
[32m[I 2026-02-26 09:33:11,691][0m Trial 2 finished with value: 0.9587408905760758 and parameters: {'C': 0.3057486557273976, 'class_weight': None}. Best is trial 1 with value: 0.9628884994927603.[0m
[32m[I 2026-02-26 09:34:49,856][0m Trial 3 finished with value: 0.9629062603755487 and parameters: {'C': 2.908676577972516, 'class_weight': 'balanced'}. Best is trial 3 with value: 0.9629062603755487.[0m
[32m[I 2026-02-26 09:36:21,150][0m Trial 4 finished with value: 0.9613153519305376 and parameters: {'C': 4.476009827847025, 'class_weight': None}. Best is trial 3 with value: 0.962906260

BEST logreg: 0.9629151410626504 {'C': 2.9811067985213984, 'class_weight': 'balanced'}
✅ Saved model: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\Optuna_Tuned_optuna_20260226_005315\optuna_logreg_best.joblib
✅ Saved thresholds: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\Optuna_Tuned_optuna_20260226_005315\optuna_logreg_best_thresholds.json
✅ Saved params: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\Optuna_Tuned_optuna_20260226_005315\optuna_logreg_best_best_params.json


({'micro_f1': 0.9629411005146556, 'macro_f1': 0.9309690257979467},
 {'micro_f1': 0.9574156205367211, 'macro_f1': 0.916767073048451})

In [None]:
# # CELL 5 — Optuna tuning: LinearSVC
# def objective_linearsvc(trial: optuna.Trial) -> float:
#     C = trial.suggest_float("C", 0.25, 8.0, log=True)
#     cw = trial.suggest_categorical("class_weight", [None, "balanced"])

#     base = LinearSVC(C=C, class_weight=cw, random_state=RANDOM_STATE)
#     clf = OneVsRestClassifier(base, n_jobs=N_JOBS)
#     clf.fit(X_train, Y_train)

#     S_val, _ = score_matrix(clf, X_val)
#     thr = tune_thresholds_per_label(Y_val, S_val, n_thr=N_THR)
#     Y_val_pred = apply_thresholds(S_val, thr)
#     m = multilabel_micro_macro(Y_val, Y_val_pred)

#     trial.set_user_attr("macro_f1", m["macro_f1"])
#     return m["micro_f1"]

# study = optuna.create_study(direction="maximize", study_name=f"linearsvc_{RUN_ID}")
# study.optimize(objective_linearsvc, n_trials=N_TRIALS, timeout=TIMEOUT_SEC)

# print("BEST linearsvc:", study.best_value, study.best_params)

# best_params = study.best_params
# base = LinearSVC(C=best_params["C"], class_weight=best_params["class_weight"], random_state=RANDOM_STATE)
# best_model = OneVsRestClassifier(base, n_jobs=N_JOBS)
# best_model.fit(X_train, Y_train)

# S_val, _ = score_matrix(best_model, X_val)
# thr = tune_thresholds_per_label(Y_val, S_val, n_thr=N_THR)
# Y_val_pred = apply_thresholds(S_val, thr)
# val_m = multilabel_micro_macro(Y_val, Y_val_pred)

# S_test, _ = score_matrix(best_model, X_test)
# Y_test_pred = apply_thresholds(S_test, thr)
# test_m = multilabel_micro_macro(Y_test, Y_test_pred)

# (pd.DataFrame(study.trials_dataframe())).to_csv(RESULTS_DIR / "linearsvc_trials.csv", index=False)
# save_best(best_model, "optuna_linearsvc_best", best_params, thr, val_m, test_m)
# val_m, test_m








# CELL 5 — Optuna tuning: LinearSVC (SUBSAMPLE + PRUNER)

import optuna
import numpy as np
import pandas as pd

# -------- SPEED KNOBS --------
TRAIN_SUBSAMPLE = 60000   # 30k/60k/100k; None => full train
N_THR_FAST = 31           # trial ichida 21/31; finalda N_THR
SEED = RANDOM_STATE

# -------- 1) Trial uchun subsample --------
if TRAIN_SUBSAMPLE is not None and X_train.shape[0] > TRAIN_SUBSAMPLE:
    rng = np.random.RandomState(SEED)
    idx = rng.choice(X_train.shape[0], size=TRAIN_SUBSAMPLE, replace=False)
    X_tr = X_train[idx]
    Y_tr = Y_train[idx]
    print(f"Using TRAIN_SUBSAMPLE={TRAIN_SUBSAMPLE} for trials")
else:
    X_tr = X_train
    Y_tr = Y_train
    print("Using FULL train for trials")

def objective_linearsvc(trial: optuna.Trial) -> float:
    C = trial.suggest_float("C", 0.25, 8.0, log=True)
    cw = trial.suggest_categorical("class_weight", [None, "balanced"])

    base = LinearSVC(C=C, class_weight=cw, random_state=RANDOM_STATE)
    clf = OneVsRestClassifier(base, n_jobs=N_JOBS)

    # !!! subsample fit
    clf.fit(X_tr, Y_tr)

    S_val, _ = score_matrix(clf, X_val)

    # !!! trial’da tez threshold
    thr = tune_thresholds_per_label(Y_val, S_val, n_thr=N_THR_FAST)

    Y_val_pred = apply_thresholds(S_val, thr)
    m = multilabel_micro_macro(Y_val, Y_val_pred)

    trial.set_user_attr("macro_f1", m["macro_f1"])
    return m["micro_f1"]

# -------- 2) Sampler + Pruner --------
sampler = optuna.samplers.TPESampler(seed=SEED)
pruner  = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=0)

study = optuna.create_study(
    direction="maximize",
    study_name=f"linearsvc_{RUN_ID}",
    sampler=sampler,
    pruner=pruner,
)

study.optimize(objective_linearsvc, n_trials=N_TRIALS, timeout=TIMEOUT_SEC)

print("BEST linearsvc:", study.best_value, study.best_params)

# -------- 3) Final: FULL TRAIN + aniq threshold --------
best_params = study.best_params
base = LinearSVC(
    C=best_params["C"],
    class_weight=best_params["class_weight"],
    random_state=RANDOM_STATE,
)
best_model = OneVsRestClassifier(base, n_jobs=N_JOBS)
best_model.fit(X_train, Y_train)

S_val, _ = score_matrix(best_model, X_val)
thr = tune_thresholds_per_label(Y_val, S_val, n_thr=N_THR)  # finalda N_THR
Y_val_pred = apply_thresholds(S_val, thr)
val_m = multilabel_micro_macro(Y_val, Y_val_pred)

S_test, _ = score_matrix(best_model, X_test)
Y_test_pred = apply_thresholds(S_test, thr)
test_m = multilabel_micro_macro(Y_test, Y_test_pred)

pd.DataFrame(study.trials_dataframe()).to_csv(RESULTS_DIR / "linearsvc_trials.csv", index=False)
save_best(best_model, "optuna_linearsvc_best", best_params, thr, val_m, test_m)
val_m, test_m

# 15 minut ketdi

[32m[I 2026-02-26 10:04:04,149][0m A new study created in memory with name: linearsvc_optuna_20260226_005315[0m


Using TRAIN_SUBSAMPLE=60000 for trials


[32m[I 2026-02-26 10:04:43,984][0m Trial 0 finished with value: 0.9612748267898383 and parameters: {'C': 0.9155436618548748, 'class_weight': None}. Best is trial 0 with value: 0.9612748267898383.[0m
[32m[I 2026-02-26 10:05:22,900][0m Trial 1 finished with value: 0.9612844223148055 and parameters: {'C': 1.990722903930924, 'class_weight': None}. Best is trial 1 with value: 0.9612844223148055.[0m
[32m[I 2026-02-26 10:05:59,794][0m Trial 2 finished with value: 0.9629042076331371 and parameters: {'C': 0.3057486557273976, 'class_weight': None}. Best is trial 2 with value: 0.9629042076331371.[0m
[32m[I 2026-02-26 10:07:41,625][0m Trial 3 finished with value: 0.9628843918906455 and parameters: {'C': 2.908676577972516, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.9629042076331371.[0m
[32m[I 2026-02-26 10:08:24,182][0m Trial 4 finished with value: 0.9612844223148055 and parameters: {'C': 4.476009827847025, 'class_weight': None}. Best is trial 2 with value: 0.962904207

BEST linearsvc: 0.9629322191785876 {'C': 0.383885657767193, 'class_weight': None}
✅ Saved model: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\Optuna_Tuned_optuna_20260226_005315\optuna_linearsvc_best.joblib
✅ Saved thresholds: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\Optuna_Tuned_optuna_20260226_005315\optuna_linearsvc_best_thresholds.json
✅ Saved params: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\Optuna_Tuned_optuna_20260226_005315\optuna_linearsvc_best_best_params.json


({'micro_f1': 0.9612652310875851, 'macro_f1': 0.9089976056718859},
 {'micro_f1': 0.957016126000827, 'macro_f1': 0.9123174799904918})

In [None]:
# #CELL 6 — Optuna tuning: SGD log_loss
# def objective_sgd_logloss(trial: optuna.Trial) -> float:
#     alpha = trial.suggest_float("alpha", 1e-6, 1e-4, log=True)
#     penalty = trial.suggest_categorical("penalty", ["l2", "elasticnet"])
#     l1_ratio = None
#     if penalty == "elasticnet":
#         l1_ratio = trial.suggest_float("l1_ratio", 0.1, 0.9)

#     base = SGDClassifier(
#         loss="log_loss",
#         penalty=penalty,
#         alpha=alpha,
#         l1_ratio=l1_ratio,
#         max_iter=3000,
#         tol=1e-3,
#         random_state=RANDOM_STATE,
#     )
#     clf = OneVsRestClassifier(base, n_jobs=N_JOBS)
#     clf.fit(X_train, Y_train)

#     S_val, _ = score_matrix(clf, X_val)
#     thr = tune_thresholds_per_label(Y_val, S_val, n_thr=N_THR)
#     Y_val_pred = apply_thresholds(S_val, thr)
#     m = multilabel_micro_macro(Y_val, Y_val_pred)

#     trial.set_user_attr("macro_f1", m["macro_f1"])
#     return m["micro_f1"]

# study = optuna.create_study(direction="maximize", study_name=f"sgd_logloss_{RUN_ID}")
# study.optimize(objective_sgd_logloss, n_trials=N_TRIALS, timeout=TIMEOUT_SEC)

# print("BEST sgd_logloss:", study.best_value, study.best_params)

# best_params = study.best_params
# base = SGDClassifier(
#     loss="log_loss",
#     penalty=best_params["penalty"],
#     alpha=best_params["alpha"],
#     l1_ratio=best_params.get("l1_ratio", None),
#     max_iter=4000,
#     tol=1e-3,
#     random_state=RANDOM_STATE,
# )
# best_model = OneVsRestClassifier(base, n_jobs=N_JOBS)
# best_model.fit(X_train, Y_train)

# S_val, _ = score_matrix(best_model, X_val)
# thr = tune_thresholds_per_label(Y_val, S_val, n_thr=N_THR)
# Y_val_pred = apply_thresholds(S_val, thr)
# val_m = multilabel_micro_macro(Y_val, Y_val_pred)

# S_test, _ = score_matrix(best_model, X_test)
# Y_test_pred = apply_thresholds(S_test, thr)
# test_m = multilabel_micro_macro(Y_test, Y_test_pred)

# (pd.DataFrame(study.trials_dataframe())).to_csv(RESULTS_DIR / "sgd_logloss_trials.csv", index=False)
# save_best(best_model, "optuna_sgd_logloss_best", best_params, thr, val_m, test_m)
# val_m, test_m











# CELL 6 — Optuna tuning: SGD log_loss (SUBSAMPLE + PRUNER)

import optuna
import numpy as np
import pandas as pd

# -------- SPEED KNOBS --------
TRAIN_SUBSAMPLE = 60000   # 30k/60k/100k; None => full train (sekin)
N_THR_FAST = 31           # trial ichida 21/31; finalda N_THR ishlatamiz
SEED = RANDOM_STATE

# -------- 1) Trial uchun subsample --------
if TRAIN_SUBSAMPLE is not None and X_train.shape[0] > TRAIN_SUBSAMPLE:
    rng = np.random.RandomState(SEED)
    idx = rng.choice(X_train.shape[0], size=TRAIN_SUBSAMPLE, replace=False)
    X_tr = X_train[idx]
    Y_tr = Y_train[idx]
    print(f"Using TRAIN_SUBSAMPLE={TRAIN_SUBSAMPLE} for trials")
else:
    X_tr = X_train
    Y_tr = Y_train
    print("Using FULL train for trials")

def objective_sgd_logloss(trial: optuna.Trial) -> float:
    alpha = trial.suggest_float("alpha", 1e-6, 1e-4, log=True)
    penalty = trial.suggest_categorical("penalty", ["l2", "elasticnet"])
    l1_ratio = None
    if penalty == "elasticnet":
        l1_ratio = trial.suggest_float("l1_ratio", 0.1, 0.9)

    base = SGDClassifier(
        loss="log_loss",
        penalty=penalty,
        alpha=alpha,
        l1_ratio=l1_ratio,
        max_iter=2000,      # 3000 -> 2000 (tezroq)
        tol=1e-3,
        random_state=RANDOM_STATE,
    )
    clf = OneVsRestClassifier(base, n_jobs=N_JOBS)

    # !!! subsample fit
    clf.fit(X_tr, Y_tr)

    S_val, _ = score_matrix(clf, X_val)

    # !!! trial’da tez threshold
    thr = tune_thresholds_per_label(Y_val, S_val, n_thr=N_THR_FAST)

    Y_val_pred = apply_thresholds(S_val, thr)
    m = multilabel_micro_macro(Y_val, Y_val_pred)

    trial.set_user_attr("macro_f1", m["macro_f1"])
    return m["micro_f1"]

# -------- 2) Sampler + Pruner --------
sampler = optuna.samplers.TPESampler(seed=SEED)
pruner  = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=0)

study = optuna.create_study(
    direction="maximize",
    study_name=f"sgd_logloss_{RUN_ID}",
    sampler=sampler,
    pruner=pruner,
)

study.optimize(objective_sgd_logloss, n_trials=N_TRIALS, timeout=TIMEOUT_SEC)

print("BEST sgd_logloss:", study.best_value, study.best_params)

# -------- 3) Final: FULL TRAIN + aniq threshold --------
best_params = study.best_params
base = SGDClassifier(
    loss="log_loss",
    penalty=best_params["penalty"],
    alpha=best_params["alpha"],
    l1_ratio=best_params.get("l1_ratio", None),
    max_iter=4000,   # finalda kattaroq
    tol=1e-3,
    random_state=RANDOM_STATE,
)

best_model = OneVsRestClassifier(base, n_jobs=N_JOBS)
best_model.fit(X_train, Y_train)

S_val, _ = score_matrix(best_model, X_val)
thr = tune_thresholds_per_label(Y_val, S_val, n_thr=N_THR)   # finalda N_THR (aniq)
Y_val_pred = apply_thresholds(S_val, thr)
val_m = multilabel_micro_macro(Y_val, Y_val_pred)

S_test, _ = score_matrix(best_model, X_test)
Y_test_pred = apply_thresholds(S_test, thr)
test_m = multilabel_micro_macro(Y_test, Y_test_pred)

pd.DataFrame(study.trials_dataframe()).to_csv(RESULTS_DIR / "sgd_logloss_trials.csv", index=False)
save_best(best_model, "optuna_sgd_logloss_best", best_params, thr, val_m, test_m)
val_m, test_m

# 40 minut ketdi

[32m[I 2026-02-26 10:20:01,035][0m A new study created in memory with name: sgd_logloss_optuna_20260226_005315[0m


Using TRAIN_SUBSAMPLE=60000 for trials


[32m[I 2026-02-26 10:21:19,765][0m Trial 0 finished with value: 0.9459118311981914 and parameters: {'alpha': 5.611516415334504e-06, 'penalty': 'l2'}. Best is trial 0 with value: 0.9459118311981914.[0m
[32m[I 2026-02-26 10:22:28,271][0m Trial 1 finished with value: 0.9279632104835167 and parameters: {'alpha': 1.575132049977973e-05, 'penalty': 'l2'}. Best is trial 0 with value: 0.9459118311981914.[0m
[32m[I 2026-02-26 10:23:43,861][0m Trial 2 finished with value: 0.9467317073170732 and parameters: {'alpha': 1.306673923805328e-06, 'penalty': 'l2'}. Best is trial 2 with value: 0.9467317073170732.[0m
[32m[I 2026-02-26 10:26:02,841][0m Trial 3 finished with value: 0.9473213180619425 and parameters: {'alpha': 2.6070247583707675e-05, 'penalty': 'elasticnet', 'l1_ratio': 0.7659541126403374}. Best is trial 3 with value: 0.9473213180619425.[0m
[32m[I 2026-02-26 10:28:47,842][0m Trial 4 finished with value: 0.93512661307589 and parameters: {'alpha': 2.6587543983272713e-06, 'penalty':

BEST sgd_logloss: 0.9513461072034927 {'alpha': 4.552621487933352e-06, 'penalty': 'elasticnet', 'l1_ratio': 0.6577247186720756}
✅ Saved model: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\Optuna_Tuned_optuna_20260226_005315\optuna_sgd_logloss_best.joblib
✅ Saved thresholds: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\Optuna_Tuned_optuna_20260226_005315\optuna_sgd_logloss_best_thresholds.json
✅ Saved params: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\Optuna_Tuned_optuna_20260226_005315\optuna_sgd_logloss_best_best_params.json


({'micro_f1': 0.9509630339727475, 'macro_f1': 0.9163810555597952},
 {'micro_f1': 0.9434148304658417, 'macro_f1': 0.9039855387001187})

In [10]:
# #CELL 7 — Optuna tuning: SGD hinge
# def objective_sgd_hinge(trial: optuna.Trial) -> float:
#     alpha = trial.suggest_float("alpha", 1e-6, 1e-4, log=True)
#     cw = trial.suggest_categorical("class_weight", [None, "balanced"])

#     base = SGDClassifier(
#         loss="hinge",
#         penalty="l2",
#         alpha=alpha,
#         class_weight=cw,
#         max_iter=3000,
#         tol=1e-3,
#         random_state=RANDOM_STATE,
#     )
#     clf = OneVsRestClassifier(base, n_jobs=N_JOBS)
#     clf.fit(X_train, Y_train)

#     S_val, _ = score_matrix(clf, X_val)
#     thr = tune_thresholds_per_label(Y_val, S_val, n_thr=N_THR)
#     Y_val_pred = apply_thresholds(S_val, thr)
#     m = multilabel_micro_macro(Y_val, Y_val_pred)

#     trial.set_user_attr("macro_f1", m["macro_f1"])
#     return m["micro_f1"]

# study = optuna.create_study(direction="maximize", study_name=f"sgd_hinge_{RUN_ID}")
# study.optimize(objective_sgd_hinge, n_trials=N_TRIALS, timeout=TIMEOUT_SEC)

# print("BEST sgd_hinge:", study.best_value, study.best_params)

# best_params = study.best_params
# base = SGDClassifier(
#     loss="hinge",
#     penalty="l2",
#     alpha=best_params["alpha"],
#     class_weight=best_params["class_weight"],
#     max_iter=4000,
#     tol=1e-3,
#     random_state=RANDOM_STATE,
# )
# best_model = OneVsRestClassifier(base, n_jobs=N_JOBS)
# best_model.fit(X_train, Y_train)

# S_val, _ = score_matrix(best_model, X_val)
# thr = tune_thresholds_per_label(Y_val, S_val, n_thr=N_THR)
# Y_val_pred = apply_thresholds(S_val, thr)
# val_m = multilabel_micro_macro(Y_val, Y_val_pred)

# S_test, _ = score_matrix(best_model, X_test)
# Y_test_pred = apply_thresholds(S_test, thr)
# test_m = multilabel_micro_macro(Y_test, Y_test_pred)

# (pd.DataFrame(study.trials_dataframe())).to_csv(RESULTS_DIR / "sgd_hinge_trials.csv", index=False)
# save_best(best_model, "optuna_sgd_hinge_best", best_params, thr, val_m, test_m)
# val_m, test_m






# CELL 7 — Optuna tuning: SGD hinge (SUBSAMPLE + PRUNER)

import optuna
import numpy as np
import pandas as pd

# -------- SPEED KNOBS --------
TRAIN_SUBSAMPLE = 60000   # 30k/60k/100k; None => full train
N_THR_FAST = 31           # trial ichida 21/31; finalda N_THR
SEED = RANDOM_STATE

# -------- 1) Trial uchun subsample --------
if TRAIN_SUBSAMPLE is not None and X_train.shape[0] > TRAIN_SUBSAMPLE:
    rng = np.random.RandomState(SEED)
    idx = rng.choice(X_train.shape[0], size=TRAIN_SUBSAMPLE, replace=False)
    X_tr = X_train[idx]
    Y_tr = Y_train[idx]
    print(f"Using TRAIN_SUBSAMPLE={TRAIN_SUBSAMPLE} for trials")
else:
    X_tr = X_train
    Y_tr = Y_train
    print("Using FULL train for trials")

def objective_sgd_hinge(trial: optuna.Trial) -> float:
    alpha = trial.suggest_float("alpha", 1e-6, 1e-4, log=True)
    cw = trial.suggest_categorical("class_weight", [None, "balanced"])

    base = SGDClassifier(
        loss="hinge",
        penalty="l2",
        alpha=alpha,
        class_weight=cw,
        max_iter=2000,    # 3000 -> 2000 (tezroq)
        tol=1e-3,
        random_state=RANDOM_STATE,
    )
    clf = OneVsRestClassifier(base, n_jobs=N_JOBS)

    # !!! subsample fit
    clf.fit(X_tr, Y_tr)

    S_val, _ = score_matrix(clf, X_val)

    # !!! trial’da tez threshold
    thr = tune_thresholds_per_label(Y_val, S_val, n_thr=N_THR_FAST)

    Y_val_pred = apply_thresholds(S_val, thr)
    m = multilabel_micro_macro(Y_val, Y_val_pred)

    trial.set_user_attr("macro_f1", m["macro_f1"])
    return m["micro_f1"]

# -------- 2) Sampler + Pruner --------
sampler = optuna.samplers.TPESampler(seed=SEED)
pruner  = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=0)

study = optuna.create_study(
    direction="maximize",
    study_name=f"sgd_hinge_{RUN_ID}",
    sampler=sampler,
    pruner=pruner,
)

study.optimize(objective_sgd_hinge, n_trials=N_TRIALS, timeout=TIMEOUT_SEC)

print("BEST sgd_hinge:", study.best_value, study.best_params)

# -------- 3) Final: FULL TRAIN + aniq threshold --------
best_params = study.best_params
base = SGDClassifier(
    loss="hinge",
    penalty="l2",
    alpha=best_params["alpha"],
    class_weight=best_params["class_weight"],
    max_iter=4000,   # finalda kattaroq
    tol=1e-3,
    random_state=RANDOM_STATE,
)
best_model = OneVsRestClassifier(base, n_jobs=N_JOBS)
best_model.fit(X_train, Y_train)

S_val, _ = score_matrix(best_model, X_val)
thr = tune_thresholds_per_label(Y_val, S_val, n_thr=N_THR)  # finalda N_THR
Y_val_pred = apply_thresholds(S_val, thr)
val_m = multilabel_micro_macro(Y_val, Y_val_pred)

S_test, _ = score_matrix(best_model, X_test)
Y_test_pred = apply_thresholds(S_test, thr)
test_m = multilabel_micro_macro(Y_test, Y_test_pred)

pd.DataFrame(study.trials_dataframe()).to_csv(RESULTS_DIR / "sgd_hinge_trials.csv", index=False)
save_best(best_model, "optuna_sgd_hinge_best", best_params, thr, val_m, test_m)
val_m, test_m

[32m[I 2026-02-26 11:01:20,888][0m A new study created in memory with name: sgd_hinge_optuna_20260226_005315[0m


Using TRAIN_SUBSAMPLE=60000 for trials


[32m[I 2026-02-26 11:02:21,243][0m Trial 0 finished with value: 0.940609772035623 and parameters: {'alpha': 5.611516415334504e-06, 'class_weight': None}. Best is trial 0 with value: 0.940609772035623.[0m
[32m[I 2026-02-26 11:03:17,580][0m Trial 1 finished with value: 0.925443563608909 and parameters: {'alpha': 1.575132049977973e-05, 'class_weight': None}. Best is trial 0 with value: 0.940609772035623.[0m
[32m[I 2026-02-26 11:04:10,604][0m Trial 2 finished with value: 0.9439522573373857 and parameters: {'alpha': 1.306673923805328e-06, 'class_weight': None}. Best is trial 2 with value: 0.9439522573373857.[0m
[32m[I 2026-02-26 11:04:59,382][0m Trial 3 finished with value: 0.9386591634253768 and parameters: {'alpha': 2.6070247583707675e-05, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.9439522573373857.[0m
[32m[I 2026-02-26 11:05:47,277][0m Trial 4 finished with value: 0.9378877302811456 and parameters: {'alpha': 4.6225890010208326e-05, 'class_weight': None}. Bes

BEST sgd_hinge: 0.9469473247223767 {'alpha': 1.0677482709481361e-05, 'class_weight': None}
✅ Saved model: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\Optuna_Tuned_optuna_20260226_005315\optuna_sgd_hinge_best.joblib
✅ Saved thresholds: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\Optuna_Tuned_optuna_20260226_005315\optuna_sgd_hinge_best_thresholds.json
✅ Saved params: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\improvement_models\Improvement_Models\Optuna_Tuned_optuna_20260226_005315\optuna_sgd_hinge_best_best_params.json


({'micro_f1': 0.9480212803102002, 'macro_f1': 0.8888894025923383},
 {'micro_f1': 0.9380873513593986, 'macro_f1': 0.8953980073993205})

# “FINAL TRAIN” (LogReg) — bitta cell (train+val bilan)

In [13]:
# Training LogisticRegression (...)

# Train time: ... min

# formatlari qo‘shilgan (fit va threshold calibration ham vaqt bilan chiqadi).
# Va siz aytgan path’larda saqlaydi:

# ✅ Models/best_model/<MODEL_NAME>/...

# ✅ results/tables/best_model_results/<MODEL_NAME>/...

#  bu cell ishlashi uchun sizda helper funksiyalar bor bo‘lsin:
# score_matrix, tune_thresholds_per_label, apply_thresholds

from pathlib import Path
import json
import time
import numpy as np
import pandas as pd
from scipy import sparse
import joblib

from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# =========================
# CONFIG
# =========================
VERSION = "fe_v1_fs_chi2_v1"
PREFER_FEATURE_SELECTED = True

MODEL_NAME = "optuna_logreg_best"   # folder nomi ham shu bo'ladi

# Optuna LogReg best params
C_best = 2.98109399813007
cw_best = "balanced"

# =========================
# Helpers: pretty timing + titles
# =========================
def fmt_minutes(seconds: float) -> str:
    mins = seconds / 60.0
    if mins < 1:
        return f"{seconds:.1f} sec"
    return f"{mins:.2f} min"

def model_title(name: str, params: dict) -> str:
    inside = ", ".join([f"{k}={v}" for k, v in params.items()])
    return f"{name} ({inside})"

# =========================
# Helpers: project root + data dir
# =========================
def find_project_root(start=None):
    start = start or Path.cwd()
    for p in [start] + list(start.parents):
        d = p / "Data"
        if not d.exists():
            continue
        for sub in ["Feature_Selected", "Engineered_data", "Processed", "Raw_data"]:
            x = d / sub
            if x.exists() and any(x.iterdir()):
                return p
    return start

def find_data_dir(project_root: Path, version: str, prefer_fs: bool = True):
    fs = project_root / "Data" / "Feature_Selected" / version
    eng = project_root / "Data" / "Engineered_data" / version
    if prefer_fs and (fs / "X_train.npz").exists(): return fs
    if (eng / "X_train.npz").exists(): return eng
    if (fs / "X_train.npz").exists(): return fs
    raise FileNotFoundError("X_train.npz topilmadi (Feature_Selected/Engineered_data).")

def per_label_metrics_df(y_cols, Y_true, Y_pred, thr):
    rows=[]
    for j, lab in enumerate(y_cols):
        y=Y_true[:,j]; p=Y_pred[:,j]
        tp=int(((y==1)&(p==1)).sum())
        fp=int(((y==0)&(p==1)).sum())
        fn=int(((y==1)&(p==0)).sum())
        tn=int(((y==0)&(p==0)).sum())
        prec= tp/(tp+fp) if (tp+fp)>0 else 0.0
        rec = tp/(tp+fn) if (tp+fn)>0 else 0.0
        f1  = (2*prec*rec/(prec+rec)) if (prec+rec)>0 else 0.0
        rows.append({"label": lab, "support_pos": int(y.sum()), "precision": prec, "recall": rec, "f1": f1, "threshold": float(thr[j])})
    return pd.DataFrame(rows)

# =========================
# Load data
# =========================
PROJECT_ROOT = find_project_root()
DATA_DIR = find_data_dir(PROJECT_ROOT, VERSION, PREFER_FEATURE_SELECTED)

with open(DATA_DIR / "engineered_meta.json", "r", encoding="utf-8") as f:
    meta = json.load(f)
y_cols = meta["y_cols"]

X_train = sparse.load_npz(DATA_DIR / "X_train.npz").tocsr()
X_val   = sparse.load_npz(DATA_DIR / "X_val.npz").tocsr()
X_test  = sparse.load_npz(DATA_DIR / "X_test.npz").tocsr()

Y_train = np.load(DATA_DIR / "Y_train.npy")
Y_val   = np.load(DATA_DIR / "Y_val.npy")
Y_test  = np.load(DATA_DIR / "Y_test.npy")

# =========================
# FINAL TRAIN: train+val -> fit/cal split
# =========================
X_tv = sparse.vstack([X_train, X_val]).tocsr()
Y_tv = np.vstack([Y_train, Y_val]).astype(np.int8)

idx_all = np.arange(X_tv.shape[0])
idx_fit, idx_cal = train_test_split(idx_all, test_size=0.10, random_state=42)

X_fit, Y_fit = X_tv[idx_fit], Y_tv[idx_fit]
X_cal, Y_cal = X_tv[idx_cal], Y_tv[idx_cal]

base = LogisticRegression(solver="liblinear", max_iter=6000, C=C_best, class_weight=cw_best)
final_model = OneVsRestClassifier(base, n_jobs=1)

MODEL_DISPLAY_NAME = "LogisticRegression"
MODEL_PARAMS = {"solver": "liblinear", "C": C_best, "class_weight": cw_best, "max_iter": 6000}

print("\n" + "="*90)
print("Training", model_title(MODEL_DISPLAY_NAME, MODEL_PARAMS))
print("="*90)

t0 = time.perf_counter()
final_model.fit(X_fit, Y_fit)
t1 = time.perf_counter()
print(f"Train time: {fmt_minutes(t1 - t0)}")

t2 = time.perf_counter()
S_cal, _ = score_matrix(final_model, X_cal)
thr_final = tune_thresholds_per_label(Y_cal, S_cal, n_thr=61)
t3 = time.perf_counter()
print(f"Threshold calib time: {fmt_minutes(t3 - t2)}")
print(f"Total time: {fmt_minutes((t1 - t0) + (t3 - t2))}")

# =========================
# EVAL: VAL + TEST
# =========================
S_val, _ = score_matrix(final_model, X_val)
Y_val_pred = apply_thresholds(S_val, thr_final)

S_test, _ = score_matrix(final_model, X_test)
Y_test_pred = apply_thresholds(S_test, thr_final)

val_micro = f1_score(Y_val, Y_val_pred, average="micro", zero_division=0)
val_macro = f1_score(Y_val, Y_val_pred, average="macro", zero_division=0)
test_micro = f1_score(Y_test, Y_test_pred, average="micro", zero_division=0)
test_macro = f1_score(Y_test, Y_test_pred, average="macro", zero_division=0)

val_summary = {"model": MODEL_NAME, "micro_f1": float(val_micro), "macro_f1": float(val_macro)}
test_summary = {"model": MODEL_NAME, "micro_f1": float(test_micro), "macro_f1": float(test_macro)}

val_per_label = per_label_metrics_df(y_cols, Y_val, Y_val_pred, thr_final)
test_per_label = per_label_metrics_df(y_cols, Y_test, Y_test_pred, thr_final)

timing = {
    "model": MODEL_NAME,
    "display": model_title(MODEL_DISPLAY_NAME, MODEL_PARAMS),
    "train_fit_seconds": float(t1 - t0),
    "threshold_calib_seconds": float(t3 - t2),
    "total_seconds": float((t1 - t0) + (t3 - t2)),
    "train_fit_human": fmt_minutes(t1 - t0),
    "threshold_calib_human": fmt_minutes(t3 - t2),
    "total_human": fmt_minutes((t1 - t0) + (t3 - t2)),
    "X_fit_shape": [int(X_fit.shape[0]), int(X_fit.shape[1])],
    "X_cal_shape": [int(X_cal.shape[0]), int(X_cal.shape[1])],
}

print("\nVAL:", val_summary)
print("TEST:", test_summary)

# =========================
# SAVE (siz aytgandek)
# =========================
MODEL_DIR = PROJECT_ROOT / "Models" / "best_model" / MODEL_NAME
RES_DIR = PROJECT_ROOT / "results" / "tables" / "best_model_results" / MODEL_NAME
MODEL_DIR.mkdir(parents=True, exist_ok=True)
RES_DIR.mkdir(parents=True, exist_ok=True)

# model + thresholds + params
joblib.dump(final_model, MODEL_DIR / f"{MODEL_NAME}.joblib")

thr_dict = {c.replace("y_", "", 1): float(t) for c, t in zip(y_cols, thr_final)}
with open(MODEL_DIR / f"{MODEL_NAME}_thresholds.json", "w", encoding="utf-8") as f:
    json.dump(thr_dict, f, ensure_ascii=False, indent=2)

with open(MODEL_DIR / f"{MODEL_NAME}_params.json", "w", encoding="utf-8") as f:
    json.dump({"C": C_best, "class_weight": cw_best}, f, ensure_ascii=False, indent=2)

# results
val_per_label.to_csv(RES_DIR / f"{MODEL_NAME}_val_per_label_metrics.csv", index=False)
test_per_label.to_csv(RES_DIR / f"{MODEL_NAME}_test_per_label_metrics.csv", index=False)

with open(RES_DIR / f"{MODEL_NAME}_val_summary.json", "w", encoding="utf-8") as f:
    json.dump(val_summary, f, ensure_ascii=False, indent=2)
with open(RES_DIR / f"{MODEL_NAME}_test_summary.json", "w", encoding="utf-8") as f:
    json.dump(test_summary, f, ensure_ascii=False, indent=2)

with open(RES_DIR / f"{MODEL_NAME}_timing.json", "w", encoding="utf-8") as f:
    json.dump(timing, f, ensure_ascii=False, indent=2)

print("\n✅ Saved model dir:", MODEL_DIR.resolve())
print("✅ Saved results dir:", RES_DIR.resolve())
print("✅ Timing saved:", (RES_DIR / f"{MODEL_NAME}_timing.json").resolve())


Training LogisticRegression (solver=liblinear, C=2.98109399813007, class_weight=balanced, max_iter=6000)
Train time: 5.69 min
Threshold calib time: 0.5 sec
Total time: 5.70 min

VAL: {'model': 'optuna_logreg_best', 'micro_f1': 0.9848308051341891, 'macro_f1': 0.9737848794746311}
TEST: {'model': 'optuna_logreg_best', 'micro_f1': 0.9781993259508907, 'macro_f1': 0.9660216617683088}

✅ Saved model dir: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\best_model\optuna_logreg_best
✅ Saved results dir: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\results\tables\best_model_results\optuna_logreg_best
✅ Timing saved: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\results\tables\best_model_results\optuna_logreg_best\optuna_logreg_best_timing.json
