# 16) Compare Results â€” Baseline vs Improvement vs Tuning vs Best Model
Bu notebook:
- 06_train_baseline.ipynb
- 10_train_improvement.ipynb
- 12_tuning_NO_OVERSAMPLING.ipynb
notebooklaridan natijalarni **avtomatik parse qilib**, bosqichma-bosqich (tagma-tag) koâ€˜rsatadi.

**Ranglar:** har bir bosqich jadvalida har bir metrikaning **eng yuqori** qiymati yashil, **eng past** qiymati qizil.


In [1]:
from __future__ import annotations

import json, re
from pathlib import Path
import pandas as pd
import numpy as np

try:
    from tabulate import tabulate
except Exception:
    tabulate = None

from IPython.display import display

# ==========================
# 0) Fayllarni topish
# ==========================
HERE = Path.cwd()

def find_file(filename: str, root: Path = HERE) -> Path:
    hits = list(root.rglob(filename))
    if hits:
        return hits[0]
    # /mnt/data kabi joylarda ishlaganda ham yordam berish uchun:
    alt_root = Path("/mnt/data")
    if alt_root.exists():
        hits = list(alt_root.rglob(filename))
        if hits:
            return hits[0]
    raise FileNotFoundError(f"Topilmadi: {filename}. Compare notebook joylashgan papkada yoki uning ichida boâ€˜lishi kerak.")

FILES = {
    "baseline": "06_train_baseline.ipynb",
    "improvement": "10_train_improvement.ipynb",
    "tuning": "12_tuning_NO_OVERSAMPLING.ipynb",
}

paths = {k: find_file(v) for k, v in FILES.items()}
paths


{'baseline': WindowsPath('c:/Users/xolmu/OneDrive/Desktop/Modul Program oyi/Modul_Program3/6_project_dori_tasiri_extract/Notebooks/06_train_baseline.ipynb'),
 'improvement': WindowsPath('c:/Users/xolmu/OneDrive/Desktop/Modul Program oyi/Modul_Program3/6_project_dori_tasiri_extract/Notebooks/10_train_improvement.ipynb'),
 'tuning': WindowsPath('c:/Users/xolmu/OneDrive/Desktop/Modul Program oyi/Modul_Program3/6_project_dori_tasiri_extract/Notebooks/12_tuning_NO_OVERSAMPLING.ipynb')}

In [2]:
# ==========================
# 1) Notebook outputlarini oâ€˜qish
# ==========================
def load_ipynb(path: Path) -> dict:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def iter_cell_text_outputs(nb: dict):
    """Yield (cell_index, text) for all stdout/stderr and text/plain outputs."""
    for ci, cell in enumerate(nb.get("cells", [])):
        if cell.get("cell_type") != "code":
            continue
        # stream outputs
        for out in cell.get("outputs", []):
            ot = out.get("output_type")
            if ot == "stream":
                txt = out.get("text", "")
                if isinstance(txt, list):
                    txt = "".join(txt)
                yield ci, txt
            elif ot in ("execute_result", "display_data"):
                data = out.get("data", {})
                txt = data.get("text/plain")
                if txt is None:
                    continue
                if isinstance(txt, list):
                    txt = "".join(txt)
                yield ci, txt

def get_cell_source(nb: dict, idx: int) -> str:
    src = nb["cells"][idx].get("source", "")
    return "".join(src) if isinstance(src, list) else str(src)

def to_float(x):
    try:
        return float(x)
    except Exception:
        return np.nan


In [3]:
# ==========================
# 2) Parsers
# ==========================
RE_TEST_METRICS = re.compile(
    r"TEST metrics:\s*\n\s*micro_f1:\s*(?P<micro_f1>[0-9eE.+-]+)\s*\n\s*macro_f1:\s*(?P<macro_f1>[0-9eE.+-]+)\s*\n\s*micro_P\s*:\s*(?P<micro_p>[0-9eE.+-]+)\s*\n\s*micro_R\s*:\s*(?P<micro_r>[0-9eE.+-]+)",
    re.MULTILINE
)

RE_VAL_METRICS_IMPR = re.compile(
    r"TRAIN:\s*(?P<model>\S+)\s*\n=+\s*\n.*?VAL micro_f1:\s*(?P<micro_f1>[0-9eE.+-]+)\s*\|\s*macro_f1:\s*(?P<macro_f1>[0-9eE.+-]+)",
    re.DOTALL
)

RE_TUPLE_VAL_TEST = re.compile(
    r"\(\{'micro_f1':\s*(?P<val_micro>[0-9eE.+-]+),\s*'macro_f1':\s*(?P<val_macro>[0-9eE.+-]+)\}\s*,\s*\{'micro_f1':\s*(?P<test_micro>[0-9eE.+-]+),\s*'macro_f1':\s*(?P<test_macro>[0-9eE.+-]+)\}\)",
    re.DOTALL
)

RE_FINAL_VAL = re.compile(r"VAL:\s*\{[^}]*'micro_f1':\s*(?P<val_micro>[0-9eE.+-]+),\s*'macro_f1':\s*(?P<val_macro>[0-9eE.+-]+)\}")
RE_FINAL_TEST = re.compile(r"TEST:\s*\{[^}]*'micro_f1':\s*(?P<test_micro>[0-9eE.+-]+),\s*'macro_f1':\s*(?P<test_macro>[0-9eE.+-]+)\}")

def parse_baseline(nb: dict) -> pd.DataFrame:
    rows = []
    for ci, txt in iter_cell_text_outputs(nb):
        m = RE_TEST_METRICS.search(txt)
        if not m:
            continue

        # model name: try run_name in source, else heuristic by cell index
        src = get_cell_source(nb, ci)
        rn = None
        m_rn = re.search(r"run_name\s*=\s*['\"](?P<rn>[^'\"]+)['\"]", src)
        if m_rn:
            rn = m_rn.group("rn")
        else:
            # LogReg baseline evaluate cell
            rn = "baseline_ovr_logreg" if ci == 13 else f"baseline_cell_{ci}"

        rows.append({
            "model": rn,
            "test_micro_f1": to_float(m.group("micro_f1")),
            "test_macro_f1": to_float(m.group("macro_f1")),
            "test_micro_precision": to_float(m.group("micro_p")),
            "test_micro_recall": to_float(m.group("micro_r")),
        })

    df = pd.DataFrame(rows).drop_duplicates(subset=["model"], keep="first")
    # Keng tarqalgan tartib
    if not df.empty:
        df = df.sort_values(["test_micro_f1","test_macro_f1"], ascending=False).reset_index(drop=True)
    return df

def parse_improvement(nb: dict) -> pd.DataFrame:
    rows = []
    # bu notebookda odatda har model bitta cell stdout'ida turadi
    for ci, txt in iter_cell_text_outputs(nb):
        if "TRAIN:" not in txt or "VAL micro_f1" not in txt:
            continue
        m = RE_VAL_METRICS_IMPR.search(txt)
        if not m:
            continue
        rows.append({
            "model": m.group("model").strip(),
            "val_micro_f1": to_float(m.group("micro_f1")),
            "val_macro_f1": to_float(m.group("macro_f1")),
        })
    df = pd.DataFrame(rows).drop_duplicates(subset=["model"], keep="first")
    if not df.empty:
        df = df.sort_values(["val_micro_f1","val_macro_f1"], ascending=False).reset_index(drop=True)
    return df

def parse_tuning(nb: dict) -> pd.DataFrame:
    rows = []
    algo_by_hint = {
        "logreg_optuna": "tune_optuna_logreg",
        "linearsvc_optuna": "tune_optuna_linearsvc",
        "sgd_logloss_optuna": "tune_optuna_sgd_logloss",
        "sgd_hinge_optuna": "tune_optuna_sgd_hinge",
    }
    for ci, txt in iter_cell_text_outputs(nb):
        m = RE_TUPLE_VAL_TEST.search(txt)
        if not m:
            continue

        # algo hint: oldingi log satrida study name boâ€˜ladi
        model = f"tune_cell_{ci}"
        for hint, name in algo_by_hint.items():
            if hint in txt:
                model = name
                break

        rows.append({
            "model": model,
            "val_micro_f1": to_float(m.group("val_micro")),
            "val_macro_f1": to_float(m.group("val_macro")),
            "test_micro_f1": to_float(m.group("test_micro")),
            "test_macro_f1": to_float(m.group("test_macro")),
        })

    df = pd.DataFrame(rows).drop_duplicates(subset=["model"], keep="first")
    if not df.empty:
        # tuning jadvalini test boâ€˜yicha tartiblaymiz
        df = df.sort_values(["test_micro_f1","test_macro_f1"], ascending=False).reset_index(drop=True)
    return df

def parse_best_model(nb: dict) -> pd.DataFrame:
    # FINAL TRAIN cell stdoutâ€™idan olamiz
    best = None
    for ci, txt in iter_cell_text_outputs(nb):
        if "Training LogisticRegression" in txt and "VAL:" in txt and "TEST:" in txt:
            mv = RE_FINAL_VAL.search(txt)
            mt = RE_FINAL_TEST.search(txt)
            if mv and mt:
                best = {
                    "model": "optuna_logreg_best_final",
                    "val_micro_f1": to_float(mv.group("val_micro")),
                    "val_macro_f1": to_float(mv.group("val_macro")),
                    "test_micro_f1": to_float(mt.group("test_micro")),
                    "test_macro_f1": to_float(mt.group("test_macro")),
                }
                break
    df = pd.DataFrame([best]) if best else pd.DataFrame(columns=[
        "model","val_micro_f1","val_macro_f1","test_micro_f1","test_macro_f1"
    ])
    return df


In [4]:
# ==========================
# 3) Parse + DataFrames
# ==========================
baseline_nb = load_ipynb(paths["baseline"])
impr_nb = load_ipynb(paths["improvement"])
tune_nb = load_ipynb(paths["tuning"])

df_baseline = parse_baseline(baseline_nb)
df_impr = parse_improvement(impr_nb)
df_tune = parse_tuning(tune_nb)
df_best = parse_best_model(tune_nb)

df_baseline, df_impr, df_tune, df_best


(                               model  test_micro_f1  test_macro_f1  \
 0             baseline_ovr_linearsvc         0.9995         0.9988   
 1  baseline_ovr_calibrated_linearsvc         0.9992         0.9984   
 2                baseline_ovr_logreg         0.9973         0.9958   
 3           baseline_ovr_sgd_logloss         0.9950         0.9920   
 
    test_micro_precision  test_micro_recall  
 0                0.9992             0.9998  
 1                0.9989             0.9995  
 2                0.9960             0.9986  
 3                0.9934             0.9966  ,
                   model  val_micro_f1  val_macro_f1
 0     ovr_logreg_bal_C2      0.980376      0.965836
 1      ovr_linearsvc_C1      0.980182      0.964417
 2     ovr_logreg_bal_C1      0.978403      0.944185
 3       ovr_sgd_logloss      0.974033      0.952951
 4         ovr_sgd_hinge      0.957535      0.936905
 5  ovr_complementnb_a05      0.913360      0.882115,
          model  val_micro_f1  val_macro

In [5]:
# ==========================
# 4) Tabulate + ðŸŸ©/ðŸŸ¥ (max/min) â€” jinja2 kerak EMAS
# ==========================

def add_max_min_marks(df: pd.DataFrame, cols):
    """Return a copy where each numeric value is formatted and tagged with ðŸŸ©/ðŸŸ¥ within cols (per-column)."""
    out = df.copy()
    for c in cols:
        if c not in out.columns:
            continue
        s = pd.to_numeric(out[c], errors="coerce")
        if s.isna().all():
            continue
        vmax = s.max()
        vmin = s.min()

        def fmt(v):
            if pd.isna(v):
                return "â€”"
            v = float(v)
            txt = f"{v:.6f}"
            if v == vmax and v == vmin:
                return txt + "ðŸŸ©ðŸŸ¥"
            if v == vmax:
                return txt + "ðŸŸ©"
            if v == vmin:
                return txt + "ðŸŸ¥"
            return txt

        out[c] = s.map(fmt)
    return out

def show_stage(stage_name: str, df: pd.DataFrame, mark_cols):
    print("\n" + "="*90)
    print(stage_name)
    print("="*90)

    if df.empty:
        print("(Bu bosqich uchun parse qilingan natija topilmadi.)")
        return

    df2 = add_max_min_marks(df, mark_cols)

    if tabulate is not None:
        print(tabulate(df2, headers="keys", tablefmt="github", showindex=False))
    else:
        print(df2.to_string(index=False))


## Baseline results

In [6]:
show_stage('BASELINE (from 06_train_baseline.ipynb) â€” TEST metrics', df_baseline,
          ['test_micro_f1','test_macro_f1','test_micro_precision','test_micro_recall'])


BASELINE (from 06_train_baseline.ipynb) â€” TEST metrics
| model                             | test_micro_f1   | test_macro_f1   | test_micro_precision   | test_micro_recall   |
|-----------------------------------|-----------------|-----------------|------------------------|---------------------|
| baseline_ovr_linearsvc            | 0.999500ðŸŸ©      | 0.998800ðŸŸ©      | 0.999200ðŸŸ©             | 0.999800ðŸŸ©          |
| baseline_ovr_calibrated_linearsvc | 0.999200        | 0.998400        | 0.998900               | 0.999500            |
| baseline_ovr_logreg               | 0.997300        | 0.995800        | 0.996000               | 0.998600            |
| baseline_ovr_sgd_logloss          | 0.995000ðŸŸ¥      | 0.992000ðŸŸ¥      | 0.993400ðŸŸ¥             | 0.996600ðŸŸ¥          |


## Improvement results

In [7]:
show_stage('IMPROVEMENT (from 10_train_improvement.ipynb) â€” VAL metrics', df_impr,
          ['val_micro_f1','val_macro_f1'])


IMPROVEMENT (from 10_train_improvement.ipynb) â€” VAL metrics
| model                | val_micro_f1   | val_macro_f1   |
|----------------------|----------------|----------------|
| ovr_logreg_bal_C2    | 0.980376ðŸŸ©     | 0.965836ðŸŸ©     |
| ovr_linearsvc_C1     | 0.980182       | 0.964417       |
| ovr_logreg_bal_C1    | 0.978403       | 0.944185       |
| ovr_sgd_logloss      | 0.974033       | 0.952951       |
| ovr_sgd_hinge        | 0.957535       | 0.936905       |
| ovr_complementnb_a05 | 0.913360ðŸŸ¥     | 0.882115ðŸŸ¥     |


## Tuning results (Optuna, NO_OVERSAMPLING)

In [8]:
show_stage('TUNING (from 12_tuning_NO_OVERSAMPLING.ipynb) â€” VAL/TEST metrics', df_tune,
          ['val_micro_f1','val_macro_f1','test_micro_f1','test_macro_f1'])


TUNING (from 12_tuning_NO_OVERSAMPLING.ipynb) â€” VAL/TEST metrics
| model       | val_micro_f1   | val_macro_f1   | test_micro_f1   | test_macro_f1   |
|-------------|----------------|----------------|-----------------|-----------------|
| tune_cell_4 | 0.962941ðŸŸ©     | 0.930969ðŸŸ©     | 0.957416ðŸŸ©      | 0.916767ðŸŸ©      |
| tune_cell_5 | 0.961265       | 0.908998       | 0.957016        | 0.912317        |
| tune_cell_6 | 0.950963       | 0.916381       | 0.943415        | 0.903986        |
| tune_cell_7 | 0.948021ðŸŸ¥     | 0.888889ðŸŸ¥     | 0.938087ðŸŸ¥      | 0.895398ðŸŸ¥      |


## Best model (Final Train)

In [9]:
show_stage('BEST MODEL (FINAL TRAIN in 12_tuning_NO_OVERSAMPLING.ipynb) â€” VAL/TEST metrics', df_best,
          ['val_micro_f1','val_macro_f1','test_micro_f1','test_macro_f1'])


BEST MODEL (FINAL TRAIN in 12_tuning_NO_OVERSAMPLING.ipynb) â€” VAL/TEST metrics
| model                    | val_micro_f1   | val_macro_f1   | test_micro_f1   | test_macro_f1   |
|--------------------------|----------------|----------------|-----------------|-----------------|
| optuna_logreg_best_final | 0.984831ðŸŸ©ðŸŸ¥   | 0.973785ðŸŸ©ðŸŸ¥   | 0.978199ðŸŸ©ðŸŸ¥    | 0.966022ðŸŸ©ðŸŸ¥    |


## Summary â€” best-of-stage comparison

In [10]:
# ==========================
# 5) Best-of-stage summary
# ==========================
def pick_best(df: pd.DataFrame, prefer_test: bool = True):
    if df.empty:
        return None
    if prefer_test and "test_micro_f1" in df.columns:
        col = "test_micro_f1"
    elif "val_micro_f1" in df.columns:
        col = "val_micro_f1"
    else:
        # fallback: first numeric col
        num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
        col = num_cols[0] if num_cols else None
    if col is None:
        return None
    return df.sort_values(col, ascending=False).iloc[0].to_dict()

best_rows = []
b = pick_best(df_baseline, prefer_test=True)
if b: b["stage"] = "baseline"; best_rows.append(b)

i = pick_best(df_impr, prefer_test=False)
if i: i["stage"] = "improvement"; best_rows.append(i)

t = pick_best(df_tune, prefer_test=True)
if t: t["stage"] = "tuning"; best_rows.append(t)

bm = pick_best(df_best, prefer_test=True)
if bm: bm["stage"] = "best_model"; best_rows.append(bm)

summary = pd.DataFrame(best_rows)

# Ustunlarni chiroyli tartibda chiqaramiz
wanted = ["stage","model",
          "val_micro_f1","val_macro_f1",
          "test_micro_f1","test_macro_f1",
          "test_micro_precision","test_micro_recall"]
cols = [c for c in wanted if c in summary.columns] + [c for c in summary.columns if c not in wanted]
summary = summary[cols]

show_stage("SUMMARY (Best row from each stage)", summary, [c for c in summary.columns if c not in ('stage','model')])



SUMMARY (Best row from each stage)
| stage       | model                    | val_micro_f1   | val_macro_f1   | test_micro_f1   | test_macro_f1   | test_micro_precision   | test_micro_recall   |
|-------------|--------------------------|----------------|----------------|-----------------|-----------------|------------------------|---------------------|
| baseline    | baseline_ovr_linearsvc   | â€”              | â€”              | 0.999500ðŸŸ©      | 0.998800ðŸŸ©      | 0.999200ðŸŸ©ðŸŸ¥           | 0.999800ðŸŸ©ðŸŸ¥        |
| improvement | ovr_logreg_bal_C2        | 0.980376       | 0.965836       | â€”               | â€”               | â€”                      | â€”                   |
| tuning      | tune_cell_4              | 0.962941ðŸŸ¥     | 0.930969ðŸŸ¥     | 0.957416ðŸŸ¥      | 0.916767ðŸŸ¥      | â€”                      | â€”                   |
| best_model  | optuna_logreg_best_final | 0.984831ðŸŸ©     | 0.973785ðŸŸ©     | 0.978199        | 0.966022        | â€”        

In [11]:
from pathlib import Path

root = Path(".")  # project rootda turgan boâ€˜lsank
patterns = ["tfidf_vectorizer.joblib", "feature_selector.joblib", "*vectorizer*.joblib", "*selector*.joblib"]

for pat in patterns:
    hits = list(root.rglob(pat))
    print(pat, "->", len(hits))
    for h in hits[:20]:
        print("  ", h)

tfidf_vectorizer.joblib -> 0
feature_selector.joblib -> 0
*vectorizer*.joblib -> 0
*selector*.joblib -> 0
