# LogisticRegression(OvR)

In [1]:
#CELL 1 — Imports + Paths

from pathlib import Path
import json
import numpy as np
import pandas as pd

from scipy import sparse
import joblib

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

# Project root
CWD = Path.cwd()
if (CWD / "Data").exists():
    PROJECT_ROOT = CWD
elif (CWD.parent / "Data").exists():
    PROJECT_ROOT = CWD.parent
else:
    PROJECT_ROOT = CWD

# 05 preprocess output
PP_DIR = PROJECT_ROOT / "Data" / "Preprocessed_data" / "baseline"

# outputs
MODEL_DIR = PROJECT_ROOT / "Models" / "baseline_models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

RES_TABLES = PROJECT_ROOT / "results" / "tables" / "baseline_train"
RES_TABLES.mkdir(parents=True, exist_ok=True)

RES_REPORTS = PROJECT_ROOT / "results" / "reports" / "baseline_train"
RES_REPORTS.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("PP_DIR exists:", PP_DIR.exists(), PP_DIR)
print("MODEL_DIR:", MODEL_DIR)
print("RES_TABLES:", RES_TABLES)
print("RES_REPORTS:", RES_REPORTS)

PROJECT_ROOT: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract
PP_DIR exists: True c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Preprocessed_data\baseline
MODEL_DIR: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\baseline_models
RES_TABLES: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\results\tables\baseline_train
RES_REPORTS: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\results\reports\baseline_train


In [2]:
#CELL 2 — Load X/Y + bundle (labels)
Xtr = sparse.load_npz(PP_DIR / "X_train.npz")
Xva = sparse.load_npz(PP_DIR / "X_val.npz")
Xte = sparse.load_npz(PP_DIR / "X_test.npz")

Ytr = np.load(PP_DIR / "Y_train.npy")
Yva = np.load(PP_DIR / "Y_val.npy")
Yte = np.load(PP_DIR / "Y_test.npy")

bundle = joblib.load(PP_DIR / "tfidf_bundle.joblib")
y_cols = bundle["y_cols"]  # list[str]

print("X shapes:", Xtr.shape, Xva.shape, Xte.shape)
print("Y shapes:", Ytr.shape, Yva.shape, Yte.shape)
print("Num labels:", len(y_cols))

X shapes: (201176, 58921) (24410, 58921) (24164, 58921)
Y shapes: (201176, 21) (24410, 21) (24164, 21)
Num labels: 21


In [3]:
#X’larni writeable qilib olamiz (bitta cell)
# X'larni aniq CSR va writeable qilish
from scipy import sparse

Xtr = sparse.csr_matrix(Xtr).copy()
Xva = sparse.csr_matrix(Xva).copy()
Xte = sparse.csr_matrix(Xte).copy()

# (ixtiyoriy) dtype float32 bo'lsa yaxshi
# Xtr.data = Xtr.data.astype("float32", copy=False)
# Xva.data = Xva.data.astype("float32", copy=False)
# Xte.data = Xte.data.astype("float32", copy=False)

print("Xtr csr:", sparse.isspmatrix_csr(Xtr), "writeable:", Xtr.data.flags.writeable)

Xtr csr: True writeable: True


In [4]:
# CELL 3 — Train baseline model (OvR Logistic Regression)

import time
base_lr = LogisticRegression(
    solver="liblinear",       # sparse uchun yaxshi
    max_iter=200,
    class_weight="balanced",  # imbalance uchun baseline fix
)

log_model = OneVsRestClassifier(base_lr, n_jobs=1)

t0 = time.time()
print("[A] Training LogisticRegression (n_jobs=1)...")
log_model.fit(Xtr, Ytr)
t1 = time.time()
print(f"[A] Done. Train time: {(t1-t0)/60:.2f} min")




# print("Training...")
# model.fit(Xtr, Ytr)
# print("Done.")

[A] Training LogisticRegression (n_jobs=1)...
[A] Done. Train time: 2.70 min


In [5]:
#CELL 4 — Predict scores on val/test (probabilities)
# OvR LogisticRegression -> predict_proba bor
Pva = log_model.predict_proba(Xva)  # (n_val, n_labels)
Pte = log_model.predict_proba(Xte)

print("P shapes:", Pva.shape, Pte.shape)
print("Prob range (val):", float(Pva.min()), float(Pva.max()))

# P shapes: (23626, 21) (23934, 21)
# Prob range (val): 1.0415268621111799e-05 1.0

# P shapes: (24310, 21) (24182, 21)
# Prob range (val): 1.2919881547259855e-05 1.0

P shapes: (24410, 21) (24164, 21)
Prob range (val): 1.9157989858011397e-05 1.0


In [6]:
#CELL 5 — Threshold tuning (per-label) on VAL

#Har label uchun thresholdni alohida topamiz: F1’ni maksimal qiladigan threshold.

def tune_thresholds_f1(Y_true, P, thresholds=None):
    if thresholds is None:
        thresholds = np.linspace(0.05, 0.95, 19)

    n_labels = Y_true.shape[1]
    best_t = np.zeros(n_labels, dtype=float)
    best_f = np.zeros(n_labels, dtype=float)

    for j in range(n_labels):
        yj = Y_true[:, j]
        pj = P[:, j]

        # agar label valda umuman yo‘q bo‘lsa (teoriya), default 0.5 qoldiramiz
        if yj.sum() == 0:
            best_t[j] = 0.5
            best_f[j] = 0.0
            continue

        best_score = -1.0
        best_thr = 0.5
        for t in thresholds:
            pred = (pj >= t).astype(int)
            f = f1_score(yj, pred, zero_division=0)
            if f > best_score:
                best_score = f
                best_thr = float(t)
        best_t[j] = best_thr
        best_f[j] = float(best_score)

    return best_t, best_f

thr_grid = np.linspace(0.05, 0.95, 19)
best_thr, best_f1 = tune_thresholds_f1(Yva, Pva, thresholds=thr_grid)

thr_df = pd.DataFrame({
    "label": [c.replace("y_", "", 1) for c in y_cols],
    "best_threshold": best_thr.round(3),
    "val_f1_at_best": best_f1.round(4),
})
thr_df = thr_df.sort_values("val_f1_at_best", ascending=False).reset_index(drop=True)

display(thr_df.head(25))

thr_df.to_csv(RES_TABLES / "thresholds_per_label_val.csv", index=False, encoding="utf-8-sig")
print("Saved thresholds table.")

Unnamed: 0,label,best_threshold,val_f1_at_best
0,pregnancy_reproductive,0.65,1.0
1,pain_general,0.6,0.9999
2,edema_swelling,0.6,0.9997
3,infections,0.7,0.9995
4,hypersensitivity_allergy,0.8,0.9994
5,injury_accident,0.7,0.9994
6,injection_site,0.75,0.9993
7,general_systemic,0.65,0.9993
8,dermatologic,0.55,0.999
9,psychiatric,0.8,0.9981


Saved thresholds table.


## Natijani haqiqatdan shundaymi yo'qmi shuni tekshirish

In [7]:
# “Doim 0” baseline (yomon chiqishi kerak)

# Agar model F1 0.99 bo‘lsa, doim 0 baseline F1 juda past bo‘ladi.

from sklearn.metrics import f1_score
Z = np.zeros_like(Yva)
print("VAL micro_f1 always-0:", f1_score(Yva, Z, average="micro", zero_division=0))

VAL micro_f1 always-0: 0.0


In [9]:
# 06 uchun: Xtr_text/Xva_text ni qayta yuklash (NO-LEAK splitdan)
from pathlib import Path
import pandas as pd
import re

# SPLIT papkangiz (aniq nom)
SPLIT_DIR = PROJECT_ROOT / "Data" / "Processed" / "splits_multilabel_noleakage"

TEXT_COL = "REAC_pt_symptom_v2"

df_tr_txt = pd.read_csv(SPLIT_DIR / "train.csv", usecols=[TEXT_COL], low_memory=False)
df_va_txt = pd.read_csv(SPLIT_DIR / "val.csv",   usecols=[TEXT_COL], low_memory=False)

_ws = re.compile(r"\s+")
_punct = re.compile(r"[,\t\r\n]+")

def normalize_text(s: str) -> str:
    s = "" if s is None else str(s)
    s = s.lower().strip()
    s = s.replace(";", " ")
    s = _punct.sub(" ", s)
    s = _ws.sub(" ", s)
    return s

Xtr_text = df_tr_txt[TEXT_COL].map(normalize_text).values
Xva_text = df_va_txt[TEXT_COL].map(normalize_text).values

print("Loaded texts:", Xtr_text.shape, Xva_text.shape)
print("Example:", Xtr_text[0])

Loaded texts: (201176,) (24410,)
Example: injection site reaction general physical health deterioration chest discomfort sensitivity to weather change fatigue dysphonia wheezing pain influenza productive cough nasopharyngitis weight decreased nasal congestion hypoventilation illness blood pressure decreased forced expiratory volume decreased hypersensitivity dyspnoea body temperature decreased


In [10]:
# B) Text’ni random shuffle qilsak (F1 qulab tushishi kerak)

# Agar data leakage bo‘lsa, shuffle qilsak ham yuqori qolib ketadi. Normal holatda esa qulaydi:

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

rng = np.random.default_rng(42)

# TEXT'ni aralashtiramiz (Y o'sha qoladi)
Xtr_shuf = np.array(Xtr_text, dtype=object).copy()
rng.shuffle(Xtr_shuf)

tv = TfidfVectorizer(min_df=5, ngram_range=(1,2))
Xtr_tmp = tv.fit_transform(Xtr_shuf)
Xva_tmp = tv.transform(Xva_text)

m = OneVsRestClassifier(LogisticRegression(solver="liblinear", max_iter=200), n_jobs=1)
m.fit(Xtr_tmp, Ytr)

P = m.predict_proba(Xva_tmp)
pred = (P >= 0.5).astype(int)

print("VAL micro_f1 after shuffling TRAIN texts:", f1_score(Yva, pred, average="micro", zero_division=0))
print("P shapes:", Pva.shape, Pte.shape)
print("Prob range (val):", float(Pva.min()), float(Pva.max()))

# Agar bu keskin tushsa — demak bizning yuqori F1 haqiqiy signal (deterministic mapping) ekan.

VAL micro_f1 after shuffling TRAIN texts: 7.541335947663129e-05
P shapes: (24410, 21) (24164, 21)
Prob range (val): 1.9157989858011397e-05 1.0


In [None]:
#---------------------------------------------------------------------------------------------------------

In [11]:
#CELL 6 — Evaluate on TEST (micro/macro + per-label)
def apply_thresholds(P, thr):
    return (P >= thr.reshape(1, -1)).astype(int)

Yte_pred = apply_thresholds(Pte, best_thr)

micro_f1 = f1_score(Yte, Yte_pred, average="micro", zero_division=0)
macro_f1 = f1_score(Yte, Yte_pred, average="macro", zero_division=0)
micro_p  = precision_score(Yte, Yte_pred, average="micro", zero_division=0)
micro_r  = recall_score(Yte, Yte_pred, average="micro", zero_division=0)

print("TEST metrics:")
print(" micro_f1:", round(float(micro_f1), 4))
print(" macro_f1:", round(float(macro_f1), 4))
print(" micro_P :", round(float(micro_p),  4))
print(" micro_R :", round(float(micro_r),  4))

# Per-label report (table)
per_label = []
for j, lab in enumerate(y_cols):
    yj = Yte[:, j]
    pj = Yte_pred[:, j]
    per_label.append({
        "label": lab.replace("y_", "",1),
        "support": int(yj.sum()),
        "precision": float(precision_score(yj, pj, zero_division=0)),
        "recall": float(recall_score(yj, pj, zero_division=0)),
        "f1": float(f1_score(yj, pj, zero_division=0)),
        "threshold": float(best_thr[j]),
    })

per_label_df = pd.DataFrame(per_label).sort_values("f1", ascending=False).reset_index(drop=True)
display(per_label_df.head(25))

per_label_df.to_csv(RES_TABLES / "test_per_label_metrics.csv", index=False, encoding="utf-8-sig")

TEST metrics:
 micro_f1: 0.9973
 macro_f1: 0.9958
 micro_P : 0.996
 micro_R : 0.9986


Unnamed: 0,label,support,precision,recall,f1,threshold
0,edema_swelling,1790,0.999442,1.0,0.999721,0.6
1,pain_general,4196,0.99881,0.999762,0.999285,0.6
2,general_systemic,4077,0.998775,0.999509,0.999142,0.65
3,injury_accident,1875,0.998933,0.998933,0.998933,0.7
4,infections,3204,0.99844,0.999064,0.998752,0.7
5,psychiatric,1735,1.0,0.997118,0.998557,0.8
6,hypersensitivity_allergy,1964,0.997965,0.998982,0.998473,0.8
7,injection_site,1737,0.996556,0.999424,0.997988,0.75
8,dermatologic,5373,0.997212,0.998697,0.997954,0.55
9,neurological,4084,0.996581,0.999265,0.997922,0.6


In [12]:
# CELL 7 — Save model + thresholds + summary report
# Save model
model_path = MODEL_DIR / "baseline_ovr_logreg.joblib"
joblib.dump(log_model, model_path)

# Save thresholds (np)
thr_path = MODEL_DIR / "thresholds.npy"
np.save(thr_path, best_thr)

# Save summary json
summary = {
    "text_col": bundle.get("text_col"),
    "num_labels": len(y_cols),
    "labels": [c.replace("y_", "",1) for c in y_cols],
    "metrics_test": {
        "micro_f1": float(micro_f1),
        "macro_f1": float(macro_f1),
        "micro_precision": float(micro_p),
        "micro_recall": float(micro_r),
    },
    "artifacts": {
        "model": str(model_path),
        "thresholds": str(thr_path),
        "tfidf_bundle": str(PP_DIR / "tfidf_bundle.joblib"),
    }
}

(RES_REPORTS / "baseline_summary.json").write_text(json.dumps(summary, indent=2, ensure_ascii=False), encoding="utf-8")
print("Saved model + thresholds + summary.")
print("Model:", model_path)

Saved model + thresholds + summary.
Model: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\baseline_models\baseline_ovr_logreg.joblib


In [13]:
# CELL 8 — (ixtiyoriy, lekin foydali) Top features per label (word TF-IDF tarafidan)

# Bu userga “nega shunday chiqdi?” deganda juda foydali bo‘ladi.

tfidf_word = bundle["tfidf_word"]
feat_word = tfidf_word.get_feature_names_out()

# combined X = [char, word], shuning uchun word koeff offseti:
n_char = len(bundle["tfidf_char"].get_feature_names_out())
coef = log_model.estimators_[0].coef_  # shape (1, n_features_total) for label0
print("n_char:", n_char, "total_features:", log_model.estimators_[0].coef_.shape[1])

def top_word_features_for_label(label_idx, top_k=15):
    est = log_model.estimators_[label_idx]
    w = est.coef_.ravel()
    w_word = w[n_char:]  # word part
    top = np.argsort(w_word)[::-1][:top_k]
    return pd.DataFrame({"term": feat_word[top], "coef": w_word[top].round(4)})

# misol: eng birinchi 3 label uchun
for j in range(min(3, len(y_cols))):
    print("\nLABEL:", y_cols[j].replace("y_", "",1))
    display(top_word_features_for_label(j, top_k=15))

n_char: 28926 total_features: 58921

LABEL: cardiovascular


Unnamed: 0,term,coef
0,hypertension,15.4809
1,thrombosis,13.2819
2,hypotension,11.9673
3,pressure increased,11.2289
4,cardiac disorder,10.9131
5,pulmonary embolism,10.8345
6,blood pressure,10.7718
7,palpitations,9.7944
8,cardiac failure,9.1217
9,pressure decreased,8.8572



LABEL: dermatologic


Unnamed: 0,term,coef
0,rash,22.1557
1,pruritus,13.8452
2,erythema,12.4661
3,dry skin,12.1093
4,dermatitis,11.767
5,acne,11.5835
6,psoriasis,11.1036
7,alopecia,10.9814
8,blister,10.3435
9,eczema,9.8778



LABEL: edema_swelling


Unnamed: 0,term,coef
0,oedema,19.8626
1,swelling,19.3845
2,fluid retention,8.7183
3,fluid,6.786
4,angioedema,5.754
5,site swelling,4.6104
6,retention,4.5057
7,peripheral swelling,3.8687
8,pulmonary oedema,3.751
9,joint swelling,3.5998


# OvR Calibrated LinearSVC

In [14]:
# B1) LinearSVC OvR (tez benchmark, pseudo-proba)

import time
import numpy as np
from scipy.special import expit  # sigmoid
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
import pandas as pd
import joblib





# ===== Train =====
run_name = "baseline_ovr_linearsvc"
base_svc = LinearSVC(C=1.0, class_weight="balanced")

svc_model = OneVsRestClassifier(base_svc, n_jobs=1)  # Windows stable

t0 = time.time()
print("[B1] Training OneVsRest LinearSVC (n_jobs=1)...")
svc_model.fit(Xtr, Ytr)
print(f"[B1] Done. Train time: {(time.time()-t0)/60:.2f} min")






# ===== Scores -> Pseudo probabilities =====
Dva = svc_model.decision_function(Xva)   # (n_val, n_labels)
Dte = svc_model.decision_function(Xte)

Pva = expit(Dva)  # 0..1
Pte = expit(Dte)

print("P shapes:", Pva.shape, Pte.shape)
print("Pseudo-prob range (val):", float(Pva.min()), float(Pva.max()))




# ===== Threshold tuning (VAL) =====
thr_grid = np.linspace(0.05, 0.95, 19)
best_thr, best_f1 = tune_thresholds_f1(Yva, Pva, thresholds=thr_grid)

thr_df = pd.DataFrame({
    "label": [c.replace("y_", "", 1) for c in y_cols],
    "best_threshold": best_thr.round(3),
    "val_f1_at_best": best_f1.round(4),
}).sort_values("val_f1_at_best", ascending=False).reset_index(drop=True)

display(thr_df.head(25))
thr_df.to_csv(RES_TABLES / f"{run_name}_thresholds_val.csv", index=False, encoding="utf-8-sig")






# ===== TEST eval =====
Yte_pred = apply_thresholds(Pte, best_thr)

micro_f1 = f1_score(Yte, Yte_pred, average="micro", zero_division=0)
macro_f1 = f1_score(Yte, Yte_pred, average="macro", zero_division=0)
micro_p  = precision_score(Yte, Yte_pred, average="micro", zero_division=0)
micro_r  = recall_score(Yte, Yte_pred, average="micro", zero_division=0)

print("TEST metrics:")
print(" micro_f1:", round(float(micro_f1), 4))
print(" macro_f1:", round(float(macro_f1), 4))
print(" micro_P :", round(float(micro_p),  4))
print(" micro_R :", round(float(micro_r),  4))

per_label = []
for j, lab in enumerate(y_cols):
    yj = Yte[:, j]
    pj = Yte_pred[:, j]
    per_label.append({
        "label": lab.replace("y_", "", 1),
        "support": int(yj.sum()),
        "precision": float(precision_score(yj, pj, zero_division=0)),
        "recall": float(recall_score(yj, pj, zero_division=0)),
        "f1": float(f1_score(yj, pj, zero_division=0)),
        "threshold": float(best_thr[j]),
    })

per_label_df = pd.DataFrame(per_label).sort_values("f1", ascending=False).reset_index(drop=True)
display(per_label_df.head(25))
per_label_df.to_csv(RES_TABLES / f"{run_name}_per_label_test.csv", index=False, encoding="utf-8-sig")








# ===== Save =====
model_path = MODEL_DIR / f"{run_name}.joblib"
joblib.dump(svc_model, model_path)

thr_path = MODEL_DIR / f"{run_name}_thresholds.npy"
np.save(thr_path, best_thr)

summary = {
    "run": run_name,
    "model_type": "OneVsRest(LinearSVC)",
    "note": "Pseudo-probabilities used: sigmoid(decision_function).",
    "text_col": bundle.get("text_col"),
    "num_labels": len(y_cols),
    "labels": [c.replace("y_", "", 1) for c in y_cols],
    "metrics_test": {
        "micro_f1": float(micro_f1),
        "macro_f1": float(macro_f1),
        "micro_precision": float(micro_p),
        "micro_recall": float(micro_r),
    },
    "artifacts": {
        "model": str(model_path),
        "thresholds": str(thr_path),
        "tfidf_bundle": str(PP_DIR / "tfidf_bundle.joblib"),
        "thresholds_table": str(RES_TABLES / f"{run_name}_thresholds_val.csv"),
        "per_label_table": str(RES_TABLES / f"{run_name}_per_label_test.csv"),
    }
}

(RES_REPORTS / f"{run_name}_summary.json").write_text(
    json.dumps(summary, indent=2, ensure_ascii=False),
    encoding="utf-8"
)

print("Saved model + thresholds + summary:", model_path)

# [B1] Training OneVsRest LinearSVC (n_jobs=1)...
# [B1] Done. Train time: 4.54 min
# P shapes: (23626, 21) (23934, 21)
# Pseudo-prob range (val): 0.011812062362718957 0.9999999991832265

[B1] Training OneVsRest LinearSVC (n_jobs=1)...
[B1] Done. Train time: 3.72 min
P shapes: (24410, 21) (24164, 21)
Pseudo-prob range (val): 0.019009880053959493 0.999999999219225


Unnamed: 0,label,best_threshold,val_f1_at_best
0,edema_swelling,0.5,1.0
1,general_systemic,0.55,1.0
2,pain_general,0.5,1.0
3,pregnancy_reproductive,0.45,1.0
4,hypersensitivity_allergy,0.5,1.0
5,dermatologic,0.5,0.9999
6,neurological,0.5,0.9999
7,infections,0.55,0.9998
8,gastrointestinal,0.5,0.9997
9,injury_accident,0.45,0.9997


TEST metrics:
 micro_f1: 0.9995
 macro_f1: 0.9988
 micro_P : 0.9992
 micro_R : 0.9998


Unnamed: 0,label,support,precision,recall,f1,threshold
0,edema_swelling,1790,1.0,1.0,1.0,0.5
1,general_systemic,4077,1.0,1.0,1.0,0.55
2,musculoskeletal,1993,1.0,1.0,1.0,0.55
3,hypersensitivity_allergy,1964,1.0,1.0,1.0,0.5
4,infections,3204,1.0,1.0,1.0,0.55
5,renal,982,1.0,1.0,1.0,0.5
6,pain_general,4196,1.0,1.0,1.0,0.5
7,injury_accident,1875,1.0,1.0,1.0,0.45
8,dermatologic,5373,0.999628,0.999814,0.999721,0.5
9,psychiatric,1735,1.0,0.999424,0.999712,0.55


Saved model + thresholds + summary: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\baseline_models\baseline_ovr_linearsvc.joblib


# OvR Calibrated LinearSVC (haqiqiy predict_proba)

In [15]:
import time
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
import pandas as pd
import joblib





run_name = "baseline_ovr_calibrated_linearsvc"

base_svc = LinearSVC(C=1.0, class_weight="balanced")

# cv=3 tezroq; method="sigmoid" odatda yetarli
cal_svc = CalibratedClassifierCV(base_svc, method="sigmoid", cv=3)

cal_svc_model = OneVsRestClassifier(cal_svc, n_jobs=1)

t0 = time.time()
print("[B2] Training OneVsRest Calibrated LinearSVC (n_jobs=1)...")
cal_svc_model.fit(Xtr, Ytr)
print(f"[B2] Done. Train time: {(time.time()-t0)/60:.2f} min")





Pva = cal_svc_model.predict_proba(Xva)
Pte = cal_svc_model.predict_proba(Xte)

print("P shapes:", Pva.shape, Pte.shape)
print("Prob range (val):", float(Pva.min()), float(Pva.max()))





thr_grid = np.linspace(0.05, 0.95, 19)
best_thr, best_f1 = tune_thresholds_f1(Yva, Pva, thresholds=thr_grid)

thr_df = pd.DataFrame({
    "label": [c.replace("y_", "", 1) for c in y_cols],
    "best_threshold": best_thr.round(3),
    "val_f1_at_best": best_f1.round(4),
}).sort_values("val_f1_at_best", ascending=False).reset_index(drop=True)

display(thr_df.head(25))
thr_df.to_csv(RES_TABLES / f"{run_name}_thresholds_val.csv", index=False, encoding="utf-8-sig")





Yte_pred = apply_thresholds(Pte, best_thr)

micro_f1 = f1_score(Yte, Yte_pred, average="micro", zero_division=0)
macro_f1 = f1_score(Yte, Yte_pred, average="macro", zero_division=0)
micro_p  = precision_score(Yte, Yte_pred, average="micro", zero_division=0)
micro_r  = recall_score(Yte, Yte_pred, average="micro", zero_division=0)

print("TEST metrics:")
print(" micro_f1:", round(float(micro_f1), 4))
print(" macro_f1:", round(float(macro_f1), 4))
print(" micro_P :", round(float(micro_p),  4))
print(" micro_R :", round(float(micro_r),  4))

per_label = []
for j, lab in enumerate(y_cols):
    yj = Yte[:, j]
    pj = Yte_pred[:, j]
    per_label.append({
        "label": lab.replace("y_", "", 1),
        "support": int(yj.sum()),
        "precision": float(precision_score(yj, pj, zero_division=0)),
        "recall": float(recall_score(yj, pj, zero_division=0)),
        "f1": float(f1_score(yj, pj, zero_division=0)),
        "threshold": float(best_thr[j]),
    })

per_label_df = pd.DataFrame(per_label).sort_values("f1", ascending=False).reset_index(drop=True)
display(per_label_df.head(25))
per_label_df.to_csv(RES_TABLES / f"{run_name}_per_label_test.csv", index=False, encoding="utf-8-sig")






model_path = MODEL_DIR / f"{run_name}.joblib"
joblib.dump(cal_svc_model, model_path)

thr_path = MODEL_DIR / f"{run_name}_thresholds.npy"
np.save(thr_path, best_thr)

summary = {
    "run": run_name,
    "model_type": "OneVsRest(CalibratedClassifierCV(LinearSVC))",
    "text_col": bundle.get("text_col"),
    "num_labels": len(y_cols),
    "labels": [c.replace("y_", "", 1) for c in y_cols],
    "metrics_test": {
        "micro_f1": float(micro_f1),
        "macro_f1": float(macro_f1),
        "micro_precision": float(micro_p),
        "micro_recall": float(micro_r),
    },
    "artifacts": {
        "model": str(model_path),
        "thresholds": str(thr_path),
        "tfidf_bundle": str(PP_DIR / "tfidf_bundle.joblib"),
        "thresholds_table": str(RES_TABLES / f"{run_name}_thresholds_val.csv"),
        "per_label_table": str(RES_TABLES / f"{run_name}_per_label_test.csv"),
    }
}

(RES_REPORTS / f"{run_name}_summary.json").write_text(
    json.dumps(summary, indent=2, ensure_ascii=False),
    encoding="utf-8"
)

print("Saved model + thresholds + summary:", model_path)

# [B2] Training OneVsRest Calibrated LinearSVC (n_jobs=1)...
# [B2] Done. Train time: 9.45 min
# P shapes: (23626, 21) (23934, 21)
# Prob range (val): 6.333557935411573e-13 1.0

[B2] Training OneVsRest Calibrated LinearSVC (n_jobs=1)...
[B2] Done. Train time: 8.82 min
P shapes: (24410, 21) (24164, 21)
Prob range (val): 3.9453748129733045e-11 1.0


Unnamed: 0,label,best_threshold,val_f1_at_best
0,edema_swelling,0.25,1.0
1,general_systemic,0.5,1.0
2,pain_general,0.25,1.0
3,injury_accident,0.25,1.0
4,hypersensitivity_allergy,0.4,1.0
5,pregnancy_reproductive,0.25,1.0
6,neurological,0.5,0.9999
7,dermatologic,0.35,0.9998
8,gastrointestinal,0.6,0.9998
9,infections,0.65,0.9998


TEST metrics:
 micro_f1: 0.9992
 macro_f1: 0.9984
 micro_P : 0.9989
 micro_R : 0.9995


Unnamed: 0,label,support,precision,recall,f1,threshold
0,edema_swelling,1790,1.0,1.0,1.0,0.25
1,renal,982,1.0,1.0,1.0,0.55
2,hypersensitivity_allergy,1964,1.0,1.0,1.0,0.4
3,pain_general,4196,0.999762,1.0,0.999881,0.25
4,infections,3204,0.999688,1.0,0.999844,0.65
5,general_systemic,4077,0.99951,1.0,0.999755,0.5
6,musculoskeletal,1993,1.0,0.999498,0.999749,0.7
7,injury_accident,1875,0.999467,1.0,0.999733,0.25
8,psychiatric,1735,1.0,0.999424,0.999712,0.5
9,dermatologic,5373,0.999442,0.999814,0.999628,0.35


Saved model + thresholds + summary: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\baseline_models\baseline_ovr_calibrated_linearsvc.joblib


# OvR SGD (log_loss) + (proba bo‘lmasa sigmoid fallback)

In [16]:
import time
import numpy as np
from scipy.special import expit
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
import pandas as pd
import joblib





run_name = "baseline_ovr_sgd_logloss"

base_sgd = SGDClassifier(
    loss="log_loss",
    alpha=1e-5,
    max_iter=2000,
    tol=1e-3,
    class_weight="balanced",
)

sgd_model = OneVsRestClassifier(base_sgd, n_jobs=1)

t0 = time.time()
print("[C] Training OneVsRest SGD (log_loss) (n_jobs=1)...")
sgd_model.fit(Xtr, Ytr)
print(f"[C] Done. Train time: {(time.time()-t0)/60:.2f} min")





# predict_proba bo'lsa ishlatamiz, bo'lmasa decision_function->sigmoid
if hasattr(sgd_model, "predict_proba"):
    try:
        Pva = sgd_model.predict_proba(Xva)
        Pte = sgd_model.predict_proba(Xte)
    except Exception:
        Dva = sgd_model.decision_function(Xva)
        Dte = sgd_model.decision_function(Xte)
        Pva = expit(Dva)
        Pte = expit(Dte)
else:
    Dva = sgd_model.decision_function(Xva)
    Dte = sgd_model.decision_function(Xte)
    Pva = expit(Dva)
    Pte = expit(Dte)

print("P shapes:", Pva.shape, Pte.shape)
print("Prob range (val):", float(Pva.min()), float(Pva.max()))






thr_grid = np.linspace(0.05, 0.95, 19)
best_thr, best_f1 = tune_thresholds_f1(Yva, Pva, thresholds=thr_grid)

thr_df = pd.DataFrame({
    "label": [c.replace("y_", "", 1) for c in y_cols],
    "best_threshold": best_thr.round(3),
    "val_f1_at_best": best_f1.round(4),
}).sort_values("val_f1_at_best", ascending=False).reset_index(drop=True)

display(thr_df.head(25))
thr_df.to_csv(RES_TABLES / f"{run_name}_thresholds_val.csv", index=False, encoding="utf-8-sig")






Yte_pred = apply_thresholds(Pte, best_thr)

micro_f1 = f1_score(Yte, Yte_pred, average="micro", zero_division=0)
macro_f1 = f1_score(Yte, Yte_pred, average="macro", zero_division=0)
micro_p  = precision_score(Yte, Yte_pred, average="micro", zero_division=0)
micro_r  = recall_score(Yte, Yte_pred, average="micro", zero_division=0)

print("TEST metrics:")
print(" micro_f1:", round(float(micro_f1), 4))
print(" macro_f1:", round(float(macro_f1), 4))
print(" micro_P :", round(float(micro_p),  4))
print(" micro_R :", round(float(micro_r),  4))

per_label = []
for j, lab in enumerate(y_cols):
    yj = Yte[:, j]
    pj = Yte_pred[:, j]
    per_label.append({
        "label": lab.replace("y_", "", 1),
        "support": int(yj.sum()),
        "precision": float(precision_score(yj, pj, zero_division=0)),
        "recall": float(recall_score(yj, pj, zero_division=0)),
        "f1": float(f1_score(yj, pj, zero_division=0)),
        "threshold": float(best_thr[j]),
    })

per_label_df = pd.DataFrame(per_label).sort_values("f1", ascending=False).reset_index(drop=True)
display(per_label_df.head(25))
per_label_df.to_csv(RES_TABLES / f"{run_name}_per_label_test.csv", index=False, encoding="utf-8-sig")







model_path = MODEL_DIR / f"{run_name}.joblib"
joblib.dump(sgd_model, model_path)

thr_path = MODEL_DIR / f"{run_name}_thresholds.npy"
np.save(thr_path, best_thr)

summary = {
    "run": run_name,
    "model_type": "OneVsRest(SGDClassifier log_loss)",
    "note": "If predict_proba not available, sigmoid(decision_function) used.",
    "text_col": bundle.get("text_col"),
    "num_labels": len(y_cols),
    "labels": [c.replace("y_", "", 1) for c in y_cols],
    "metrics_test": {
        "micro_f1": float(micro_f1),
        "macro_f1": float(macro_f1),
        "micro_precision": float(micro_p),
        "micro_recall": float(micro_r),
    },
    "artifacts": {
        "model": str(model_path),
        "thresholds": str(thr_path),
        "tfidf_bundle": str(PP_DIR / "tfidf_bundle.joblib"),
        "thresholds_table": str(RES_TABLES / f"{run_name}_thresholds_val.csv"),
        "per_label_table": str(RES_TABLES / f"{run_name}_per_label_test.csv"),
    }
}

(RES_REPORTS / f"{run_name}_summary.json").write_text(
    json.dumps(summary, indent=2, ensure_ascii=False),
    encoding="utf-8"
)

print("Saved model + thresholds + summary:", model_path)

# 56.1 sekund

[C] Training OneVsRest SGD (log_loss) (n_jobs=1)...
[C] Done. Train time: 0.69 min
P shapes: (24410, 21) (24164, 21)
Prob range (val): 1.901327531390059e-09 1.0


Unnamed: 0,label,best_threshold,val_f1_at_best
0,pain_general,0.7,0.9997
1,infections,0.7,0.9991
2,edema_swelling,0.75,0.9991
3,injection_site,0.7,0.999
4,general_systemic,0.6,0.9984
5,dermatologic,0.5,0.9982
6,injury_accident,0.65,0.998
7,gastrointestinal,0.6,0.9972
8,neurological,0.6,0.9963
9,hypersensitivity_allergy,0.8,0.9955


TEST metrics:
 micro_f1: 0.995
 macro_f1: 0.992
 micro_P : 0.9934
 micro_R : 0.9966


Unnamed: 0,label,support,precision,recall,f1,threshold
0,pain_general,4196,0.999047,0.999047,0.999047,0.7
1,edema_swelling,1790,0.998324,0.998324,0.998324,0.75
2,general_systemic,4077,0.997064,0.999509,0.998285,0.6
3,injury_accident,1875,0.996807,0.998933,0.997869,0.65
4,infections,3204,0.997815,0.997815,0.997815,0.7
5,injection_site,1737,0.994276,1.0,0.99713,0.7
6,neurological,4084,0.995361,0.998286,0.996822,0.6
7,dermatologic,5373,0.994809,0.998697,0.996749,0.5
8,hypersensitivity_allergy,1964,0.995929,0.996436,0.996182,0.8
9,gastrointestinal,4912,0.995727,0.996336,0.996031,0.6


Saved model + thresholds + summary: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\baseline_models\baseline_ovr_sgd_logloss.joblib
