In [1]:
from pathlib import Path
import os

# hozirgi joy: .../Notebooks
# root topish: Data/ va Models/ bor papkani qidiradi
def find_project_root(start: Path | None = None) -> Path:
    start = start or Path.cwd()
    for p in [start] + list(start.parents):
        if (p / "Data").exists() and (p / "Models").exists():
            return p
    # fallback: Data bor joy
    for p in [start] + list(start.parents):
        if (p / "Data").exists():
            return p
    return start

PROJECT_ROOT = find_project_root()
os.chdir(PROJECT_ROOT)

print("✅ PROJECT_ROOT =", PROJECT_ROOT.resolve())
print("✅ CWD          =", Path.cwd().resolve())

✅ PROJECT_ROOT = C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract
✅ CWD          = C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract


In [3]:
# 8 ta MIN new_object (offline predict uchun to‘g‘ri va yetarli)


import pandas as pd

new_object_01_MIN = {
  "primaryid": 260486321,
  "REAC_pt_symptom_v2": "Unintended pregnancy; Pregnancy with implant contraceptive"
}
new_object_01_MIN_df = pd.DataFrame([new_object_01_MIN])

new_object_02_MIN = {
  "primaryid": 259770711,
  "REAC_pt_symptom_v2": "Erythema; Swelling face"
}
new_object_02_MIN_df = pd.DataFrame([new_object_02_MIN])

new_object_03_MIN = {
  "primaryid": 260447931,
  "REAC_pt_symptom_v2": "Sinus tachycardia; Generalised oedema; Cardiac arrest; Hypotension; Pulse abnormal; Anaphylactic reaction; Pulseless electrical activity; Piloerection; PCO2 decreased"
}
new_object_03_MIN_df = pd.DataFrame([new_object_03_MIN])

new_object_04_MIN = {
  "primaryid": 254757433,
  "REAC_pt_symptom_v2": "Platelet count decreased; Renal impairment; Hyponatraemia; Pulmonary toxicity; Neutrophil count decreased; Sepsis"
}
new_object_04_MIN_df = pd.DataFrame([new_object_04_MIN])

new_object_05_MIN = {
  "primaryid": 258951304,
  "REAC_pt_symptom_v2": "Urticaria; Chest discomfort; Hyperhidrosis; Dyspnoea; Abdominal pain; Uterine contractions during pregnancy; Erythema"
}
new_object_05_MIN_df = pd.DataFrame([new_object_05_MIN])

new_object_06_MIN = {
  "primaryid": 253086112,
  "REAC_pt_symptom_v2": "Feeling abnormal; Dizziness; Blood pressure increased; Nervousness; Diarrhoea; Cough"
}
new_object_06_MIN_df = pd.DataFrame([new_object_06_MIN])

new_object_07_MIN = {
  "primaryid": 258673121,
  "REAC_pt_symptom_v2": "Anxiety; Psoriatic arthropathy; Injection site vesicles; Illness; Hypoacusis; Eczema; Injection site pain; Psoriasis; Headache"
}
new_object_07_MIN_df = pd.DataFrame([new_object_07_MIN])

new_object_08_MIN = {
  "primaryid": 251222438,
  "REAC_pt_symptom_v2": "Illness; Anxiety; Peripheral swelling; Joint swelling; Pruritus; Pain; Chest pain; Headache; Decreased appetite; Pyrexia; Infection; Coagulopathy; Swelling; Chest discomfort"
}
new_object_08_MIN_df = pd.DataFrame([new_object_08_MIN])

In [2]:
# “new_object (unseen) → predict → predict_proba/confidence → model reliability” uslubida qilamiz — faqat FAERS multilabel bo‘lgani uchun:

# pred = bir nechta label (21 ta ichidan)

# predict_proba = (1, n_labels) skor/proba matritsa

# confidence = bitta son qilib olsak: max(score) yoki top-k.


import json
import numpy as np
import pandas as pd
from pathlib import Path
from tabulate import tabulate
import joblib
from scipy import sparse
from sklearn.metrics import brier_score_loss

# =========================================================
# 0) CONFIG (SIZ AYTGAN PATH'LAR)
# =========================================================
PROJECT_ROOT = Path.cwd()  # notebook project rootda ochilgan bo‘lsin

FE_VERSION = "fe_v1"                 # Data/Engineered_data/<FE_VERSION> ichida tfidf_vectorizer.joblib
FS_VERSION = "fe_v1_fs_chi2_v1"      # Data/Feature_Selected/<FS_VERSION> ichida feature_selector.joblib va X_test/Y_test bo‘lishi mumkin

TEXT_COL = "REAC_pt_symptom"         # sizda qaysi text col bo‘lsa shuni yozing

VECT_PATH = PROJECT_ROOT / "Data" / "Engineered_data" / FE_VERSION / "tfidf_vectorizer.joblib"
SEL_PATH  = PROJECT_ROOT / "Data" / "Feature_Selected" / FS_VERSION / "feature_selector.joblib"

MODEL_DIR = PROJECT_ROOT / "Models" / "best_model" / "optuna_logreg_best"
MODEL_PATH = MODEL_DIR / "optuna_logreg_best.joblib"
THR_PATH   = MODEL_DIR / "optuna_logreg_best_thresholds.json"

# label order uchun meta (qaysi biri bo‘lsa o‘shani o‘qiymiz)
META_CANDIDATES = [
    PROJECT_ROOT / "Data" / "Feature_Selected" / FS_VERSION / "engineered_meta.json",
    PROJECT_ROOT / "Data" / "Engineered_data" / FE_VERSION / "meta.json",
]

# TEST reliability uchun (bor bo‘lsa ishlatamiz)
X_TEST_PATH = PROJECT_ROOT / "Data" / "Feature_Selected" / FS_VERSION / "X_test.npz"
Y_TEST_PATH = PROJECT_ROOT / "Data" / "Feature_Selected" / FS_VERSION / "Y_test.npy"

# =========================================================
# 1) LOAD ARTEFACTS
# =========================================================
assert VECT_PATH.exists(), f"Topilmadi: {VECT_PATH}"
assert SEL_PATH.exists(),  f"Topilmadi: {SEL_PATH}"
assert MODEL_PATH.exists(), f"Topilmadi: {MODEL_PATH}"
assert THR_PATH.exists(),   f"Topilmadi: {THR_PATH}"




# =========================================================
# FIX: joblib load uchun custom functionlar (meta_to_sparse) ni avval e'lon qilamiz
# =========================================================
import re
import numpy as np
from scipy import sparse
from sklearn.preprocessing import FunctionTransformer

_term_split = re.compile(r"\s*;\s*")

def meta_features(texts: list[str]) -> np.ndarray:
    lens = []
    n_terms = []
    n_uniq_terms = []

    for s in texts:
        s = (s or "").strip()
        lens.append(len(s))

        if not s:
            n_terms.append(0)
            n_uniq_terms.append(0)
            continue

        terms = [t.strip().lower() for t in _term_split.split(s) if t.strip()]
        n_terms.append(len(terms))
        n_uniq_terms.append(len(set(terms)))

    lens = np.array(lens, dtype=np.float32).reshape(-1, 1)
    n_terms = np.array(n_terms, dtype=np.float32).reshape(-1, 1)
    n_uniq_terms = np.array(n_uniq_terms, dtype=np.float32).reshape(-1, 1)

    feats = np.hstack([np.log1p(lens), n_terms, n_uniq_terms]).astype(np.float32)
    return feats

def meta_to_sparse(texts):
    feats = meta_features(list(texts))
    return sparse.csr_matrix(feats)

# NOTE: offline predictda meta_transformer'ni qayta ishlatmasak ham bo‘ladi,
# lekin joblib.load uchun meta_to_sparse shu modulda mavjud bo‘lishi shart.
meta_transformer = FunctionTransformer(meta_to_sparse, validate=False)




featurizer = joblib.load(VECT_PATH)          # sizda bu FeatureUnion featurizer
selector_payload = joblib.load(SEL_PATH)     # siz yaratgan payload (mask/selected_idx)
model = joblib.load(MODEL_PATH)

with open(THR_PATH, "r", encoding="utf-8") as f:
    thr_dict = json.load(f)

# selector: mask/idx
if isinstance(selector_payload, dict) and "mask" in selector_payload:
    mask = np.array(selector_payload["mask"], dtype=bool)
    selected_idx = np.array(selector_payload.get("selected_idx", np.where(mask)[0]), dtype=int)
else:
    raise ValueError("feature_selector.joblib ichida 'mask' yo‘q. Siz payloadni mask bilan saqlagan bo‘lishingiz kerak.")

# labels order
meta = None
for mp in META_CANDIDATES:
    if mp.exists():
        with open(mp, "r", encoding="utf-8") as f:
            meta = json.load(f)
        break
assert meta is not None, f"Meta topilmadi. Kandidatlar: {META_CANDIDATES}"

y_cols = meta["y_cols"]
label_names = [c.replace("y_", "", 1) for c in y_cols]
thr = np.array([float(thr_dict.get(name, 0.5)) for name in label_names], dtype=float)

print("Loaded:")
print(" - featurizer:", VECT_PATH)
print(" - selector  :", SEL_PATH)
print(" - model     :", MODEL_PATH)
print(" - thresholds:", THR_PATH)
print("labels:", len(label_names))

# =========================================================
# 2) OFFLINE UNSEEN OBJECT (SIZ AYTGAN USLUB)
# =========================================================
new_object = pd.DataFrame({
    "primaryid": [260447931],
    TEXT_COL: ["Sinus tachycardia; Generalised oedema; Cardiac arrest; Hypotension; Pulse abnormal; Anaphylactic reaction; Pulseless electrical activity; Piloerection; PCO2 decreased;"]   # <<< shu yerga o‘zimizning unseen text’imizni yozamiz
})

# abdominal pain; nausea; vomiting; headache

# --- Transform: text -> X_full -> select features -> X_fs ---
def safe_transform(featurizer, series_or_df):
    # ba’zi pipeline’lar Series qabul qiladi, ba’zilari DataFrame/array
    try:
        return featurizer.transform(series_or_df)
    except Exception:
        # fallback: numpy array
        return featurizer.transform(np.asarray(series_or_df).reshape(-1,))

X_full = safe_transform(featurizer, new_object[TEXT_COL])
X_fs = X_full[:, mask]  # yoki X_full[:, selected_idx]

# --- Predict proba/score ---
if hasattr(model, "predict_proba"):
    scores = model.predict_proba(X_fs)
    scores = np.asarray(scores)
else:
    # agar predict_proba bo‘lmasa, decision_function ishlatamiz
    scores = np.asarray(model.decision_function(X_fs))

scores = scores.reshape(1, -1)  # (1, n_labels)

# --- Thresholds -> predicted labels ---
pred = (scores >= thr.reshape(1, -1)).astype(int)[0]
pred_labels = [label_names[i] for i,v in enumerate(pred) if v == 1]

# --- “confidence” analoglari ---
overall_confidence = float(scores.max())  # sizdagi max(proba) analog
topk_idx = np.argsort(scores[0])[::-1][:10]
top10 = [(label_names[i], float(scores[0, i]), float(thr[i])) for i in topk_idx]

print("\n--- Offline (Unseen) FAERS Case ---")
print("primaryid:", new_object["primaryid"][0])
print("Predicted labels:", "; ".join(pred_labels) if pred_labels else "(none)")
print(f"Overall confidence (max score): {overall_confidence:.3f}")

print("\nTop-10 label scores (score vs threshold):")
print(tabulate(top10, headers=["label", "score", "thr"], tablefmt="github", floatfmt=".4f"))

# =========================================================
# 3) MODEL RELIABILITY (TEST set) — sizdagi test_accuracy analogi
# =========================================================
if X_TEST_PATH.exists() and Y_TEST_PATH.exists():
    X_test = sparse.load_npz(X_TEST_PATH).tocsr()
    Y_test = np.load(Y_TEST_PATH)

    if hasattr(model, "predict_proba"):
        s_test = np.asarray(model.predict_proba(X_test))
    else:
        s_test = np.asarray(model.decision_function(X_test))
    s_test = s_test.reshape(X_test.shape[0], -1)

    Y_pred_test = (s_test >= thr.reshape(1, -1)).astype(int)

    # micro/macro F1 (minimal)
    tp = int(((Y_test == 1) & (Y_pred_test == 1)).sum())
    fp = int(((Y_test == 0) & (Y_pred_test == 1)).sum())
    fn = int(((Y_test == 1) & (Y_pred_test == 0)).sum())

    micro_prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    micro_rec  = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    micro_f1   = (2*micro_prec*micro_rec/(micro_prec+micro_rec)) if (micro_prec+micro_rec) > 0 else 0.0

    f1s = []
    for j in range(Y_test.shape[1]):
        y = Y_test[:, j]
        p = Y_pred_test[:, j]
        tpj = int(((y == 1) & (p == 1)).sum())
        fpj = int(((y == 0) & (p == 1)).sum())
        fnj = int(((y == 1) & (p == 0)).sum())
        precj = tpj / (tpj + fpj) if (tpj + fpj) > 0 else 0.0
        recj  = tpj / (tpj + fnj) if (tpj + fnj) > 0 else 0.0
        f1j   = (2*precj*recj/(precj+recj)) if (precj+recj) > 0 else 0.0
        f1s.append(f1j)
    macro_f1 = float(np.mean(f1s))

    # Brier (multilabel mean)
    briers = []
    for j in range(Y_test.shape[1]):
        briers.append(brier_score_loss(Y_test[:, j], s_test[:, j]))
    mean_brier = float(np.mean(briers))

    print("\n--- Model TEST Reliability ---")
    print(f"micro_precision: {micro_prec:.4f}")
    print(f"micro_recall   : {micro_rec:.4f}")
    print(f"micro_f1       : {micro_f1:.4f}")
    print(f"macro_f1       : {macro_f1:.4f}")
    print(f"mean_brier     : {mean_brier:.4f}   (past bo‘lsa yaxshi)")

else:
    print("\n(TEST reliability skipped) X_test.npz yoki Y_test.npy topilmadi:")
    print(" -", X_TEST_PATH)
    print(" -", Y_TEST_PATH)

Loaded:
 - featurizer: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Engineered_data\fe_v1\tfidf_vectorizer.joblib
 - selector  : c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Feature_Selected\fe_v1_fs_chi2_v1\feature_selector.joblib
 - model     : c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\best_model\optuna_logreg_best\optuna_logreg_best.joblib
 - thresholds: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Models\best_model\optuna_logreg_best\optuna_logreg_best_thresholds.json
labels: 21

--- Offline (Unseen) FAERS Case ---
primaryid: 260447931
Predicted labels: cardiovascular; edema_swelling; hypersensitivity_allergy
Overall confidence (max score): 1.000

Top-10 label scores (score vs threshold):
| label                    |   score |    thr |
|--------------------------|--

In [3]:
import pandas as pd
from pathlib import Path

CSV_PATH = Path("Data") / "Raw_data" / "faers_25Q4_targets_multilabel_v2.csv"
assert CSV_PATH.exists(), f"CSV topilmadi: {CSV_PATH.resolve()}"

TEXT_COL = "REAC_pt_symptom_v2"

# faqat kerakli ustunlar (yengil)
df_raw = pd.read_csv(CSV_PATH, usecols=["primaryid", TEXT_COL, "y_labels"])

# primaryid tipini tozalash
df_raw["primaryid"] = pd.to_numeric(df_raw["primaryid"], errors="coerce").astype("Int64")

print("Loaded:", CSV_PATH.resolve())
print("df_raw:", df_raw.shape)
df_raw.head(3)

Loaded: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Raw_data\faers_25Q4_targets_multilabel_v2.csv
df_raw: (308959, 3)


Unnamed: 0,primaryid,REAC_pt_symptom_v2,y_labels
0,100324053,Meningitis pneumococcal,
1,1012809821,Injection site reaction; General physical heal...,cardiovascular; respiratory; general_systemic;...
2,101406268,Internal haemorrhage; Injury; Pain; Depression...,hematologic; pain_general; infections; psychia...


In [5]:
df_raw = df_raw.set_index("primaryid")

pid = 260447931
if pid in df_raw.index:
    row = df_raw.loc[pid]
    print("TRUE y_labels:", row["y_labels"])
    print("TEXT:", row["REAC_pt_symptom_v2"])
else:
    print("Topilmadi:", pid)

TRUE y_labels: cardiovascular; edema_swelling; hypersensitivity_allergy
TEXT: Sinus tachycardia; Generalised oedema; Cardiac arrest; Hypotension; Pulse abnormal; Anaphylactic reaction; Pulseless electrical activity; Piloerection; PCO2 decreased


In [None]:
# shu pid va TEXT bilan offline predict qilib, yonma-yon chiqarish

pid = 260447931
text = df_raw.loc[pid, "REAC_pt_symptom_v2"]
true = df_raw.loc[pid, "y_labels"]

# --- OFFLINE PREDICT ---
X_full = featurizer.transform(pd.Series([text]))
X_fs = X_full[:, mask]

scores = np.asarray(model.predict_proba(X_fs)).reshape(1, -1)
pred = (scores >= thr.reshape(1, -1)).astype(int)[0]
pred_labels = [label_names[i] for i,v in enumerate(pred) if v == 1]

print("PID:", pid)
print("TEXT:", text)
print("\nTRUE y_labels:", true)
print("PRED labels   :", "; ".join(pred_labels) if pred_labels else "(none)")


# Shunda siz aniq ko‘rishimiz mumkin:
# TRUE: cardiovascular; edema_swelling; hypersensitivity_allergy
# PRED: model nima chiqarganini

PID: 260447931
TEXT: Sinus tachycardia; Generalised oedema; Cardiac arrest; Hypotension; Pulse abnormal; Anaphylactic reaction; Pulseless electrical activity; Piloerection; PCO2 decreased

TRUE y_labels: cardiovascular; edema_swelling; hypersensitivity_allergy
PRED labels   : cardiovascular; edema_swelling; hypersensitivity_allergy


In [9]:
pid = 260447931

text = df_raw.loc[pid, "REAC_pt_symptom_v2"]
true = df_raw.loc[pid, "y_labels"]

# --- OFFLINE PREDICT ---
X_full = featurizer.transform(pd.Series([text]))
X_fs = X_full[:, mask]

scores = np.asarray(model.predict_proba(X_fs)).reshape(1, -1)

pred = (scores >= thr.reshape(1, -1)).astype(int)[0]
pred_labels = [label_names[i] for i, v in enumerate(pred) if v == 1]

# top-k
topk = np.argsort(scores[0])[::-1][:10]
top10 = [(label_names[i], float(scores[0, i]), float(thr[i])) for i in topk]

print("PID:", pid)
print("TEXT:", text)
print("\nTRUE y_labels:", true)
print("PRED labels   :", "; ".join(pred_labels) if pred_labels else "(none)")

from tabulate import tabulate
print("\nTop-10 scores (score vs thr):")
print(tabulate(top10, headers=["label","score","thr"], tablefmt="github", floatfmt=".4f"))

PID: 260447931
TEXT: Sinus tachycardia; Generalised oedema; Cardiac arrest; Hypotension; Pulse abnormal; Anaphylactic reaction; Pulseless electrical activity; Piloerection; PCO2 decreased

TRUE y_labels: cardiovascular; edema_swelling; hypersensitivity_allergy
PRED labels   : cardiovascular; edema_swelling; hypersensitivity_allergy

Top-10 scores (score vs thr):
| label                    |   score |    thr |
|--------------------------|---------|--------|
| cardiovascular           |  1.0000 | 0.9551 |
| edema_swelling           |  0.9963 | 0.9412 |
| hypersensitivity_allergy |  0.9954 | 0.9560 |
| general_systemic         |  0.0621 | 0.4194 |
| neurological             |  0.0499 | 0.9505 |
| respiratory              |  0.0341 | 0.9635 |
| infections               |  0.0277 | 0.1207 |
| dermatologic             |  0.0238 | 0.1496 |
| psychiatric              |  0.0236 | 0.9707 |
| gastrointestinal         |  0.0177 | 0.1527 |


In [None]:
# Yana 1ta ehtimol: primaryid duplicate bo‘lishi mumkin

# Agar primaryid bir nechta qatorda bo‘lsa, df_raw.set_index("primaryid") dan keyin df_raw.loc[pid] DataFrame qaytarishi mumkin.

# Sizda hozir row["y_labels"] ishlagan — demak ehtimol u Series bo‘lib qaytyapti (ya’ni unique).

# Lekin tekshirib qo‘yish uchun:

pid = 260447931
hit = df_raw.loc[pid]
print(type(hit))

# pandas.core.series.Series chiqsa → unique ✅

# pandas.core.frame.DataFrame chiqsa → duplicate; unda hit.iloc[0] bilan bittasini tanlaymiz

<class 'pandas.Series'>


In [None]:
# bu 8 ta primaryid ro‘yxatingiz bo‘yicha loop qilib, har biriga TRUE/PRED/top5 chiqaradi

# =========================================================
# LOOP: 8 ta primaryid uchun TRUE vs PRED + Top5 scores
# (shart: df_raw index=primaryid bo‘lsin, va featurizer/mask/model/thr/label_names loaded bo‘lsin)
# =========================================================
import numpy as np
import pandas as pd
from tabulate import tabulate

TEXT_COL = "REAC_pt_symptom_v2"

pids = [
    260486321,
    259770711,
    260447931,
    254757433,
    258951304,
    253086112,
    258673121,
    251222438,
]

def split_labels(s):
    if s is None or (isinstance(s, float) and np.isnan(s)):
        return set()
    return set([t.strip() for t in str(s).split(";") if t.strip()])

rows = []
for pid in pids:
    if pid not in df_raw.index:
        rows.append({
            "primaryid": pid,
            "status": "NOT_FOUND",
            "true_n": None,
            "pred_n": None,
            "overlap_n": None,
            "missing_true": None,
            "extra_pred": None,
            "top5": None,
        })
        continue

    text = df_raw.loc[pid, TEXT_COL]
    true_str = df_raw.loc[pid, "y_labels"] if "y_labels" in df_raw.columns else None
    true_set = split_labels(true_str)

    # --- predict ---
    X_full = featurizer.transform(pd.Series([text]))
    X_fs = X_full[:, mask]
    scores = np.asarray(model.predict_proba(X_fs)).reshape(1, -1)

    pred = (scores >= thr.reshape(1, -1)).astype(int)[0]
    pred_labels = [label_names[i] for i, v in enumerate(pred) if v == 1]
    pred_set = set(pred_labels)

    overlap = true_set & pred_set
    missing = true_set - pred_set
    extra = pred_set - true_set

    # top5
    topk = np.argsort(scores[0])[::-1][:5]
    top5 = "; ".join([f"{label_names[i]}:{scores[0,i]:.3f}(thr={thr[i]:.3f})" for i in topk])

    rows.append({
        "primaryid": pid,
        "status": "OK",
        "true_n": len(true_set),
        "pred_n": len(pred_set),
        "overlap_n": len(overlap),
        "missing_true": "; ".join(sorted(list(missing))) if missing else "",
        "extra_pred": "; ".join(sorted(list(extra))) if extra else "",
        "top5": top5,
    })

df_report = pd.DataFrame(rows)

print(tabulate(df_report, headers="keys", tablefmt="github", showindex=False))

|   primaryid | status   |   true_n |   pred_n |   overlap_n | missing_true   | extra_pred   | top5                                                                                                                                                                          |
|-------------|----------|----------|----------|-------------|----------------|--------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|   260486321 | OK       |        1 |        1 |           1 |                |              | pregnancy_reproductive:1.000(thr=0.994); gastrointestinal:0.003(thr=0.153); injury_accident:0.002(thr=0.082); hematologic:0.002(thr=0.099); neurological:0.002(thr=0.950)     |
|   259770711 | OK       |        2 |        2 |           2 |                |              | edema_swelling:1.000(thr=0.941); dermatologic:1.000(thr=0.150); general_systemic:0.000(thr=0

In [11]:
# primaryid uchun alohida qilib:

# TEXT

# TRUE y_labels

# PRED labels

# Top10 scores
# ni ham “tagma-tag” chiqaradigan (ko‘proq detallangan) varianti


# =========================================================
# DETAILED PRINT: 8 ta primaryid uchun tagma-tag:
# - TEXT
# - TRUE y_labels
# - PRED labels
# - Missing/Extra
# - Top10 scores (score vs thr)
# (shart: df_raw index=primaryid bo‘lsin, va featurizer/mask/model/thr/label_names loaded bo‘lsin)
# =========================================================
import numpy as np
import pandas as pd
from tabulate import tabulate

TEXT_COL = "REAC_pt_symptom_v2"

pids = [
    260486321,
    259770711,
    260447931,
    254757433,
    258951304,
    253086112,
    258673121,
    251222438,
]

def split_labels(s):
    if s is None or (isinstance(s, float) and np.isnan(s)):
        return set()
    return set([t.strip() for t in str(s).split(";") if t.strip()])

def predict_one(text: str):
    X_full = featurizer.transform(pd.Series([text]))
    X_fs = X_full[:, mask]
    scores = np.asarray(model.predict_proba(X_fs)).reshape(1, -1)[0]
    pred = (scores >= thr).astype(int)
    pred_labels = [label_names[i] for i, v in enumerate(pred) if v == 1]
    top10_idx = np.argsort(scores)[::-1][:10]
    top10 = [(label_names[i], float(scores[i]), float(thr[i])) for i in top10_idx]
    return pred_labels, top10

for pid in pids:
    print("\n" + "=" * 110)
    print(f"PRIMARYID: {pid}")
    print("=" * 110)

    if pid not in df_raw.index:
        print("❌ NOT FOUND in df_raw")
        continue

    text = str(df_raw.loc[pid, TEXT_COL])
    true_str = df_raw.loc[pid, "y_labels"] if "y_labels" in df_raw.columns else None
    true_set = split_labels(true_str)

    pred_labels, top10 = predict_one(text)
    pred_set = set(pred_labels)

    missing = sorted(list(true_set - pred_set))
    extra = sorted(list(pred_set - true_set))

    print("\nTEXT:")
    print(text)

    print("\nTRUE y_labels:")
    print(true_str if true_str is not None else "(no y_labels column)")

    print("\nPRED labels:")
    print("; ".join(pred_labels) if pred_labels else "(none)")

    print("\nDIFF:")
    print(" - missing_true:", "; ".join(missing) if missing else "(none)")
    print(" - extra_pred  :", "; ".join(extra) if extra else "(none)")

    print("\nTop-10 scores (score vs thr):")
    print(tabulate(top10, headers=["label", "score", "thr"], tablefmt="github", floatfmt=".4f"))


PRIMARYID: 260486321

TEXT:
Unintended pregnancy; Pregnancy with implant contraceptive

TRUE y_labels:
pregnancy_reproductive

PRED labels:
pregnancy_reproductive

DIFF:
 - missing_true: (none)
 - extra_pred  : (none)

Top-10 scores (score vs thr):
| label                  |   score |    thr |
|------------------------|---------|--------|
| pregnancy_reproductive |  1.0000 | 0.9939 |
| gastrointestinal       |  0.0029 | 0.1527 |
| injury_accident        |  0.0024 | 0.0816 |
| hematologic            |  0.0024 | 0.0985 |
| neurological           |  0.0022 | 0.9505 |
| general_systemic       |  0.0022 | 0.4194 |
| pain_general           |  0.0020 | 0.9467 |
| infections             |  0.0011 | 0.1207 |
| dermatologic           |  0.0011 | 0.1496 |
| psychiatric            |  0.0011 | 0.9707 |

PRIMARYID: 259770711

TEXT:
Erythema; Swelling face

TRUE y_labels:
dermatologic; edema_swelling

PRED labels:
dermatologic; edema_swelling

DIFF:
 - missing_true: (none)
 - extra_pred  : (none)

T

In [12]:
# u CSV’dan **“eng qiyin 30 ta case”**ni topadi (score threshold’ga eng yaqin bo‘lganlar), keyin shu 30 tadan 8 ta tanlab beradi:

# primaryid (to‘g‘ri, CSV ichidan)

# TEXT (REAC_pt_symptom_v2)

# TRUE y_labels

# PRED labels

# overall_margin / pos_margin / neg_margin

# top5 scores (score vs thr)

# va oxirida 8 ta new_object_MIN ni tayyor qilib chiqaradi.

# Shart: sizda oldindan featurizer, mask, model, thr, label_names yuklangan bo‘lsin (sizning offline testing cell’ingizdagi artefaktlar).

# =========================================================
# HARD CASE MINER (CSV -> top30 hardest -> pick 8 -> show + new_objects)
# =========================================================
import numpy as np
import pandas as pd
from pathlib import Path
from tabulate import tabulate
import heapq

# --- REQUIREMENTS CHECK ---
need = ["featurizer", "mask", "model", "thr", "label_names"]
missing = [v for v in need if v not in globals()]
assert not missing, f"Avval bularni load qiling: {missing}"

CSV_PATH = Path("Data") / "Raw_data" / "faers_25Q4_targets_multilabel_v2.csv"
assert CSV_PATH.exists(), f"CSV topilmadi: {CSV_PATH.resolve()}"

TEXT_COL = "REAC_pt_symptom_v2"
Y_COL = "y_labels"
PID_COL = "primaryid"

# label name -> index
label2idx = {name: i for i, name in enumerate(label_names)}

KEEP_TOP = 30
CHUNKSIZE = 2000  # kerak bo‘lsa 1000/5000 qilib o‘zgartiring
MAX_CHUNKS = None # None = butun CSV bo‘ylab yuradi; xohlasangiz masalan 50 deb qo‘ying

def split_labels(s):
    if s is None or (isinstance(s, float) and np.isnan(s)):
        return []
    return [t.strip() for t in str(s).split(";") if t.strip()]

# max-heap (bizga eng kichik marginlar kerak, heapda esa eng kattasini saqlab turamiz)
hard_heap = []  # item: (-overall_margin, record_dict)

def push_hard(rec):
    m = float(rec["overall_margin"])
    key = -m
    if len(hard_heap) < KEEP_TOP:
        heapq.heappush(hard_heap, (key, rec))
    else:
        # heap[0] = eng kichik key (ya'ni eng katta margin) bo‘ladi
        # agar yangi margin bundan kichik bo‘lsa (hardroq bo‘lsa) almashtiramiz
        if key > hard_heap[0][0]:
            heapq.heapreplace(hard_heap, (key, rec))

# headerni o‘qib ustunlar bor-yo‘qligini tekshiramiz
head = pd.read_csv(CSV_PATH, nrows=1)
assert PID_COL in head.columns, f"CSVda {PID_COL} yo‘q"
assert TEXT_COL in head.columns, f"CSVda {TEXT_COL} yo‘q (kolonkani tekshirib nomini moslang)"
assert Y_COL in head.columns, f"CSVda {Y_COL} yo‘q (TRUE bilan solishtirish bo‘lmaydi)"

usecols = [PID_COL, TEXT_COL, Y_COL]

reader = pd.read_csv(CSV_PATH, usecols=usecols, chunksize=CHUNKSIZE, low_memory=False)
for ci, chunk in enumerate(reader, start=1):
    if MAX_CHUNKS is not None and ci > MAX_CHUNKS:
        break

    # types
    chunk[PID_COL] = pd.to_numeric(chunk[PID_COL], errors="coerce")
    chunk = chunk.dropna(subset=[PID_COL, TEXT_COL]).copy()
    if len(chunk) == 0:
        continue
    chunk[PID_COL] = chunk[PID_COL].astype(np.int64)
    texts = chunk[TEXT_COL].astype(str)

    # batch featurize -> select -> predict
    X_full = featurizer.transform(texts)
    X_fs = X_full[:, mask]
    scores = np.asarray(model.predict_proba(X_fs))
    if scores.ndim == 1:
        scores = scores.reshape(-1, len(label_names))

    # deltas = score - thr
    deltas = scores - thr.reshape(1, -1)

    # per-row hardness metric
    for i in range(len(chunk)):
        pid = int(chunk.iloc[i][PID_COL])
        text = str(chunk.iloc[i][TEXT_COL])
        y_true_str = chunk.iloc[i][Y_COL]
        true_names = split_labels(y_true_str)

        true_idx = [label2idx[n] for n in true_names if n in label2idx]
        if len(true_idx) == 0:
            # TRUE label yo‘q bo‘lsa (y_labels NaN yoki mappingda yo‘q), skip
            continue

        d = deltas[i]
        # positive margin: TRUE label'larning eng zaifi (thresholddan qanchaga yuqori)
        pos_margin = float(np.min(d[true_idx]))

        # negative margin: FALSE label'larning eng kuchlisi thresholdgacha qanchaga yetmayapti
        # (TRUE label idx'larni mask qilib tashlaymiz)
        mask_false = np.ones(d.shape[0], dtype=bool)
        mask_false[true_idx] = False
        max_false_delta = float(np.max(d[mask_false])) if mask_false.any() else -np.inf
        neg_margin = float(-max_false_delta)  # >0 bo‘lsa yaxshi; 0 ga yaqin bo‘lsa xatar

        overall_margin = float(min(pos_margin, neg_margin))

        # predicted labels
        pred_idx = np.where(scores[i] >= thr)[0]
        pred_names = [label_names[j] for j in pred_idx]

        # top5
        top5_idx = np.argsort(scores[i])[::-1][:5]
        top5 = "; ".join([f"{label_names[j]}:{scores[i,j]:.3f}(thr={thr[j]:.3f})" for j in top5_idx])

        rec = {
            "primaryid": pid,
            "overall_margin": overall_margin,
            "pos_margin": pos_margin,
            "neg_margin": neg_margin,
            "true_labels": "; ".join(true_names),
            "pred_labels": "; ".join(pred_names),
            "top5": top5,
            "text": text,
            "text_len": len(text),
            "true_n": len(true_idx),
            "pred_n": len(pred_names),
        }
        push_hard(rec)

# --- top30 hardest sorted ---
hard30 = [r for _, r in sorted(hard_heap, key=lambda x: x[0], reverse=True)]  # reverse=True => smallest margin first
df_hard30 = pd.DataFrame(hard30).sort_values("overall_margin", ascending=True).reset_index(drop=True)

print("\n" + "="*110)
print("TOP 30 HARDEST (margin eng kichik)")
print("="*110)
show_cols = ["primaryid","overall_margin","pos_margin","neg_margin","true_n","pred_n","true_labels","pred_labels","top5","text_len"]
print(tabulate(df_hard30[show_cols].head(30), headers="keys", tablefmt="github", showindex=False, floatfmt=".6f"))

# --- pick 8 from hard30 (diverse true_n) ---
picked = []
used = set()
for target_n in [1,2,3,4,5,6,7,8,9,10,12]:
    sub = df_hard30[df_hard30["true_n"] == target_n]
    if len(sub) == 0:
        continue
    pid = int(sub.iloc[0]["primaryid"])
    if pid in used:
        continue
    used.add(pid)
    picked.append(pid)
    if len(picked) >= 8:
        break

# fill if less than 8
if len(picked) < 8:
    for pid in df_hard30["primaryid"].tolist():
        pid = int(pid)
        if pid in used:
            continue
        used.add(pid)
        picked.append(pid)
        if len(picked) >= 8:
            break

print("\n" + "="*110)
print("PICKED 8 HARDEST PRIMARYIDs")
print("="*110)
print(picked)

# --- detailed print for picked 8 + new_object_MIN ---
print("\n" + "="*110)
print("DETAILED (8 ta) + new_object_MIN")
print("="*110)

new_objects_min = []
for pid in picked:
    row = df_hard30[df_hard30["primaryid"] == pid].iloc[0]
    print("\n" + "-"*110)
    print("PID:", pid)
    print("overall_margin:", row["overall_margin"], "| pos_margin:", row["pos_margin"], "| neg_margin:", row["neg_margin"])
    print("TEXT:", row["text"])
    print("TRUE:", row["true_labels"])
    print("PRED:", row["pred_labels"])
    print("TOP5:", row["top5"])

    new_obj = {"primaryid": int(pid), TEXT_COL: row["text"]}
    new_objects_min.append(new_obj)

print("\n" + "="*110)
print("8 ta new_object_MIN (copy-paste)")
print("="*110)
for i, obj in enumerate(new_objects_min, start=1):
    print(f"new_object_{i:02d}_MIN = {obj}")


TOP 30 HARDEST (margin eng kichik)
|   primaryid |   overall_margin |   pos_margin |   neg_margin |   true_n |   pred_n | true_labels                                                                                                                                    | pred_labels                                                                                                                                                                                       | top5                                                                                                                                                                        |   text_len |
|-------------|------------------|--------------|--------------|----------|----------|------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------

In [14]:
from pathlib import Path
from tabulate import tabulate
import textwrap

# 1) TOP30 jadvalni faylga
hard30_md = tabulate(df_hard30[show_cols], headers="keys", tablefmt="github", showindex=False, floatfmt=".6f")
Path("hard30_table.md").write_text(hard30_md, encoding="utf-8")

# 2) Picked 8 detalni faylga
lines = []
for pid in picked:
    r = df_hard30[df_hard30["primaryid"] == pid].iloc[0]
    lines.append("="*110)
    lines.append(f"PID: {pid}")
    lines.append(f"overall_margin: {r['overall_margin']:.6f} | pos_margin: {r['pos_margin']:.6f} | neg_margin: {r['neg_margin']:.6f}")
    lines.append("TEXT:")
    lines.append(str(r["text"]))
    lines.append("TRUE:")
    lines.append(str(r["true_labels"]))
    lines.append("PRED:")
    lines.append(str(r["pred_labels"]))
    lines.append("TOP5:")
    lines.append(str(r["top5"]))
    lines.append("")

Path("hard8_detail.md").write_text("\n".join(lines), encoding="utf-8")

print("✅ Saved: hard30_table.md, hard8_detail.md")

✅ Saved: hard30_table.md, hard8_detail.md


In [None]:
# shu “picked 8” uchun TRUE vs PREDni avvalgi formatda (Top10 scores jadval bilan) ham avtomatik chiqaradigan variant

# =========================================================
# DETAILED PRINT (Hard-picked 8):
# - TEXT
# - TRUE y_labels (CSV)
# - PRED labels (model)
# - missing/extra
# - Top10 scores (score vs thr)
#
# Shart:
# - oldingi cell ishlagan bo‘lsin va `picked` list mavjud bo‘lsin
#   (yoki pastda pids_8 ni qo‘lda berishingiz mumkin)
# - df_raw index=primaryid bo‘lsin (df_raw = df_raw.set_index("primaryid"))
# - featurizer/mask/model/thr/label_names loaded bo‘lsin
# =========================================================
import numpy as np
import pandas as pd
from tabulate import tabulate

TEXT_COL = "REAC_pt_symptom_v2"

# Agar oldingi cell'dan `picked` kelmagan bo‘lsa, shu yerga 8 ta pid yozing:
pids_8 = picked if "picked" in globals() and picked else []

assert len(pids_8) == 8, f"pids_8 8 ta bo‘lishi kerak. Hozir: {len(pids_8)}"

def split_labels(s):
    if s is None or (isinstance(s, float) and np.isnan(s)):
        return set()
    return set([t.strip() for t in str(s).split(";") if t.strip()])

def predict_one(text: str):
    X_full = featurizer.transform(pd.Series([text]))
    X_fs = X_full[:, mask]
    scores = np.asarray(model.predict_proba(X_fs)).reshape(1, -1)[0]
    pred = (scores >= thr).astype(int)
    pred_labels = [label_names[i] for i, v in enumerate(pred) if v == 1]
    top10_idx = np.argsort(scores)[::-1][:10]
    top10 = [(label_names[i], float(scores[i]), float(thr[i])) for i in top10_idx]
    return pred_labels, top10

for pid in pids_8:
    print("\n" + "=" * 110)
    print(f"PRIMARYID: {pid}")
    print("=" * 110)

    if pid not in df_raw.index:
        print("❌ NOT FOUND in df_raw (index)")
        continue

    text = str(df_raw.loc[pid, TEXT_COL])
    true_str = df_raw.loc[pid, "y_labels"] if "y_labels" in df_raw.columns else None
    true_set = split_labels(true_str)

    pred_labels, top10 = predict_one(text)
    pred_set = set(pred_labels)

    missing = sorted(list(true_set - pred_set))
    extra = sorted(list(pred_set - true_set))

    print("\nTEXT:")
    print(text)

    print("\nTRUE y_labels:")
    print(true_str if true_str is not None else "(no y_labels column)")

    print("\nPRED labels:")
    print("; ".join(pred_labels) if pred_labels else "(none)")

    print("\nDIFF:")
    print(" - missing_true:", "; ".join(missing) if missing else "(none)")
    print(" - extra_pred  :", "; ".join(extra) if extra else "(none)")

    print("\nTop-10 scores (score vs thr):")
    print(tabulate(top10, headers=["label", "score", "thr"], tablefmt="github", floatfmt=".4f"))


PRIMARYID: 260908951

TEXT:
Psoriasis; Platelet count

TRUE y_labels:
dermatologic

PRED labels:
dermatologic; hematologic

DIFF:
 - missing_true: (none)
 - extra_pred  : hematologic

Top-10 scores (score vs thr):
| label            |   score |    thr |
|------------------|---------|--------|
| dermatologic     |  1.0000 | 0.1496 |
| hematologic      |  0.9875 | 0.0985 |
| infections       |  0.0004 | 0.1207 |
| pain_general     |  0.0003 | 0.9467 |
| respiratory      |  0.0003 | 0.9635 |
| psychiatric      |  0.0002 | 0.9707 |
| musculoskeletal  |  0.0002 | 0.9303 |
| neurological     |  0.0002 | 0.9505 |
| gastrointestinal |  0.0002 | 0.1527 |
| injury_accident  |  0.0002 | 0.0816 |

PRIMARYID: 258264534

TEXT:
Dyspnoea; Hypophagia; Haemoglobin abnormal; White blood cell count abnormal; Platelet count abnormal; Fatigue

TRUE y_labels:
respiratory; general_systemic

PRED labels:
general_systemic; hematologic; respiratory

DIFF:
 - missing_true: (none)
 - extra_pred  : hematologic

To