In [1]:
# =========================
# 09_feature_engineering.py (notebook cell’lari sifatida ham ishlaydi)
# FAERS25Q4 multilabel — feature engineering + featurizer artifact
# =========================

# =========================================================
# CELL 1 — Imports
# =========================================================
from __future__ import annotations

from pathlib import Path
import re
import json

import numpy as np
import pandas as pd

from scipy import sparse
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer

In [2]:
# =========================================================
# CELL 2 — Config + PROJECT_ROOT + SPLIT_DIR (rasmga mos fix)
# =========================================================
from pathlib import Path

# Sizda bor split papka (rasmda shu)
SPLIT_DIR_CANDIDATES = [
    "splits_multilabel_noleakage",   # <-- asosiy
    "splits_multilabel_noleakage2",
    "splits_multilabel_noleakage3",
]

TEXT_COL = "REAC_pt_symptom_v2"
EXCLUDE_Y_COLS = {"y_labels", "y_sum", "label_sum"}

FE_VERSION = "fe_v1"
SAVE_MATRICES_NPZ = False
RUN_QUICK_CHECK = True
MAX_TRAIN_SAMPLES_FOR_QUICK = 50000


def find_project_root_and_split(start: Path | None = None):
    """
    Notebooks 'tuzog'i'ga tushmaslik uchun:
    PROJECT_ROOT ni Data borligi bilan emas,
    aynan Data/Processed/<split>/train|val|test.csv borligi bilan topadi.
    """
    start = start or Path.cwd()
    checked = []

    for p in [start] + list(start.parents):
        processed = p / "Data" / "Processed"
        if not processed.exists():
            continue

        for name in SPLIT_DIR_CANDIDATES:
            d = processed / name
            checked.append(d)
            if (d / "train.csv").exists() and (d / "val.csv").exists() and (d / "test.csv").exists():
                return p, d

    raise FileNotFoundError(
        "Split papka topilmadi (parents bo‘ylab qidirildi).\n"
        f"CWD: {start.resolve()}\n"
        "Tekshirilgan processed/split papkalar (oxirgi 10 ta):\n"
        + "\n".join(str(x) for x in checked[-10:])
    )


PROJECT_ROOT, SPLIT_DIR = find_project_root_and_split()
print("PROJECT_ROOT:", PROJECT_ROOT.resolve())
print("SPLIT_DIR:", SPLIT_DIR.resolve())

PROJECT_ROOT: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract
SPLIT_DIR: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Processed\splits_multilabel_noleakage


In [3]:
# =========================================================
# CELL 3 — Load split CSVs
# =========================================================
train_path = SPLIT_DIR / "train.csv"
val_path = SPLIT_DIR / "val.csv"
test_path = SPLIT_DIR / "test.csv"

train_df = pd.read_csv(train_path, low_memory=False)
val_df = pd.read_csv(val_path, low_memory=False)
test_df = pd.read_csv(test_path, low_memory=False)

print("train:", train_df.shape, "| val:", val_df.shape, "| test:", test_df.shape)

if TEXT_COL not in train_df.columns:
    raise ValueError(f"TEXT_COL topilmadi: {TEXT_COL}. Mavjud: {list(train_df.columns)[:30]} ...")

# y_cols tanlash (y_sum/label_sum kirmasin!)
y_cols = sorted([c for c in train_df.columns if c.startswith("y_") and c not in EXCLUDE_Y_COLS])
if not y_cols:
    raise ValueError("y_* ustunlari topilmadi (train.csv).")

# val/test ham xuddi shu y_cols ga ega bo‘lsin
missing_val = [c for c in y_cols if c not in val_df.columns]
missing_test = [c for c in y_cols if c not in test_df.columns]
if missing_val or missing_test:
    raise ValueError(f"y_cols mismatch. missing_val={missing_val[:10]} missing_test={missing_test[:10]}")

print("Num labels:", len(y_cols))
print("First labels:", y_cols[:10])

# basic cleanup
for d in (train_df, val_df, test_df):
    d[TEXT_COL] = d[TEXT_COL].fillna("").astype(str).str.strip()

print("Empty text rows (train/val/test):",
      int((train_df[TEXT_COL] == "").sum()),
      int((val_df[TEXT_COL] == "").sum()),
      int((test_df[TEXT_COL] == "").sum()))

train: (201176, 24) | val: (24410, 24) | test: (24164, 24)
Num labels: 21
First labels: ['y_cardiovascular', 'y_dermatologic', 'y_edema_swelling', 'y_gastrointestinal', 'y_general_systemic', 'y_hematologic', 'y_hepatic', 'y_hypersensitivity_allergy', 'y_infections', 'y_injection_site']
Empty text rows (train/val/test): 0 0 0


In [4]:
# =========================================================
# CELL 4 — Meta features (length, term counts)
# =========================================================
_term_split = re.compile(r"\s*;\s*")

def meta_features(texts: list[str]) -> np.ndarray:
    lens = []
    n_terms = []
    n_uniq_terms = []

    for s in texts:
        s = (s or "").strip()
        lens.append(len(s))

        if not s:
            n_terms.append(0)
            n_uniq_terms.append(0)
            continue

        terms = [t.strip().lower() for t in _term_split.split(s) if t.strip()]
        n_terms.append(len(terms))
        n_uniq_terms.append(len(set(terms)))

    lens = np.array(lens, dtype=np.float32).reshape(-1, 1)
    n_terms = np.array(n_terms, dtype=np.float32).reshape(-1, 1)
    n_uniq_terms = np.array(n_uniq_terms, dtype=np.float32).reshape(-1, 1)

    feats = np.hstack([np.log1p(lens), n_terms, n_uniq_terms]).astype(np.float32)
    return feats

def meta_to_sparse(texts):
    feats = meta_features(list(texts))
    return sparse.csr_matrix(feats)

meta_transformer = FunctionTransformer(meta_to_sparse, validate=False)

In [5]:
# =========================================================
# CELL 5 — Featurizer: word TF-IDF + char_wb TF-IDF + meta
# =========================================================
word_tfidf = TfidfVectorizer(
    analyzer="word",
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True,
    norm="l2",
)

char_tfidf = TfidfVectorizer(
    analyzer="char_wb",
    ngram_range=(3, 5),
    min_df=3,
    max_df=0.98,
    sublinear_tf=True,
    norm="l2",
)

featurizer = FeatureUnion(
    transformer_list=[
        ("word_tfidf", word_tfidf),
        ("char_tfidf", char_tfidf),
        ("meta", meta_transformer),
    ],
    n_jobs=1,  # Windows/OneDrive uchun 1 tavsiya
)

print(featurizer)

FeatureUnion(n_jobs=1,
             transformer_list=[('word_tfidf',
                                TfidfVectorizer(max_df=0.95, min_df=2,
                                                ngram_range=(1, 2),
                                                sublinear_tf=True)),
                               ('char_tfidf',
                                TfidfVectorizer(analyzer='char_wb', max_df=0.98,
                                                min_df=3, ngram_range=(3, 5),
                                                sublinear_tf=True)),
                               ('meta',
                                FunctionTransformer(func=<function meta_to_sparse at 0x000002844A03D580>))])


In [6]:
# =========================================================
# CELL 6 — Fit on train, transform val/test
# =========================================================
X_train_text = train_df[TEXT_COL].tolist()
X_val_text = val_df[TEXT_COL].tolist()
X_test_text = test_df[TEXT_COL].tolist()

print("Fitting featurizer on train...")
X_train = featurizer.fit_transform(X_train_text)
X_val = featurizer.transform(X_val_text)
X_test = featurizer.transform(X_test_text)

print("X_train:", X_train.shape, "nnz:", X_train.nnz)
print("X_val  :", X_val.shape, "nnz:", X_val.nnz)
print("X_test :", X_test.shape, "nnz:", X_test.nnz)

Fitting featurizer on train...
X_train: (201176, 104479) nnz: 29288234
X_val  : (24410, 104479) nnz: 3331217
X_test : (24164, 104479) nnz: 3587450


In [11]:
from pathlib import Path
import os

def find_project_root(start: Path | None = None) -> Path:
    start = start or Path.cwd()
    for p in [start] + list(start.parents):
        if (p / "Data").exists() and (p / "Models").exists():
            return p
        # fallback: Data bo‘lsa ham root deb olamiz
        if (p / "Data").exists():
            return p
    return start

PROJECT_ROOT = find_project_root()

# ✅ hozirgi ish papkani rootga o‘tkazamiz
os.chdir(PROJECT_ROOT)

print("PROJECT_ROOT =", PROJECT_ROOT.resolve())
print("CWD          =", Path.cwd().resolve())

PROJECT_ROOT = C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract
CWD          = C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract


In [13]:
from pathlib import Path

# PROJECT_ROOT oldin aniqlangan bo‘lsin
%cd {PROJECT_ROOT}
print("CWD =", Path.cwd().resolve())

c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract
CWD = C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract


In [15]:
# =========================================================
# CELL 7 — Save artifacts (featurizer + optional matrices)
# =========================================================
ART_DIR = PROJECT_ROOT /"Data"/ "Engineered_data" / FE_VERSION
ART_DIR.mkdir(parents=True, exist_ok=True)

featurizer_path = ART_DIR / "tfidf_vectorizer.joblib"
joblib.dump(featurizer, featurizer_path)
print("Saved featurizer:", featurizer_path.resolve())

# # (qo‘shimcha) aynan shu featurizer'ni "tfidf_vectorizer.joblib" nomi bilan ham saqlab qo‘yamiz
# joblib.dump(featurizer, ART_DIR / "tfidf_vectorizer.joblib")
# print("Saved tfidf_vectorizer.joblib:", (ART_DIR / "tfidf_vectorizer.joblib").resolve())

meta_path = ART_DIR / "meta.json"
with open(meta_path, "w", encoding="utf-8") as f:
    json.dump(
        {
            "TEXT_COL": TEXT_COL,
            "y_cols_count": len(y_cols),
            "y_cols": y_cols,
            "split_dir": str(SPLIT_DIR),
            "fe_version": FE_VERSION,
            "word_tfidf": {
                "ngram_range": word_tfidf.ngram_range,
                "min_df": word_tfidf.min_df,
                "max_df": word_tfidf.max_df,
                "sublinear_tf": word_tfidf.sublinear_tf,
                "norm": word_tfidf.norm,
            },
            "char_tfidf": {
                "analyzer": char_tfidf.analyzer,
                "ngram_range": char_tfidf.ngram_range,
                "min_df": char_tfidf.min_df,
                "max_df": char_tfidf.max_df,
                "sublinear_tf": char_tfidf.sublinear_tf,
                "norm": char_tfidf.norm,
            },
            "meta_features": ["log1p_len", "n_terms", "n_uniq_terms"],
        },
        f,
        ensure_ascii=False,
        indent=2,
    )
print("Saved meta:", meta_path.resolve())

if SAVE_MATRICES_NPZ:
    sparse.save_npz(ART_DIR / "X_train.npz", X_train)
    sparse.save_npz(ART_DIR / "X_val.npz", X_val)
    sparse.save_npz(ART_DIR / "X_test.npz", X_test)
    print("Saved matrices (.npz) into:", ART_DIR.resolve())
else:
    print("SAVE_MATRICES_NPZ=False (matritsa saqlanmadi).")

Saved featurizer: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Engineered_data\fe_v1\tfidf_vectorizer.joblib
Saved meta: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Engineered_data\fe_v1\meta.json
SAVE_MATRICES_NPZ=False (matritsa saqlanmadi).


In [10]:
# =========================================================
# CELL 8 — Quick sanity-check (optional): OneVsRest(LogReg) on VAL
# =========================================================
if RUN_QUICK_CHECK:
    from sklearn.multiclass import OneVsRestClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import f1_score

    Y_train = train_df[y_cols].astype(int).to_numpy()
    Y_val = val_df[y_cols].astype(int).to_numpy()

    # subsample (tez bo‘lishi uchun)
    if MAX_TRAIN_SAMPLES_FOR_QUICK and len(train_df) > MAX_TRAIN_SAMPLES_FOR_QUICK:
        idx = np.random.RandomState(42).choice(len(train_df), size=MAX_TRAIN_SAMPLES_FOR_QUICK, replace=False)
        X_tr = X_train[idx]
        Y_tr = Y_train[idx]
    else:
        X_tr = X_train
        Y_tr = Y_train

    base = LogisticRegression(
        solver="liblinear",
        max_iter=2000,
        n_jobs=1,
        class_weight="balanced",
    )
    clf = OneVsRestClassifier(base, n_jobs=1)

    print("Training quick OVR LogReg...")
    clf.fit(X_tr, Y_tr)

    # proba bo‘lmasa decision_function
    if hasattr(clf, "predict_proba"):
        P = clf.predict_proba(X_val)
        Y_pred = (P >= 0.5).astype(int)
    else:
        S = clf.decision_function(X_val)
        Y_pred = (S >= 0).astype(int)

    micro = f1_score(Y_val, Y_pred, average="micro", zero_division=0)
    macro = f1_score(Y_val, Y_pred, average="macro", zero_division=0)
    print(f"VAL F1 micro={micro:.4f} | macro={macro:.4f}")
else:
    print("RUN_QUICK_CHECK=False")

Training quick OVR LogReg...




VAL F1 micro=0.9873 | macro=0.9787


In [12]:
# Bu cell:

# Data/Engineered_data/<FE_VERSION>/ papka yaratadi

# X_train / X_val / X_test ni .npz qilib saqlaydi

# Y_train / Y_val / Y_test ni .npy qilib saqlaydi

# y_cols + TEXT_COL meta’ni json qilib saqlaydi

# (agar bo‘lsa) primaryid/caseid kabi ID ustunlarni ham saqlab qo‘yadi

# =========================================================
# CELL 9 — Save Engineered Data into Data/Engineered_data
# =========================================================
from pathlib import Path
import json
import numpy as np
from scipy import sparse

ENGINEERED_DIR = PROJECT_ROOT / "Data" / "Engineered_data" / FE_VERSION
ENGINEERED_DIR.mkdir(parents=True, exist_ok=True)
print("ENGINEERED_DIR:", ENGINEERED_DIR.resolve())

# --- 1) Save X matrices (sparse) ---
sparse.save_npz(ENGINEERED_DIR / "X_train.npz", X_train)
sparse.save_npz(ENGINEERED_DIR / "X_val.npz", X_val)
sparse.save_npz(ENGINEERED_DIR / "X_test.npz", X_test)
print("Saved X_*.npz")

# --- 2) Save Y arrays ---
Y_train = train_df[y_cols].astype(np.int8).to_numpy()
Y_val   = val_df[y_cols].astype(np.int8).to_numpy()
Y_test  = test_df[y_cols].astype(np.int8).to_numpy()

np.save(ENGINEERED_DIR / "Y_train.npy", Y_train)
np.save(ENGINEERED_DIR / "Y_val.npy", Y_val)
np.save(ENGINEERED_DIR / "Y_test.npy", Y_test)
print("Saved Y_*.npy")

# --- 3) Save metadata (y_cols, text_col, shapes) ---
meta = {
    "FE_VERSION": FE_VERSION,
    "TEXT_COL": TEXT_COL,
    "y_cols": y_cols,
    "split_dir": str(SPLIT_DIR),
    "X_shapes": {
        "train": [int(X_train.shape[0]), int(X_train.shape[1])],
        "val":   [int(X_val.shape[0]), int(X_val.shape[1])],
        "test":  [int(X_test.shape[0]), int(X_test.shape[1])],
    },
    "Y_shapes": {
        "train": [int(Y_train.shape[0]), int(Y_train.shape[1])],
        "val":   [int(Y_val.shape[0]), int(Y_val.shape[1])],
        "test":  [int(Y_test.shape[0]), int(Y_test.shape[1])],
    },
}
with open(ENGINEERED_DIR / "engineered_meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)
print("Saved engineered_meta.json")

# --- 4) (Optional) Save ID columns to align predictions later ---
# Sizda qaysi ID borligini bilmasak ham, odatda shu nomlar uchraydi:
ID_CANDIDATES = ["primaryid", "caseid", "CASEID", "PRIMARYID", "safetyreportid"]
id_cols = [c for c in ID_CANDIDATES if c in train_df.columns]

if id_cols:
    train_df[id_cols].to_csv(ENGINEERED_DIR / "ids_train.csv", index=False)
    val_df[id_cols].to_csv(ENGINEERED_DIR / "ids_val.csv", index=False)
    test_df[id_cols].to_csv(ENGINEERED_DIR / "ids_test.csv", index=False)
    print("Saved ids_*.csv:", id_cols)
else:
    # Hech bo‘lmasa index saqlab qo‘yamiz
    np.save(ENGINEERED_DIR / "idx_train.npy", train_df.index.to_numpy())
    np.save(ENGINEERED_DIR / "idx_val.npy", val_df.index.to_numpy())
    np.save(ENGINEERED_DIR / "idx_test.npy", test_df.index.to_numpy())
    print("No ID cols found — saved idx_*.npy")
    
# Muhim eslatma:

# Data/Engineered_data/fe_v1/X_train.npz — treningda ishlatiladigan X

# Data/Engineered_data/fe_v1/Y_train.npy — label matriksa

# engineered_meta.json — y_cols tartibini saqlab beradi (juda muhim!)

ENGINEERED_DIR: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Engineered_data\fe_v1
Saved X_*.npz
Saved Y_*.npy
Saved engineered_meta.json
Saved ids_*.csv: ['primaryid']
