In [1]:
# Quyida Data/Engineered_data/fe_v1/ (X/Y) dan o‘qib, Chi-square (chi2) bo‘yicha top-k per label qilib feature selection qiladigan kodlar.

# 09B_feature_selection (Notebook cell’lari)
# CELL 1 — Imports
from __future__ import annotations

from pathlib import Path
import json
import numpy as np
from scipy import sparse

from sklearn.feature_selection import chi2
import joblib

In [2]:
#CELL 2 — Config + PROJECT_ROOT + Load Engineered Data (fe_v1)
FE_VERSION_IN = "fe_v1"              # 09_feature_engineering chiqargan
FS_VERSION_OUT = "fe_v1_fs_chi2_v1"  # yangi selected_data versiya

# Har label uchun nechta feature olish
TOPK_PER_LABEL = 3000

# Final feature sonini cheklash (union juda kattalashsa)
MAX_TOTAL_FEATURES = 250000

# Meta (oxirgi 3 ta) feature’ni majburan qoldiramiz
KEEP_META_LAST_N = 3

ENGINEERED_DIR_NAME = "Engineered_data"

def find_project_root_by_engineered(start: Path | None = None):
    start = start or Path.cwd()
    checked = []
    for p in [start] + list(start.parents):
        ed = p / "Data" / ENGINEERED_DIR_NAME / FE_VERSION_IN
        checked.append(ed)
        if (ed / "X_train.npz").exists() and (ed / "Y_train.npy").exists() and (ed / "engineered_meta.json").exists():
            return p, ed
    raise FileNotFoundError(
        "Engineered data topilmadi.\n"
        f"Start: {start.resolve()}\n"
        "Oxirgi 10 yo‘l:\n" + "\n".join(str(x) for x in checked[-10:]) +
        "\nYe chim: avval 09_feature_engineering ni run qiling."
    )

PROJECT_ROOT, ENGINEERED_IN = find_project_root_by_engineered()
print("PROJECT_ROOT:", PROJECT_ROOT.resolve())
print("ENGINEERED_IN:", ENGINEERED_IN.resolve())

# Load meta
with open(ENGINEERED_IN / "engineered_meta.json", "r", encoding="utf-8") as f:
    meta = json.load(f)

y_cols = meta["y_cols"]
print("Num labels:", len(y_cols))

# Load X/Y
X_train = sparse.load_npz(ENGINEERED_IN / "X_train.npz").tocsr()
X_val   = sparse.load_npz(ENGINEERED_IN / "X_val.npz").tocsr()
X_test  = sparse.load_npz(ENGINEERED_IN / "X_test.npz").tocsr()

Y_train = np.load(ENGINEERED_IN / "Y_train.npy")
Y_val   = np.load(ENGINEERED_IN / "Y_val.npy")
Y_test  = np.load(ENGINEERED_IN / "Y_test.npy")

print("X shapes:", X_train.shape, X_val.shape, X_test.shape)
print("Y shapes:", Y_train.shape, Y_val.shape, Y_test.shape)

PROJECT_ROOT: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract
ENGINEERED_IN: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Engineered_data\fe_v1
Num labels: 21
X shapes: (201176, 104479) (24410, 104479) (24164, 104479)
Y shapes: (201176, 21) (24410, 21) (24164, 21)


In [3]:
import re
import numpy as np
from scipy import sparse

_term_split = re.compile(r"\s*;\s*")

def meta_to_sparse(texts):
    # 09’da ishlatilgan meta feature’lar bilan bir xil bo‘lishi kerak
    lens = []
    n_terms = []
    n_uniq_terms = []

    for s in texts:
        s = (s or "").strip()
        lens.append(len(s))
        if not s:
            n_terms.append(0)
            n_uniq_terms.append(0)
            continue
        terms = [t.strip().lower() for t in _term_split.split(s) if t.strip()]
        n_terms.append(len(terms))
        n_uniq_terms.append(len(set(terms)))

    lens = np.array(lens, dtype=np.float32).reshape(-1, 1)
    n_terms = np.array(n_terms, dtype=np.float32).reshape(-1, 1)
    n_uniq_terms = np.array(n_uniq_terms, dtype=np.float32).reshape(-1, 1)

    feats = np.hstack([np.log1p(lens), n_terms, n_uniq_terms]).astype(np.float32)
    return sparse.csr_matrix(feats)

# Shu bilan joblib.load() o‘tadi.

# Nega ishlaydi?
# Pickle ichida __main__.meta_to_sparse deb yozilgan. Siz uni notebook’da aynan shu nom bilan yaratib qo‘ysangiz, loader topadi.

In [4]:
# CELL 3 — (Ixtiyoriy) Feature names’ni olish (featurizer artifact bo‘lsa)

# Agar artifacts/feature_engineering/fe_v1/featurizer...joblib mavjud bo‘lsa, feature name ham saqlab qo‘yamiz. Yo‘q bo‘lsa, selection baribir ishlaydi.

FEATURIZER_PATH = PROJECT_ROOT / "artifacts" / "feature_engineering" / FE_VERSION_IN / "featurizer_union_word_char_meta.joblib"

feature_names = None
if FEATURIZER_PATH.exists():
    featurizer = joblib.load(FEATURIZER_PATH)
    # FeatureUnion ichidan feature names yig'amiz
    names = []
    for name, tr in featurizer.transformer_list:
        if hasattr(tr, "get_feature_names_out"):
            fn = tr.get_feature_names_out()
            names.extend([f"{name}__{x}" for x in fn])
        else:
            # meta transformer uchun (3 ta) nom beramiz
            if name == "meta":
                names.extend(["meta__log1p_len", "meta__n_terms", "meta__n_uniq_terms"])
            else:
                names.extend([f"{name}__feat{i}" for i in range(tr.transform(X_train[:1]).shape[1])])
    feature_names = np.array(names, dtype=object)
    print("feature_names:", feature_names.shape)
else:
    print("Featurizer artifact topilmadi (ok) — feature_names saqlanmaydi.")

feature_names: (104479,)


In [5]:
#CELL 4 — Chi2 feature selection (top-k per label, union)
n_samples, n_features = X_train.shape
n_labels = Y_train.shape[1]

selected = set()

# Meta (oxirgi N) ni majburan qoldiramiz
if KEEP_META_LAST_N and KEEP_META_LAST_N > 0:
    meta_idx = list(range(n_features - KEEP_META_LAST_N, n_features))
    for i in meta_idx:
        if 0 <= i < n_features:
            selected.add(i)
    print("Forced meta idx:", meta_idx)

# Har label bo‘yicha chi2 top-k
for j in range(n_labels):
    y = Y_train[:, j].astype(np.int8)

    # label’da positive umuman bo‘lmasa skip
    if int(y.sum()) == 0:
        continue

    scores, _ = chi2(X_train, y)  # X non-negative bo‘lishi shart (TF-IDF OK)
    scores = np.nan_to_num(scores, nan=0.0, posinf=0.0, neginf=0.0)

    k = min(TOPK_PER_LABEL, n_features)
    top_idx = np.argpartition(scores, -k)[-k:]
    for idx in top_idx:
        selected.add(int(idx))

    # union juda kattalashsa: kesamiz
    if len(selected) > MAX_TOTAL_FEATURES:
        print("Reached MAX_TOTAL_FEATURES, stopping early at label:", j)
        break

selected = np.array(sorted(selected), dtype=np.int32)
print("Selected features:", selected.size, "/", n_features)

mask = np.zeros(n_features, dtype=bool)
mask[selected] = True

Forced meta idx: [104476, 104477, 104478]
Selected features: 33671 / 104479


In [6]:
#CELL 5 — Apply selection: X_train/val/test ni kamaytirish
X_train_fs = X_train[:, mask]
X_val_fs   = X_val[:, mask]
X_test_fs  = X_test[:, mask]

print("X_train_fs:", X_train_fs.shape)
print("X_val_fs  :", X_val_fs.shape)
print("X_test_fs :", X_test_fs.shape)

X_train_fs: (201176, 33671)
X_val_fs  : (24410, 33671)
X_test_fs : (24164, 33671)


In [7]:
#CELL 6 — Save to Data/Feature_Selected/<FS_VERSION_OUT>/
# =========================================================
# CELL 6 — Save feature-selected data to Data/Feature_Selected
# =========================================================
from pathlib import Path
import json
import numpy as np
from scipy import sparse

FEATURE_SELECTED_DIRNAME = "Feature_Selected"

OUT_DIR = PROJECT_ROOT / "Data" / FEATURE_SELECTED_DIRNAME / FS_VERSION_OUT
OUT_DIR.mkdir(parents=True, exist_ok=True)
print("OUT_DIR:", OUT_DIR.resolve())

# --- Save X (selected) ---
sparse.save_npz(OUT_DIR / "X_train.npz", X_train_fs)
sparse.save_npz(OUT_DIR / "X_val.npz", X_val_fs)
sparse.save_npz(OUT_DIR / "X_test.npz", X_test_fs)
print("Saved X_*.npz")

# --- Save Y (unchanged) ---
np.save(OUT_DIR / "Y_train.npy", Y_train.astype(np.int8))
np.save(OUT_DIR / "Y_val.npy", Y_val.astype(np.int8))
np.save(OUT_DIR / "Y_test.npy", Y_test.astype(np.int8))
print("Saved Y_*.npy")

# --- Save mask + selected idx ---
np.save(OUT_DIR / "feature_mask.npy", mask)
np.save(OUT_DIR / "selected_idx.npy", selected)
print("Saved feature_mask.npy + selected_idx.npy")

import joblib

selector_payload = {
    "mask": mask,                 # bool array
    "selected_idx": selected,     # int indices
    "fs_version_out": FS_VERSION_OUT,
    "method": "chi2_topk_per_label_union",
    "topk_per_label": TOPK_PER_LABEL,
    "max_total_features": MAX_TOTAL_FEATURES,
    "keep_meta_last_n": KEEP_META_LAST_N,
}

# agar selected_feature_names.npy saqlayotgan bo‘lsangiz, uni ham qo‘shsa bo‘ladi:
if "feature_names" in globals() and feature_names is not None and len(feature_names) == n_features:
    selector_payload["selected_feature_names"] = feature_names[mask]

joblib.dump(selector_payload, OUT_DIR / "feature_selector.joblib")
print("Saved feature_selector.joblib")

# --- Copy IDs/idx from engineered input (if exists) ---
for fn in ["ids_train.csv", "ids_val.csv", "ids_test.csv", "idx_train.npy", "idx_val.npy", "idx_test.npy"]:
    src = ENGINEERED_IN / fn
    if src.exists():
        (OUT_DIR / fn).write_bytes(src.read_bytes())

# --- Save meta ---
out_meta = dict(meta)
out_meta.update({
    "fe_version_in": FE_VERSION_IN,
    "fs_version_out": FS_VERSION_OUT,
    "method": "chi2_topk_per_label_union",
    "topk_per_label": TOPK_PER_LABEL,
    "max_total_features": MAX_TOTAL_FEATURES,
    "keep_meta_last_n": KEEP_META_LAST_N,
    "selected_features": int(selected.size),
    "original_features": int(n_features),
    "output_dir": str(OUT_DIR),
    "X_shapes": {
        "train": [int(X_train_fs.shape[0]), int(X_train_fs.shape[1])],
        "val":   [int(X_val_fs.shape[0]), int(X_val_fs.shape[1])],
        "test":  [int(X_test_fs.shape[0]), int(X_test_fs.shape[1])],
    },
    "Y_shapes": {
        "train": [int(Y_train.shape[0]), int(Y_train.shape[1])],
        "val":   [int(Y_val.shape[0]), int(Y_val.shape[1])],
        "test":  [int(Y_test.shape[0]), int(Y_test.shape[1])],
    },
})

with open(OUT_DIR / "engineered_meta.json", "w", encoding="utf-8") as f:
    json.dump(out_meta, f, ensure_ascii=False, indent=2)
print("Saved engineered_meta.json")

# --- Save feature names (if available) ---
if "feature_names" in globals() and feature_names is not None and len(feature_names) == n_features:
    sel_names = feature_names[mask]
    np.save(OUT_DIR / "selected_feature_names.npy", sel_names)
    with open(OUT_DIR / "selected_feature_names.txt", "w", encoding="utf-8") as f:
        for x in sel_names.tolist():
            f.write(str(x) + "\n")
    print("Saved selected_feature_names.npy/.txt")

print("✅ Saved Feature_Selected data:", OUT_DIR.resolve())

OUT_DIR: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Feature_Selected\fe_v1_fs_chi2_v1
Saved X_*.npz
Saved Y_*.npy
Saved feature_mask.npy + selected_idx.npy
Saved feature_selector.joblib
Saved engineered_meta.json
Saved selected_feature_names.npy/.txt
✅ Saved Feature_Selected data: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Feature_Selected\fe_v1_fs_chi2_v1


In [None]:
# 10_train_improvement’da qanday o‘qiladi?

# Endi 10’da Engineered_data emas, Feature_Selected dan o‘qitasiz:

# FS_VERSION_OUT = "fe_v1_fs_chi2_v1"
# FEATURE_SELECTED_DIR = PROJECT_ROOT / "Data" / "Feature_Selected" / FS_VERSION_OUT
# X_train = sparse.load_npz(FEATURE_SELECTED_DIR / "X_train.npz")
# Y_train = np.load(FEATURE_SELECTED_DIR / "Y_train.npy")
...

In [None]:
# 10_train_improvement uchun nima o‘zgaradi?

# Endi 10_train_improvement’da faqat shu 2 ta config o‘zgaradi:

# FE_VERSION = "fe_v1_fs_chi2_v1"
# # va ENGINEERED_DIR ni shu versiyadan o‘qiydi

# Shunda model kamroq feature bilan tezroq train bo‘ladi va ko‘pincha generalization yaxshilanadi.