In [1]:
# CELL 1 — Imports + Paths
from pathlib import Path
import json
import re
import numpy as np
import pandas as pd

from scipy import sparse
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# --- Project root topish ---
CWD = Path.cwd()
if (CWD / "Data").exists():
    PROJECT_ROOT = CWD
elif (CWD.parent / "Data").exists():
    PROJECT_ROOT = CWD.parent
else:
    PROJECT_ROOT = CWD

# --- Sizning split papkangiz (aniq nom shu bo'lsin) ---
SPLIT_DIR = PROJECT_ROOT / "Data" / "Processed" / "splits_multilabel_noleakage"

# --- Preprocess output ---
OUT_DIR = PROJECT_ROOT / "Data" / "Preprocessed_data" / "baseline"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("SPLIT_DIR exists:", SPLIT_DIR.exists(), SPLIT_DIR)
print("OUT_DIR:", OUT_DIR)

PROJECT_ROOT: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract
SPLIT_DIR exists: True c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Processed\splits_multilabel_noleakage
OUT_DIR: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Preprocessed_data\baseline


In [2]:
# CELL 2 — Load train/val/test + Columns
TRAIN_CSV = SPLIT_DIR / "train.csv"
VAL_CSV   = SPLIT_DIR / "val.csv"
TEST_CSV  = SPLIT_DIR / "test.csv"

df_train = pd.read_csv(TRAIN_CSV, low_memory=False)
df_val   = pd.read_csv(VAL_CSV, low_memory=False)
df_test  = pd.read_csv(TEST_CSV, low_memory=False)

TEXT_COL = "REAC_pt_symptom_v2"
if TEXT_COL not in df_train.columns:
    raise ValueError(f"TEXT_COL topilmadi: {TEXT_COL}")

y_cols = sorted([c for c in df_train.columns if c.startswith("y_") and c != "y_labels"])
if len(y_cols) == 0:
    raise ValueError("y_ label ustunlari topilmadi.")

print("Shapes:", df_train.shape, df_val.shape, df_test.shape)
print("Num labels:", len(y_cols))
print("First labels:", y_cols[:8])

# sanity
for name, dfx in [("train", df_train), ("val", df_val), ("test", df_test)]:
    empty_txt = (dfx[TEXT_COL].fillna("").astype(str).str.strip() == "").sum()
    zero_lbl  = (dfx[y_cols].sum(axis=1) == 0).sum()
    print(f"{name}: empty_text={int(empty_txt)} | zero_label={int(zero_lbl)}")

Shapes: (201176, 24) (24410, 24) (24164, 24)
Num labels: 21
First labels: ['y_cardiovascular', 'y_dermatologic', 'y_edema_swelling', 'y_gastrointestinal', 'y_general_systemic', 'y_hematologic', 'y_hepatic', 'y_hypersensitivity_allergy']
train: empty_text=0 | zero_label=0
val: empty_text=0 | zero_label=0
test: empty_text=0 | zero_label=0


In [3]:
# Text normalize + X/Y tayyorlash

# Bu baseline uchun yetarli va user erkin yozadigan matnga yaqin qiladi.

_ws = re.compile(r"\s+")
_punct = re.compile(r"[,\t\r\n]+")

def normalize_text(s: str) -> str:
    s = "" if s is None else str(s)
    s = s.lower().strip()
    # FAERS PT'lar ";" bilan keladi — user matniga yaqinlashtirish uchun space qilamiz
    s = s.replace(";", " ")
    s = _punct.sub(" ", s)
    s = _ws.sub(" ", s)
    return s

Xtr_text = df_train[TEXT_COL].map(normalize_text).values
Xva_text = df_val[TEXT_COL].map(normalize_text).values
Xte_text = df_test[TEXT_COL].map(normalize_text).values

# Y (multi-label)
Ytr = df_train[y_cols].apply(pd.to_numeric, errors="coerce").fillna(0).astype(np.int8).values
Yva = df_val[y_cols].apply(pd.to_numeric, errors="coerce").fillna(0).astype(np.int8).values
Yte = df_test[y_cols].apply(pd.to_numeric, errors="coerce").fillna(0).astype(np.int8).values

print("Example text:", Xtr_text[0])
print("Y shapes:", Ytr.shape, Yva.shape, Yte.shape)

Example text: injection site reaction general physical health deterioration chest discomfort sensitivity to weather change fatigue dysphonia wheezing pain influenza productive cough nasopharyngitis weight decreased nasal congestion hypoventilation illness blood pressure decreased forced expiratory volume decreased hypersensitivity dyspnoea body temperature decreased
Y shapes: (201176, 21) (24410, 21) (24164, 21)


In [4]:
#CELL 4 — TF-IDF (Char + Word) → Combine (BEST baseline)
# Char TF-IDF: typo/imlo/sinonimga chidamli
tfidf_char = TfidfVectorizer(
    analyzer="char_wb",
    ngram_range=(3, 5),
    min_df=3,
    max_df=0.95,
    sublinear_tf=True,
    dtype=np.float32,
    max_features=180_000
)

# Word TF-IDF: interpretatsiya yaxshi (qaysi so'zlar signal)
tfidf_word = TfidfVectorizer(
    analyzer="word",
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.95,
    sublinear_tf=True,
    dtype=np.float32,
    max_features=90_000,
    token_pattern=r"(?u)\b[a-z0-9][a-z0-9\-\_]+\b"
)

print("[1/2] Fit/transform CHAR TF-IDF ...")
Xtr_c = tfidf_char.fit_transform(Xtr_text)
Xva_c = tfidf_char.transform(Xva_text)
Xte_c = tfidf_char.transform(Xte_text)

print("[2/2] Fit/transform WORD TF-IDF ...")
Xtr_w = tfidf_word.fit_transform(Xtr_text)
Xva_w = tfidf_word.transform(Xva_text)
Xte_w = tfidf_word.transform(Xte_text)

# Combine
Xtr = hstack([Xtr_c, Xtr_w], format="csr")
Xva = hstack([Xva_c, Xva_w], format="csr")
Xte = hstack([Xte_c, Xte_w], format="csr")

print("X shapes:", Xtr.shape, Xva.shape, Xte.shape)
print("nnz train:", Xtr.nnz)

[1/2] Fit/transform CHAR TF-IDF ...
[2/2] Fit/transform WORD TF-IDF ...
X shapes: (201176, 58921) (24410, 58921) (24164, 58921)
nnz train: 27079723


In [5]:
#Save (X sparse + Y + bundle + meta)
# --- Save sparse matrices ---
sparse.save_npz(OUT_DIR / "X_train.npz", Xtr)
sparse.save_npz(OUT_DIR / "X_val.npz",   Xva)
sparse.save_npz(OUT_DIR / "X_test.npz",  Xte)

# --- Save labels ---
np.save(OUT_DIR / "Y_train.npy", Ytr)
np.save(OUT_DIR / "Y_val.npy",   Yva)
np.save(OUT_DIR / "Y_test.npy",  Yte)

# --- Save ids (foydali) ---
if "primaryid" in df_train.columns:
    np.save(OUT_DIR / "id_train.npy", df_train["primaryid"].values)
    np.save(OUT_DIR / "id_val.npy",   df_val["primaryid"].values)
    np.save(OUT_DIR / "id_test.npy",  df_test["primaryid"].values)

# --- Save vectorizer bundle ---
bundle = {
    "text_col": TEXT_COL,
    "y_cols": y_cols,
    "tfidf_char": tfidf_char,
    "tfidf_word": tfidf_word,
    "version": "baseline_preprocess_v3_noleakage",
}
joblib.dump(bundle, OUT_DIR / "tfidf_bundle.joblib")

# --- Meta ---
meta = {
    "text_col": TEXT_COL,
    "num_labels": int(len(y_cols)),
    "num_features_char": int(len(tfidf_char.get_feature_names_out())),
    "num_features_word": int(len(tfidf_word.get_feature_names_out())),
    "num_features_total": int(Xtr.shape[1]),
    "rows_train": int(Xtr.shape[0]),
    "rows_val": int(Xva.shape[0]),
    "rows_test": int(Xte.shape[0]),
    "char_params": {
        "analyzer": "char_wb", "ngram_range": [3,5], "min_df": 3, "max_df": 0.95,
        "sublinear_tf": True, "max_features": 180000, "dtype": "float32"
    },
    "word_params": {
        "analyzer": "word", "ngram_range": [1,2], "min_df": 5, "max_df": 0.95,
        "sublinear_tf": True, "max_features": 90000, "dtype": "float32"
    }
}
(OUT_DIR / "preprocess_meta.json").write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")

print("SAVED =>", OUT_DIR)
print(meta)



# Natijada Data/Processed/baseline_preprocess_v2/ ichida:

# X_train.npz / X_val.npz / X_test.npz

# Y_train.npy / Y_val.npy / Y_test.npy

# id_train.npy / id_val.npy / id_test.npy (agar primaryid bo‘lsa)

# tfidf_bundle.joblib

# preprocess_meta.json

SAVED => c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Preprocessed_data\baseline
{'text_col': 'REAC_pt_symptom_v2', 'num_labels': 21, 'num_features_char': 28926, 'num_features_word': 29995, 'num_features_total': 58921, 'rows_train': 201176, 'rows_val': 24410, 'rows_test': 24164, 'char_params': {'analyzer': 'char_wb', 'ngram_range': [3, 5], 'min_df': 3, 'max_df': 0.95, 'sublinear_tf': True, 'max_features': 180000, 'dtype': 'float32'}, 'word_params': {'analyzer': 'word', 'ngram_range': [1, 2], 'min_df': 5, 'max_df': 0.95, 'sublinear_tf': True, 'max_features': 90000, 'dtype': 'float32'}}


In [6]:
#CELL 6 — Quick “hammasi joyida” tekshiruv
# 1) Y faqat 0/1 ekanini tekshiruv
vals = set(np.unique(Ytr))
print("Unique Y values:", vals)

# 2) X da NaN yo'qligi (sparse)
print("Xtr has NaN:", np.isnan(Xtr.data).any())

# 3) Word featurelardan top 20 (interpretatsiya uchun)
feat_word = tfidf_word.get_feature_names_out()
mean_w = np.asarray(Xtr_w.mean(axis=0)).ravel()
top_idx = mean_w.argsort()[::-1][:20]

pd.DataFrame({"term": feat_word[top_idx], "mean_tfidf": mean_w[top_idx].round(6)})

Unique Y values: {np.int8(0), np.int8(1)}
Xtr has NaN: False


Unnamed: 0,term,mean_tfidf
0,pain,0.025193
1,rash,0.019115
2,site,0.016978
3,infection,0.015799
4,diarrhoea,0.015617
5,injection,0.015534
6,injection site,0.015447
7,increased,0.014365
8,decreased,0.013348
9,nausea,0.013016
