# txt datalarni oqish

In [4]:
from pathlib import Path
import pandas as pd

ASCII_DIR = Path(r"C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Raw_data\txt_datalar")  # sizda qayerda bo'lsak shu yerga o'zgartiramiz

def read_faers_txt(path: Path, usecols=None, dtype=str):
    """
    FAERS ASCII ($ delimited) faylini o'qiydi.
    Encoding muammosi bo'lsa latin-1 ga o'tadi.
    """
    for enc in ("utf-8", "latin-1"):
        try:
            df = pd.read_csv(
                path,
                sep="$",
                dtype=dtype,
                encoding=enc,
                low_memory=False,
                usecols=usecols,
                on_bad_lines="skip"  # agar juda kam xatoli qator bo'lsa; xohlasangiz olib tashlang
            )
            # oxirgi bo'sh ustun chiqsa (Unnamed...) olib tashlaymiz
            df = df.loc[:, ~df.columns.astype(str).str.contains("^Unnamed", regex=True)]
            return df
        except UnicodeDecodeError:
            continue
    raise

def find_file(stem: str) -> Path:
    # DEMO25Q4.txt yoki DEMO25Q4 kabi nomlarni topadi
    matches = list(ASCII_DIR.glob(f"{stem}*"))
    if not matches:
        raise FileNotFoundError(f"{stem} topilmadi. {ASCII_DIR} ichini tekshiring.")
    # .txt bo'lsa, o'shani ustun qo'yamiz
    matches = sorted(matches, key=lambda p: (p.suffix != ".txt", p.name))
    return matches[0]

# 8ta fayl nomi
names = ["DEMO25Q4", "DRUG25Q4", "REAC25Q4", "INDI25Q4", "OUTC25Q4", "THER25Q4", "RPSR25Q4", "DELETE25Q4"]

paths = {n: find_file(n) for n in names}
print(paths)

# Katta fayllar bo'lgani uchun hozircha hammasini to'liq o'qimasdan,
# avval ustunlarini ko'rib olamiz:
for n, p in paths.items():
    df_head = read_faers_txt(p, dtype=str).head(1)
    print(n, "columns:", list(df_head.columns))

# Agar hammasi OK bo'lsa, to'liq o'qib olasiz:
demo  = read_faers_txt(paths["DEMO25Q4"])
drug  = read_faers_txt(paths["DRUG25Q4"])
reac  = read_faers_txt(paths["REAC25Q4"])
indi  = read_faers_txt(paths["INDI25Q4"])
outc  = read_faers_txt(paths["OUTC25Q4"])
ther  = read_faers_txt(paths["THER25Q4"])
rpsr  = read_faers_txt(paths["RPSR25Q4"])
dele  = read_faers_txt(paths["DELETE25Q4"])

print("Shapes:")
print("DEMO", demo.shape)
print("DRUG", drug.shape)
print("REAC", reac.shape)
print("INDI", indi.shape)
print("OUTC", outc.shape)
print("THER", ther.shape)
print("RPSR", rpsr.shape)
print("DELETE", dele.shape)

{'DEMO25Q4': WindowsPath('C:/Users/xolmu/OneDrive/Desktop/Modul Program oyi/Modul_Program3/6_project_dori_tasiri_extract/Data/Raw_data/txt_datalar/DEMO25Q4.txt'), 'DRUG25Q4': WindowsPath('C:/Users/xolmu/OneDrive/Desktop/Modul Program oyi/Modul_Program3/6_project_dori_tasiri_extract/Data/Raw_data/txt_datalar/DRUG25Q4.txt'), 'REAC25Q4': WindowsPath('C:/Users/xolmu/OneDrive/Desktop/Modul Program oyi/Modul_Program3/6_project_dori_tasiri_extract/Data/Raw_data/txt_datalar/REAC25Q4.txt'), 'INDI25Q4': WindowsPath('C:/Users/xolmu/OneDrive/Desktop/Modul Program oyi/Modul_Program3/6_project_dori_tasiri_extract/Data/Raw_data/txt_datalar/INDI25Q4.txt'), 'OUTC25Q4': WindowsPath('C:/Users/xolmu/OneDrive/Desktop/Modul Program oyi/Modul_Program3/6_project_dori_tasiri_extract/Data/Raw_data/txt_datalar/OUTC25Q4.txt'), 'THER25Q4': WindowsPath('C:/Users/xolmu/OneDrive/Desktop/Modul Program oyi/Modul_Program3/6_project_dori_tasiri_extract/Data/Raw_data/txt_datalar/THER25Q4.txt'), 'RPSR25Q4': WindowsPath('C:

In [7]:
# DELETE’ni to‘g‘ri o‘qish (header yo‘q muammosi bilan)

# =========================
# BOX 1 — READ DELETE (header yo'q)
# =========================
from pathlib import Path
import pandas as pd

# 1) TXT papka manzili (Sizdagi real joy)
TXT_DIR = Path(r"C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Raw_data\txt_datalar")

# 2) Quarter suffix (faqat suffix!)
Q = "25Q4"

# ---------- Helper functions ----------
def resolve_file(stem: str) -> Path:
    """
    DEMO25Q4 / DEMO25Q4.txt / DEMO25Q4.TXT kabi variantlarni topadi.
    """
    candidates = [TXT_DIR / stem, TXT_DIR / f"{stem}.txt", TXT_DIR / f"{stem}.TXT"]
    for p in candidates:
        if p.exists():
            return p

    hits = list(TXT_DIR.glob(f"{stem}*"))
    if hits:
        return hits[0]

    raise FileNotFoundError(f"Topilmadi: {stem} (TXT_DIR={TXT_DIR})")

def sniff_sep(path: Path) -> str:
    """
    Birinchi 50 qatordan separatorni taxmin qiladi.
    FAERS ASCII ko'pincha | bo'ladi.
    """
    lines = path.read_text(errors="ignore", encoding="latin1").splitlines()
    for line in lines[:50]:
        line = line.strip()
        if not line:
            continue
        cands = ["|", "\t", ",", ";", "$", "^"]
        counts = {c: line.count(c) for c in cands}
        return max(counts, key=counts.get)
    return "|"

# ---------- Read DELETE ----------
delete_path = resolve_file(f"DELETE{Q}")
sep = sniff_sep(delete_path)

# DELETE ko'pincha HEADERsiz bo'ladi -> header=None shart
delete_df = pd.read_csv(
    delete_path,
    sep=sep,
    header=None,
    dtype=str,
    encoding="latin1",
    low_memory=False
)

# bo'sh ustunlar chiqsa olib tashlaymiz
delete_df = delete_df.dropna(axis=1, how="all")

# 1 ustun bo'lsa: PRIMARYID ro'yxati
if delete_df.shape[1] == 1:
    delete_df.columns = ["primaryid"]
else:
    # ba'zan 2 ustun bo'lishi mumkin (PRIMARYID, CASEID) — xavfsiz usul
    delete_df = delete_df.iloc[:, :2]
    delete_df.columns = ["primaryid", "caseid"]

# trim
delete_df["primaryid"] = delete_df["primaryid"].astype(str).str.strip()

print("DELETE path:", delete_path)
print("DELETE sep :", repr(sep))
print("DELETE fixed shape:", delete_df.shape)
print("DELETE columns:", list(delete_df.columns))
print(delete_df.head(10))

DELETE path: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Raw_data\txt_datalar\DELETE25Q4.txt
DELETE sep : '|'
DELETE fixed shape: (4497, 1)
DELETE columns: ['primaryid']
  primaryid
0  10088013
1  11913421
2  13015075
3  13049382
4  13347877
5  13526206
6  14230628
7  14401801
8  14539298
9  14885874


In [10]:
# DELETE qaysi ID ekanini aniqlash → DEMO’ni tozalash → .txt saqlash

from pathlib import Path
import pandas as pd

# === PATHS ===
TXT_DIR = Path(r"C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Raw_data\txt_datalar")
Q = "25Q4"

# ---------- Helpers ----------
def resolve_file(stem: str) -> Path:
    candidates = [TXT_DIR / stem, TXT_DIR / f"{stem}.txt", TXT_DIR / f"{stem}.TXT"]
    for p in candidates:
        if p.exists():
            return p
    hits = list(TXT_DIR.glob(f"{stem}*"))
    if hits:
        return hits[0]
    raise FileNotFoundError(f"Topilmadi: {stem} (TXT_DIR={TXT_DIR})")

def sniff_sep(path: Path) -> str:
    lines = path.read_text(errors="ignore", encoding="latin1").splitlines()
    for line in lines[:50]:
        line = line.strip()
        if not line:
            continue
        cands = ["|", "\t", ",", ";", "$", "^"]
        counts = {c: line.count(c) for c in cands}
        return max(counts, key=counts.get)
    return "|"

def read_faers_with_header(path: Path) -> pd.DataFrame:
    sep = sniff_sep(path)
    df = pd.read_csv(path, sep=sep, dtype=str, low_memory=False, encoding="latin1")
    df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
    df.columns = [c.strip().lower() for c in df.columns]
    return df

def read_delete_no_header(path: Path) -> pd.DataFrame:
    sep = sniff_sep(path)
    df = pd.read_csv(path, sep=sep, header=None, dtype=str, low_memory=False, encoding="latin1")
    df = df.dropna(axis=1, how="all")
    df = df.iloc[:, :1]               # sizda 1 ustun
    df.columns = ["id_raw"]
    df["id_raw"] = df["id_raw"].astype(str)
    return df

def norm_id(s: pd.Series) -> pd.Series:
    # faqat raqamlarni qoldiramiz (hidden char/space muammosini ham yechadi)
    return (s.astype(str)
              .str.strip()
              .str.replace(r"\D+", "", regex=True))

# ---------- Load DEMO + DELETE ----------
demo_path = resolve_file(f"DEMO{Q}")
delete_path = resolve_file(f"DELETE{Q}")

demo = read_faers_with_header(demo_path)
delete_df = read_delete_no_header(delete_path)

if "primaryid" not in demo.columns:
    raise ValueError(f"DEMO ichida primaryid yo'q. demo.columns={list(demo.columns)}")

# normalize IDs
demo["primaryid_n"] = norm_id(demo["primaryid"])
delete_df["id_n"] = norm_id(delete_df["id_raw"])

# tozalash: bo'sh va dublikat
delete_df = delete_df[delete_df["id_n"].notna() & (delete_df["id_n"] != "")]
delete_ids = set(delete_df["id_n"].drop_duplicates())

# CASEID bo'lsa ham normalize qilamiz
has_caseid = "caseid" in demo.columns
if has_caseid:
    demo["caseid_n"] = norm_id(demo["caseid"])

# ---------- Decide: DELETE = primaryidmi yoki caseidmi? ----------
overlap_primary = demo["primaryid_n"].isin(delete_ids).sum()
overlap_case = demo["caseid_n"].isin(delete_ids).sum() if has_caseid else 0

print("DEMO shape:", demo.shape)
print("DELETE rows:", len(delete_ids))
print("Overlap with DEMO.primaryid:", int(overlap_primary))
print("Overlap with DEMO.caseid  :", int(overlap_case))

# qaysi biri ko'p bo'lsa, o'shani DELETE key deb olamiz
if overlap_primary == 0 and overlap_case == 0:
    print("\n⚠️ Ikkalasida ham overlap 0. Demak DELETE boshqa kvartalniki yoki ID format boshqacha.")
    # baribir demo_clean = demo (o'zgarmaydi)
    demo_clean = demo.copy()
else:
    if overlap_case > overlap_primary:
        key_col = "caseid_n"
        print("\n✅ DELETE bu yerda CASEIDga mos kelmoqda.")
    else:
        key_col = "primaryid_n"
        print("\n✅ DELETE bu yerda PRIMARYIDga mos kelmoqda.")

    before = len(demo)
    demo_clean = demo[~demo[key_col].isin(delete_ids)].copy()
    after = len(demo_clean)

    print(f"DEMO before: {before:,}")
    print(f"DEMO after : {after:,}")
    print(f"Removed    : {before - after:,}")

# ---------- Save demo_clean as .txt into txt_datalar ----------
# normalize ustunlar yordamchi edi -> saqlashdan oldin olib tashlaymiz
drop_cols = [c for c in ["primaryid_n", "caseid_n"] if c in demo_clean.columns]
demo_clean_out = demo_clean.drop(columns=drop_cols, errors="ignore")

out_path = TXT_DIR / f"DEMO{Q}_clean.txt"
demo_clean_out.to_csv(out_path, sep="|", index=False, encoding="latin1")
print("Saved:", out_path)

delete_out = OUT_DIR / f"DELETE{Q}_clean_sanitized_.txt"
delete_df.to_csv(delete_out, sep="|", index=False, encoding="latin1")
print("Saved delete_df to:", delete_out)

DEMO shape: (385288, 27)
DELETE rows: 4497
Overlap with DEMO.primaryid: 0
Overlap with DEMO.caseid  : 0

⚠️ Ikkalasida ham overlap 0. Demak DELETE boshqa kvartalniki yoki ID format boshqacha.
Saved: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Raw_data\txt_datalar\DEMO25Q4_clean.txt
Saved delete_df to: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Processed\DELETE25Q4_clean_sanitized_.txt


In [11]:
# DEMO vs DELETE ID length distribution (tez tekshiruv)
demo_primary_len = demo["primaryid"].astype(str).str.strip().str.len().value_counts().head(10)
delete_len = delete_df["id_raw"].astype(str).str.strip().str.replace(r"\D+","",regex=True).str.len().value_counts().head(10)

print("DEMO primaryid length counts:\n", demo_primary_len)
print("\nDELETE id length counts:\n", delete_len)

print("\nSample DEMO primaryid:", demo["primaryid"].astype(str).str.strip().head(5).tolist())
if "caseid" in demo.columns:
    print("Sample DEMO caseid   :", demo["caseid"].astype(str).str.strip().head(5).tolist())
print("Sample DELETE ids     :", delete_df["id_raw"].astype(str).str.strip().head(5).tolist())

DEMO primaryid length counts:
 primaryid
9     380230
10      4996
8         59
11         3
Name: count, dtype: int64

DELETE id length counts:
 id_raw
8    4494
7       3
Name: count, dtype: int64

Sample DEMO primaryid: ['100324053', '1012809821', '101406268', '101515934', '1016133068']
Sample DEMO caseid   : ['10032405', '10128098', '10140626', '10151593', '10161330']
Sample DELETE ids     : ['10088013', '11913421', '13015075', '13049382', '13347877']


In [None]:
# hozir qilgan ishimizni aniq “hisobot” qilib aytaman — nima qilindi, nega qilindi, qilinmasa nima bo‘ladi, natija nima bo‘ldi.

# 1) Hozir nima ish qilindi?

# FAERS ASCII DELETE25Q4.txt faylini topdik.

# Uni to‘g‘ri o‘qishga keltirdik:

# DELETE faylida header yo‘q ekan.

# Avval pandas 1-qatorni header deb olib, ustun nomi 10088013 bo‘lib qolgan edi.

# Biz header=None qilib o‘qib, ustunini primaryid (yoki id_raw) deb qo‘ydik.

# DELETEdagi ID’larni sanitize qildik (tozaladik):

# strip() (bo‘sh joylarni qirqish)

# bo‘sh qatorlarni chiqarish

# faqat raqamlarni qoldirish (\Dlarni olib tashlash)

# dublikatlarni olib tashlash

# DEMO25Q4.txt faylini o‘qidik.

# DELETEdagi ID’lar DEMOdagi primaryid yoki caseid bilan mos tushadimi, yo‘qmi — diagnostika qildik:

# Overlap with DEMO.primaryid

# Overlap with DEMO.caseid

# ID uzunliklarini ham tekshirdik (format mosligini tushunish uchun):

# DEMO.primaryid ko‘p hollarda 9 raqam,

# DEMO.caseid 8 raqam,

# DELETE ID’lar ham 8 raqam.

# 2) Bu ish nega kerak?

# FAERS’da DELETE fayli — “bekor qilingan / chiqarib tashlangan reportlar ro‘yxati”.
# Agar bu ID’lar DEMO va boshqa jadvallarda mavjud bo‘lsa:

# ularni datasetdan chiqarib tashlash kerak,

# aks holda ML/analizga “bekor qilingan” (yaroqsiz) reportlar aralashadi.

# Shuning uchun biz DELETEni merge’dan oldin tekshirdik — bu data cleaningning standart qismi.

# 3) Nega aynan hozir qilindi?

# Merge’dan oldin qilishning foydasi:

# keraksiz satrlar merge jarayoniga kirib ketmaydi,

# dataset kichikroq bo‘ladi,

# keyinchalik debug qilish oson bo‘ladi.

# Ya’ni bu bosqich — “merge’dan oldingi sanitariya”.

# 4) Qilinmasa nima bo‘lardi?

# Sizning hozirgi holatda hech narsa bo‘lmasdi, chunki DELETE ID’lari DEMO’da topilmadi (overlap 0).
# Demak:

# DELETEni umuman ishlatmasangiz ham, 25Q4 DEMO o‘zgarmaydi.

# Ammo umumiy qoida sifatida:

# agar overlap bo‘lganida, qilinmasa “deleted” reportlar datasetga kirib, natijalarni shovqinli qilardi.

# 5) Natija nima bo‘ldi?

# Natija: DELETE25Q4 bu DEMO25Q4ga ta’sir qilmadi.

# Siz olgan aniq faktlar:

# DELETE rows: 4497

# Overlap with DEMO.primaryid: 0

# Overlap with DEMO.caseid: 0

# shuning uchun Removed: 0

# Bu nimani anglatadi?

# DELETE25Q4 ichidagi ID’lar Siz o‘qiyotgan DEMO25Q4 faylida yo‘q.

# Demak 25Q4 paketiga kelganda, o‘sha “delete bo‘lishi kerak bo‘lgan” case’lar allaqachon paketdan chiqarib yuborilgan (yoki boshqa paket/kvartalga tegishli).

# Qisqa xulosa

# Qilingan ish: DELETE faylini to‘g‘ri o‘qish + DEMO bilan mosligini tekshirish.

# Nega: deleted reportlar datasetga kirib ketmasligi uchun.

# Qilinmasa: Sizning hozirgi 25Q4’da farq bo‘lmaydi (chunki overlap 0), lekin umumiy datasetlarda bu muhim.

# Natija: DEMO o‘zgarmadi (Removed = 0). Endi merge bosqichiga DEMO ni o‘z holicha (yoki nomi demo_clean bo‘lsa ham u demo bilan bir xil) olib kiramiz.

# Merge

In [1]:
# Setup + helper funksiyalar

from pathlib import Path
import re
import pandas as pd

# ====== Sizdagi txt papka ======
TXT_DIR = Path(r"C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Raw_data\txt_datalar")
Q = "25Q4"

# ====== Output ======
OUT_DIR = TXT_DIR.parent.parent / "Processed"   # Data/Processed
OUT_DIR.mkdir(parents=True, exist_ok=True)

def sniff_sep(path: Path) -> str:
    lines = path.read_text(errors="ignore", encoding="latin1").splitlines()
    for line in lines[:50]:
        line = line.strip()
        if not line:
            continue
        cands = ["|", "\t", ",", ";", "$", "^"]
        counts = {c: line.count(c) for c in cands}
        return max(counts, key=counts.get)
    return "|"

def read_faers(path: Path) -> pd.DataFrame:
    sep = sniff_sep(path)
    df = pd.read_csv(path, sep=sep, dtype=str, low_memory=False, encoding="latin1")
    df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
    df.columns = [c.strip().lower() for c in df.columns]
    return df

def uniq_join(s: pd.Series, max_items: int = 50) -> str:
    vals = []
    for x in s.dropna().astype(str):
        x = x.strip()
        if x:
            vals.append(x)
    # unique + tartib saqlash
    seen = set()
    out = []
    for v in vals:
        if v not in seen:
            seen.add(v)
            out.append(v)
        if len(out) >= max_items:
            break
    return "; ".join(out)

def agg_table(df: pd.DataFrame, key: str, cols: list[str], prefix: str, max_items: int = 50) -> pd.DataFrame:
    cols = [c for c in cols if c in df.columns]
    if key not in df.columns:
        raise ValueError(f"{prefix}: '{key}' ustuni topilmadi. columns={list(df.columns)}")

    base = df[[key] + cols].copy()
    base[key] = base[key].astype(str).str.strip()

    # agregatsiya
    g = base.groupby(key, sort=False)
    out = pd.DataFrame({key: g.size().index, f"{prefix}_n_rows": g.size().values})

    for c in cols:
        out[f"{prefix}_{c}"] = g[c].apply(lambda s: uniq_join(s, max_items=max_items)).values

    return out

In [2]:
# DEMO’ni o‘qish + qolgan fayllarni avtomatik topish (DEMO/DELETE dan tashqari)

# ====== DEMO ======
demo_path = TXT_DIR / f"DEMO{Q}.txt"
if not demo_path.exists():
    # ba'zan .txt yashirin yoki nomi biroz boshqacha bo'lishi mumkin
    hits = list(TXT_DIR.glob(f"DEMO{Q}*"))
    if not hits:
        raise FileNotFoundError(f"DEMO{Q} topilmadi: {TXT_DIR}")
    demo_path = hits[0]

demo = read_faers(demo_path)

if "primaryid" not in demo.columns:
    raise ValueError(f"DEMO ichida primaryid yo'q. DEMO columns={list(demo.columns)}")

demo["primaryid"] = demo["primaryid"].astype(str).str.strip()
valid_ids = set(demo["primaryid"].dropna())

print("DEMO:", demo.shape, "| valid PRIMARYIDs:", len(valid_ids))
print("DEMO path:", demo_path)

# ====== qolgan TXT fayllar ro'yxati (DEMO/DELETE dan tashqari) ======
all_txt = sorted(TXT_DIR.glob(f"*{Q}*.txt"))
other_txt = []
for p in all_txt:
    name = p.name.upper()
    if name.startswith("DEMO") or name.startswith("DELETE"):
        continue
    other_txt.append(p)

print("\nOther txt files found:", len(other_txt))
for p in other_txt:
    print(" -", p.name)

DEMO: (385288, 25) | valid PRIMARYIDs: 385288
DEMO path: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Raw_data\txt_datalar\DEMO25Q4.txt

Other txt files found: 6
 - DRUG25Q4.txt
 - INDI25Q4.txt
 - OUTC25Q4.txt
 - REAC25Q4.txt
 - RPSR25Q4.txt
 - THER25Q4.txt


In [3]:
# 7 ta .txtni agregatsiya qilib DEMO bilan merge + 1 ta CSV saqlash

# Qaysi jadvaldan qaysi ustunlarni olish (eng keraklilari)
# (ustun bo'lmasa — avtomatik skip qiladi)
PICK = {
    "DRUG": ["drugname", "prod_ai", "role_cod", "route", "dose_amt", "dose_unit", "dose_freq"],
    "REAC": ["pt"],
    "INDI": ["indi_pt"],
    "OUTC": ["outc_cod"],
    "RPSR": ["rpsr_cod"],
    "THER": ["start_dt", "end_dt", "dur", "dur_cod", "dsg_drug_seq", "drug_seq"],
    # agar yana boshqa jadval chiqsa, generic bilan ishlaydi
}

merged = demo.copy()

for path in other_txt:
    fname = path.stem.upper()     # masalan: DRUG25Q4
    # prefiksni ajratamiz: DRUG / REAC / ...
    m = re.match(r"^([A-Z]+)" + re.escape(Q) + r"$", fname)
    prefix = m.group(1) if m else re.sub(re.escape(Q) + r"$", "", fname)

    df = read_faers(path)

    if "primaryid" not in df.columns:
        print(f"\nSKIP {path.name}: primaryid yo'q")
        continue

    # tezlashtirish: faqat DEMOdagi primaryidlar qolsin
    df["primaryid"] = df["primaryid"].astype(str).str.strip()
    df = df[df["primaryid"].isin(valid_ids)]

    cols = PICK.get(prefix, None)
    if cols is None:
        # unknown jadval bo'lsa: hamma ustunlardan emas, eng birinchi 3 tasini olamiz (primaryid dan tashqari)
        other_cols = [c for c in df.columns if c != "primaryid"][:3]
        cols = other_cols

    print(f"\nProcessing {prefix}: file={path.name} | rows(after filter)={len(df):,} | cols={cols}")

    agg = agg_table(df, key="primaryid", cols=cols, prefix=prefix, max_items=50)

    # LEFT MERGE
    merged = merged.merge(agg, on="primaryid", how="left")

print("\nFINAL merged shape:", merged.shape)

# ====== SAVE ======
out_csv = OUT_DIR / f"faers_{Q}_merged.csv"
merged.to_csv(out_csv, index=False, encoding="utf-8-sig")
print("Saved CSV:", out_csv)


Processing DRUG: file=DRUG25Q4.txt | rows(after filter)=1,815,349 | cols=['drugname', 'prod_ai', 'role_cod', 'route', 'dose_amt', 'dose_unit', 'dose_freq']

Processing INDI: file=INDI25Q4.txt | rows(after filter)=1,168,789 | cols=['indi_pt']

Processing OUTC: file=OUTC25Q4.txt | rows(after filter)=289,721 | cols=['outc_cod']

Processing REAC: file=REAC25Q4.txt | rows(after filter)=1,349,105 | cols=['pt']

Processing RPSR: file=RPSR25Q4.txt | rows(after filter)=10,694 | cols=['rpsr_cod']

Processing THER: file=THER25Q4.txt | rows(after filter)=454,746 | cols=['start_dt', 'end_dt', 'dur', 'dur_cod', 'dsg_drug_seq', 'drug_seq']

FINAL merged shape: (385288, 47)
Saved CSV: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Processed\faers_25Q4_merged.csv


In [4]:
import pandas as pd
df=pd.read_csv(r"C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Processed\faers_25Q4_merged.csv")

  df=pd.read_csv(r"C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Processed\faers_25Q4_merged.csv")


In [5]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 385288 entries, 0 to 385287
Data columns (total 47 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   primaryid          385288 non-null  int64  
 1   caseid             385288 non-null  int64  
 2   caseversion        385288 non-null  int64  
 3   i_f_code           385288 non-null  str    
 4   event_dt           172235 non-null  float64
 5   mfr_dt             385288 non-null  int64  
 6   init_fda_dt        385288 non-null  int64  
 7   fda_dt             385288 non-null  int64  
 8   rept_cod           385288 non-null  str    
 9   auth_num           36225 non-null   str    
 10  mfr_num            372910 non-null  str    
 11  mfr_sndr           385288 non-null  str    
 12  lit_ref            32915 non-null   str    
 13  age                237492 non-null  float64
 14  age_cod            237504 non-null  str    
 15  age_grp            140486 non-null  str    
 16  sex          

In [6]:
df.columns

Index(['primaryid', 'caseid', 'caseversion', 'i_f_code', 'event_dt', 'mfr_dt',
       'init_fda_dt', 'fda_dt', 'rept_cod', 'auth_num', 'mfr_num', 'mfr_sndr',
       'lit_ref', 'age', 'age_cod', 'age_grp', 'sex', 'e_sub', 'wt', 'wt_cod',
       'rept_dt', 'to_mfr', 'occp_cod', 'reporter_country', 'occr_country',
       'DRUG_n_rows', 'DRUG_drugname', 'DRUG_prod_ai', 'DRUG_role_cod',
       'DRUG_route', 'DRUG_dose_amt', 'DRUG_dose_unit', 'DRUG_dose_freq',
       'INDI_n_rows', 'INDI_indi_pt', 'OUTC_n_rows', 'OUTC_outc_cod',
       'REAC_n_rows', 'REAC_pt', 'RPSR_n_rows', 'RPSR_rpsr_cod', 'THER_n_rows',
       'THER_start_dt', 'THER_end_dt', 'THER_dur', 'THER_dur_cod',
       'THER_dsg_drug_seq'],
      dtype='str')

In [7]:
# 1) PRIMARYID unique-mi?
print("primaryid unique?", merged["primaryid"].is_unique)

# 2) Qaysi jadval qamrovi qanday? (coverage)
for c in ["DRUG_n_rows","REAC_n_rows","INDI_n_rows","OUTC_n_rows","THER_n_rows","RPSR_n_rows"]:
    print(c, "non-null:", merged[c].notna().mean().round(4))

# 3) Eng ko‘p satrli reportlar (diagnostika)
print(merged[["primaryid","DRUG_n_rows","REAC_n_rows","INDI_n_rows"]].sort_values("DRUG_n_rows", ascending=False).head(10))

primaryid unique? True
DRUG_n_rows non-null: 1.0
REAC_n_rows non-null: 1.0
INDI_n_rows non-null: 0.9411
OUTC_n_rows non-null: 0.5667
THER_n_rows non-null: 0.4724
RPSR_n_rows non-null: 0.0274
          primaryid  DRUG_n_rows  REAC_n_rows  INDI_n_rows
14897    2371497717         1860           19         82.0
10766    2281381215         1644          172        255.0
8523     2189747722         1525           12       1502.0
30445     250873148         1487          210       1463.0
11557     230460179         1405           39        464.0
271080    260903201         1282           23       1282.0
52575     256343916         1219           58        303.0
8065    21684576150         1159           54        252.0
666      1463805734         1158          199        404.0
8705     2197481911         1097           22        388.0


In [9]:
pid_int = 2371497717

drug_path = TXT_DIR / f"DRUG{Q}.txt"
drug = read_faers(drug_path)

x = drug[drug["primaryid"].astype(str).str.strip() == str(pid_int)]
print("rows:", len(x))
print("unique drugname:", x["drugname"].nunique())
print(x["drugname"].value_counts().head(10))

rows: 1860
unique drugname: 10
drugname
RIFAXIMIN              454
VEDOLIZUMAB            273
REMICADE               258
ENTYVIO                234
METHOTREXATE SODIUM    166
ADALIMUMAB             146
INFLIXIMAB             138
METHOTREXATE           130
HYRIMOZ                 45
USTEKINUMAB             16
Name: count, dtype: int64


# Dataga multiclass yasash

In [2]:
import pandas as pd
from pathlib import Path

# Agar merged yo'q bo'lsa:

merged_path = Path(r"C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Processed\faers_25Q4_merged.csv")
merged = pd.read_csv(merged_path, low_memory=False)

In [3]:
# REAC_pt ni PT-level datasetga aylantirish (explode)

# merged["REAC_pt"] sizda "; " bilan birlashtirilgan string. Uni ajratamiz.

import pandas as pd

# REAC_pt ustuni borligini tekshiramiz
if "REAC_pt" not in merged.columns:
    raise ValueError("merged ichida 'REAC_pt' topilmadi.")

# PT-level: har bir PT alohida qator bo'ladi
pt_df = merged[["primaryid", "REAC_pt"]].copy()

pt_df["REAC_pt"] = pt_df["REAC_pt"].fillna("").astype(str)
pt_df["pt"] = pt_df["REAC_pt"].str.split(";")          # ";" bo'yicha ajratamiz
pt_df = pt_df.explode("pt", ignore_index=True)
pt_df["pt"] = pt_df["pt"].astype(str).str.strip()

# bo'shlarni olib tashlaymiz
pt_df = pt_df[pt_df["pt"].ne("")].copy()

print("PT-level rows:", len(pt_df))
print("Unique PT:", pt_df["pt"].nunique())
pt_df.head()

PT-level rows: 1306516
Unique PT: 12696


Unnamed: 0,primaryid,REAC_pt,pt
0,100324053,Drug ineffective; Drug resistance; Meningitis ...,Drug ineffective
1,100324053,Drug ineffective; Drug resistance; Meningitis ...,Drug resistance
2,100324053,Drug ineffective; Drug resistance; Meningitis ...,Meningitis pneumococcal
3,1012809821,Injection site reaction; General physical heal...,Injection site reaction
4,1012809821,Injection site reaction; General physical heal...,General physical health deterioration


In [4]:
# 2 xil target: (A) 9 organ class, (B) hybrid (other’ni bo‘lib tashlaydi)

# Eslatma: bu heuristic (keyword/regex) mapping. Maqsad — balansni ko‘rish. Keyin top PT’lar bo‘yicha mappingni kuchaytirasiz.

import re

# -------------------------------
# A) 9-class (organ + other)
# -------------------------------
ORG_PATTERNS = {
  "gastrointestinal": re.compile(r"\b(diarrh\w*|nause\w*|vomit\w*|abdom\w*|constipat\w*|colitis|gastr\w*|pancreat\w*|ileus|dyspeps\w*|gastro\w*|stomatit\w*|oesophag\w*|esophag\w*)\b", re.I),
  "cardiovascular":   re.compile(r"\b(cardiac\w*|heart\w*|myocard\w*|infarct\w*|arrhyth\w*|tachycard\w*|bradycard\w*|hypertension|hypotension|atrial\w*|ventric\w*|thromboembol\w*|embol\w*|stroke)\b", re.I),
  "neurological":     re.compile(r"\b(headache\w*|dizz\w*|seiz\w*|convuls\w*|neurop\w*|syncope\w*|migraine\w*|tremor\w*|ataxia\w*|vertigo\w*|paresthes\w*)\b", re.I),
  "dermatologic":     re.compile(r"\b(rash\w*|prurit\w*|urticar\w*|dermatit\w*|eczema\w*|psoriasis\w*|alopec\w*|erythema\w*|skin)\b", re.I),
  "hematologic":      re.compile(r"\b(anaemi\w*|thrombocyt\w*|neutrop\w*|leukopen\w*|lymphopen\w*|pancytopen\w*|coagul\w*|haemorrhag\w*|hemorrhag\w*)\b", re.I),
  "hepatic":          re.compile(r"\b(hepatic\w*|liver\w*|hepatit\w*|bilirubin\w*|jaundice\w*|cholestat\w*|transamin\w*|alt\b|ast\b)\b", re.I),
  "renal":            re.compile(r"\b(renal\w*|kidney\w*|nephr\w*|creatinin\w*|acute kidney\w*|glomerul\w*)\b", re.I),
  "respiratory":      re.compile(r"\b(dyspn\w*|cough\w*|pneumon\w*|asthma\w*|bronch\w*|wheez\w*|pulmon\w*|rhinitis\w*|nasopharyng\w*|respiratory)\b", re.I),
}

def pt_to_organ9(pt: str) -> str:
    pt = str(pt).strip()
    for cat, pat in ORG_PATTERNS.items():
        if pat.search(pt):
            return cat
    return "other"


# -------------------------------
# B) HYBRID (other'ni bo'lib tashlash)
# -------------------------------
# "product/medication error" va "lack of effect/off label" - FAERS'da juda ko'p uchraydi
HYB_PATTERNS = dict(ORG_PATTERNS)  # organlar ham bor

HYB_EXTRA = {
  "product_use_or_medication_error": re.compile(
      r"\b(product\w*|dose omission|wrong dose|incorrect dose|medication error|device\w*|malfunction|accidental exposure|overdose|underdose|administration|dispensing)\b",
      re.I
  ),
  "lack_of_effect_or_off_label": re.compile(
      r"\b(drug ineffective|lack of efficacy|off label|therapeutic response decreased|treatment failure)\b",
      re.I
  ),
  "general_systemic": re.compile(
      r"\b(fatigue|malaise|asthenia|pyrexia|fever|chills|pain|condition aggravated|discomfort)\b",
      re.I
  ),
  "musculoskeletal_pain": re.compile(
      r"\b(arthralgia|myalgia|back pain|musculoskeletal|joint pain|bone pain)\b",
      re.I
  ),
  "infections": re.compile(
      r"\b(infection|sepsis|pneumonia|covid|influenza|cellulitis)\b",
      re.I
  ),
}

# prioritet: avval extra, keyin organ (siz xohlasangiz aksini qilamiz)
HYB_ORDER = list(HYB_EXTRA.keys()) + list(ORG_PATTERNS.keys())

def pt_to_hybrid(pt: str) -> str:
    pt = str(pt).strip()
    # 1) extra (other'ni bo'luvchi)
    for cat, pat in HYB_EXTRA.items():
        if pat.search(pt):
            return cat
    # 2) organlar
    for cat, pat in ORG_PATTERNS.items():
        if pat.search(pt):
            return cat
    return "other_rare"

In [6]:
# Bu xato shuni bildiradi: pt_df ichida cat_organ9 ustuni hali yaratilmagan (yoki siz uni yaratadigan cell’ni run qilmagansiz / pt_df qayta overwrite bo‘lgan).

# ✅ Tez yechim: avval cat_organ9 va cat_hybrid ustunlarini yana bir marta aniq yaratib oling, keyin report_presenceni chaqiring.

# Quyidagi bitta to‘liq cellni ishlating (shu cell hammasini o‘zi qiladi):

# =========================
# FIX: create cat_organ9/cat_hybrid then report_presence
# =========================
import pandas as pd

# 0) Tekshiruv: pt_df bor-mi?
try:
    pt_df
except NameError:
    raise NameError("pt_df topilmadi. Avval REAC_pt explode qiladigan cell'ni run qiling.")

# 1) cat ustunlarini yaratish (agar yo'q bo'lsa)
pt_df["cat_organ9"] = pt_df["pt"].apply(pt_to_organ9)
pt_df["cat_hybrid"] = pt_df["pt"].apply(pt_to_hybrid)

print("pt_df columns now contains:", [c for c in ["cat_organ9","cat_hybrid"] if c in pt_df.columns])

# 2) Report-level presence funksiyasi
def report_presence(df: pd.DataFrame, cat_col: str, title: str):
    if cat_col not in df.columns:
        raise KeyError(f"'{cat_col}' ustuni topilmadi. Hozirgi ustunlar: {list(df.columns)[:20]} ...")

    rep = (
        df.groupby("primaryid", sort=False)[cat_col]
          .apply(lambda s: set(s.dropna().astype(str)))
          .reset_index(name="cats")
    )

    all_cats = sorted({c for ss in rep["cats"] for c in ss})
    presence = {c: rep["cats"].apply(lambda ss: c in ss).sum() for c in all_cats}
    presence = pd.Series(presence).sort_values(ascending=False)

    pct = (presence / rep.shape[0] * 100).round(2)
    out = pd.DataFrame({"reports_with_label": presence, "percent_of_reports": pct})

    print("\n" + "="*70)
    print(title)
    print("="*70)
    print("Total reports:", rep.shape[0])
    print(out)

# 3) Ishga tushirish
report_presence(pt_df, "cat_organ9", "Report-level (multi-label presence) — ORGAN 9-class")
report_presence(pt_df, "cat_hybrid", "Report-level (multi-label presence) — HYBRID (other split)")

# Nega shunday bo‘ldi?

# Siz cat_organ9ni yaratadigan cell’ni run qilmasdan turib report_presenceni chaqirgansiz yoki

# pt_dfni qaytadan yaratib yuborgansiz (unda cat_organ9 hali yo‘q).

pt_df columns now contains: ['cat_organ9', 'cat_hybrid']

Report-level (multi-label presence) — ORGAN 9-class
Total reports: 385288
                  reports_with_label  percent_of_reports
other                         340717               88.43
gastrointestinal               53066               13.77
dermatologic                   52343               13.59
respiratory                    38964               10.11
neurological                   29732                7.72
cardiovascular                 25230                6.55
hematologic                    21516                5.58
renal                          13101                3.40
hepatic                        11933                3.10

Report-level (multi-label presence) — HYBRID (other split)
Total reports: 385288
                                 reports_with_label  percent_of_reports
other_rare                                   261730               67.93
product_use_or_medication_error               93503               24.27


In [7]:
# (ixtiyoriy, juda foydali) “Other” ichidagi top PT’lar

# Bu sizga “other”ni nima kattalashtiryapti?” degan savolga aniq javob beradi.

# ORGAN9 da other'ga tushgan eng ko'p PT lar
other_top = (
    pt_df[pt_df["cat_organ9"] == "other"]["pt"]
    .value_counts()
    .head(30)
)
print(other_top)

pt
Off label use                                       26700
Drug ineffective                                    22134
Product dose omission issue                         17322
Fatigue                                             15519
Death                                               13234
Pain                                                10199
Arthralgia                                           9835
Condition aggravated                                 9826
Inappropriate schedule of product administration     9014
Injection site pain                                  8492
Product use in unapproved indication                 7720
Malaise                                              7591
Incorrect dose administered                          7538
Pyrexia                                              6867
Asthenia                                             6827
Fall                                                 6455
Weight decreased                                     5693
Illness    

In [9]:
# Symptom-only filter + mapping + distribution

import re
import pandas as pd

# --------------------------
# 1) Symptom bo'lmagan PT'larni filtrlash (drop list)
# --------------------------
NON_SYMPTOM_PAT = re.compile(
    r"\b("
    r"off label|drug ineffective|lack of efficacy|treatment failure|therapeutic response decreased|"
    r"product|device|malfunction|dose omission|wrong dose|incorrect dose|medication error|"
    r"inappropriate schedule|administration|dispensing|wrong technique|accidental exposure|"
    r"product use issue|product use in unapproved indication|product dose omission issue"
    r")\b",
    re.I
)

# pt_df mavjudligini tekshiramiz
if "pt" not in pt_df.columns:
    raise ValueError("pt_df ichida 'pt' ustuni yo'q. Avval explode qilgan bo'ling.")

pt_df2 = pt_df.copy()
pt_df2["pt_clean"] = pt_df2["pt"].astype(str).str.strip()

# filter: NON_SYMPTOM bo'lsa olib tashlaymiz
mask_non_symptom = pt_df2["pt_clean"].str.contains(NON_SYMPTOM_PAT, na=False)
pt_sym = pt_df2[~mask_non_symptom].copy()

print("Original PT rows:", len(pt_df2))
print("After symptom-only filter PT rows:", len(pt_sym))
print("Dropped as non-symptom:", mask_non_symptom.sum())

# --------------------------
# 2) Symptom-only label set (multi-label uchun)
# --------------------------
SYM_PATTERNS = {
  # organ/system
  "gastrointestinal": re.compile(r"\b(diarrh\w*|nause\w*|vomit\w*|abdom\w*|constipat\w*|colitis|gastr\w*|pancreat\w*|ileus|dyspeps\w*|stomatit\w*|oesophag\w*|esophag\w*)\b", re.I),
  "cardiovascular":   re.compile(r"\b(cardiac\w*|heart\w*|myocard\w*|infarct\w*|arrhyth\w*|tachycard\w*|bradycard\w*|hypertension|hypotension|atrial\w*|ventric\w*|thromboembol\w*|embol\w*|stroke)\b", re.I),
  "neurological":     re.compile(r"\b(headache\w*|dizz\w*|seiz\w*|convuls\w*|neurop\w*|syncope\w*|migraine\w*|tremor\w*|ataxia\w*|vertigo\w*|paresthes\w*|somnolence)\b", re.I),
  "dermatologic":     re.compile(r"\b(rash\w*|prurit\w*|urticar\w*|dermatit\w*|eczema\w*|psoriasis\w*|alopec\w*|erythema\w*|skin)\b", re.I),
  "hematologic":      re.compile(r"\b(anaemi\w*|thrombocyt\w*|neutrop\w*|leukopen\w*|lymphopen\w*|pancytopen\w*|coagul\w*|haemorrhag\w*|hemorrhag\w*)\b", re.I),
  "hepatic":          re.compile(r"\b(hepatic\w*|liver\w*|hepatit\w*|bilirubin\w*|jaundice\w*|cholestat\w*|transamin\w*|alt\b|ast\b)\b", re.I),
  "renal":            re.compile(r"\b(renal\w*|kidney\w*|nephr\w*|creatinin\w*|acute kidney\w*|glomerul\w*)\b", re.I),
  "respiratory":      re.compile(r"\b(dyspn\w*|cough\w*|pneumon\w*|asthma\w*|bronch\w*|wheez\w*|pulmon\w*|rhinitis\w*|nasopharyng\w*|respiratory)\b", re.I),

  # extra symptom classes (data-driven)
  "general_systemic": re.compile(r"\b(fatigue|malaise|asthenia|pyrexia|fever|chills|illness)\b", re.I),
  "musculoskeletal_pain": re.compile(r"\b(arthralgia|myalgia|back pain|pain in extremity|musculoskeletal|joint pain|bone pain)\b", re.I),
  "infections": re.compile(r"\b(infection|sepsis|pneumonia|influenza|covid|cellulitis)\b", re.I),
  "psychiatric": re.compile(r"\b(anxiety|depression|insomnia|agitation|panic)\b", re.I),
  "metabolic_weight": re.compile(r"\b(weight increased|weight decreased|decreased appetite|increased appetite)\b", re.I),
  "injury_fall": re.compile(r"\b(fall|injury|fracture|contusion)\b", re.I),
  "injection_site": re.compile(r"\b(injection site|injection-site)\b", re.I),
}

# Priority: avval organlar, keyin extra emas — bu multi-label emas, 1 PT -> 1 label bo'lishi uchun
# (Agar 1 PT bir necha pattern'ga tushsa, yuqoridagi tartib bo'yicha birinchisi olinadi.)
SYM_ORDER = [
    "gastrointestinal","cardiovascular","neurological","dermatologic","hematologic","hepatic","renal","respiratory",
    "general_systemic","musculoskeletal_pain","infections","psychiatric","metabolic_weight","injury_fall","injection_site"
]

def pt_to_symptom_label(pt: str) -> str:
    pt = str(pt).strip()
    for cat in SYM_ORDER:
        if SYM_PATTERNS[cat].search(pt):
            return cat
    return "other_symptom"

pt_sym["symptom_label"] = pt_sym["pt_clean"].apply(pt_to_symptom_label)

# --------------------------
# 3) Distribution funksiyalari
# --------------------------
def show_dist(series: pd.Series, title: str, topn: int = 50):
    vc = series.value_counts()
    pct = (vc / vc.sum() * 100).round(2)
    out = pd.DataFrame({"count": vc, "percent": pct})
    print("\n" + "="*70)
    print(title)
    print("="*70)
    print(out.head(topn))

def report_presence(df: pd.DataFrame, label_col: str, title: str):
    rep = (
        df.groupby("primaryid", sort=False)[label_col]
          .apply(lambda s: set(s.dropna().astype(str)))
          .reset_index(name="labels")
    )
    all_labels = sorted({x for ss in rep["labels"] for x in ss})
    presence = {lab: rep["labels"].apply(lambda ss: lab in ss).sum() for lab in all_labels}
    presence = pd.Series(presence).sort_values(ascending=False)
    pct = (presence / rep.shape[0] * 100).round(2)
    out = pd.DataFrame({"reports_with_label": presence, "percent_of_reports": pct})

    print("\n" + "="*70)
    print(title)
    print("="*70)
    print("Total reports:", rep.shape[0])
    print(out)

# --------------------------
# 4) PT-level + Report-level natijalar
# --------------------------
show_dist(pt_sym["symptom_label"], "PT-level distribution — SYMPTOM-ONLY labels")
report_presence(pt_sym, "symptom_label", "Report-level (multi-label presence) — SYMPTOM-ONLY labels")

# --------------------------
# 5) other_symptom ichidagi top PT'lar (mappingni kengaytirish uchun)
# --------------------------
other_top = pt_sym[pt_sym["symptom_label"] == "other_symptom"]["pt_clean"].value_counts().head(40)
print("\nTop PT inside other_symptom (for mapping expansion):")
print(other_top)

# Bu cell sizga nima beradi?

# symptom-only filtrdan keyin qancha PT qoldi (qanchasi non-symptom edi)

# yangi label set bo‘yicha balans:

# PT-level (har bir PT label)

# report-level presence (har bir reportda qaysi label bor)

# other_symptom ichidagi top PT’lar — mappingni kuchaytirish uchun eng kerakli ro‘yxat

  mask_non_symptom = pt_df2["pt_clean"].str.contains(NON_SYMPTOM_PAT, na=False)


Original PT rows: 1306516
After symptom-only filter PT rows: 1123683
Dropped as non-symptom: 182833

PT-level distribution — SYMPTOM-ONLY labels
                       count  percent
symptom_label                        
other_symptom         594540    52.91
gastrointestinal       86573     7.70
dermatologic           82123     7.31
respiratory            58862     5.24
general_systemic       45975     4.09
neurological           39186     3.49
infections             32901     2.93
cardiovascular         31467     2.80
hematologic            26472     2.36
musculoskeletal_pain   25875     2.30
injury_fall            21221     1.89
injection_site         17921     1.59
metabolic_weight       15927     1.42
renal                  15599     1.39
psychiatric            14684     1.31
hepatic                14357     1.28

Report-level (multi-label presence) — SYMPTOM-ONLY labels
Total reports: 346447
                      reports_with_label  percent_of_reports
other_symptom                

In [10]:
# other_symptom top 200 PT’ni to‘liq chiqarish (truncation bo‘lmasin)
other_top200 = (
    pt_sym[pt_sym["symptom_label"] == "other_symptom"]["pt_clean"]
    .value_counts()
    .head(200)
    .reset_index()
)
other_top200.columns = ["pt", "count"]
other_top200

Unnamed: 0,pt,count
0,Death,13234
1,Pain,10199
2,Condition aggravated,9826
3,Peripheral swelling,3617
4,Hospitalisation,3617
...,...,...
195,White blood cell count increased,617
196,Dysuria,617
197,Infusion site pain,615
198,Nervousness,609


In [11]:
# Report-level presence natijasini CSV ga saqlab qo‘yish (chiqishi uzun bo‘lsa)
# report_presence'ni qayta hisoblab, saqlash
rep = (
    pt_sym.groupby("primaryid", sort=False)["symptom_label"]
          .apply(lambda s: set(s.dropna().astype(str)))
          .reset_index(name="labels")
)

all_labels = sorted({x for ss in rep["labels"] for x in ss})
presence = {lab: rep["labels"].apply(lambda ss: lab in ss).sum() for lab in all_labels}
presence = pd.Series(presence).sort_values(ascending=False)
pct = (presence / rep.shape[0] * 100).round(2)

report_dist = pd.DataFrame({"reports_with_label": presence, "percent_of_reports": pct})
report_dist

# saqlash
out_path = Path("symptom_only_report_presence.csv")
report_dist.to_csv(out_path, index=True, encoding="utf-8-sig")
print("Saved:", out_path.resolve())

Saved: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Notebooks\symptom_only_report_presence.csv


In [12]:
# “DROP + qayta distribution” kodi

# Bu kod pt_sym (symptom-only filtrdan keyingi dataframe) bilan ishlaydi.
# U:

# “nonsymptom” PT’larni drop qiladi,

# keyin qayta PT-level va report-level presence distribution chiqaradi,

# drop qilingan top PT’larni ham ko‘rsatadi.

import re
import pandas as pd

# pt_sym borligini tekshiramiz
try:
    pt_sym
except NameError:
    raise NameError("pt_sym topilmadi. Avval symptom-only filter cell'dan pt_sym hosil qiling.")

# 1) Drop qilinadigan (user simptom modeli uchun nomos) PT patternlari
DROP_PAT = re.compile(
    r"\b(?:"
    # product / medication error / use issues
    r"product|device|malfunction|dose omission|wrong dose|incorrect dose|medication error|"
    r"inappropriate schedule|administration|dispensing|wrong technique|accidental exposure|"
    r"product use issue|product use in unapproved indication|product dose omission issue|"
    # lack of effect / off-label
    r"off label|drug ineffective|lack of efficacy|treatment failure|therapeutic response decreased|"
    # outcomes / severity (symptom emas)
    r"death|hospitalis(?:ation|ation)|hospitalization|life[- ]?threatening|disability|"
    # lab abnormal (symptom emas)
    r"white blood cell count (?:increased|decreased)|wbc (?:increased|decreased)|"
    r"platelet count (?:increased|decreased)|neutrophil count (?:increased|decreased)|"
    r"alanine aminotransferase (?:increased|decreased)|aspartate aminotransferase (?:increased|decreased)|"
    r"creatinine (?:increased|decreased)|test abnormal|laboratory test abnormal|"
    # very generic status
    r"condition aggravated"
    r")\b",
    re.I
)

df = pt_sym.copy()
df["pt_clean"] = df["pt_clean"].astype(str).str.strip()

mask_drop = df["pt_clean"].str.contains(DROP_PAT, na=False)

print("Before drop PT rows:", len(df))
print("Dropped rows:", int(mask_drop.sum()))
df_keep = df[~mask_drop].copy()
print("After drop PT rows:", len(df_keep))

# 2) Siz ishlatayotgan label ustuni qaysi biri? (v1 yoki v2)
# Agar symptom_label_v2 mavjud bo'lsa shuni olamiz, bo'lmasa symptom_label
label_col = "symptom_label_v2" if "symptom_label_v2" in df_keep.columns else "symptom_label"
if label_col not in df_keep.columns:
    raise ValueError("Label ustuni topilmadi. symptom_label yoki symptom_label_v2 mavjud bo‘lishi kerak.")

# 3) PT-level distribution
pt_dist = df_keep[label_col].value_counts()
pt_pct = (pt_dist / pt_dist.sum() * 100).round(2)
print("\nPT-level distribution (after drop):")
print(pd.DataFrame({"count": pt_dist, "percent": pt_pct}))

# 4) Report-level presence distribution (multi-label)
rep = (
    df_keep.groupby("primaryid", sort=False)[label_col]
          .apply(lambda s: set(s.dropna().astype(str)))
          .reset_index(name="labels")
)
all_labels = sorted({x for ss in rep["labels"] for x in ss})
presence = {lab: rep["labels"].apply(lambda ss: lab in ss).sum() for lab in all_labels}
presence = pd.Series(presence).sort_values(ascending=False)
presence_pct = (presence / rep.shape[0] * 100).round(2)

print("\nReport-level presence (after drop):")
print(pd.DataFrame({"reports_with_label": presence, "percent_of_reports": presence_pct}))

# 5) Qaysi PT'lar drop bo'ldi? (top 30)
print("\nTop dropped PT (sanity check):")
print(df[mask_drop]["pt_clean"].value_counts().head(30))

Before drop PT rows: 1123683
Dropped rows: 39025
After drop PT rows: 1084658

PT-level distribution (after drop):
                       count  percent
symptom_label                        
other_symptom         558489    51.49
gastrointestinal       86573     7.98
dermatologic           82123     7.57
respiratory            58822     5.42
general_systemic       45975     4.24
neurological           39185     3.61
infections             32901     3.03
cardiovascular         31414     2.90
musculoskeletal_pain   25875     2.39
hematologic            25168     2.32
injury_fall            21221     1.96
injection_site         17921     1.65
metabolic_weight       15927     1.47
psychiatric            14684     1.35
renal                  14278     1.32
hepatic                14102     1.30

Report-level presence (after drop):
                      reports_with_label  percent_of_reports
other_symptom                     238960               72.04
gastrointestinal                   53066   

In [13]:
# CELL 1 — Merged CSV’ni o‘qish
from pathlib import Path
import pandas as pd

CSV_PATH = Path(r"C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Processed\faers_25Q4_merged.csv")

df = pd.read_csv(CSV_PATH, low_memory=False)
print("Loaded:", df.shape)
print("Columns contain REAC_pt?", "REAC_pt" in df.columns)
df[["primaryid","REAC_pt"]].head()

Loaded: (385288, 47)
Columns contain REAC_pt? True


Unnamed: 0,primaryid,REAC_pt
0,100324053,Drug ineffective; Drug resistance; Meningitis ...
1,1012809821,Injection site reaction; General physical heal...
2,101406268,Ectopic pregnancy with contraceptive device; D...
3,101515934,Drug resistance; Cytomegalovirus infection
4,1016133068,Asthenia; Infusion related reaction; Blood pre...


In [14]:
# CELL 2 — REAC_pt ichidagi “other”ni ko‘rish uchun TOP ro‘yxat (review)

# Bu yerda biz REAC_ptni explode qilib, top PT’larni chiqaramiz. (Hozircha mapping shart emas — avval “nima ko‘p?”ni ko‘ramiz.)

# REAC_pt ni explode qilib PT-level jadval
pt_df = df[["primaryid", "REAC_pt"]].copy()
pt_df["REAC_pt"] = pt_df["REAC_pt"].fillna("").astype(str)

pt_df["pt"] = pt_df["REAC_pt"].str.split(";")
pt_df = pt_df.explode("pt", ignore_index=True)
pt_df["pt"] = pt_df["pt"].astype(str).str.strip()
pt_df = pt_df[pt_df["pt"].ne("")].copy()

print("PT-level rows:", len(pt_df))
print("Unique PT:", pt_df["pt"].nunique())

# TOP 300 PT (umumiy)
top300 = pt_df["pt"].value_counts().head(300).reset_index()
top300.columns = ["pt", "count"]
top300["percent_of_all_pt"] = (top300["count"] / len(pt_df) * 100).round(2)
top300

#  Excel’da ko‘rish uchun saqlab ham qo‘yish

out_dir = Path.cwd() / "analysis_outputs"
out_dir.mkdir(parents=True, exist_ok=True)
top_path = out_dir / "REAC_PT_top300.csv"
top300.to_csv(top_path, index=False, encoding="utf-8-sig")
print("Saved:", top_path.resolve())

PT-level rows: 1306516
Unique PT: 12696
Saved: C:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Notebooks\analysis_outputs\REAC_PT_top300.csv


# Multi-Label target yasash

In [1]:
# faers_25Q4_merged.csv ni o‘qiydi

# REAC_pt ichidan symptom emas (product/error/off-label/ineffective…) PT’larni drop qiladi

# Qolganidan multi-label target (y_*) yasaydi (organ + qolgan symptom-classlar)

# classlar nechta ekanini va prevalenceni chiqaradi

# Natijani Data/Raw_data/ ichiga CSV qilib saqlaydi


from pathlib import Path
import re
import pandas as pd

# Notebook qayerda turganiga qarab root topamiz
CWD = Path.cwd()
if (CWD / "Data").exists():
    PROJECT_ROOT = CWD
elif (CWD.parent / "Data").exists():
    PROJECT_ROOT = CWD.parent
else:
    PROJECT_ROOT = CWD  # fallback

IN_CSV = PROJECT_ROOT / "Data" / "Processed" / "faers_25Q4_merged.csv"
OUT_CSV = PROJECT_ROOT / "Data" / "Raw_data" / "faers_25Q4_targets_multilabel.csv"
OUT_CSV.parent.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("IN_CSV exists:", IN_CSV.exists(), IN_CSV)
print("OUT_CSV:", OUT_CSV)

PROJECT_ROOT: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract
IN_CSV exists: True c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Processed\faers_25Q4_merged.csv
OUT_CSV: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Raw_data\faers_25Q4_targets_multilabel.csv


In [2]:
# DROP qoidalari + helper funksiyalar
# (B) Product/medication error + (C) Off-label/ineffective + non-event noise -> DROP

DROP_EXACT = {
    "Off label use",
    "Drug ineffective",
    "No adverse event",
    "Adverse event",
    "Adverse drug reaction",
}

DROP_REGEX = re.compile(
    r"\b(?:"
    r"product|device|malfunction|breakage|leakage|defective|delivery system|needle issue|"
    r"dose omission|incorrect dose|wrong dose|extra dose|underdose|overdose|medication error|"
    r"inappropriate schedule|administration|dispensing|wrong technique|misuse|"
    r"product use issue|product use in unapproved indication|product storage error|"
    r"product quality issue|product availability issue|product complaint|"
    r"off label|drug ineffective|lack of efficacy|treatment failure|therapeutic response|"
    r"therapy non-responder|partial responder|effect less than expected|"
    r"treatment noncompliance|therapy interrupted|therapy cessation"
    r")\b",
    re.I
)

def split_pts(s: str) -> list[str]:
    pts = [p.strip() for p in str(s).split(";") if p.strip()]
    # unique (order saqlansin)
    seen, out = set(), []
    for p in pts:
        if p not in seen:
            seen.add(p)
            out.append(p)
    return out

def drop_non_symptom(pts: list[str]) -> tuple[list[str], list[str]]:
    kept, dropped = [], []
    for p in pts:
        if p in DROP_EXACT or DROP_REGEX.search(p):
            dropped.append(p)
        else:
            kept.append(p)
    return kept, dropped

In [3]:
# CSV o‘qish + REAC_pt_symptom yaratish (drop qilinganlar ajratiladi)
df = pd.read_csv(IN_CSV, low_memory=False)

if "REAC_pt" not in df.columns:
    raise ValueError("REAC_pt ustuni topilmadi. Merge CSV noto‘g‘ri bo‘lishi mumkin.")

# REAC_pt -> list
pts_all = df["REAC_pt"].fillna("").astype(str).apply(split_pts)

kept_list = []
dropped_list = []
for pts in pts_all:
    kept, dropped = drop_non_symptom(pts)
    kept_list.append(kept)
    dropped_list.append(dropped)

df["REAC_pt_symptom"] = ["; ".join(x) for x in kept_list]
df["REAC_pt_dropped"] = ["; ".join(x) for x in dropped_list]
df["REAC_n_rows_symptom"] = df["REAC_pt_symptom"].apply(
    lambda x: 0 if not str(x).strip() else len([t for t in str(x).split(";") if t.strip()])
)

print("Rows:", df.shape)
print("Avg REAC_n_rows_symptom:", df["REAC_n_rows_symptom"].mean().round(3))
print("Empty after drop:", (df["REAC_n_rows_symptom"] == 0).sum())
df[["primaryid","REAC_n_rows","REAC_n_rows_symptom","REAC_pt_symptom","REAC_pt_dropped"]].head(3)

Rows: (385288, 50)
Avg REAC_n_rows_symptom: 2.833
Empty after drop: 50191


Unnamed: 0,primaryid,REAC_n_rows,REAC_n_rows_symptom,REAC_pt_symptom,REAC_pt_dropped
0,100324053,3,2,Drug resistance; Meningitis pneumococcal,Drug ineffective
1,1012809821,22,20,Injection site reaction; General physical heal...,Incorrect dose administered; Product dose omis...
2,101406268,10,6,Internal haemorrhage; Injury; Pain; Depression...,Ectopic pregnancy with contraceptive device; D...


In [4]:
# Multi-label classlar (organ + qolgan symptom-classlar) + y_* targetlar

# Bu yerda classlar soni ham shu yerning o‘zida chiqadi.

LABEL_PATTERNS = {
    # ---- Organ/system ----
    "gastrointestinal": re.compile(r"\b(?:diarrh\w*|nausea|vomit\w*|abdominal|constipat\w*|dyspeps\w*|reflux|flatulence|colitis|pancreat\w*|haematochezia|gastrointestinal haemorrhage|rectal haemorrhage|stomatitis|dysphagia)\b", re.I),
    "cardiovascular":   re.compile(r"\b(?:hypertension|hypotension|palpitation\w*|tachycard\w*|atrial fibrillation|myocardial infarction|thrombosis|pulmonary embolism|blood pressure increased|blood pressure decreased|heart rate increased|cardiac failure)\b", re.I),
    "neurological":     re.compile(r"\b(?:headache|dizz\w*|seizure|tremor|confusional state|memory impairment|paraesthesia|hypoaesthesia|migraine|syncope|loss of consciousness|vertigo|balance disorder|amnesia|brain fog)\b", re.I),
    "dermatologic":     re.compile(r"\b(?:rash\w*|prurit\w*|erythema|urticaria|eczema|psoriasis|alopecia|blister|acne|dry skin|skin burning sensation|skin exfoliation)\b", re.I),
    "hematologic":      re.compile(r"\b(?:anaemi\w*|neutropen\w*|leukopen\w*|thrombocytopen\w*|pancytopen\w*|myelosuppression|febrile neutropenia|haemorrhag\w*|hemorrhag\w*)\b", re.I),
    "hepatic":          re.compile(r"\b(?:hepatit\w*|jaundice|liver injury|drug-induced liver injury|hepatic function abnormal|liver disorder|hepatic enzyme increased)\b", re.I),
    "renal":            re.compile(r"\b(?:acute kidney injury|renal impairment|renal failure|renal disorder|nephrolithiasis)\b", re.I),
    "respiratory":      re.compile(r"\b(?:dyspnoea|dyspnea|cough\w*|asthma|wheez\w*|bronchit\w*|sinusitis|rhinitis|nasopharyngitis|upper respiratory tract infection|lower respiratory tract infection|respiratory tract infection|interstitial lung disease|pleural effusion|oxygen saturation decreased|epistaxis)\b", re.I),

    # ---- Extra symptom-classlar (user yozishi mumkin bo‘lganlar) ----
    "general_systemic": re.compile(r"\b(?:fatigue|malaise|asthenia|pyrexia|fever|chills|feeling abnormal|illness|influenza like illness|feeling hot|hot flush|flushing)\b", re.I),
    "pain_general":     re.compile(r"\b(?:\bpain\b|chest pain|chest discomfort|back pain|bone pain|pain in extremity|discomfort)\b", re.I),
    "musculoskeletal":  re.compile(r"\b(?:arthralgia|myalgia|muscle spasms|muscular weakness|arthritis\b|osteoarthritis|joint swelling|musculoskeletal stiffness|arthropathy|fibromyalgia)\b", re.I),
    "edema_swelling":   re.compile(r"\b(?:peripheral swelling|oedema\b|edema\b|swelling\b|swelling face|fluid retention)\b", re.I),
    "infections":       re.compile(r"\b(?:pneumonia|covid-19|influenza\b|infection\b|sepsis|cellulitis|viral infection|herpes zoster|septic shock)\b", re.I),
    "psychiatric":      re.compile(r"\b(?:anxiety|insomnia|depression|stress|agitation|hallucination|suicidal ideation|suicide attempt|irritability|nervousness|emotional distress|depressed mood)\b", re.I),
    "urinary":          re.compile(r"\b(?:urinary tract infection|dysuria|cystitis|haematuria|hematuria|urinary retention)\b", re.I),
    "injection_site":   re.compile(r"\b(?:injection site|infusion site)\b", re.I),
}

# ---- classlar soni ----
labels = list(LABEL_PATTERNS.keys())
print("CLASS COUNT:", len(labels))
print("CLASSES:", labels)

# Multi-label target matritsa
s = df["REAC_pt_symptom"].fillna("").astype(str)
for lab, pat in LABEL_PATTERNS.items():
    df[f"y_{lab}"] = s.str.contains(pat, na=False).astype(int)

# (ixtiyoriy) debug uchun label ro'yxati (agar sekin bo'lsa comment qiling)
y_cols = [f"y_{lab}" for lab in labels]
def row_labels_fast(row) -> str:
    out = []
    for lab in labels:
        if row[f"y_{lab}"] == 1:
            out.append(lab)
    return "; ".join(out)

df["y_labels"] = df[y_cols].apply(row_labels_fast, axis=1)

df[["primaryid","REAC_pt_symptom","y_labels"]].head(5)

CLASS COUNT: 16
CLASSES: ['gastrointestinal', 'cardiovascular', 'neurological', 'dermatologic', 'hematologic', 'hepatic', 'renal', 'respiratory', 'general_systemic', 'pain_general', 'musculoskeletal', 'edema_swelling', 'infections', 'psychiatric', 'urinary', 'injection_site']


Unnamed: 0,primaryid,REAC_pt_symptom,y_labels
0,100324053,Drug resistance; Meningitis pneumococcal,
1,1012809821,Injection site reaction; General physical heal...,cardiovascular; respiratory; general_systemic;...
2,101406268,Internal haemorrhage; Injury; Pain; Depression...,hematologic; pain_general; infections; psychia...
3,101515934,Drug resistance; Cytomegalovirus infection,infections
4,1016133068,Asthenia; Infusion related reaction; Blood pre...,gastrointestinal; cardiovascular; respiratory;...


In [5]:
# Prevalence ko‘rish + saqlash (Raw_data ichiga)
before = len(df)
df_out = df[df["REAC_n_rows_symptom"] > 0].copy()
after = len(df_out)

print(f"Reports before: {before:,}")
print(f"Reports after (REAC_symptom non-empty): {after:,}")
print(f"Dropped reports (empty after drop): {before-after:,}")

# Prevalence (percent of reports)
print("\nLabel prevalence (% of reports):")
for lab in labels:
    pct = df_out[f"y_{lab}"].mean() * 100
    print(f" - {lab:16s}: {pct:6.2f}%")

# Saqlash
df_out.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
print("\nSAVED:", OUT_CSV)

Reports before: 385,288
Reports after (REAC_symptom non-empty): 335,097
Dropped reports (empty after drop): 50,191

Label prevalence (% of reports):
 - gastrointestinal:  14.93%
 - cardiovascular  :   7.42%
 - neurological    :  10.81%
 - dermatologic    :  12.51%
 - hematologic     :   7.02%
 - hepatic         :   2.09%
 - renal           :   2.53%
 - respiratory     :   9.15%
 - general_systemic:  11.90%
 - pain_general    :  14.44%
 - musculoskeletal :   6.78%
 - edema_swelling  :   5.31%
 - infections      :  11.04%
 - psychiatric     :   5.85%
 - urinary         :   1.88%
 - injection_site  :   5.56%

SAVED: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Raw_data\faers_25Q4_targets_multilabel.csv


# Qayta multi label target yasash

In [None]:
# # 02_target_refine_v3.ipynb — FINAL (V2 input + V3 drop + FULL patch)

# from pathlib import Path
# import re
# import numpy as np
# import pandas as pd

# # ---------- PATHS ----------
# CWD = Path.cwd()
# if (CWD / "Data").exists():
#     PROJECT_ROOT = CWD
# elif (CWD.parent / "Data").exists():
#     PROJECT_ROOT = CWD.parent
# else:
#     PROJECT_ROOT = CWD

# IN_CSV  = PROJECT_ROOT / "Data" / "Raw_data" / "faers_25Q4_targets_multilabel_v2.csv"
# OUT_CSV = PROJECT_ROOT / "Data" / "Raw_data" / "faers_25Q4_targets_multilabel_v3.csv"

# TEXT_COL = "REAC_pt_symptom_v2"  # v2 text input

# print("IN_CSV:", IN_CSV, "exists:", IN_CSV.exists())
# print("OUT_CSV:", OUT_CSV)

# df = pd.read_csv(IN_CSV, low_memory=False)
# print("Loaded:", df.shape)

# # ---------- 1) OLD y_* LARNI O'CHIRIB TASHLAYMIZ ----------
# old_y = [c for c in df.columns if c.startswith("y_")]
# df = df.drop(columns=old_y, errors="ignore")
# if "y_labels" in df.columns:
#     df = df.drop(columns=["y_labels"], errors="ignore")

# # ---------- 2) DROP RULES (symptom emas / admin / exposure / outcome / progression ...) ----------
# DROP_EXACT = {
#     "Death",
#     "Hospitalisation",
#     "Hospitalization",
#     "Drug interaction",
#     "Surgery",
#     "Disease progression",
#     "Malignant neoplasm progression",
#     "Neoplasm malignant",
#     "Condition aggravated",
#     "Toxicity to various agents",
#     "Drug effective for unapproved indication",
#     "Exposure during pregnancy",
#     "Maternal exposure during pregnancy",
#     "Foetal exposure during pregnancy",
#     "Exposure via skin contact",

#     # admin / misuse / no-symptom
#     "Drug abuse",
#     "Drug diversion",
#     "Drug dependence",
#     "Drug intolerance",
#     "Ill-defined disorder",
#     "Insurance issue",
#     "Drug resistance",
#     "Disease recurrence",

#     # diagnosis-heavy (symptom model uchun shovqin)
#     "Autism spectrum disorder",
#     "Breast cancer",
#     "Meningioma",
#     "Plasma cell myeloma",
#     "Systemic lupus erythematosus",
# }

# DROP_REGEX = re.compile(
#     r"\b(?:"
#     r"exposure\b|"
#     r"drug interaction|"
#     r"disease progression|"
#     r"neoplasm progression|"
#     r"neoplasm malignant|"
#     r"surgery\b|"
#     r"hospitalis(?:ation|ation)|"
#     r"death\b|"
#     r"condition aggravated|"
#     r"toxicity to various agents|"
#     r"drug effective for unapproved indication|"
#     r"drug abuse|drug diversion|drug dependence|drug intolerance|ill-defined disorder|"
#     r"insurance issue|drug resistance|disease recurrence|"
#     r"autism spectrum disorder|breast cancer|meningioma|myeloma|plasma cell myeloma|systemic lupus erythematosus"
#     r")\b",
#     re.I
# )

# def split_pts(s: str) -> list[str]:
#     return [p.strip() for p in str(s).split(";") if p.strip()]

# def drop_non_symptom(pts: list[str]) -> tuple[list[str], list[str]]:
#     kept, dropped = [], []
#     for p in pts:
#         if p in DROP_EXACT or DROP_REGEX.search(p):
#             dropped.append(p)
#         else:
#             kept.append(p)
#     return kept, dropped

# # ---------- 3) CLEAN REAC_pt_symptom (V3 text) ----------
# df[TEXT_COL] = df[TEXT_COL].fillna("").astype(str)

# kept_texts = []
# dropped_texts = []

# for t in df[TEXT_COL].tolist():
#     pts = split_pts(t)
#     kept, dropped = drop_non_symptom(pts)
#     kept_texts.append("; ".join(kept))
#     dropped_texts.append("; ".join(dropped))

# df["REAC_pt_symptom_v3"] = kept_texts
# df["REAC_pt_dropped_v3"] = dropped_texts
# df["REAC_n_rows_symptom_v3"] = df["REAC_pt_symptom_v3"].apply(
#     lambda x: 0 if not str(x).strip() else len([p for p in str(x).split(";") if p.strip()])
# )

# # Empty bo'lib qolganlarni tushiramiz
# before = len(df)
# df = df[df["REAC_n_rows_symptom_v3"] > 0].copy()
# after = len(df)
# print(f"[V3] Rows before: {before:,} | after non-empty symptom: {after:,} | dropped empty: {before-after:,}")

# # ---------- 4) LABEL PATTERNS (V3 = V2 + FULL patch) ----------
# LABEL_PATTERNS = {
#     "gastrointestinal": re.compile(
#         r"\b(?:diarrh\w*|nausea|vomit\w*|abdominal|constipat\w*|dyspeps\w*|reflux|flatulence|colitis|pancreat\w*|"
#         r"haematochezia|hem(?:at|a)ochezia|gastrointestinal haemorrhage|rectal haemorrhage|stomatitis|dysphagia|"
#         r"crohn|ulcerative colitis|gastrointestinal disorder|intestinal obstruction)\b", re.I),

#     # PATCHED (FULL)
#     "cardiovascular": re.compile(
#         r"\b(?:"
#         r"hypertension|hypotension|palpitation\w*|tachycard\w*|bradycard\w*|atrial fibrillation|myocardial infarction|"
#         r"thrombosis|pulmonary embolism|cardiac failure|arrhythmia|"
#         r"cardiac arrest|cardiac fibrillation|"
#         r"blood pressure increased|blood pressure decreased|blood pressure fluctuation|"
#         r"heart rate increased|heart rate decreased|heart rate abnormal|"
#         r"intracardiac pressure increased"
#         r")\b", re.I),

#     "neurological": re.compile(
#         r"\b(?:headache|dizz\w*|seizure|tremor|confusional state|memory impairment|paraesthesia|hypoaesthesia|migraine|"
#         r"syncope|loss of consciousness|vertigo|balance disorder|amnesia|brain fog|somnolence|sedation|neuropathy|dementia|"
#         r"cerebrovascular accident|stroke)\b", re.I),

#     "dermatologic": re.compile(
#         r"\b(?:rash\w*|prurit\w*|erythema|urticaria|eczema|psoriasis|alopecia|blister|acne|dry skin|skin burning sensation|"
#         r"skin exfoliation|dermatitis|atopic|hidradenitis)\b", re.I),

#     "hematologic": re.compile(
#         r"\b(?:anaemi\w*|neutropen\w*|leukopen\w*|thrombocytopen\w*|pancytopen\w*|myelosuppression|febrile neutropenia|"
#         r"haemorrhag\w*|hemorrhag\w*|white blood cell count decreased|platelet count decreased)\b", re.I),

#     # PATCHED (FULL)
#     "hepatic": re.compile(
#         r"\b(?:"
#         r"hepatit\w*|jaundice|liver injury|drug-induced liver injury|hepatic function abnormal|liver disorder|"
#         r"hepatic enzyme increased|hepatic enzyme abnormal|hepatic infection|hepatic cytolysis|"
#         r"hepatic haemorrhage|hepatic hemorrhage|hepatic vein thrombosis|hepatic cyst|hepatic necrosis|"
#         r"alanine aminotransferase increased|aspartate aminotransferase increased|transaminases increased|liver enzymes increased|"
#         r"bilirubin increased|bilirubin conjugated increased|liver function test increased"
#         r")\b", re.I),

#     "renal": re.compile(
#         r"\b(?:acute kidney injury|renal impairment|renal failure|renal disorder|nephrolithiasis)\b", re.I),

#     # PATCHED (FULL)
#     "respiratory": re.compile(
#         r"\b(?:"
#         r"dyspnoea|dyspnea|cough\w*|asthma|wheez\w*|bronchit\w*|sinusitis|rhinitis|nasopharyngitis|"
#         r"upper respiratory tract infection|lower respiratory tract infection|respiratory tract infection|"
#         r"interstitial lung disease|pleural effusion|"
#         r"oxygen saturation decreased|oxygen saturation abnormal|"
#         r"lung disorder|increased bronchial secretion|epistaxis"
#         r")\b", re.I),

#     "general_systemic": re.compile(
#         r"\b(?:fatigue|malaise|asthenia|pyrexia|fever|chills|feeling abnormal|illness|influenza like illness|"
#         r"feeling hot|hot flush|flushing|general physical health deterioration)\b", re.I),

#     "pain_general": re.compile(
#         r"\b(?:\bpain\b|chest pain|chest discomfort|back pain|bone pain|pain in extremity|discomfort)\b", re.I),

#     "musculoskeletal": re.compile(
#         r"\b(?:arthralgia|myalgia|muscle spasms|muscular weakness|arthritis\b|osteoarthritis|joint swelling|"
#         r"musculoskeletal stiffness|arthropathy|fibromyalgia|rhabdomyolysis)\b", re.I),

#     "edema_swelling": re.compile(
#         r"\b(?:peripheral swelling|oedema\b|edema\b|swelling\b|swelling face|fluid retention|angioedema)\b", re.I),

#     # PATCHED (infective qo'shildi)
#     "infections": re.compile(
#         r"\b(?:"
#         r"pneumonia|covid-19|influenza\b|infection\b|infective|"
#         r"sepsis|septic shock|cellulitis|viral infection|herpes zoster"
#         r")\b", re.I),

#     "psychiatric": re.compile(
#         r"\b(?:anxiety|insomnia|depression|stress|agitation|hallucination|suicidal ideation|suicide attempt|"
#         r"irritability|nervousness|emotional distress|depressed mood)\b", re.I),

#     # PATCHED (FULL)
#     "urinary": re.compile(
#         r"\b(?:"
#         r"urinary tract infection|dysuria|cystitis|haematuria|hematuria|urinary retention|"
#         r"urinary tract disorder|lower urinary tract symptoms|pollakiuria|"
#         r"urinary frequency|nocturia|micturition urgency|"
#         r"proteinuria|urinary occult blood|urinary hesitation|"
#         r"urinary tract obstruction|bladder disorder|"
#         r"urinary bladder haemorrhage|haemorrhage urinary tract"
#         r")\b", re.I),

#     "injection_site": re.compile(
#         r"\b(?:injection site|infusion site|infusion related reaction|infusion-related reaction)\b", re.I),

#     # PATCHED (FULL – eye infection / haemorrhage / oedema + eye\b)
#     "ocular_visual": re.compile(
#         r"\b(?:"
#         r"eye\b|ocular|"
#         r"eye infection|eye haemorrhage|eye hemorrhage|"
#         r"eye oedema|eye edema|"
#         r"vision blurred|visual impairment|photophobia|"
#         r"conjunctivitis|dry eye|eye pain|cataract|"
#         r"eyelid oedema|eyelid edema"
#         r")\b", re.I),

#     "metabolic_endocrine": re.compile(
#         r"\b(?:weight increased|weight decreased|blood glucose increased|blood glucose decreased|hyperglyc\w*|hypoglyc\w*|"
#         r"diabetes mellitus|hyponatraemia|hyponatremia)\b", re.I),

#     "hypersensitivity_allergy": re.compile(
#         r"\b(?:hypersensitivity|drug hypersensitivity|anaphylactic reaction|anaphylaxis|allergic reaction|anaphylactic shock|"
#         r"cytokine release syndrome|infusion related reaction|"
#         r"drug reaction with eosinophilia and systemic symptoms)\b", re.I),

#     "injury_accident": re.compile(
#         r"\b(?:fall\b|fracture|injury|contusion|wound)\b", re.I),

#     # PATCHED (FULL)
#     "pregnancy_reproductive": re.compile(
#         r"\b(?:"
#         r"pregnancy|abortion spontaneous|abortion threatened|miscarriage|"
#         r"premature delivery|premature labour|premature labor|premature baby|"
#         r"menstrual disorder|dysmenorrhoea|dysmenorrhea|"
#         r"intermenstrual bleeding|heavy menstrual bleeding|pelvic pain"
#         r")\b", re.I),
# }

# labels = list(LABEL_PATTERNS.keys())
# print("[V3] label count:", len(labels))
# print(labels)

# # ---------- 5) BUILD y_* ----------
# s = df["REAC_pt_symptom_v3"].fillna("").astype(str)

# for lab, pat in LABEL_PATTERNS.items():
#     df[f"y_{lab}"] = s.str.contains(pat, na=False).astype(int)

# y_cols = [f"y_{lab}" for lab in labels]
# row_sum = df[y_cols].sum(axis=1)

# zero_label_rows = int((row_sum == 0).sum())
# print(f"[V3] 0-label rows (will be dropped in 03_load_data): {zero_label_rows:,} / {len(df):,} ({zero_label_rows/len(df)*100:.2f}%)")

# # y_labels (debug)
# lab_names = labels
# Y = df[y_cols].to_numpy(dtype=np.int8)
# idxs = [np.flatnonzero(r) for r in Y]
# df["y_labels"] = ["; ".join([lab_names[i] for i in ix]) for ix in idxs]

# # ---------- 6) PREVALENCE ----------
# print("\n[V3] Label prevalence (% of rows):")
# for lab in labels:
#     pct = df[f"y_{lab}"].mean() * 100
#     print(f" - {lab:24s}: {pct:6.2f}%")

# # ---------- 7) SAVE ----------
# df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
# print("\n[SAVED]", OUT_CSV)

IN_CSV: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Raw_data\faers_25Q4_targets_multilabel_v2.csv exists: True
OUT_CSV: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Raw_data\faers_25Q4_targets_multilabel_v3.csv
Loaded: (308959, 75)
[V3] Rows before: 308,959 | after non-empty symptom: 308,959 | dropped empty: 0
[V3] label count: 21
['gastrointestinal', 'cardiovascular', 'neurological', 'dermatologic', 'hematologic', 'hepatic', 'renal', 'respiratory', 'general_systemic', 'pain_general', 'musculoskeletal', 'edema_swelling', 'infections', 'psychiatric', 'urinary', 'injection_site', 'ocular_visual', 'metabolic_endocrine', 'hypersensitivity_allergy', 'injury_accident', 'pregnancy_reproductive']
[V3] 0-label rows (will be dropped in 03_load_data): 59,280 / 308,959 (19.19%)

[V3] Label prevalence (% of rows):
 - gastrointestinal        :  17.40%
 - cardiovascular          :   8.78%
 -

In [None]:
# #02_target_refine_v3.ipynb — FINAL (V2 code + patchlar qo‘shilgan)

# from pathlib import Path
# import re
# import numpy as np
# import pandas as pd

# # ---------- PATHS ----------
# CWD = Path.cwd()
# if (CWD / "Data").exists():
#     PROJECT_ROOT = CWD
# elif (CWD.parent / "Data").exists():
#     PROJECT_ROOT = CWD.parent
# else:
#     PROJECT_ROOT = CWD

# IN_CSV  = PROJECT_ROOT / "Data" / "Raw_data" / "faers_25Q4_targets_multilabel_v2.csv"        # v1 input
# OUT_CSV = PROJECT_ROOT / "Data" / "Raw_data" / "faers_25Q4_targets_multilabel_v3.csv"     # v3 output

# TEXT_COL = "REAC_pt_symptom_v2"

# print("IN_CSV:", IN_CSV, "exists:", IN_CSV.exists())
# print("OUT_CSV:", OUT_CSV)

# df = pd.read_csv(IN_CSV, low_memory=False)
# print("Loaded:", df.shape)

# # ---------- 1) OLD y_* LARNI O'CHIRIB TASHLAYMIZ ----------
# old_y = [c for c in df.columns if c.startswith("y_")]
# df = df.drop(columns=old_y, errors="ignore")
# if "y_labels" in df.columns:
#     df = df.drop(columns=["y_labels"], errors="ignore")

# # ---------- 2) DROP RULES (symptom emas / admin / exposure / outcome / progression ...) ----------
# DROP_EXACT = {
#     "Death",
#     "Hospitalisation",
#     "Hospitalization",
#     "Drug interaction",
#     "Surgery",
#     "Disease progression",
#     "Malignant neoplasm progression",
#     "Neoplasm malignant",
#     "Condition aggravated",
#     "Toxicity to various agents",
#     "Drug effective for unapproved indication",
#     "Exposure during pregnancy",
#     "Maternal exposure during pregnancy",
#     "Foetal exposure during pregnancy",
#     "Exposure via skin contact",

#     # admin / misuse / no-symptom
#     "Drug abuse",
#     "Drug diversion",
#     "Drug dependence",
#     "Drug intolerance",
#     "Ill-defined disorder",
#     "Insurance issue",
#     "Drug resistance",
#     "Disease recurrence",

#     # diagnosis-heavy (symptom model uchun shovqin)
#     "Autism spectrum disorder",
#     "Breast cancer",
#     "Meningioma",
#     "Plasma cell myeloma",
#     "Systemic lupus erythematosus",
# }

# DROP_REGEX = re.compile(
#     r"\b(?:"
#     r"exposure\b|"
#     r"drug interaction|"
#     r"disease progression|"
#     r"neoplasm progression|"
#     r"neoplasm malignant|"
#     r"surgery\b|"
#     r"hospitalis(?:ation|ation)|"
#     r"death\b|"
#     r"condition aggravated|"
#     r"toxicity to various agents|"
#     r"drug effective for unapproved indication|"
#     r"drug abuse|drug diversion|drug dependence|drug intolerance|ill-defined disorder|"
#     r"insurance issue|drug resistance|disease recurrence|"
#     r"autism spectrum disorder|breast cancer|meningioma|myeloma|plasma cell myeloma|systemic lupus erythematosus"
#     r")\b",
#     re.I
# )

# def split_pts(s: str) -> list[str]:
#     return [p.strip() for p in str(s).split(";") if p.strip()]

# def drop_non_symptom(pts: list[str]) -> tuple[list[str], list[str]]:
#     kept, dropped = [], []
#     for p in pts:
#         if p in DROP_EXACT or DROP_REGEX.search(p):
#             dropped.append(p)
#         else:
#             kept.append(p)
#     return kept, dropped

# # ---------- 3) CLEAN REAC_pt_symptom (V3 text) ----------
# df[TEXT_COL] = df[TEXT_COL].fillna("").astype(str)

# kept_texts = []
# dropped_texts = []

# for t in df[TEXT_COL].tolist():
#     pts = split_pts(t)
#     kept, dropped = drop_non_symptom(pts)
#     kept_texts.append("; ".join(kept))
#     dropped_texts.append("; ".join(dropped))

# df["REAC_pt_symptom_v3"] = kept_texts
# df["REAC_pt_dropped_v3"] = dropped_texts
# df["REAC_n_rows_symptom_v3"] = df["REAC_pt_symptom_v3"].apply(
#     lambda x: 0 if not str(x).strip() else len([p for p in str(x).split(";") if p.strip()])
# )

# # Empty bo'lib qolganlarni tushiramiz
# before = len(df)
# df = df[df["REAC_n_rows_symptom_v3"] > 0].copy()
# after = len(df)
# print(f"[V3] Rows before: {before:,} | after non-empty symptom: {after:,} | dropped empty: {before-after:,}")

# # ---------- 4) LABEL PATTERNS (V3 = V2 + patchlar) ----------
# LABEL_PATTERNS = {
#     "gastrointestinal": re.compile(
#         r"\b(?:diarrh\w*|nausea|vomit\w*|abdominal|constipat\w*|dyspeps\w*|reflux|flatulence|colitis|pancreat\w*|"
#         r"haematochezia|hem(?:at|a)ochezia|gastrointestinal haemorrhage|rectal haemorrhage|stomatitis|dysphagia|"
#         r"crohn|ulcerative colitis|gastrointestinal disorder|intestinal obstruction)\b", re.I),  # ✅ group warning fixed

#     # ✅ PATCHED
#     "cardiovascular": re.compile(
#         r"\b(?:"
#         r"hypertension|hypotension|palpitation\w*|tachycard\w*|bradycard\w*|atrial fibrillation|myocardial infarction|"
#         r"thrombosis|pulmonary embolism|cardiac failure|arrhythmia|"
#         r"cardiac arrest|cardiac fibrillation|"
#         r"blood pressure increased|blood pressure decreased|blood pressure fluctuation|"
#         r"heart rate increased|heart rate decreased|heart rate abnormal|"
#         r"intracardiac pressure increased"
#         r")\b", re.I),

#     "neurological": re.compile(
#         r"\b(?:headache|dizz\w*|seizure|tremor|confusional state|memory impairment|paraesthesia|hypoaesthesia|migraine|"
#         r"syncope|loss of consciousness|vertigo|balance disorder|amnesia|brain fog|somnolence|sedation|neuropathy|dementia|"
#         r"cerebrovascular accident|stroke)\b", re.I),

#     "dermatologic": re.compile(
#         r"\b(?:rash\w*|prurit\w*|erythema|urticaria|eczema|psoriasis|alopecia|blister|acne|dry skin|skin burning sensation|"
#         r"skin exfoliation|dermatitis|atopic|hidradenitis)\b", re.I),

#     "hematologic": re.compile(
#         r"\b(?:anaemi\w*|neutropen\w*|leukopen\w*|thrombocytopen\w*|pancytopen\w*|myelosuppression|febrile neutropenia|"
#         r"haemorrhag\w*|hemorrhag\w*|white blood cell count decreased|platelet count decreased)\b", re.I),

#     # ✅ PATCHED
#     "hepatic": re.compile(
#         r"\b(?:"
#         r"hepatit\w*|jaundice|liver injury|drug-induced liver injury|hepatic function abnormal|liver disorder|"
#         r"hepatic enzyme increased|hepatic enzyme abnormal|hepatic infection|"
#         r"hepatic cytolysis|"
#         r"alanine aminotransferase increased|aspartate aminotransferase increased|transaminases increased|liver enzymes increased|"
#         r"bilirubin increased|bilirubin conjugated increased"
#         r")\b", re.I),

#     "renal": re.compile(r"\b(?:acute kidney injury|renal impairment|renal failure|renal disorder|nephrolithiasis)\b", re.I),

#     # ✅ PATCHED
#     "respiratory": re.compile(
#         r"\b(?:"
#         r"dyspnoea|dyspnea|cough\w*|asthma|wheez\w*|bronchit\w*|sinusitis|rhinitis|nasopharyngitis|"
#         r"upper respiratory tract infection|lower respiratory tract infection|respiratory tract infection|"
#         r"interstitial lung disease|pleural effusion|"
#         r"oxygen saturation decreased|oxygen saturation abnormal|"
#         r"lung disorder|increased bronchial secretion|epistaxis"
#         r")\b", re.I),

#     "general_systemic": re.compile(
#         r"\b(?:fatigue|malaise|asthenia|pyrexia|fever|chills|feeling abnormal|illness|influenza like illness|"
#         r"feeling hot|hot flush|flushing|general physical health deterioration)\b", re.I),

#     "pain_general": re.compile(r"\b(?:\bpain\b|chest pain|chest discomfort|back pain|bone pain|pain in extremity|discomfort)\b", re.I),

#     "musculoskeletal": re.compile(
#         r"\b(?:arthralgia|myalgia|muscle spasms|muscular weakness|arthritis\b|osteoarthritis|joint swelling|"
#         r"musculoskeletal stiffness|arthropathy|fibromyalgia|rhabdomyolysis)\b", re.I),

#     "edema_swelling": re.compile(
#         r"\b(?:peripheral swelling|oedema\b|edema\b|swelling\b|swelling face|fluid retention|angioedema)\b", re.I),

#     "infections": re.compile(
#         r"\b(?:pneumonia|covid-19|influenza\b|infection\b|sepsis|cellulitis|viral infection|herpes zoster|septic shock)\b", re.I),

#     "psychiatric": re.compile(
#         r"\b(?:anxiety|insomnia|depression|stress|agitation|hallucination|suicidal ideation|suicide attempt|"
#         r"irritability|nervousness|emotional distress|depressed mood)\b", re.I),

#     # ✅ PATCHED
#     "urinary": re.compile(
#         r"\b(?:"
#         r"urinary tract infection|dysuria|cystitis|haematuria|hematuria|urinary retention|"
#         r"urinary tract disorder|urinary tract pain|lower urinary tract symptoms|pollakiuria|"
#         r"urinary frequency|nocturia|micturition urgency|bladder pain|chromaturia|urine abnormal"
#         r")\b", re.I),

#     "injection_site": re.compile(
#         r"\b(?:injection site|infusion site|infusion related reaction|infusion-related reaction)\b", re.I),

#     # ✅ PATCHED
#     "ocular_visual": re.compile(
#         r"\b(?:"
#         r"eye irritation|conjunctivitis|vision blurred|visual impairment|blindness|photophobia|ocular|eye pain|dry eye|cataract|"
#         r"eye swelling|eye disorder|eyelid oedema|eyelid edema"
#         r")\b", re.I),

#     "metabolic_endocrine": re.compile(
#         r"\b(?:weight increased|weight decreased|blood glucose increased|blood glucose decreased|hyperglyc\w*|hypoglyc\w*|"
#         r"diabetes mellitus|hyponatraemia|hyponatremia)\b", re.I),

#     "hypersensitivity_allergy": re.compile(
#         r"\b(?:hypersensitivity|drug hypersensitivity|anaphylactic reaction|anaphylaxis|allergic reaction|anaphylactic shock|"
#         r"cytokine release syndrome|infusion related reaction|"
#         r"drug reaction with eosinophilia and systemic symptoms)\b", re.I),

#     "injury_accident": re.compile(r"\b(?:fall\b|fracture|injury|contusion|wound)\b", re.I),

#     "pregnancy_reproductive": re.compile(
#         r"\b(?:abortion spontaneous|miscarriage|heavy menstrual bleeding|intermenstrual bleeding|premature baby|pregnancy)\b", re.I),
# }

# labels = list(LABEL_PATTERNS.keys())
# print("[V3] label count:", len(labels))
# print(labels)

# # ---------- 5) BUILD y_* ----------
# s = df["REAC_pt_symptom_v3"].fillna("").astype(str)

# for lab, pat in LABEL_PATTERNS.items():
#     df[f"y_{lab}"] = s.str.contains(pat, na=False).astype(int)

# y_cols = [f"y_{lab}" for lab in labels]
# row_sum = df[y_cols].sum(axis=1)

# zero_label_rows = int((row_sum == 0).sum())
# print(f"[V3] 0-label rows (will be dropped in 03_load_data): {zero_label_rows:,} / {len(df):,} ({zero_label_rows/len(df)*100:.2f}%)")

# # y_labels (debug)
# lab_names = labels
# Y = df[y_cols].to_numpy(dtype=np.int8)
# idxs = [np.flatnonzero(r) for r in Y]
# df["y_labels"] = ["; ".join([lab_names[i] for i in ix]) for ix in idxs]

# # ---------- 6) PREVALENCE ----------
# print("\n[V3] Label prevalence (% of rows):")
# for lab in labels:
#     pct = df[f"y_{lab}"].mean() * 100
#     print(f" - {lab:24s}: {pct:6.2f}%")

# # ---------- 7) SAVE ----------
# df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
# print("\n[SAVED]", OUT_CSV)

IN_CSV: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Raw_data\faers_25Q4_targets_multilabel_v2.csv exists: True
OUT_CSV: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Raw_data\faers_25Q4_targets_multilabel_v3.csv
Loaded: (308959, 75)
[V3] Rows before: 308,959 | after non-empty symptom: 308,959 | dropped empty: 0
[V3] label count: 21
['gastrointestinal', 'cardiovascular', 'neurological', 'dermatologic', 'hematologic', 'hepatic', 'renal', 'respiratory', 'general_systemic', 'pain_general', 'musculoskeletal', 'edema_swelling', 'infections', 'psychiatric', 'urinary', 'injection_site', 'ocular_visual', 'metabolic_endocrine', 'hypersensitivity_allergy', 'injury_accident', 'pregnancy_reproductive']
[V3] 0-label rows (will be dropped in 03_load_data): 59,568 / 308,959 (19.28%)

[V3] Label prevalence (% of rows):
 - gastrointestinal        :  17.40%
 - cardiovascular          :   8.78%
 -

In [None]:
# # 02_target_refine_v2.ipynb

# from pathlib import Path
# import re
# import numpy as np
# import pandas as pd

# # ---------- PATHS ----------
# CWD = Path.cwd()
# if (CWD / "Data").exists():
#     PROJECT_ROOT = CWD
# elif (CWD.parent / "Data").exists():
#     PROJECT_ROOT = CWD.parent
# else:
#     PROJECT_ROOT = CWD

# IN_CSV  = PROJECT_ROOT / "Data" / "Raw_data" / "faers_25Q4_targets_multilabel.csv"      # sizdagi hozirgi v1
# OUT_CSV = PROJECT_ROOT / "Data" / "Raw_data" / "faers_25Q4_targets_multilabel_v2.csv"   # yangi v2

# TEXT_COL = "REAC_pt_symptom"

# print("IN_CSV:", IN_CSV, "exists:", IN_CSV.exists())
# print("OUT_CSV:", OUT_CSV)

# df = pd.read_csv(IN_CSV, low_memory=False)
# print("Loaded:", df.shape)

# # ---------- 1) OLD y_* LARNI O'CHIRIB TASHLAYMIZ ----------
# old_y = [c for c in df.columns if c.startswith("y_")]
# df = df.drop(columns=old_y, errors="ignore")

# # ---------- 2) DROP RULES (symptom emas / admin / exposure / outcome / progression ...) ----------
# DROP_EXACT = {
#     "Death",
#     "Hospitalisation",
#     "Hospitalization",
#     "Drug interaction",
#     "Surgery",
#     "Disease progression",
#     "Malignant neoplasm progression",
#     "Neoplasm malignant",
#     "Condition aggravated",
#     "Toxicity to various agents",
#     "Drug effective for unapproved indication",
#     "Exposure during pregnancy",
#     "Maternal exposure during pregnancy",
#     "Foetal exposure during pregnancy",
#     "Exposure via skin contact",

#     # admin / misuse / no-symptom
#     "Drug abuse",
#     "Drug diversion",
#     "Drug dependence",
#     "Drug intolerance",
#     "Ill-defined disorder",
#     "Insurance issue",
#     "Drug resistance",
#     "Disease recurrence",

#     # diagnosis-heavy (symptom model uchun shovqin)
#     "Autism spectrum disorder",
#     "Breast cancer",
#     "Meningioma",
#     "Plasma cell myeloma",
#     "Systemic lupus erythematosus",
# }

# DROP_REGEX = re.compile(
#     r"\b(?:"
#     r"exposure\b|"
#     r"drug interaction|"
#     r"disease progression|"
#     r"neoplasm progression|"
#     r"neoplasm malignant|"
#     r"surgery\b|"
#     r"hospitalis(?:ation|ation)|"
#     r"death\b|"
#     r"condition aggravated|"
#     r"toxicity to various agents|"
#     r"drug effective for unapproved indication|"
#     r"drug abuse|drug diversion|drug dependence|drug intolerance|ill-defined disorder|"
#     r"insurance issue|drug resistance|disease recurrence|"
#     r"autism spectrum disorder|breast cancer|meningioma|myeloma|plasma cell myeloma|systemic lupus erythematosus"
#     r")\b",
#     re.I
# )

# def split_pts(s: str) -> list[str]:
#     return [p.strip() for p in str(s).split(";") if p.strip()]

# def drop_non_symptom(pts: list[str]) -> tuple[list[str], list[str]]:
#     kept, dropped = [], []
#     for p in pts:
#         if p in DROP_EXACT or DROP_REGEX.search(p):
#             dropped.append(p)
#         else:
#             kept.append(p)
#     return kept, dropped

# # ---------- 3) CLEAN REAC_pt_symptom (V2) ----------
# df[TEXT_COL] = df[TEXT_COL].fillna("").astype(str)

# kept_texts = []
# dropped_texts = []

# for t in df[TEXT_COL].tolist():
#     pts = split_pts(t)
#     kept, dropped = drop_non_symptom(pts)
#     kept_texts.append("; ".join(kept))
#     dropped_texts.append("; ".join(dropped))

# df["REAC_pt_symptom_v2"] = kept_texts
# df["REAC_pt_dropped_v2"] = dropped_texts
# df["REAC_n_rows_symptom_v2"] = df["REAC_pt_symptom_v2"].apply(
#     lambda x: 0 if not str(x).strip() else len([p for p in str(x).split(";") if p.strip()])
# )

# # Empty bo'lib qolganlarni tushiramiz (symptom yo'q bo'lsa bu row bizga kerak emas)
# before = len(df)
# df = df[df["REAC_n_rows_symptom_v2"] > 0].copy()
# after = len(df)
# print(f"[V2] Rows before: {before:,} | after non-empty symptom: {after:,} | dropped empty: {before-after:,}")

# # ---------- 4) LABEL PATTERNS (KENGAYTIRILGAN V2) ----------
# LABEL_PATTERNS = {
#     "gastrointestinal": re.compile(
#         r"\b(?:diarrh\w*|nausea|vomit\w*|abdominal|constipat\w*|dyspeps\w*|reflux|flatulence|colitis|pancreat\w*|"
#         r"haematochezia|hem(at|a)ochezia|gastrointestinal haemorrhage|rectal haemorrhage|stomatitis|dysphagia|"
#         r"crohn|ulcerative colitis|gastrointestinal disorder|intestinal obstruction)\b", re.I),

#     "cardiovascular": re.compile(
#         r"\b(?:hypertension|hypotension|palpitation\w*|tachycard\w*|bradycard\w*|atrial fibrillation|myocardial infarction|"
#         r"thrombosis|pulmonary embolism|cardiac failure|arrhythmia|cardiac arrest|cardiac disorder|"
#         r"blood pressure increased|blood pressure decreased|heart rate increased|heart rate decreased)\b", re.I),

#     "neurological": re.compile(
#         r"\b(?:headache|dizz\w*|seizure|tremor|confusional state|memory impairment|paraesthesia|hypoaesthesia|migraine|"
#         r"syncope|loss of consciousness|vertigo|balance disorder|amnesia|brain fog|somnolence|sedation|neuropathy|dementia|"
#         r"cerebrovascular accident|stroke)\b", re.I),

#     "dermatologic": re.compile(
#         r"\b(?:rash\w*|prurit\w*|erythema|urticaria|eczema|psoriasis|alopecia|blister|acne|dry skin|skin burning sensation|"
#         r"skin exfoliation|dermatitis|atopic|hidradenitis)\b", re.I),

#     "hematologic": re.compile(
#         r"\b(?:anaemi\w*|neutropen\w*|leukopen\w*|thrombocytopen\w*|pancytopen\w*|myelosuppression|febrile neutropenia|"
#         r"haemorrhag\w*|hemorrhag\w*|white blood cell count decreased|platelet count decreased)\b", re.I),

#     "hepatic": re.compile(
#         r"\b(?:hepatit\w*|jaundice|liver injury|drug-induced liver injury|hepatic function abnormal|liver disorder|"
#         r"hepatic enzyme increased|hepatic cytolysis|alanine aminotransferase increased|aspartate aminotransferase increased|"
#         r"transaminases increased|liver enzymes increased)\b", re.I),

#     "renal": re.compile(r"\b(?:acute kidney injury|renal impairment|renal failure|renal disorder|nephrolithiasis)\b", re.I),

#     "respiratory": re.compile(
#         r"\b(?:dyspnoea|dyspnea|cough\w*|asthma|wheez\w*|bronchit\w*|sinusitis|rhinitis|nasopharyngitis|"
#         r"upper respiratory tract infection|lower respiratory tract infection|respiratory tract infection|"
#         r"interstitial lung disease|pleural effusion|oxygen saturation decreased|epistaxis)\b", re.I),

#     "general_systemic": re.compile(
#         r"\b(?:fatigue|malaise|asthenia|pyrexia|fever|chills|feeling abnormal|illness|influenza like illness|"
#         r"feeling hot|hot flush|flushing|general physical health deterioration)\b", re.I),

#     "pain_general": re.compile(r"\b(?:\bpain\b|chest pain|chest discomfort|back pain|bone pain|pain in extremity|discomfort)\b", re.I),

#     "musculoskeletal": re.compile(
#         r"\b(?:arthralgia|myalgia|muscle spasms|muscular weakness|arthritis\b|osteoarthritis|joint swelling|"
#         r"musculoskeletal stiffness|arthropathy|fibromyalgia|rhabdomyolysis)\b", re.I),

#     "edema_swelling": re.compile(
#         r"\b(?:peripheral swelling|oedema\b|edema\b|swelling\b|swelling face|fluid retention|angioedema)\b", re.I),

#     "infections": re.compile(
#         r"\b(?:pneumonia|covid-19|influenza\b|infection\b|sepsis|cellulitis|viral infection|herpes zoster|septic shock)\b", re.I),

#     "psychiatric": re.compile(
#         r"\b(?:anxiety|insomnia|depression|stress|agitation|hallucination|suicidal ideation|suicide attempt|"
#         r"irritability|nervousness|emotional distress|depressed mood)\b", re.I),

#     "urinary": re.compile(r"\b(?:urinary tract infection|dysuria|cystitis|haematuria|hematuria|urinary retention)\b", re.I),

#     "injection_site": re.compile(r"\b(?:injection site|infusion site|infusion related reaction|infusion-related reaction)\b", re.I),

#     # NEW
#     "ocular_visual": re.compile(r"\b(?:eye irritation|conjunctivitis|vision blurred|visual impairment|blindness|photophobia|ocular|eye pain|dry eye|cataract)\b", re.I),

#     "metabolic_endocrine": re.compile(r"\b(?:weight increased|weight decreased|blood glucose increased|blood glucose decreased|hyperglyc\w*|hypoglyc\w*|diabetes mellitus|hyponatraemia|hyponatremia)\b", re.I),

#     "hypersensitivity_allergy": re.compile(
#         r"\b(?:hypersensitivity|drug hypersensitivity|anaphylactic reaction|anaphylaxis|allergic reaction|anaphylactic shock|"
#         r"cytokine release syndrome|infusion related reaction|"
#         r"drug reaction with eosinophilia and systemic symptoms)\b", re.I),

#     "injury_accident": re.compile(r"\b(?:fall\b|fracture|injury|contusion|wound)\b", re.I),

#     "pregnancy_reproductive": re.compile(r"\b(?:abortion spontaneous|miscarriage|heavy menstrual bleeding|intermenstrual bleeding|premature baby|pregnancy)\b", re.I),
# }

# labels = list(LABEL_PATTERNS.keys())
# print("[V2] label count:", len(labels))
# print(labels)

# # ---------- 5) BUILD y_* ----------
# s = df["REAC_pt_symptom_v2"].fillna("").astype(str)

# for lab, pat in LABEL_PATTERNS.items():
#     df[f"y_{lab}"] = s.str.contains(pat, na=False).astype(int)

# y_cols = [f"y_{lab}" for lab in labels]
# row_sum = df[y_cols].sum(axis=1)

# zero_label_rows = int((row_sum == 0).sum())
# print(f"[V2] 0-label rows (will be dropped in 03_load_data): {zero_label_rows:,} / {len(df):,} ({zero_label_rows/len(df)*100:.2f}%)")

# # y_labels (debug)
# lab_names = labels
# Y = df[y_cols].to_numpy(dtype=np.int8)

# idxs = [np.flatnonzero(r) for r in Y]
# df["y_labels"] = ["; ".join([lab_names[i] for i in ix]) for ix in idxs]

# # ---------- 6) PREVALENCE ----------
# print("\n[V2] Label prevalence (% of rows):")
# for lab in labels:
#     pct = df[f"y_{lab}"].mean() * 100
#     print(f" - {lab:24s}: {pct:6.2f}%")

# # ---------- 7) SAVE ----------
# df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
# print("\n[SAVED]", OUT_CSV)

IN_CSV: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Raw_data\faers_25Q4_targets_multilabel.csv exists: True
OUT_CSV: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Raw_data\faers_25Q4_targets_multilabel_v2.csv
Loaded: (335097, 67)
[V2] Rows before: 335,097 | after non-empty symptom: 308,959 | dropped empty: 26,138
[V2] label count: 21
['gastrointestinal', 'cardiovascular', 'neurological', 'dermatologic', 'hematologic', 'hepatic', 'renal', 'respiratory', 'general_systemic', 'pain_general', 'musculoskeletal', 'edema_swelling', 'infections', 'psychiatric', 'urinary', 'injection_site', 'ocular_visual', 'metabolic_endocrine', 'hypersensitivity_allergy', 'injury_accident', 'pregnancy_reproductive']


  df[f"y_{lab}"] = s.str.contains(pat, na=False).astype(int)


[V2] 0-label rows (will be dropped in 03_load_data): 59,952 / 308,959 (19.40%)

[V2] Label prevalence (% of rows):
 - gastrointestinal        :  17.40%
 - cardiovascular          :   9.01%
 - neurological            :  14.28%
 - dermatologic            :  15.18%
 - hematologic             :   8.45%
 - hepatic                 :   2.94%
 - renal                   :   2.75%
 - respiratory             :   9.93%
 - general_systemic        :  13.66%
 - pain_general            :  15.66%
 - musculoskeletal         :   7.60%
 - edema_swelling          :   6.12%
 - infections              :  11.97%
 - psychiatric             :   6.35%
 - urinary                 :   2.04%
 - injection_site          :   6.62%
 - ocular_visual           :   4.41%
 - metabolic_endocrine     :   5.30%
 - hypersensitivity_allergy:   3.78%
 - injury_accident         :   6.88%
 - pregnancy_reproductive  :   0.87%

[SAVED] c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data

In [2]:
# 02_target_refine_v2.ipynb

from pathlib import Path
import re
import numpy as np
import pandas as pd

# ---------- PATHS ----------
CWD = Path.cwd()
if (CWD / "Data").exists():
    PROJECT_ROOT = CWD
elif (CWD.parent / "Data").exists():
    PROJECT_ROOT = CWD.parent
else:
    PROJECT_ROOT = CWD

IN_CSV  = PROJECT_ROOT / "Data" / "Raw_data" / "faers_25Q4_targets_multilabel.csv"      # sizdagi hozirgi v1
OUT_CSV = PROJECT_ROOT / "Data" / "Raw_data" / "faers_25Q4_targets_multilabel_v2.csv"   # yangi v2

TEXT_COL = "REAC_pt_symptom"

print("IN_CSV:", IN_CSV, "exists:", IN_CSV.exists())
print("OUT_CSV:", OUT_CSV)

df = pd.read_csv(IN_CSV, low_memory=False)
print("Loaded:", df.shape)

# ---------- 1) OLD y_* LARNI O'CHIRIB TASHLAYMIZ ----------
old_y = [c for c in df.columns if c.startswith("y_")]
df = df.drop(columns=old_y, errors="ignore")

# ---------- 2) DROP RULES (symptom emas / admin / exposure / outcome / progression ...) ----------
DROP_EXACT = {
    "Death",
    "Hospitalisation",
    "Hospitalization",
    "Drug interaction",
    "Surgery",
    "Disease progression",
    "Malignant neoplasm progression",
    "Neoplasm malignant",
    "Condition aggravated",
    "Toxicity to various agents",
    "Drug effective for unapproved indication",
    "Exposure during pregnancy",
    "Maternal exposure during pregnancy",
    "Foetal exposure during pregnancy",
    "Exposure via skin contact",

    # admin / misuse / no-symptom
    "Drug abuse",
    "Drug diversion",
    "Drug dependence",
    "Drug intolerance",
    "Ill-defined disorder",
    "Insurance issue",
    "Drug resistance",
    "Disease recurrence",

    # diagnosis-heavy (symptom model uchun shovqin)
    "Autism spectrum disorder",
    "Breast cancer",
    "Meningioma",
    "Plasma cell myeloma",
    "Systemic lupus erythematosus",
}

DROP_REGEX = re.compile(
    r"\b(?:"
    r"exposure\b|"
    r"drug interaction|"
    r"disease progression|"
    r"neoplasm progression|"
    r"neoplasm malignant|"
    r"surgery\b|"
    r"hospitalis(?:ation|ation)|"
    r"death\b|"
    r"condition aggravated|"
    r"toxicity to various agents|"
    r"drug effective for unapproved indication|"
    r"drug abuse|drug diversion|drug dependence|drug intolerance|ill-defined disorder|"
    r"insurance issue|drug resistance|disease recurrence|"
    r"autism spectrum disorder|breast cancer|meningioma|myeloma|plasma cell myeloma|systemic lupus erythematosus"
    r")\b",
    re.I
)

def split_pts(s: str) -> list[str]:
    return [p.strip() for p in str(s).split(";") if p.strip()]

def drop_non_symptom(pts: list[str]) -> tuple[list[str], list[str]]:
    kept, dropped = [], []
    for p in pts:
        if p in DROP_EXACT or DROP_REGEX.search(p):
            dropped.append(p)
        else:
            kept.append(p)
    return kept, dropped

# ---------- 3) CLEAN REAC_pt_symptom (V2) ----------
df[TEXT_COL] = df[TEXT_COL].fillna("").astype(str)

kept_texts = []
dropped_texts = []

for t in df[TEXT_COL].tolist():
    pts = split_pts(t)
    kept, dropped = drop_non_symptom(pts)
    kept_texts.append("; ".join(kept))
    dropped_texts.append("; ".join(dropped))

df["REAC_pt_symptom_v2"] = kept_texts
df["REAC_pt_dropped_v2"] = dropped_texts
df["REAC_n_rows_symptom_v2"] = df["REAC_pt_symptom_v2"].apply(
    lambda x: 0 if not str(x).strip() else len([p for p in str(x).split(";") if p.strip()])
)

# Empty bo'lib qolganlarni tushiramiz (symptom yo'q bo'lsa bu row bizga kerak emas)
before = len(df)
df = df[df["REAC_n_rows_symptom_v2"] > 0].copy()
after = len(df)
print(f"[V2] Rows before: {before:,} | after non-empty symptom: {after:,} | dropped empty: {before-after:,}")

# ---------- 4) LABEL PATTERNS (KENGAYTIRILGAN V2) ----------
LABEL_PATTERNS = {
    "gastrointestinal": re.compile(
        r"\b(?:diarrh\w*|nausea|vomit\w*|abdominal|constipat\w*|dyspeps\w*|reflux|flatulence|colitis|pancreat\w*|"
        r"haematochezia|hem(at|a)ochezia|gastrointestinal haemorrhage|rectal haemorrhage|stomatitis|dysphagia|"
        r"crohn|ulcerative colitis|gastrointestinal disorder|intestinal obstruction)\b", re.I),

    "cardiovascular": re.compile(
    r"\b(?:"
    # --- eski ---
    r"hypertension|hypotension|palpitation\w*|tachycard\w*|bradycard\w*|atrial fibrillation|myocardial infarction|"
    r"thrombosis|pulmonary embolism|cardiac failure|arrhythmia|cardiac arrest|cardiac disorder|"
    r"blood pressure increased|blood pressure decreased|heart rate increased|heart rate decreased|"
    # --- kengaytirish ---
    r"cardiac fibrillation|"
    r"blood pressure fluctuation|heart rate abnormal|intracardiac pressure increased"
    r")\b",
    re.I),

    "neurological": re.compile(
        r"\b(?:headache|dizz\w*|seizure|tremor|confusional state|memory impairment|paraesthesia|hypoaesthesia|migraine|"
        r"syncope|loss of consciousness|vertigo|balance disorder|amnesia|brain fog|somnolence|sedation|neuropathy|dementia|"
        r"cerebrovascular accident|stroke)\b", re.I),

    "dermatologic": re.compile(
        r"\b(?:rash\w*|prurit\w*|erythema|urticaria|eczema|psoriasis|alopecia|blister|acne|dry skin|skin burning sensation|"
        r"skin exfoliation|dermatitis|atopic|hidradenitis)\b", re.I),

    "hematologic": re.compile(
        r"\b(?:anaemi\w*|neutropen\w*|leukopen\w*|thrombocytopen\w*|pancytopen\w*|myelosuppression|febrile neutropenia|"
        r"haemorrhag\w*|hemorrhag\w*|white blood cell count decreased|platelet count decreased)\b", re.I),

    "hepatic":  re.compile(
    r"\b(?:"
    # --- eski ---
    r"hepatit\w*|jaundice|liver injury|drug-induced liver injury|hepatic function abnormal|liver disorder|"
    r"hepatic enzyme increased|hepatic cytolysis|"
    r"alanine aminotransferase increased|aspartate aminotransferase increased|"
    r"transaminases increased|liver enzymes increased|"
    # --- kengaytirish ---
    r"hepatic enzyme abnormal|hepatic infection|"
    r"bilirubin increased|bilirubin conjugated increased"
    r")\b",
    re.I),

    "renal": re.compile(r"\b(?:acute kidney injury|renal impairment|renal failure|renal disorder|nephrolithiasis)\b", re.I),

    "respiratory": re.compile(
    r"\b(?:"
    # --- eski ---
    r"dyspnoea|dyspnea|cough\w*|asthma|wheez\w*|bronchit\w*|sinusitis|rhinitis|nasopharyngitis|"
    r"upper respiratory tract infection|lower respiratory tract infection|respiratory tract infection|"
    r"interstitial lung disease|pleural effusion|oxygen saturation decreased|epistaxis|"
    # --- kengaytirish ---
    r"oxygen saturation abnormal|lung disorder|increased bronchial secretion"
    r")\b",
    re.I),

    "general_systemic": re.compile(
        r"\b(?:fatigue|malaise|asthenia|pyrexia|fever|chills|feeling abnormal|illness|influenza like illness|"
        r"feeling hot|hot flush|flushing|general physical health deterioration)\b", re.I),

    "pain_general": re.compile(r"\b(?:\bpain\b|chest pain|chest discomfort|back pain|bone pain|pain in extremity|discomfort)\b", re.I),

    "musculoskeletal": re.compile(
        r"\b(?:arthralgia|myalgia|muscle spasms|muscular weakness|arthritis\b|osteoarthritis|joint swelling|"
        r"musculoskeletal stiffness|arthropathy|fibromyalgia|rhabdomyolysis)\b", re.I),

    "edema_swelling": re.compile(
        r"\b(?:peripheral swelling|oedema\b|edema\b|swelling\b|swelling face|fluid retention|angioedema)\b", re.I),

    "infections": re.compile(
        r"\b(?:pneumonia|covid-19|influenza\b|infection\b|sepsis|cellulitis|viral infection|herpes zoster|septic shock)\b", re.I),

    "psychiatric": re.compile(
        r"\b(?:anxiety|insomnia|depression|stress|agitation|hallucination|suicidal ideation|suicide attempt|"
        r"irritability|nervousness|emotional distress|depressed mood)\b", re.I),

    "urinary": re.compile(
    r"\b(?:"
    # --- eski ---
    r"urinary tract infection|dysuria|cystitis|haematuria|hematuria|urinary retention|"
    # --- kengaytirish ---
    r"urinary tract disorder|urinary tract pain|lower urinary tract symptoms|pollakiuria|"
    r"urinary frequency|nocturia|micturition urgency|bladder pain|chromaturia|urine abnormal"
    r")\b",
    re.I),

    "injection_site": re.compile(r"\b(?:injection site|infusion site|infusion related reaction|infusion-related reaction)\b", re.I),

    # NEW
    "ocular_visual": re.compile(
    r"\b(?:"
    # --- eski ---
    r"eye irritation|conjunctivitis|vision blurred|visual impairment|blindness|photophobia|"
    r"ocular|eye pain|dry eye|cataract|"
    # --- kengaytirish ---
    r"eye swelling|eye disorder|eyelid oedema|eyelid edema"
    r")\b",
    re.I),

    "metabolic_endocrine": re.compile(r"\b(?:weight increased|weight decreased|blood glucose increased|blood glucose decreased|hyperglyc\w*|hypoglyc\w*|diabetes mellitus|hyponatraemia|hyponatremia)\b", re.I),

    "hypersensitivity_allergy": re.compile(
        r"\b(?:hypersensitivity|drug hypersensitivity|anaphylactic reaction|anaphylaxis|allergic reaction|anaphylactic shock|"
        r"cytokine release syndrome|infusion related reaction|"
        r"drug reaction with eosinophilia and systemic symptoms)\b", re.I),

    "injury_accident": re.compile(r"\b(?:fall\b|fracture|injury|contusion|wound)\b", re.I),

    "pregnancy_reproductive": re.compile(r"\b(?:abortion spontaneous|miscarriage|heavy menstrual bleeding|intermenstrual bleeding|premature baby|pregnancy)\b", re.I),
}

labels = list(LABEL_PATTERNS.keys())
print("[V2] label count:", len(labels))
print(labels)

# ---------- 5) BUILD y_* ----------
s = df["REAC_pt_symptom_v2"].fillna("").astype(str)

for lab, pat in LABEL_PATTERNS.items():
    df[f"y_{lab}"] = s.str.contains(pat, na=False).astype(int)

y_cols = [f"y_{lab}" for lab in labels]
row_sum = df[y_cols].sum(axis=1)

zero_label_rows = int((row_sum == 0).sum())
print(f"[V2] 0-label rows (will be dropped in 03_load_data): {zero_label_rows:,} / {len(df):,} ({zero_label_rows/len(df)*100:.2f}%)")

# y_labels (debug)
lab_names = labels
Y = df[y_cols].to_numpy(dtype=np.int8)

idxs = [np.flatnonzero(r) for r in Y]
df["y_labels"] = ["; ".join([lab_names[i] for i in ix]) for ix in idxs]

# ---------- 6) PREVALENCE ----------
print("\n[V2] Label prevalence (% of rows):")
for lab in labels:
    pct = df[f"y_{lab}"].mean() * 100
    print(f" - {lab:24s}: {pct:6.2f}%")

# ---------- 7) SAVE ----------
df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
print("\n[SAVED]", OUT_CSV)

IN_CSV: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Raw_data\faers_25Q4_targets_multilabel.csv exists: True
OUT_CSV: c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data\Raw_data\faers_25Q4_targets_multilabel_v2.csv
Loaded: (335097, 67)
[V2] Rows before: 335,097 | after non-empty symptom: 308,959 | dropped empty: 26,138
[V2] label count: 21
['gastrointestinal', 'cardiovascular', 'neurological', 'dermatologic', 'hematologic', 'hepatic', 'renal', 'respiratory', 'general_systemic', 'pain_general', 'musculoskeletal', 'edema_swelling', 'infections', 'psychiatric', 'urinary', 'injection_site', 'ocular_visual', 'metabolic_endocrine', 'hypersensitivity_allergy', 'injury_accident', 'pregnancy_reproductive']


  df[f"y_{lab}"] = s.str.contains(pat, na=False).astype(int)


[V2] 0-label rows (will be dropped in 03_load_data): 59,209 / 308,959 (19.16%)

[V2] Label prevalence (% of rows):
 - gastrointestinal        :  17.40%
 - cardiovascular          :   9.10%
 - neurological            :  14.28%
 - dermatologic            :  15.18%
 - hematologic             :   8.45%
 - hepatic                 :   3.04%
 - renal                   :   2.75%
 - respiratory             :  10.10%
 - general_systemic        :  13.66%
 - pain_general            :  15.66%
 - musculoskeletal         :   7.60%
 - edema_swelling          :   6.12%
 - infections              :  11.97%
 - psychiatric             :   6.35%
 - urinary                 :   2.47%
 - injection_site          :   6.62%
 - ocular_visual           :   4.74%
 - metabolic_endocrine     :   5.30%
 - hypersensitivity_allergy:   3.78%
 - injury_accident         :   6.88%
 - pregnancy_reproductive  :   0.87%

[SAVED] c:\Users\xolmu\OneDrive\Desktop\Modul Program oyi\Modul_Program3\6_project_dori_tasiri_extract\Data