In [None]:
# ------------------------------------------------------------
# TaxoClass-inspired baseline (adapted to your file structure)
# - No labeled data used
# - Uses: class names + taxonomy + class-related keywords + unlabeled corpus
#
# Pipeline:
#   1) Build doc-class similarity (TF-IDF cosine) using class text = name + keywords
#   2) Core class mining:
#        - top-down candidate expansion on taxonomy
#        - confidence = sim(c) - max(sim(parent), max sim(siblings))
#        - keep cores if conf>0 and conf >= median_conf_for_class
#   3) Silver labels: core + ancestors => enforce 2~3 labels
#   4) Train multi-label classifier (One-vs-Rest Logistic Regression)
#   5) Self-training (optional): refine pseudo labels using model probabilities
#   6) Predict test, write submission.csv
# ------------------------------------------------------------

import os
import re
import csv
import random
import numpy as np
from tqdm import tqdm

from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

# ------------------------
# Reproducibility
# ------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# ------------------------
# Paths (fixed by your file structure)
# ------------------------
BASE_DIR = "Amazon_products"
CLASSES_PATH = os.path.join(BASE_DIR, "classes.txt")
HIERARCHY_PATH = os.path.join(BASE_DIR, "class_hierarchy.txt")
KEYWORDS_PATH = os.path.join(BASE_DIR, "class_related_keywords.txt")

TRAIN_CORPUS_PATH = os.path.join(BASE_DIR, "train", "train_corpus.txt")  # pid \t text
TEST_CORPUS_PATH = os.path.join(BASE_DIR, "test", "test_corpus.txt")     # pid \t text

SUBMISSION_PATH = "submission.csv"

# ------------------------
# Constants
# ------------------------
NUM_CLASSES = 531  # 0..530
MIN_LABELS = 2
MAX_LABELS = 3

# ------------------------
# I/O helpers
# ------------------------
def load_corpus(path):
    """Load corpus into (pid_list, text_list). Format: pid\\ttext"""
    pids, texts = [], []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.rstrip("\n").split("\t", 1)
            if len(parts) == 2:
                pid, text = parts
                pids.append(pid)
                texts.append(text)
    return pids, texts

def load_classes(path, expected_num=NUM_CLASSES):
    """
    Robust loader for classes.txt.
    Supports either:
      - one class name per line (id = line index)
      - "id\\tname" or "id,name"
    Returns: id2name dict
    """
    id2name = {}
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            s = line.strip()
            if not s:
                continue
            parts = re.split(r"[\t,]", s, maxsplit=1)
            if len(parts) == 2 and parts[0].strip().isdigit():
                cid = int(parts[0].strip())
                name = parts[1].strip()
                id2name[cid] = name
            else:
                id2name[i] = s

    # pad missing ids defensively
    for cid in range(expected_num):
        if cid not in id2name:
            id2name[cid] = f"class_{cid}"
    return id2name

def load_keywords(path, id2name):
    """
    Robust loader for class_related_keywords.txt.
    Common formats handled:
      - "cid\\tkw1,kw2,kw3"
      - "cid\\tkw1\\tkw2\\tkw3"
      - "cid,kw1,kw2,..."
    Returns: kw[cid] = [kw...]
    """
    kw = {cid: [] for cid in id2name.keys()}

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            s = line.strip()
            if not s:
                continue

            # First split by tab; if only 1 field, split by comma.
            tab_parts = s.split("\t")
            if len(tab_parts) == 1:
                parts = s.split(",")
            else:
                parts = tab_parts

            if len(parts) < 2:
                continue

            cid_raw = parts[0].strip()
            if not cid_raw.isdigit():
                continue
            cid = int(cid_raw)
            if cid not in kw:
                continue

            rest = [p.strip() for p in parts[1:] if p.strip()]
            # If rest is a single "kw1,kw2,kw3" string, split again by comma/;/
            expanded = []
            for r in rest:
                expanded.extend([x.strip() for x in re.split(r"[,;/|]", r) if x.strip()])
            # de-dup while preserving order
            seen = set()
            deduped = []
            for x in expanded:
                if x not in seen:
                    deduped.append(x)
                    seen.add(x)

            kw[cid] = deduped

    return kw

def load_hierarchy(path, id2name):
    """
    Load taxonomy edges from class_hierarchy.txt.
    Expected per line: parent_id<sep>child_id  (sep = tab or comma)
    (If the file is name-based, this also tries to map by class names.)
    Returns:
      roots: [cid...]
      parents_of: dict child -> sorted list of parents (can be multiple)
      children_of: dict parent -> sorted list of children
    """
    name2id = {v: k for k, v in id2name.items()}
    parents_of = {cid: set() for cid in id2name.keys()}
    children_of = {cid: set() for cid in id2name.keys()}

    def to_id(x):
        x = x.strip()
        if x.isdigit():
            return int(x)
        return name2id.get(x, None)

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            s = line.strip()
            if not s:
                continue
            parts = re.split(r"[\t,]", s)
            if len(parts) < 2:
                continue
            p = to_id(parts[0])
            c = to_id(parts[1])
            if p is None or c is None or p == c:
                continue
            parents_of[c].add(p)
            children_of[p].add(c)

    roots = [cid for cid in id2name.keys() if len(parents_of[cid]) == 0]
    parents_of = {k: sorted(list(v)) for k, v in parents_of.items()}
    children_of = {k: sorted(list(v)) for k, v in children_of.items()}
    return roots, parents_of, children_of

# ------------------------
# Taxonomy utilities
# ------------------------
def get_ancestors(cid, parents_of, max_steps=10):
    """Return a deterministic ancestor chain using the smallest parent when multiple exist."""
    anc = []
    cur = cid
    for _ in range(max_steps):
        plist = parents_of.get(cur, [])
        if not plist:
            break
        p = plist[0]
        anc.append(p)
        cur = p
    return anc

# ------------------------
# Build class texts (name + keywords)
# ------------------------
def build_class_texts(id2name, kw):
    class_texts = []
    for cid in range(NUM_CLASSES):
        name = id2name.get(cid, f"class_{cid}")
        kws = kw.get(cid, [])[:30]  # cap to reduce noise
        class_texts.append(name + " " + " ".join(kws))
    return class_texts

# ------------------------
# Core mining (TaxoClass-inspired)
# ------------------------
def mine_core_classes(
    X_docs, X_class, roots, parents_of, children_of,
    top_k=3, max_nodes_per_doc=60, batch_size=512
):
    """
    X_docs: (N,V) sparse
    X_class: (C,V) sparse
    Uses:
      sim(d,c) = dot(tfidf(d), tfidf(class))
      candidates: top-down expansion from roots
      confidence(d,c) = sim(d,c) - max(sim(parent), max sim(siblings))
      keep if conf>0 and conf >= median_conf[c]
    Returns:
      doc_cores: list[list[cid]]
    """
    n_docs = X_docs.shape[0]
    # store conf values per class for median
    conf_lists = [[] for _ in range(NUM_CLASSES)]
    doc_candidate_confs = [None] * n_docs

    X_class_T = X_class.T  # (V,C) for fast dot: X_docs @ X_class_T

    # pass 1: collect confidences
    for start in tqdm(range(0, n_docs, batch_size), desc="Core mining pass1 (collect conf)"):
        end = min(start + batch_size, n_docs)
        sims = (X_docs[start:end] @ X_class_T).toarray().astype(np.float32)  # (b,C)

        for i in range(end - start):
            sim = sims[i]
            candidates = set()
            frontier = list(roots)
            visited = set()
            steps = 0

            while frontier and len(visited) < max_nodes_per_doc:
                node = frontier.pop(0)
                if node in visited:
                    continue
                visited.add(node)
                candidates.add(node)

                ch = children_of.get(node, [])
                if ch:
                    ch_sorted = sorted(ch, key=lambda c: sim[c], reverse=True)[:top_k]
                    for c in ch_sorted:
                        if c not in visited:
                            frontier.append(c)

                steps += 1
                if steps >= max_nodes_per_doc:
                    break

            cand_confs = []
            for c in candidates:
                plist = parents_of.get(c, [])
                p = plist[0] if plist else None
                parent_sim = sim[p] if p is not None else -1e9

                sib_max = -1e9
                if p is not None:
                    for s in children_of.get(p, []):
                        if s == c:
                            continue
                        if sim[s] > sib_max:
                            sib_max = sim[s]

                baseline = max(parent_sim, sib_max)
                conf = float(sim[c] - baseline)

                cand_confs.append((c, conf))
                if conf > 0:
                    conf_lists[c].append(conf)

            doc_candidate_confs[start + i] = cand_confs

    # per-class median confidence
    med = np.full(NUM_CLASSES, -np.inf, dtype=np.float32)
    for c in range(NUM_CLASSES):
        if conf_lists[c]:
            med[c] = float(np.median(np.array(conf_lists[c], dtype=np.float32)))

    # pass 2: select top cores per doc
    doc_cores = []
    for cand_confs in tqdm(doc_candidate_confs, desc="Core mining pass2 (select cores)"):
        if not cand_confs:
            doc_cores.append([])
            continue
        filtered = [(c, conf) for (c, conf) in cand_confs if conf > 0 and conf >= med[c]]
        filtered.sort(key=lambda x: x[1], reverse=True)
        cores = [c for (c, _) in filtered[:2]]  # max 2 cores
        doc_cores.append(cores)

    return doc_cores

# ------------------------
# Convert cores -> final 2~3 labels (silver / prediction formatting)
# ------------------------
def cores_to_labels(cores, parents_of):
    labels = []
    for c in cores:
        labels.append(c)

    # add ancestors until reaching MAX_LABELS
    for c in cores:
        for a in get_ancestors(c, parents_of, max_steps=5):
            if len(labels) >= MAX_LABELS:
                break
            labels.append(a)
        if len(labels) >= MAX_LABELS:
            break

    # dedup preserving order
    seen = set()
    uniq = []
    for x in labels:
        if x not in seen:
            uniq.append(x)
            seen.add(x)
    labels = uniq

    # ensure at least MIN_LABELS
    if len(labels) < MIN_LABELS:
        if cores:
            for a in get_ancestors(cores[0], parents_of, max_steps=10):
                if a not in seen:
                    labels.append(a)
                    seen.add(a)
                if len(labels) >= MIN_LABELS:
                    break

    # still short: deterministic pad with next ids (rare)
    if len(labels) < MIN_LABELS:
        for k in range(NUM_CLASSES):
            if k not in seen:
                labels.append(k)
                seen.add(k)
            if len(labels) >= MIN_LABELS:
                break

    # clip and sort for submission
    labels = labels[:MAX_LABELS]
    return sorted(labels)

# ------------------------
# Self-training (simple, stable)
# ------------------------
def self_training(model, X_all_docs, parents_of, rounds=2, min_prob=0.20):
    """
    For each round:
      - predict_proba on all docs
      - pseudo core = top-1 (plus optional 2nd if close)
      - convert to 2~3 labels via ancestors
      - re-fit model on pseudo labels
    """
    for _ in range(rounds):
        probs = model.predict_proba(X_all_docs)  # (N,C)
        pseudo = []
        for i in range(probs.shape[0]):
            p = probs[i]
            top = int(np.argmax(p))
            topk = np.argsort(-p)[:5].tolist()

            cores = [top]
            # optional 2nd core if not too far
            for c in topk[1:]:
                if p[top] >= min_prob and p[c] >= (p[top] * 0.75):
                    cores.append(c)
                    break
            labs = cores_to_labels(cores, parents_of)
            pseudo.append(labs)

        mlb = MultiLabelBinarizer(classes=list(range(NUM_CLASSES)))
        Y = mlb.fit_transform(pseudo)
        model.fit(X_all_docs, Y)

    return model

# ------------------------
# Main
# ------------------------
def main():
    # sanity checks
    for p in [CLASSES_PATH, HIERARCHY_PATH, KEYWORDS_PATH, TRAIN_CORPUS_PATH, TEST_CORPUS_PATH]:
        if not os.path.exists(p):
            raise FileNotFoundError(f"Missing required file: {p}")

    # load resources
    id2name = load_classes(CLASSES_PATH, expected_num=NUM_CLASSES)
    kw = load_keywords(KEYWORDS_PATH, id2name)
    roots, parents_of, children_of = load_hierarchy(HIERARCHY_PATH, id2name)

    # load corpora
    train_pids, train_texts = load_corpus(TRAIN_CORPUS_PATH)
    test_pids, test_texts = load_corpus(TEST_CORPUS_PATH)

    # build class texts
    class_texts = build_class_texts(id2name, kw)

    # vectorize in a shared TF-IDF space (docs + class texts)
    # NOTE: we include test texts to align with the project setting (test corpus allowed for training).
    all_texts_for_vocab = train_texts + test_texts + class_texts

    vectorizer = TfidfVectorizer(
        lowercase=True,
        ngram_range=(1, 2),
        max_features=200000,
        min_df=2,
        stop_words="english",
    )
    X_all = vectorizer.fit_transform(all_texts_for_vocab)

    n_train = len(train_texts)
    n_test = len(test_texts)

    X_train = X_all[:n_train]
    X_test = X_all[n_train:n_train + n_test]
    X_class = X_all[n_train + n_test:]  # (C,V) in same TF-IDF space

    # core mining on train corpus to create silver labels
    doc_cores = mine_core_classes(
        X_docs=X_train,
        X_class=X_class,
        roots=roots,
        parents_of=parents_of,
        children_of=children_of,
        top_k=3,
        max_nodes_per_doc=60,
        batch_size=512
    )

    silver_labels = []
    used_idx = []
    for i, cores in enumerate(doc_cores):
        if not cores:
            continue
        labs = cores_to_labels(cores, parents_of)
        silver_labels.append(labs)
        used_idx.append(i)

    if len(used_idx) == 0:
        raise RuntimeError(
            "No silver-labeled documents found. "
            "Check that class_hierarchy / keywords parsing is correct."
        )

    X_silver = X_train[used_idx]
    mlb = MultiLabelBinarizer(classes=list(range(NUM_CLASSES)))
    Y_silver = mlb.fit_transform(silver_labels)

    # train multi-label classifier
    base = LogisticRegression(
        solver="saga",
        max_iter=250,
        n_jobs=-1,
        random_state=SEED
    )
    model = OneVsRestClassifier(base, n_jobs=-1)
    model.fit(X_silver, Y_silver)

    # self-training on train+test texts (allowed per project statement)
    X_all_docs = vectorizer.transform(train_texts + test_texts)
    model = self_training(model, X_all_docs, parents_of, rounds=2, min_prob=0.20)

    # inference on test
    probs_test = model.predict_proba(X_test)  # (Ntest,C)

    # format predictions: enforce 2~3 labels
    all_labels = []
    for i in range(n_test):
        p = probs_test[i]
        topk = np.argsort(-p)[:5].tolist()
        cores = [topk[0]]
        for c in topk[1:]:
            if p[c] >= (p[topk[0]] * 0.75):
                cores.append(c)
                break

        labels = cores_to_labels(cores, parents_of)

        # final enforcement (should already satisfy)
        if len(labels) < MIN_LABELS:
            for c in topk:
                if c not in labels:
                    labels.append(c)
                if len(labels) >= MIN_LABELS:
                    break
            labels = sorted(labels)[:MAX_LABELS]
        if len(labels) > MAX_LABELS:
            labels = labels[:MAX_LABELS]

        all_labels.append(labels)

    # write submission
    with open(SUBMISSION_PATH, "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["pid", "labels"])
        for pid, labels in zip(test_pids, all_labels):
            w.writerow([pid, ",".join(map(str, labels))])

    print(f"Saved: {SUBMISSION_PATH}")
    print(f"Test samples: {n_test} | labels per sample: {MIN_LABELS}-{MAX_LABELS}")
    print(f"Silver-labeled train docs used: {len(used_idx)} / {n_train}")

if __name__ == "__main__":
    main()


Generating dummy predictions: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19658/19658 [00:00<00:00, 190266.11it/s]

Dummy submission file saved to: submission.csv
Total samples: 19658, Classes per sample: 1-3



