In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("fill-mask", model="nazyrova/clinicalBERT")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/591 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("nazyrova/clinicalBERT")
model = AutoModelForMaskedLM.from_pretrained("nazyrova/clinicalBERT")

In [None]:
# --- Hugging Face Transformers ---
from transformers import pipeline

# ChemBERTa foi treinado para compreensão de SMILES (representação química)
# Modelos recomendados: "DeepChem/ChemBERTa-77M-MLM" ou "seyonec/ChemBERTa-zinc-base-v1"
pipe = pipeline("fill-mask", model="DeepChem/ChemBERTa-77M-MLM")

# Exemplo de uso com SMILES (molécula de cafeína)
output = pipe("CN1C=NC2=C1C(=O)N(C(=O)N2C)[MASK]")
print(output)


config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/13.7M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/13.7M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/420 [00:00<?, ?B/s]

Device set to use cpu


[{'score': 0.6545522212982178, 'token': 16, 'token_str': 'C', 'sequence': 'CN1C=NC2=C1C(=O)N(C(=O)N2C)C'}, {'score': 0.14195111393928528, 'token': 23, 'token_str': 'N', 'sequence': 'CN1C=NC2=C1C(=O)N(C(=O)N2C)N'}, {'score': 0.0727911964058876, 'token': 54, 'token_str': 'B', 'sequence': 'CN1C=NC2=C1C(=O)N(C(=O)N2C)B'}, {'score': 0.04137810692191124, 'token': 19, 'token_str': 'O', 'sequence': 'CN1C=NC2=C1C(=O)N(C(=O)N2C)O'}, {'score': 0.03221575915813446, 'token': 48, 'token_str': 'I', 'sequence': 'CN1C=NC2=C1C(=O)N(C(=O)N2C)I'}]


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# ==========================================
# Shared feature space (HINT + CTOD joined)
# Leakage-safe; global block-wise SVD (effective cap)
# SEM (ClinicalBERT/ChemBERTa) + POS (TF-IDF/MultiHot/OneHot)
# Human-readable reduced names (top original feature per component)
# Tightened raw dimensionality (≈ ≤20–30k on 5k rows)
# ==========================================

import os, re, ast, json, warnings, time
import numpy as np
import pandas as pd
from pathlib import Path
from scipy import sparse as sp
from packaging import version

warnings.filterwarnings("ignore")
os.environ.setdefault("TOKENIZERS_PARALLELISM", "true")
os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1")

# ------------- CONFIG -------------
PHASE_TAGS = ["phase_I", "phase_II", "phase_III"]

# Paths
BASE_HINT_DIR = "/content/drive/MyDrive/MyModel(hybrid)/Data/HINT"
BASE_CTOD_DIR = "/content/drive/MyDrive/MyModel(hybrid)/Data/CTOD"
HINT_OUT_ROOT = Path("/content/drive/MyDrive/MyModel(hybrid)/hint_xgb_artifacts")
CTOD_JOIN_OUT_ROOT = Path("/content/drive/MyDrive/MyModel(hybrid)/ctod_xgb_artifacts")

# Global SVD
APPLY_SVD = True
GLOBAL_MAX_COMPONENTS = 1000
TARGET_EXPLAINED_VAR = 0.90
RANDOM_STATE = 42
MAX_K_PER_BLOCK = 400
MIN_PER_BLOCK = 1

# Tight vocabularies (keeps raw dims under control)
TFIDF_TEXT_MAX = 2000      # criteria/description each
TFIDF_TEXT_MIN_DF = 10
TFIDF_TEXT_NGRAM = (4,5)   # char_wb

TFIDF_SMILES_MAX = 2000    # inside drugs_fusion
TFIDF_SMILES_MIN_DF = 2

ICD_MIN_COUNT = 3          # prune very-rare ICDs

# Light compression ONLY for LLM blocks (keeps interpretability of TF-IDF/labels)
USE_LLM_PROJECTOR = True
LLM_PROJECTED_DIM = 256    # 768 -> 256

DEBUG = True
def _dbg(*a):
    if DEBUG: print(*a)

# ------------- Imports -------------
from sklearn import __version__ as skl_version
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import joblib

# ---- CTOD schema & mapping ----
CTOD_SCHEMA = [
    "nct_id","brief_title","overall_status","start_date","completion_date","phase",
    "enrollment","diseases","drugs","drug_type","description","smiles","criteria","label","icdcodes",
]
HINT_TO_CTOD = {
    "NCT Number":"nct_id","Brief Title":"brief_title","Study Status":"overall_status",
    "Start Date":"start_date","Completion Date":"completion_date","phase":"phase",
    "Enrollment":"enrollment","diseases":"diseases","drugs":"drugs","criteria":"criteria",
    "label":"label","smiles":"smiles","icdcodes":"icdcodes",
}
LEAK_DROP = {"overall_status","start_date","completion_date","phase"}

def to_ctod_schema_from_hint(df: pd.DataFrame) -> pd.DataFrame:
    d = df.rename(columns={c:HINT_TO_CTOD[c] for c in df.columns if c in HINT_TO_CTOD}).copy()
    for col in CTOD_SCHEMA:
        if col not in d.columns: d[col] = np.nan
    d = d[CTOD_SCHEMA].copy()
    d["enrollment"] = pd.to_numeric(d["enrollment"], errors="coerce")
    d["label"] = pd.to_numeric(d["label"], errors="coerce").astype("Int64").fillna(0).astype(int)
    for c in ["nct_id","brief_title","overall_status","start_date","completion_date","phase",
              "diseases","drugs","drug_type","description","smiles","criteria","icdcodes"]:
        d[c] = d[c].astype(object).where(pd.notna(d[c]), np.nan)
    return d

def ensure_ctod_schema(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    for col in CTOD_SCHEMA:
        if col not in d.columns: d[col] = np.nan
    d = d[CTOD_SCHEMA].copy()
    d["enrollment"] = pd.to_numeric(d["enrollment"], errors="coerce")
    d["label"] = pd.to_numeric(d["label"], errors="coerce").astype("Int64").fillna(0).astype(int)
    for c in ["nct_id","brief_title","overall_status","start_date","completion_date","phase",
              "diseases","drugs","drug_type","description","smiles","criteria","icdcodes"]:
        d[c] = d[c].astype(object).where(pd.notna(d[c]), np.nan)
    return d

def drop_leak_columns(df: pd.DataFrame) -> pd.DataFrame:
    return df[[c for c in df.columns if c not in LEAK_DROP]].copy()

def to_csr(M):
    return M if sp.issparse(M) else sp.csr_matrix(M)

def _has_nonempty(df, col, min_non_na=5):
    return (col in df.columns) and (df[col].notna().sum() >= min_non_na)

SEP = re.compile(r"[;,|/]+")

def split_delimited(x):
    if pd.isna(x): return []
    toks = [t.strip() for t in SEP.split(str(x)) if t.strip()]
    return list(dict.fromkeys([t.lower() for t in toks]))

def parse_icd_list(x):
    if pd.isna(x) or x == "" or x == "[]": return []
    try:
        val = ast.literal_eval(str(x))
        return [str(v) for v in val] if isinstance(val, list) else []
    except Exception:
        return re.findall(r"[A-Z]\d[\w\.\-]*", str(x))

# --------- selectors ---------
class SeriesAsArray(BaseEstimator, TransformerMixin):
    def __init__(self, col): self.col = col
    def fit(self, X, y=None): return self
    def transform(self, X): return X[self.col].fillna("").astype(str).values

# --------- Multi-hot (with ICD pruning) ---------
class ICDMultiHotPruned(BaseEstimator, TransformerMixin):
    def __init__(self, col="icdcodes", min_count=ICD_MIN_COUNT):
        self.col = col
        self.min_count = int(min_count)
        try: self.mlb = MultiLabelBinarizer(sparse_output=True)
        except TypeError: self.mlb = MultiLabelBinarizer()
        self.keep_idx_ = None
        self.keep_labels_ = None

    def fit(self, X, y=None):
        lists = [parse_icd_list(v) for v in X[self.col].tolist()]
        # First fit to get all classes
        self.mlb.fit(lists)
        classes = list(self.mlb.classes_)

        # Count per class
        from collections import Counter
        cnt = Counter()
        for row in lists:
            cnt.update(set(row))  # count presence per row

        self.keep_labels_ = [c for c in classes if cnt.get(c,0) >= self.min_count]
        # keep indices inside the original order
        idx_map = {c:i for i,c in enumerate(classes)}
        self.keep_idx_ = np.array([idx_map[c] for c in self.keep_labels_], dtype=int)
        return self

    def transform(self, X):
        lists = [parse_icd_list(v) for v in X[self.col].tolist()]
        M_full = self.mlb.transform(lists)
        if self.keep_idx_ is None or len(self.keep_idx_) == 0:
            return sp.csr_matrix((M_full.shape[0], 0))
        # slice columns to kept ICDs
        M = M_full[:, self.keep_idx_]
        return M if sp.issparse(M) else sp.csr_matrix(M)

# Plain multi-hot
class DelimitedMultiHot(BaseEstimator, TransformerMixin):
    def __init__(self, col):
        self.col = col
        try: self.mlb = MultiLabelBinarizer(sparse_output=True)
        except TypeError: self.mlb = MultiLabelBinarizer()
    def fit(self, X, y=None):
        self.mlb.fit([split_delimited(v) for v in X[self.col].tolist()])
        return self
    def transform(self, X):
        M = self.mlb.transform([split_delimited(v) for v in X[self.col].tolist()])
        return M if sp.issparse(M) else sp.csr_matrix(M)

# --------- TF-IDF helpers (tight) ---------
def _text_pipe_for(col, analyzer="char_wb", ngram=TFIDF_TEXT_NGRAM, min_df=TFIDF_TEXT_MIN_DF,
                   max_df=0.9, max_features=TFIDF_TEXT_MAX):
    return Pipeline([
        ("series", SeriesAsArray(col)),
        ("tfidf", TfidfVectorizer(
            lowercase=True,
            analyzer=analyzer,
            ngram_range=ngram,
            min_df=min_df, max_df=max_df,
            max_features=max_features,
            sublinear_tf=True,
            strip_accents="unicode"
        )),
    ])

# OneHot version-safe
from sklearn import __version__ as _skv
if version.parse(_skv) >= version.parse("1.4"):
    OHE = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
else:
    OHE = OneHotEncoder(handle_unknown="ignore", sparse=True)

# --------- LLM blocks ---------
import torch
from torch.utils.data import DataLoader

class ClinicalBERTTextEmbedder(BaseEstimator, TransformerMixin):
    def __init__(self, col, model_name="nazyrova/clinicalBERT",
                 max_length=384, batch_size=32, pooling="cls", fp16=True):
        self.col = col; self.model_name = model_name
        self.max_length = int(max_length); self.batch_size = int(batch_size)
        self.pooling = pooling; self.fp16 = bool(fp16)
        self.device = None; self.tokenizer = None; self.model = None; self.output_dim = None
    def _setup(self):
        if self.device is None:
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        if self.tokenizer is None or self.model is None:
            from transformers import AutoTokenizer, AutoModel
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModel.from_pretrained(self.model_name)
            self.model.to(self.device); self.model.eval()
            self.output_dim = int(getattr(self.model.config, "hidden_size", 768))
    def fit(self, X, y=None): self._setup(); return self
    @torch.inference_mode()
    def transform(self, X):
        self._setup()
        texts = X[self.col].fillna("").astype(str).tolist()
        idxs = list(range(len(texts)))
        loader = DataLoader(idxs, batch_size=self.batch_size, shuffle=False)
        outs = []; use_autocast = self.fp16 and (self.device.type == "cuda")
        def _tok(batch_texts):
            return self.tokenizer(batch_texts, padding=True, truncation=True,
                                  max_length=self.max_length, return_tensors="pt")
        for batch_idx in loader:
            batch_texts = [texts[i] for i in batch_idx]
            enc = _tok(batch_texts); enc = {k:v.to(self.device) for k,v in enc.items()}
            with torch.cuda.amp.autocast(enabled=use_autocast):
                hidden = self.model(**enc).last_hidden_state
                if self.pooling == "mean":
                    mask = enc["attention_mask"].unsqueeze(-1)
                    pooled = (hidden*mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
                else:
                    pooled = hidden[:,0,:]
            outs.append(pooled.detach().cpu())
        Z = torch.cat(outs, dim=0).numpy()
        return to_csr(Z)

class ChemBERTaEmbedder(BaseEstimator, TransformerMixin):
    def __init__(self, col="smiles", model_name="DeepChem/ChemBERTa-77M-MLM",
                 max_length=96, batch_size=64, fp16=True):
        self.col = col; self.model_name = model_name
        self.max_length = int(max_length); self.batch_size = int(batch_size)
        self.fp16 = bool(fp16)
        self.device=None; self.tokenizer=None; self.model=None; self.output_dim=None
    def _setup(self):
        if self.device is None:
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        if self.tokenizer is None or self.model is None:
            from transformers import AutoTokenizer, AutoModel
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModel.from_pretrained(self.model_name)
            self.model.to(self.device); self.model.eval()
            self.output_dim = int(getattr(self.model.config, "hidden_size", 768))
    def fit(self, X, y=None): self._setup(); return self
    @torch.inference_mode()
    def transform(self, X):
        self._setup()
        smiles = X[self.col].fillna("").astype(str).tolist()
        idxs = list(range(len(smiles)))
        loader = DataLoader(idxs, batch_size=self.batch_size, shuffle=False)
        outs = []; use_autocast = self.fp16 and (self.device.type == "cuda")
        for batch_idx in loader:
            batch_txt = [smiles[i] if smiles[i] != "" else " " for i in batch_idx]
            enc = self.tokenizer(batch_txt, padding=True, truncation=True,
                                 max_length=self.max_length, return_tensors="pt").to(self.device)
            with torch.cuda.amp.autocast(enabled=use_autocast):
                hidden = self.model(**enc).last_hidden_state
                mask = enc["attention_mask"].unsqueeze(-1)
                pooled = (hidden*mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
            outs.append(pooled.detach().cpu())
        Z = torch.cat(outs, dim=0).numpy()
        return to_csr(Z)

# Optional: compress LLM embeddings only (keeps TF-IDF/labels intact for naming)
class LLMProjector(BaseEstimator, TransformerMixin):
    def __init__(self, inner, n_components=LLM_PROJECTED_DIM, random_state=RANDOM_STATE):
        self.inner = inner; self.n_components=n_components; self.random_state=random_state
        self.svd=None; self.output_dim=None
    def fit(self, X, y=None):
        Z = to_csr(self.inner.fit_transform(X))
        k = min(self.n_components, max(2, min(Z.shape)-1))
        self.svd = TruncatedSVD(n_components=k, random_state=self.random_state).fit(Z)
        self.output_dim = k
        return self
    def transform(self, X):
        Z = to_csr(self.inner.transform(X))
        Zr = self.svd.transform(Z)
        return to_csr(Zr)

# --------- Drugs fusion (raw; two namespaces) ---------
from sklearn.feature_extraction.text import TfidfVectorizer
class DrugsFusionRaw(BaseEstimator, TransformerMixin):
    def __init__(self, smiles_col="smiles", drugs_col="drugs",
                 max_smiles_features=TFIDF_SMILES_MAX, min_df=TFIDF_SMILES_MIN_DF):
        self.smiles_col = smiles_col; self.drugs_col = drugs_col
        self.max_smiles_features = int(max_smiles_features); self.min_df = int(min_df)
        self.mh_drugs = DelimitedMultiHot(self.drugs_col)
        self.vect_smiles = TfidfVectorizer(
            analyzer="char", ngram_range=(2,5), min_df=self.min_df, max_features=self.max_smiles_features
        )
        self.drug_classes_ = None; self.smiles_vocab_ = None
    def fit(self, X, y=None):
        _ = self.mh_drugs.fit_transform(X)
        _ = self.vect_smiles.fit_transform(X[self.smiles_col].fillna("").astype(str))
        self.drug_classes_ = list(getattr(self.mh_drugs.mlb, "classes_", []))
        self.smiles_vocab_  = list(self.vect_smiles.get_feature_names_out())
        return self
    def transform(self, X):
        Xd = self.mh_drugs.transform(X)
        Xs = self.vect_smiles.transform(X[self.smiles_col].fillna("").astype(str))
        return sp.hstack([Xd, Xs], format="csr")

# --------- Numeric / Categorical (no per-block SVD) ---------
def numeric_block():
    from sklearn.compose import ColumnTransformer
    return Pipeline([
        ("select", ColumnTransformer([("num", "passthrough", ["enrollment"])], remainder="drop")),
        ("impute", SimpleImputer(strategy="median")),
        ("log1p", FunctionTransformer(np.log1p, feature_names_out="one-to-one")),
        ("scale", StandardScaler(with_mean=False)),
    ])

def categorical_block():
    from sklearn.compose import ColumnTransformer
    return Pipeline([
        ("select", ColumnTransformer([("cat", "passthrough", ["drug_type"])], remainder="drop")),
        ("impute", SimpleImputer(strategy="constant", fill_value="__NA__")),
        ("onehot", OHE),
    ])

# --------- Build blocks (no per-block SVD) ---------
def build_and_fit_preprocessor(df_train: pd.DataFrame):
    blocks = []
    if "enrollment" in df_train.columns:
        blocks.append(("num", numeric_block()))
    if _has_nonempty(df_train, "drug_type"):
        blocks.append(("cat", categorical_block()))
    # POS text
    if _has_nonempty(df_train, "criteria"):
        blocks.append(("pos__criteria", _text_pipe_for("criteria")))
    if _has_nonempty(df_train, "description"):
        blocks.append(("pos__description", _text_pipe_for("description")))
    # SEM text
    emb_crit = ClinicalBERTTextEmbedder("criteria", max_length=384, batch_size=32, fp16=True)
    emb_desc = ClinicalBERTTextEmbedder("description", max_length=384, batch_size=32, fp16=True)
    emb_smi  = ChemBERTaEmbedder(col="smiles", max_length=96, batch_size=64, fp16=True)
    if _has_nonempty(df_train, "criteria"):
        blocks.append(("sem__criteria", Pipeline([
            ("emb", LLMProjector(emb_crit) if USE_LLM_PROJECTOR else emb_crit)
        ])))
    if _has_nonempty(df_train, "description"):
        blocks.append(("sem__description", Pipeline([
            ("emb", LLMProjector(emb_desc) if USE_LLM_PROJECTOR else emb_desc)
        ])))
    if _has_nonempty(df_train, "smiles"):
        blocks.append(("sem__smiles", Pipeline([
            ("emb", LLMProjector(emb_smi) if USE_LLM_PROJECTOR else emb_smi)
        ])))
    # ICD (pruned)
    if _has_nonempty(df_train, "icdcodes"):
        blocks.append(("pos__icd", Pipeline([("mh", ICDMultiHotPruned("icdcodes", min_count=ICD_MIN_COUNT))])))
    # Diseases dual (mean ClinicalBERT over terms) — left as a dense score vector (no SVD here)
    if _has_nonempty(df_train, "diseases"):
        blocks.append(("diseases_dual", Pipeline([("dual", ClinicalBERTTextEmbedder("diseases", max_length=32, batch_size=64, fp16=True))])))
    # Drugs fusion (names + SMILES TF-IDF)
    if _has_nonempty(df_train, "drugs") or _has_nonempty(df_train, "smiles"):
        blocks.append(("drugs_fusion", DrugsFusionRaw(smiles_col="smiles", drugs_col="drugs",
                                                      max_smiles_features=TFIDF_SMILES_MAX)))
    # Fit & validate
    validated = []
    for name, pipe in blocks:
        try:
            Xt = to_csr(pipe.fit_transform(df_train))
            if Xt.shape[1] == 0:
                print(f"Skipping block '{name}' (0 features)."); continue
            validated.append((name, pipe))
        except Exception as e:
            print(f"Skipping block '{name}' due to error: {e}")
    assert validated, "No usable feature blocks."
    return validated

# --------- Human-readable original feature names ---------
def _names_pos_tfidf(pipe, col):
    try:
        vec = pipe.named_steps["tfidf"]
        toks = vec.get_feature_names_out().tolist()
        return [f"pos__{col}__{t}" for t in toks]
    except Exception:
        return []

def _names_cat(pipe):
    try:
        ohe = pipe.named_steps["onehot"]
        base = ["drug_type"]
        try: cats = ohe.get_feature_names_out(base).tolist()
        except Exception: cats = ohe.get_feature_names_out().tolist()
        return [f"cat__{c}" for c in cats]
    except Exception:
        return []

def _names_icd_pruned(pipe):
    try:
        mh = pipe.named_steps.get("mh", None) or pipe
        labels = getattr(mh, "keep_labels_", None)
        return [f"pos__icd__{c}" for c in (labels or [])]
    except Exception:
        return []

def _names_llm_dim(prefix, dim):
    return [f"{prefix}__d{i+1}" for i in range(dim)]

def _names_drugs_fusion(pipe):
    est = pipe.steps[-1][1] if hasattr(pipe, "steps") and pipe.steps else pipe
    names = []
    if hasattr(est, "drug_classes_") and est.drug_classes_ is not None:
        names += [f"drugs__{c}" for c in est.drug_classes_]
    if hasattr(est, "smiles_vocab_") and est.smiles_vocab_ is not None:
        names += [f"smiles_tfidf__{t}" for t in est.smiles_vocab_]
    return names

def extract_feature_names_pre(blocks, df_like: pd.DataFrame):
    names = []
    for name, pipe in blocks:
        if name == "num":
            names.append("num__enrollment_log1p"); continue
        if name == "cat":
            names += _names_cat(pipe); continue
        if name == "pos__criteria":
            names += _names_pos_tfidf(pipe, "criteria"); continue
        if name == "pos__description":
            names += _names_pos_tfidf(pipe, "description"); continue
        if name == "pos__icd":
            names += _names_icd_pruned(pipe); continue
        if name == "sem__criteria":
            emb = pipe.named_steps["emb"]
            dim = int(getattr(emb, "output_dim", getattr(getattr(emb, "inner", None), "output_dim", 768) or LLM_PROJECTED_DIM))
            names += _names_llm_dim("sem__criteria", dim); continue
        if name == "sem__description":
            emb = pipe.named_steps["emb"]
            dim = int(getattr(emb, "output_dim", getattr(getattr(emb, "inner", None), "output_dim", 768) or LLM_PROJECTED_DIM))
            names += _names_llm_dim("sem__description", dim); continue
        if name == "sem__smiles":
            emb = pipe.named_steps["emb"]
            dim = int(getattr(emb, "output_dim", getattr(getattr(emb, "inner", None), "output_dim", 768) or LLM_PROJECTED_DIM))
            names += _names_llm_dim("sem__smiles", dim); continue
        if name == "diseases_dual":
            # this block produces a dense vector from text; label dims
            emb = pipe.named_steps.get("dual", None)
            dim = int(getattr(emb, "output_dim", 768))
            names += _names_llm_dim("diseases_dual", dim); continue
        if name == "drugs_fusion":
            names += _names_drugs_fusion(pipe); continue
        # fallback
        k = to_csr(pipe.transform(df_like.head(2))).shape[1]
        names += [f"{name}__d{i+1}" for i in range(k)]
    return names

# --------- Global block-wise SVD with FIXED grouping ---------
class IdentityProjector:
    def __init__(self, n_components): self.n_components = int(n_components)
    def fit(self, X, y=None): return self
    def transform(self, X): return X.toarray() if sp.issparse(X) else np.asarray(X)

def _origin_from_name(feat: str) -> str:
    if feat.startswith("pos__criteria__"):     return "pos__criteria"
    if feat.startswith("pos__description__"):  return "pos__description"
    if feat.startswith("pos__icd__"):          return "pos__icd"
    if feat.startswith("sem__criteria__"):     return "sem__criteria"
    if feat.startswith("sem__description__"):  return "sem__description"
    if feat.startswith("sem__smiles__"):       return "sem__smiles"
    if feat.startswith("drugs__") or feat.startswith("smiles_tfidf__"):
        return "drugs_fusion"
    if feat.startswith("num__"):               return "num"
    if feat.startswith("cat__"):               return "cat"
    if feat.startswith("diseases_dual__"):     return "diseases_dual"
    return feat.split("__", 1)[0]

def group_feature_indices_by_origin(feature_names):
    from collections import OrderedDict, defaultdict
    groups = defaultdict(list)
    for j, n in enumerate(feature_names):
        groups[_origin_from_name(n)].append(j)
    prefer = [
        "num","cat",
        "pos__criteria","pos__description","pos__icd",
        "drugs_fusion",
        "sem__criteria","sem__description","sem__smiles",
        "diseases_dual",
    ]
    out = OrderedDict()
    for k in prefer:
        if k in groups: out[k] = groups[k]
    for k in sorted(groups.keys()):
        if k not in out: out[k] = groups[k]
    return out

def fit_blockwise_svd_with_global_cap(
    X_tr, feature_names, target=TARGET_EXPLAINED_VAR,
    max_k_per_block=MAX_K_PER_BLOCK, global_k=GLOBAL_MAX_COMPONENTS,
    min_per_block=MIN_PER_BLOCK, random_state=RANDOM_STATE
):
    from collections import OrderedDict
    groups = group_feature_indices_by_origin(feature_names)
    spans = OrderedDict((k, np.asarray(v, dtype=int)) for k, v in groups.items())

    probe_info = {}
    total_capacity = 0
    for origin, idxs in spans.items():
        Xi = X_tr[:, idxs]
        if Xi.shape[1] < 2 or min(Xi.shape) < 2:
            k_max = Xi.shape[1]; k_target = k_max
        else:
            k_geom_cap = min(max_k_per_block, min(Xi.shape)-1)
            k_probe = max(2, k_geom_cap)
            svd_probe = TruncatedSVD(n_components=k_probe, random_state=random_state)
            _ = svd_probe.fit_transform(Xi)
            csum = np.cumsum(svd_probe.explained_variance_ratio_)
            k_target = int(np.searchsorted(csum, target)) + 1
            k_target = max(min_per_block, min(k_target, k_probe))
            k_max = k_geom_cap
        probe_info[origin] = dict(k_target=int(k_target), k_max=int(k_max))
        total_capacity += int(k_max)

    global_goal = int(min(global_k, total_capacity))
    proposed = {o:int(info["k_target"]) for o,info in probe_info.items()}
    total_proposed = sum(proposed.values())

    if total_proposed <= global_goal:
        final_alloc = proposed.copy()
    else:
        scale = global_goal / float(total_proposed)
        final_alloc = {o:max(min_per_block, int(np.floor(proposed[o]*scale))) for o in spans.keys()}

    remain = global_goal - sum(final_alloc.values())
    if remain > 0:
        prio = []
        for o in spans.keys():
            base = 1.0 + (0.2 if o.startswith("sem__") else 0.0)
            if "criteria" in o: base += 0.1
            prio.append((o, base))
        prio.sort(key=lambda x: x[1], reverse=True)
        caps = {o:probe_info[o]["k_max"] for o in spans.keys()}
        i = 0
        safety = 10*len(prio)+1000
        while remain>0 and safety>0:
            o,_ = prio[i%len(prio)]
            if final_alloc[o] < caps[o]:
                final_alloc[o]+=1; remain-=1
            i+=1; safety-=1

    # Fit projectors
    models, kept, evsum = {}, {}, {}
    for origin, idxs in spans.items():
        Xi = X_tr[:, idxs]
        k_final = int(final_alloc[origin])
        if Xi.shape[1] < 2 or min(Xi.shape) < 2:
            proj = IdentityProjector(n_components=Xi.shape[1]).fit(Xi)
            models[origin] = proj; kept[origin]=int(Xi.shape[1]); evsum[origin]=1.0
            continue
        k_final = max(min_per_block, min(k_final, min(Xi.shape)-1, max_k_per_block))
        svd = TruncatedSVD(n_components=k_final, random_state=random_state).fit(Xi)
        models[origin]=svd; kept[origin]=int(svd.n_components); evsum[origin]=float(np.sum(svd.explained_variance_ratio_))

    assert sum(kept.values()) == global_goal, f"Global cap mismatch: {sum(kept.values())} != {global_goal}"
    return models, spans, kept, evsum, final_alloc, global_goal

def make_reduced_names_with_top_original(models, spans, feature_names_pre):
    names = []
    for origin, idxs in spans.items():
        block_names = [feature_names_pre[j] for j in idxs]
        proj = models[origin]
        comps = getattr(proj, "components_", None)
        if comps is None or (hasattr(comps, "size") and comps.size == 0):
            k = getattr(proj, "n_components", len(block_names))
            for i in range(1, k+1): names.append(f"{origin}__svd_{i}")
            continue
        for i in range(comps.shape[0]):
            vec = comps[i]
            j_top = int(np.argmax(np.abs(vec)))
            top = block_names[j_top] if 0 <= j_top < len(block_names) else origin
            names.append(f"{origin}__{top}__svd_{i+1}")
    return names

def transform_blockwise_svd(X, models, spans, feature_names_pre=None, use_top_names=True):
    Zs, names = [], []
    for origin, proj in models.items():
        Xi = X[:, spans[origin]]
        Zi = proj.transform(Xi)
        Zs.append(to_csr(Zi))
        if not use_top_names or feature_names_pre is None:
            names += [f"{origin}__svd_component_{i+1}" for i in range(Zi.shape[1])]
    Z = sp.hstack(Zs).tocsr() if len(Zs)>1 else Zs[0]
    if use_top_names and feature_names_pre is not None:
        names = make_reduced_names_with_top_original(models, spans, feature_names_pre)
    return Z, names

# ---------------- RUN ----------------
if __name__ == "__main__":
    HINT_OUT_ROOT.mkdir(parents=True, exist_ok=True)
    CTOD_JOIN_OUT_ROOT.mkdir(parents=True, exist_ok=True)

    for phase_tag in PHASE_TAGS:
        print(f"\n=== Building shared feature space for {phase_tag} ===")

        p_train = f"{BASE_HINT_DIR}/{phase_tag}_train.csv"
        p_valid = f"{BASE_HINT_DIR}/{phase_tag}_valid.csv"
        p_test  = f"{BASE_HINT_DIR}/{phase_tag}_test.csv"

        df_tr_raw = pd.read_csv(p_train, low_memory=False)
        df_va_raw = pd.read_csv(p_valid, low_memory=False)
        df_te_raw = pd.read_csv(p_test,  low_memory=False)

        df_tr = to_ctod_schema_from_hint(df_tr_raw)
        df_va = to_ctod_schema_from_hint(df_va_raw)
        df_te = to_ctod_schema_from_hint(df_te_raw)

        y_tr = df_tr["label"].astype(int).values
        y_va = df_va["label"].astype(int).values
        y_te = df_te["label"].astype(int).values

        df_tr_feat = drop_leak_columns(df_tr)
        df_va_feat = drop_leak_columns(df_va)
        df_te_feat = drop_leak_columns(df_te)

        # Build & fit blocks on HINT train
        block_list = build_and_fit_preprocessor(df_tr_feat)

        def _apply_blocks(blocks, df):
            mats = [to_csr(pipe.transform(df)) for _, pipe in blocks]
            return sp.hstack(mats, format="csr")

        X_tr = _apply_blocks(block_list, df_tr_feat)
        X_va = _apply_blocks(block_list, df_va_feat)
        X_te = _apply_blocks(block_list, df_te_feat)

        out_dir = HINT_OUT_ROOT / phase_tag
        out_dir.mkdir(parents=True, exist_ok=True)

        sp.save_npz(out_dir / "X_train.npz", X_tr)
        sp.save_npz(out_dir / "X_valid.npz", X_va)
        sp.save_npz(out_dir / "X_test.npz",  X_te)
        np.save(out_dir / "y_train.npy", y_tr)
        np.save(out_dir / "y_valid.npy", y_va)
        np.save(out_dir / "y_test.npy",  y_te)

        print(f"{phase_tag} HINT shapes: Train {X_tr.shape} | Valid {X_va.shape} | Test {X_te.shape}")

        # Human-readable original names (pre-SVD)
        feature_names_pre = extract_feature_names_pre(block_list, df_tr_feat)
        (out_dir / "feature_names.json").write_text(json.dumps(feature_names_pre, indent=2))
        joblib.dump(block_list, out_dir / "shared_blocks_hint_train.joblib")

        # Global SVD with effective cap + human-readable reduced names
        if APPLY_SVD:
            print(f"Fitting BLOCK-WISE SVD (global cap={GLOBAL_MAX_COMPONENTS}) on HINT-train for {phase_tag} …")
            t0 = time.time()
            models_bw, spans_bw, kept_bw, evsum_bw, alloc_bw, global_goal_bw = fit_blockwise_svd_with_global_cap(
                X_tr, feature_names_pre,
                target=TARGET_EXPLAINED_VAR,
                max_k_per_block=MAX_K_PER_BLOCK,
                global_k=GLOBAL_MAX_COMPONENTS,
                min_per_block=MIN_PER_BLOCK,
                random_state=RANDOM_STATE
            )

            X_tr_r, reduced_feature_names = transform_blockwise_svd(
                X_tr, models_bw, spans_bw, feature_names_pre, use_top_names=True
            )
            X_va_r, _ = transform_blockwise_svd(
                X_va, models_bw, spans_bw, feature_names_pre, use_top_names=True
            )
            X_te_r, _ = transform_blockwise_svd(
                X_te, models_bw, spans_bw, feature_names_pre, use_top_names=True
            )

            print(f"Block-wise SVD done | train:{X_tr_r.shape} valid:{X_va_r.shape} test:{X_te_r.shape} | {time.time()-t0:.2f}s")
            assert X_tr_r.shape[1] == global_goal_bw
            assert X_va_r.shape[1] == global_goal_bw
            assert X_te_r.shape[1] == global_goal_bw

            joblib.dump({"models": models_bw, "spans": spans_bw}, out_dir / "svd_blockwise_hint_train.joblib")
            sp.save_npz(out_dir / "X_train_reduced.npz", X_tr_r)
            sp.save_npz(out_dir / "X_valid_reduced.npz", X_va_r)
            sp.save_npz(out_dir / "X_test_reduced.npz",  X_te_r)
            (out_dir / "feature_names_reduced_with_top_original.json").write_text(json.dumps(reduced_feature_names, indent=2))

            svd_meta = {
                "mode": "blockwise+global_cap",
                "n_original_features": int(X_tr.shape[1]),
                "global_cap_requested": int(GLOBAL_MAX_COMPONENTS),
                "global_cap_effective": int(global_goal_bw),
                "target_explained_var_per_block": float(TARGET_EXPLAINED_VAR),
                "kept_per_origin": kept_bw,
                "evsum_per_origin": evsum_bw,
                "max_k_per_block": int(MAX_K_PER_BLOCK),
                "final_allocation": {k:int(v) for k,v in kept_bw.items()},
            }
            (out_dir / "svd_meta.json").write_text(json.dumps(svd_meta, indent=2))

        # CTOD joined (train+valid) with same blocks & SVD
        print(f"Transforming CTOD (joined) for {phase_tag} with HINT-trained preprocessor/SVD …")
        ct_train_p = f"{BASE_CTOD_DIR}/{phase_tag}_train.csv"
        ct_valid_p = f"{BASE_CTOD_DIR}/{phase_tag}_valid.csv"
        df_ct_tr_raw = pd.read_csv(ct_train_p, low_memory=False)
        df_ct_va_raw = pd.read_csv(ct_valid_p, low_memory=False)
        df_ct_tr = ensure_ctod_schema(df_ct_tr_raw)
        df_ct_va = ensure_ctod_schema(df_ct_va_raw)
        df_ct_join = pd.concat([df_ct_tr, df_ct_va], ignore_index=True)
        y_ct = df_ct_join["label"].astype(int).values

        df_ct_feat = drop_leak_columns(df_ct_join)
        X_ct = _apply_blocks(block_list, df_ct_feat)

        out_dir_ct = CTOD_JOIN_OUT_ROOT / phase_tag
        out_dir_ct.mkdir(parents=True, exist_ok=True)
        sp.save_npz(out_dir_ct / "X_test_joined.npz", X_ct)
        np.save(out_dir_ct / "y_test_joined.npy", y_ct)
        (out_dir_ct / "feature_names_joined.json").write_text(json.dumps(feature_names_pre, indent=2))

        if APPLY_SVD:
            bw = joblib.load(out_dir / "svd_blockwise_hint_train.joblib")
            models_bw, spans_bw = bw["models"], bw["spans"]
            X_ct_r, reduced_feature_names_ct = transform_blockwise_svd(
                X_ct, models_bw, spans_bw, feature_names_pre, use_top_names=True
            )
            assert X_ct_r.shape[1] == svd_meta["global_cap_effective"]
            sp.save_npz(out_dir_ct / "X_test_joined_reduced.npz", X_ct_r)
            (out_dir_ct / "feature_names_joined_reduced_with_top_original.json").write_text(
                json.dumps(reduced_feature_names_ct, indent=2)
            )

    print("\nDone. Tightened raw feature space, fixed grouping, and human-readable reduced names.")



=== Building shared feature space for phase_I ===


tokenizer_config.json:   0%|          | 0.00/591 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at nazyrova/clinicalBERT and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/420 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/13.7M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MLM and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/13.7M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at nazyrova/clinicalBERT and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


phase_I HINT shapes: Train (5417, 13615) | Valid (1159, 13615) | Test (1160, 13615)
Fitting BLOCK-WISE SVD (global cap=1000) on HINT-train for phase_I …
Block-wise SVD done | train:(5417, 1000) valid:(1159, 1000) test:(1160, 1000) | 65.11s
Transforming CTOD (joined) for phase_I with HINT-trained preprocessor/SVD …

=== Building shared feature space for phase_II ===


Some weights of BertModel were not initialized from the model checkpoint at nazyrova/clinicalBERT and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MLM and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at nazyrova/clinicalBERT and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


phase_II HINT shapes: Train (6761, 13684) | Valid (1452, 13684) | Test (1449, 13684)
Fitting BLOCK-WISE SVD (global cap=1000) on HINT-train for phase_II …
Block-wise SVD done | train:(6761, 1000) valid:(1452, 1000) test:(1449, 1000) | 77.80s
Transforming CTOD (joined) for phase_II with HINT-trained preprocessor/SVD …

=== Building shared feature space for phase_III ===


Some weights of BertModel were not initialized from the model checkpoint at nazyrova/clinicalBERT and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MLM and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at nazyrova/clinicalBERT and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


phase_III HINT shapes: Train (4165, 11502) | Valid (894, 11502) | Test (893, 11502)
Fitting BLOCK-WISE SVD (global cap=1000) on HINT-train for phase_III …
Block-wise SVD done | train:(4165, 1000) valid:(894, 1000) test:(893, 1000) | 48.18s
Transforming CTOD (joined) for phase_III with HINT-trained preprocessor/SVD …

Done. Tightened raw feature space, fixed grouping, and human-readable reduced names.


In [1]:
import pandas as pd
import numpy as np

# --- Paths: reuse the same ones you used in the main script ---
BASE_HINT_DIR = "/Users/antoniocortes/Tese/MyModel(hybrid)/Data/HINT"

BASE_CTOD_DIR = "/Users/antoniocortes/Tese/MyModel(hybrid)/Data/CTOD"

# --- Canonical CTOD schema and mapping (same as in your main script) ---
CTOD_SCHEMA = [
    "nct_id", "brief_title", "overall_status", "start_date", "completion_date", "phase",
    "enrollment", "diseases", "drugs", "drug_type", "description",
    "smiles", "criteria", "label", "icdcodes",
]

HINT_TO_CTOD = {
    "NCT Number":       "nct_id",
    "Brief Title":      "brief_title",
    "Study Status":     "overall_status",
    "Start Date":       "start_date",
    "Completion Date":  "completion_date",
    "phase":            "phase",
    "Enrollment":       "enrollment",
    "diseases":         "diseases",
    "drugs":            "drugs",
    "criteria":         "criteria",
    "label":            "label",
    "smiles":           "smiles",
    "icdcodes":         "icdcodes",
}

LEAK_DROP = {"overall_status", "start_date", "completion_date", "phase"}


def to_ctod_schema_from_hint(df: pd.DataFrame) -> pd.DataFrame:
    d = df.rename(columns={c: HINT_TO_CTOD[c] for c in df.columns if c in HINT_TO_CTOD}).copy()
    # create any missing columns
    for col in CTOD_SCHEMA:
        if col not in d.columns:
            d[col] = np.nan
    d = d[CTOD_SCHEMA].copy()  # enforce ordering
    d["enrollment"] = pd.to_numeric(d["enrollment"], errors="coerce")
    d["label"] = pd.to_numeric(d["label"], errors="coerce").astype("Int64").fillna(0).astype(int)
    for c in ["nct_id","brief_title","overall_status","start_date","completion_date","phase",
              "diseases","drugs","drug_type","description","smiles","criteria","icdcodes"]:
        d[c] = d[c].astype(object).where(pd.notna(d[c]), np.nan)
    return d


def ensure_ctod_schema(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    for col in CTOD_SCHEMA:
        if col not in d.columns:
            d[col] = np.nan
    d = d[CTOD_SCHEMA].copy()
    d["enrollment"] = pd.to_numeric(d["enrollment"], errors="coerce")
    d["label"] = pd.to_numeric(d["label"], errors="coerce").astype("Int64").fillna(0).astype(int)
    for c in ["nct_id","brief_title","overall_status","start_date","completion_date","phase",
              "diseases","drugs","drug_type","description","smiles","criteria","icdcodes"]:
        d[c] = d[c].astype(object).where(pd.notna(d[c]), np.nan)
    return d


def drop_leak_columns(df: pd.DataFrame) -> pd.DataFrame:
    return df[[c for c in df.columns if c not in LEAK_DROP]].copy()


# ============================================================
# 1) HINT → canonical CTOD schema
# ============================================================

hint_path = f"{BASE_HINT_DIR}/phase_I_train.csv"   # any split/phase works; Phase I is fine
df_hint_raw = pd.read_csv(hint_path, low_memory=False)

print("=== HINT (phase_I_train) – ORIGINAL COLUMNS ===")
print(df_hint_raw.columns.tolist())
print()

df_hint = to_ctod_schema_from_hint(df_hint_raw)

print("=== HINT after to_ctod_schema_from_hint – COLUMNS ===")
print(df_hint.columns.tolist())
print()

print("Do transformed HINT columns exactly match CTOD_SCHEMA?")
print(df_hint.columns.tolist() == CTOD_SCHEMA)
print()

# Show which columns were newly created and filled with NaN
created_cols = [c for c in CTOD_SCHEMA if c not in df_hint_raw.rename(columns=HINT_TO_CTOD).columns]
print("Columns created by to_ctod_schema_from_hint (not present in raw HINT):")
print(created_cols)
print()

for c in created_cols:
    print(f"Column '{c}' all NaN after harmonisation? ->", df_hint[c].isna().all())
print()

# Side-by-side example: NCT Number → nct_id, Enrollment → enrollment
print("=== Example value mapping (first 3 rows) ===")
cols_raw = [c for c in ["NCT Number", "Enrollment"] if c in df_hint_raw.columns]
print("Raw HINT:")
print(df_hint_raw[cols_raw].head(3))
print("\nAfter schema harmonisation:")
print(df_hint[["nct_id", "enrollment"]].head(3))
print()

# ============================================================
# 2) CTOD → ensure_ctod_schema
# ============================================================

ctod_path = f"{BASE_CTOD_DIR}/phase_I_train.csv"
df_ctod_raw = pd.read_csv(ctod_path, low_memory=False)

print("=== CTOD (phase_I_train) – ORIGINAL COLUMNS ===")
print(df_ctod_raw.columns.tolist())
print()

df_ctod = ensure_ctod_schema(df_ctod_raw)

print("=== CTOD after ensure_ctod_schema – COLUMNS ===")
print(df_ctod.columns.tolist())
print()

print("Do transformed CTOD columns exactly match CTOD_SCHEMA?")
print(df_ctod.columns.tolist() == CTOD_SCHEMA)
print()

created_ctod_cols = [c for c in CTOD_SCHEMA if c not in df_ctod_raw.columns]
print("Columns created by ensure_ctod_schema (if any):")
print(created_ctod_cols)
for c in created_ctod_cols:
    print(f"Column '{c}' all NaN after harmonisation? ->", df_ctod[c].isna().all())
print()

# ============================================================
# 3) Leak-field removal proof
# ============================================================

print("=== Columns AFTER dropping leakage fields on HINT ===")
df_hint_no_leak = drop_leak_columns(df_hint)
print(df_hint_no_leak.columns.tolist())
print()

print("Were leakage columns removed?")
for c in LEAK_DROP:
    print(f"{c} in final feature table? ->", c in df_hint_no_leak.columns)


=== HINT (phase_I_train) – ORIGINAL COLUMNS ===
['NCT Number', 'Brief Title', 'Study Status', 'Start Date', 'Completion Date', 'phase', 'Enrollment', 'diseases', 'drugs', 'criteria', 'label', 'smiles', 'icdcodes', 'icd_list', 'flat_icds', 'CCSR', 'CCSR_MAIN', 'fused_pred']

=== HINT after to_ctod_schema_from_hint – COLUMNS ===
['nct_id', 'brief_title', 'overall_status', 'start_date', 'completion_date', 'phase', 'enrollment', 'diseases', 'drugs', 'drug_type', 'description', 'smiles', 'criteria', 'label', 'icdcodes']

Do transformed HINT columns exactly match CTOD_SCHEMA?
True

Columns created by to_ctod_schema_from_hint (not present in raw HINT):
['drug_type', 'description']

Column 'drug_type' all NaN after harmonisation? -> True
Column 'description' all NaN after harmonisation? -> True

=== Example value mapping (first 3 rows) ===
Raw HINT:
    NCT Number  Enrollment
0  NCT02706899        19.0
1  NCT01422109        44.0
2  NCT02670941        31.0

After schema harmonisation:
        n

In [3]:
import pandas as pd
import numpy as np



# Load raw phase I
df_hint_raw = pd.read_csv(f"{BASE_HINT_DIR}/phase_I_train.csv", low_memory=False)
df_ctod_raw = pd.read_csv(f"{BASE_CTOD_DIR}/phase_I_train.csv", low_memory=False)

# Use your existing schema functions:
df_hint = to_ctod_schema_from_hint(df_hint_raw)
df_ctod = ensure_ctod_schema(df_ctod_raw)

# ---------------------------------------------------------
# 1) Missingness overview (proves heterogeneity)
# ---------------------------------------------------------
print("=== Missingness fraction per modality (HINT) ===")
print(df_hint[["enrollment","drug_type","smiles","diseases","drugs","icdcodes"]].isna().mean())
print()

print("=== Missingness fraction per modality (CTOD) ===")
print(df_ctod[["enrollment","drug_type","smiles","diseases","drugs","icdcodes"]].isna().mean())
print()

# ---------------------------------------------------------
# 2) Evidence of non-numeric or corrupted enrollment values
# ---------------------------------------------------------
enr_raw = df_hint_raw["Enrollment"]
non_numeric = enr_raw[pd.to_numeric(enr_raw, errors="coerce").isna() & enr_raw.notna()]
print("=== Examples of non-numeric enrollment entries in HINT ===")
print(non_numeric.head(5))
print()

# ---------------------------------------------------------
# 3) Free-text variability (short vs long)
# ---------------------------------------------------------
crit = df_hint["criteria"].fillna("")
lens = crit.str.len()

print("=== Criteria text length (HINT) ===")
print(f"Min length: {lens.min()}, Median: {lens.median()}, Max: {lens.max()}")
print("Example of very short criteria:")
print(crit[lens <= lens.quantile(0.05)].head(1).values[0])
print("Example of very long criteria (first 300 chars):")
print(crit[lens >= lens.quantile(0.95)].head(1).values[0][:300], "...")
print()

# ---------------------------------------------------------
# 4) Pronounced missingness in molecular/ontology features
# ---------------------------------------------------------
print("=== High-missingness modalities (HINT) ===")
for col in ["smiles", "icdcodes"]:
    rate = df_hint[col].isna().mean()
    print(f"{col}: {rate:.1%} missing")
print()


=== Missingness fraction per modality (HINT) ===
enrollment    0.012738
drug_type     1.000000
smiles        0.000000
diseases      0.000000
drugs         0.000000
icdcodes      0.000000
dtype: float64

=== Missingness fraction per modality (CTOD) ===
enrollment    0.011205
drug_type     1.000000
smiles        0.119522
diseases      0.294074
drugs         0.000000
icdcodes      0.000000
dtype: float64

=== Examples of non-numeric enrollment entries in HINT ===
Series([], Name: Enrollment, dtype: float64)

=== Criteria text length (HINT) ===
Min length: 32, Median: 1988.0, Max: 14818
Example of very short criteria:
Inclusion Criteria:

* Subjects 18 to 65 years of age with a positive skin prick test with ragweed allergen

Exclusion Criteria:

* No active respiratory tract infection
Example of very long criteria (first 300 chars):
Inclusion Criteria:

* Male or non-pregnant, non-lactating female 12 years or age or older.
* Signed informed consent form, which meets all criteria of current

In [5]:
from pathlib import Path

PHASE_TAGS = ["phase_I", "phase_II", "phase_III"]

HINT_OUT_ROOT = Path("/Users/antoniocortes/Tese/MyModel(hybrid)/hint_xgb_artifacts")
CTOD_OUT_ROOT = Path("/Users/antoniocortes/Tese/MyModel(hybrid)/ctod_xgb_artifacts")

RELEVANT_SUFFIXES = {".npz", ".npy", ".json", ".joblib", ".txt"}

def list_phase_artifacts(root: Path, label: str):
    print(f"\n=== {label} artifacts in {root} ===")
    if not root.exists():
        print(f"[WARN] Directory does not exist: {root}")
        return

    for phase in PHASE_TAGS:
        phase_dir = root / phase
        if not phase_dir.exists():
            print(f"\n  {phase}: [no directory found]")
            continue

        print(f"\n  {phase}:")
        files = sorted(
            [
                p for p in phase_dir.rglob("*")
                if p.is_file() and p.suffix in RELEVANT_SUFFIXES
            ],
            key=lambda p: str(p)
        )

        if not files:
            print("    [no relevant files found]")
            continue

        for p in files:
            rel = p.relative_to(root)
            size_kb = p.stat().st_size / 1024.0
            print(f"    {rel}  ({size_kb:7.1f} KB)")


if __name__ == "__main__":
    list_phase_artifacts(HINT_OUT_ROOT, "HINT (preprocessing + latent space)")
    list_phase_artifacts(CTOD_OUT_ROOT, "CTOD (OOD evaluation)")
    print("\nDone listing preprocessing package artifacts.")



=== HINT (preprocessing + latent space) artifacts in /Users/antoniocortes/Tese/MyModel(hybrid)/hint_xgb_artifacts ===

  phase_I:
    phase_I/X_test.npz  (15362.2 KB)
    phase_I/X_test_reduced.npz  ( 7344.5 KB)
    phase_I/X_train.npz  (69570.6 KB)
    phase_I/X_train_reduced.npz  (37559.5 KB)
    phase_I/X_valid.npz  ( 6456.1 KB)
    phase_I/X_valid_reduced.npz  ( 4413.3 KB)
    phase_I/feature_names.json  (  395.3 KB)
    phase_I/feature_names_reduced_with_top_original.json  (   46.3 KB)
    phase_I/shared_blocks_hint_train.joblib  (880553.4 KB)
    phase_I/svd_blockwise_hint_train.joblib  (36758.3 KB)
    phase_I/svd_meta.json  (    0.8 KB)
    phase_I/y_test.npy  (    9.2 KB)
    phase_I/y_train.npy  (   42.4 KB)
    phase_I/y_valid.npy  (    9.2 KB)

  phase_II:
    phase_II/X_test.npz  ( 8229.5 KB)
    phase_II/X_test_reduced.npz  ( 5547.7 KB)
    phase_II/X_train.npz  (86105.0 KB)
    phase_II/X_train_reduced.npz  (47354.8 KB)
    phase_II/X_valid.npz  (19671.4 KB)
    phase_I