# Feature Engineering & Semantic Embedding Pipeline

This notebook builds features for citation-impact prediction and saves two versions:
1. **Without embeddings** — numerical + one-hot features only
2. **With embeddings** — adds sentence-transformer embeddings of title + abstract

All saved parquets use **meaningful column names** and a companion JSON mapping file.

## Setup

In [1]:
import os

# Prevent PyTorch/libomp CPU segfaults on macOS
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"

In [2]:
import re
import json
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

import torch
from sentence_transformers import SentenceTransformer

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Load & Prepare Data

In [3]:
df = pd.read_csv("../data/eda_papers.csv")
print(f"Loaded {len(df)} papers.")

# Basic features
df["published_at"] = pd.to_datetime(df["published_at"])
df["age_days"] = (pd.Timestamp.now(tz="UTC") - df["published_at"]).dt.days
df["title_len"] = df["title"].apply(len)
df["abstract_len"] = df["abstract"].apply(len)
df["num_authors"] = df["authors"].apply(lambda x: len(str(x).split("|")) if pd.notna(x) else 0)

# Filter very new papers
df = df[df["age_days"] > 30]

# Age binning
df["age_bin"] = (df["age_days"] // 7) * 7
df["age_bin"] = pd.to_numeric(df["age_bin"], errors="coerce")
df["citation_count"] = pd.to_numeric(df["citation_count"], errors="coerce").fillna(0)
df = df.dropna(subset=["age_bin"]).copy()
df["age_bin"] = df["age_bin"].astype(int)

# Popularity bucket (target)
degree = 1.08
c1 = 0.019  # mid threshold
c2 = 0.035  # high threshold

mid_threshold = c1 * (df["age_bin"] ** degree)
high_threshold = c2 * (df["age_bin"] ** degree)

conditions = [
    df["citation_count"] < mid_threshold,
    (df["citation_count"] >= mid_threshold) & (df["citation_count"] < high_threshold),
    df["citation_count"] >= high_threshold,
]
df["popularity_bucket"] = np.select(conditions, ["low", "mid", "high"], default="low")

print(df["popularity_bucket"].value_counts())

Loaded 102215 papers.
popularity_bucket
low     93538
mid      4438
high     3579
Name: count, dtype: int64


## Feature Engineering

In [4]:
_WORD_RE = re.compile(r"[A-Za-z0-9]+(?:'[A-Za-z0-9]+)?")
_SENT_SPLIT_RE = re.compile(r"[.!?]+\s+")
_URL_RE = re.compile(r"https?://\S+|www\.\S+")
_EMAIL_RE = re.compile(r"\b[\w\.-]+@[\w\.-]+\.\w+\b")


def add_offline_paper_features(
    df: pd.DataFrame,
    *,
    title_col: str = "title",
    abstract_col: str = "abstract",
    authors_col: str = "authors",
    primary_cat_col: str = "primary_category",
    all_cats_col: str = "all_categories",
    published_at_col: str = "published_at",
    author_sep: str = "|",
) -> pd.DataFrame:
    """Add offline-only features for impact prediction."""
    out = df.copy()

    def safe_str(s) -> str:
        return "" if pd.isna(s) else str(s)

    def words(s: str):
        return _WORD_RE.findall(s)

    def sentence_count(s: str) -> int:
        s = s.strip()
        return 0 if not s else max(1, len(_SENT_SPLIT_RE.split(s)))

    def keyword_flags(text_lower: str, patterns: dict) -> dict:
        return {name: int(bool(re.search(pat, text_lower))) for name, pat in patterns.items()}

    # ── Datetime ──
    if published_at_col in out.columns:
        dt = pd.to_datetime(out[published_at_col], errors="coerce", utc=True)
        out["pub_hour_utc"] = dt.dt.hour
        out["pub_dow"] = dt.dt.dayofweek
        out["pub_month"] = dt.dt.month
        out["is_weekend"] = dt.dt.dayofweek.isin([5, 6]).astype("Int64")

    # ── Category features ──
    if all_cats_col in out.columns:
        cats = out[all_cats_col].fillna("").astype(str)
        out["num_categories"] = cats.apply(lambda x: 0 if x.strip() == "" else len([c for c in x.split("|") if c.strip()]))
        out["is_cross_listed"] = (out["num_categories"] > 1).astype("Int64")

        def starts_with(prefix: str):
            return cats.apply(lambda x: int(any(c.strip().startswith(prefix) for c in x.split("|") if c.strip())))

        out["has_cs"] = starts_with("cs.")
        out["has_stat"] = starts_with("stat.")
        out["has_math"] = starts_with("math.")
        out["has_eess"] = starts_with("eess.")
        out["has_qbio"] = starts_with("q-bio.")

    if primary_cat_col in out.columns:
        pc = out[primary_cat_col].fillna("").astype(str)
        out["primary_is_cs"] = pc.str.startswith("cs.").astype("Int64")
        out["primary_is_stat"] = pc.str.startswith("stat.").astype("Int64")

    # ── Author features ──
    if authors_col in out.columns:
        auth = out[authors_col].fillna("").astype(str)
        author_lists = auth.apply(lambda x: [a.strip() for a in x.split(author_sep) if a.strip()])
        out["num_authors_offline"] = author_lists.apply(len)
        out["first_author"] = author_lists.apply(lambda xs: xs[0] if len(xs) else "")
        out["last_author"] = author_lists.apply(lambda xs: xs[-1] if len(xs) else "")

        author_name_lens = author_lists.apply(lambda xs: [len(a) for a in xs] if xs else [])
        out["author_name_len_mean"] = author_name_lens.apply(lambda ls: float(np.mean(ls)) if ls else np.nan)
        out["author_name_len_max"] = author_name_lens.apply(lambda ls: float(np.max(ls)) if ls else np.nan)
        out["has_many_authors_ge5"] = (out["num_authors_offline"] >= 5).astype("Int64")
        out["has_many_authors_ge10"] = (out["num_authors_offline"] >= 10).astype("Int64")

    # ── Text features ──
    title = out.get(title_col, "").apply(safe_str)
    abstract = out.get(abstract_col, "").apply(safe_str)
    title_lower = title.str.lower()
    abs_lower = abstract.str.lower()

    out["title_char_len"] = title.str.len()
    out["abstract_char_len"] = abstract.str.len()
    out["title_word_count"] = title.apply(lambda s: len(words(s)))
    out["abstract_word_count"] = abstract.apply(lambda s: len(words(s)))
    out["title_avg_word_len"] = title.apply(lambda s: np.mean([len(w) for w in words(s)]) if words(s) else np.nan)
    out["abstract_avg_word_len"] = abstract.apply(lambda s: np.mean([len(w) for w in words(s)]) if words(s) else np.nan)
    out["abstract_sentence_count"] = abstract.apply(sentence_count)
    out["abstract_avg_words_per_sentence"] = (
        out["abstract_word_count"] / out["abstract_sentence_count"].replace(0, np.nan)
    )

    def ratio_of(pattern: str, s: str) -> float:
        return len(re.findall(pattern, s)) / max(1, len(s)) if s else 0.0

    out["title_digit_ratio"] = title.apply(lambda s: ratio_of(r"\d", s))
    out["abstract_digit_ratio"] = abstract.apply(lambda s: ratio_of(r"\d", s))
    out["title_punct_ratio"] = title.apply(lambda s: ratio_of(r"[^\w\s]", s))
    out["abstract_punct_ratio"] = abstract.apply(lambda s: ratio_of(r"[^\w\s]", s))

    out["abstract_has_url"] = abstract.apply(lambda s: int(bool(_URL_RE.search(s)))).astype("Int64")
    out["abstract_has_email"] = abstract.apply(lambda s: int(bool(_EMAIL_RE.search(s)))).astype("Int64")
    out["mentions_github"] = abs_lower.str.contains("github.com", regex=False).astype("Int64")
    out["mentions_code"] = abs_lower.str.contains("code", regex=False).astype("Int64")
    out["mentions_dataset"] = abs_lower.str.contains("dataset", regex=False).astype("Int64")
    out["mentions_benchmark"] = abs_lower.str.contains("benchmark", regex=False).astype("Int64")
    out["mentions_arxiv_id"] = abs_lower.str.contains("arxiv", regex=False).astype("Int64")
    out["mentions_doi"] = abs_lower.str.contains("doi", regex=False).astype("Int64")

    # Keyword patterns
    kw_patterns = {
        "is_survey": r"\bsurvey\b|\breview\b",
        "is_benchmark_paper": r"\bbenchmark\b|\bleaderboard\b",
        "is_dataset_paper": r"\bdataset\b|\bcorpus\b",
        "is_system_paper": r"\bsystem\b|\bframework\b|\bplatform\b",
        "has_theory": r"\btheorem\b|\bproof\b|\bconvergence\b",
        "mentions_llm": r"\bllm\b|large language model|language model",
        "mentions_diffusion": r"\bdiffusion\b",
        "mentions_transformer": r"\btransformer\b",
        "mentions_agent": r"\bagent\b|\btool\b|\bplanning\b",
        "mentions_rl": r"\breinforcement learning\b|\brl\b",
        "mentions_multimodal": r"\bmultimodal\b|vision-language|vlm",
        "claims_sota": r"\bsota\b|state[- ]of[- ]the[- ]art",
        "claims_novel": r"\bnovel\b|\bnew\b|\bfirst\b|\bintroduce\b",
        "mentions_open_source": r"open[- ]source|we release|code is available",
        "mentions_experiments": r"\bexperiments?\b|\bwe evaluate\b|\bresults?\b",
    }
    combined_lower = (title_lower + " " + abs_lower).fillna("")
    kw_df = pd.DataFrame(
        combined_lower.apply(lambda s: keyword_flags(s, kw_patterns)).tolist(),
        index=out.index,
    )
    out = pd.concat([out, kw_df], axis=1)

    # Lexical diversity
    def ttr(s: str) -> float:
        ws = [w.lower() for w in words(s)]
        return len(set(ws)) / len(ws) if ws else np.nan

    out["abstract_ttr"] = abstract.apply(ttr)
    out["log_abstract_word_count"] = np.log1p(out["abstract_word_count"])
    out["log_num_authors"] = np.log1p(out.get("num_authors_offline", 0))

    return out

## Build Model-Ready Data

In [5]:
# Apply feature engineering
feat_df = add_offline_paper_features(df)
print("feat_df shape:", feat_df.shape)

# Filter to papers with enough age for reliable citation signal
feat_df["age_days"] = pd.to_numeric(feat_df.get("age_days", np.nan), errors="coerce")
model_df = feat_df[feat_df["age_days"] >= 200].copy()
print("model_df:", model_df.shape)

# Encode target: low=0, mid=1, high=2
ordered_classes = ["low", "mid", "high"]
y = pd.Categorical(model_df["popularity_bucket"].astype(str), categories=ordered_classes, ordered=True).codes
if (y == -1).any():
    raise ValueError("Found unexpected popularity_bucket values outside low/mid/high.")

# Drop leak / ID columns
leak_cols = {"citation_count", "age_days", "age_bin", "popularity_bucket"}
id_cols = {"arxiv_id"}
drop_cols = leak_cols | id_cols
X = model_df.drop(columns=[c for c in drop_cols if c in model_df.columns]).copy()
print("X shape:", X.shape)

feat_df shape: (101555, 72)
model_df: (51455, 72)
X shape: (51455, 67)


## Train / Test Split

In [6]:
TITLE_COL = "title"
ABSTRACT_COL = "abstract"

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)
print("X_train:", X_train.shape, " X_test:", X_test.shape)

X_train: (41164, 67)  X_test: (10291, 67)


## Numerical Imputation & One-Hot Encoding

In [7]:
# Identify column groups
cat_cols = ["primary_category"] if "primary_category" in X_train.columns else []
num_cols = X_train.select_dtypes(include=["int64", "int32", "float64", "float32", "bool"]).columns.tolist()
num_cols = [c for c in num_cols if c not in {TITLE_COL, ABSTRACT_COL}]

# Numerical imputation
num_imputer = SimpleImputer(strategy="median")
X_train_num = num_imputer.fit_transform(X_train[num_cols]) if num_cols else np.zeros((len(X_train), 0), np.float32)
X_test_num  = num_imputer.transform(X_test[num_cols])      if num_cols else np.zeros((len(X_test),  0), np.float32)

# One-hot encoding
if cat_cols:
    cat_imputer = SimpleImputer(strategy="most_frequent")
    X_train_cat_raw = cat_imputer.fit_transform(X_train[cat_cols])
    X_test_cat_raw  = cat_imputer.transform(X_test[cat_cols])

    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    X_train_cat = ohe.fit_transform(X_train_cat_raw).astype(np.float32)
    X_test_cat  = ohe.transform(X_test_cat_raw).astype(np.float32)
    ohe_col_names = list(ohe.get_feature_names_out(cat_cols))
else:
    X_train_cat = np.zeros((len(X_train), 0), np.float32)
    X_test_cat  = np.zeros((len(X_test),  0), np.float32)
    ohe_col_names = []

# Build the authoritative column-name list
wo_emb_col_names = num_cols + ohe_col_names
print(f"num_cols: {len(num_cols)}, ohe_cols: {len(ohe_col_names)}, total: {len(wo_emb_col_names)}")

num_cols: 59, ohe_cols: 124, total: 183


---
## Section 1 — Save Without Embedding

In [8]:
SPLITS_DIR = "./splits"
os.makedirs(SPLITS_DIR, exist_ok=True)

# Build DataFrames with real column names
X_train_wo_emb_df = pd.DataFrame(
    np.hstack([X_train_num.astype(np.float32), X_train_cat]),
    columns=wo_emb_col_names,
)
X_test_wo_emb_df = pd.DataFrame(
    np.hstack([X_test_num.astype(np.float32), X_test_cat]),
    columns=wo_emb_col_names,
)

y_train_df = pd.DataFrame(y_train, columns=["label"])
y_test_df  = pd.DataFrame(y_test,  columns=["label"])

# Save parquets
X_train_wo_emb_df.to_parquet(os.path.join(SPLITS_DIR, "X_train_wo_emb.parquet"), engine="fastparquet", index=False)
X_test_wo_emb_df.to_parquet(os.path.join(SPLITS_DIR, "X_test_wo_emb.parquet"),   engine="fastparquet", index=False)
y_train_df.to_parquet(os.path.join(SPLITS_DIR, "y_train.parquet"), engine="fastparquet", index=False)
y_test_df.to_parquet(os.path.join(SPLITS_DIR, "y_test.parquet"),   engine="fastparquet", index=False)

# Save column mapping
with open(os.path.join(SPLITS_DIR, "column_names_wo_emb.json"), "w") as f:
    json.dump(wo_emb_col_names, f)

print(f"✅ Section 1 saved — X_train: {X_train_wo_emb_df.shape}, X_test: {X_test_wo_emb_df.shape}")
print(f"   Columns: {wo_emb_col_names[:5]} ... ({len(wo_emb_col_names)} total)")

✅ Section 1 saved — X_train: (41164, 183), X_test: (10291, 183)
   Columns: ['title_len', 'abstract_len', 'num_authors', 'pub_hour_utc', 'pub_dow'] ... (183 total)


---
## Section 2 — Generate Embeddings & Save

In [9]:
EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
CACHE_DIR = "./emb_cache"
os.makedirs(CACHE_DIR, exist_ok=True)

MAX_CHARS = 6000
CHUNK_SIZE = 2000
BATCH_SIZE = 16
DEVICE = "cpu"


def build_text(df_):
    """Concatenate title + abstract for embedding."""
    t = df_.get(TITLE_COL, "").fillna("").astype(str)
    a = df_.get(ABSTRACT_COL, "").fillna("").astype(str)
    return (t + " [SEP] " + a).str.slice(0, MAX_CHARS)


def encode_resumable(texts: pd.Series, out_prefix: str, model_name: str):
    """
    Encode texts to embeddings with resume support via memmap.
    Writes: {out_prefix}.mmap (float32) and {out_prefix}.json (progress)
    """
    texts = texts.fillna("").astype(str)
    n = len(texts)

    prog_path = out_prefix + ".json"
    mmap_path = out_prefix + ".mmap"

    st = SentenceTransformer(model_name, device=DEVICE)

    # Determine embedding dim
    probe = st.encode(texts.iloc[:4].tolist(), batch_size=4, convert_to_numpy=True, normalize_embeddings=True)
    dim = probe.shape[1]

    # Load or init progress
    if os.path.exists(prog_path) and os.path.exists(mmap_path):
        with open(prog_path, "r") as f:
            prog = json.load(f)
        start_idx = int(prog.get("done_until", 0))
        if prog.get("n") != n or prog.get("dim") != dim:
            print("Progress file mismatch; restarting from scratch.")
            start_idx = 0
            os.remove(prog_path)
            os.remove(mmap_path)
    else:
        start_idx = 0

    emb_mm = np.memmap(mmap_path, dtype="float32", mode="w+" if start_idx == 0 else "r+", shape=(n, dim))

    if start_idx == 0:
        emb_mm[:probe.shape[0], :] = probe.astype(np.float32)
        start_idx = probe.shape[0]
        emb_mm.flush()
        with open(prog_path, "w") as f:
            json.dump({"n": n, "dim": dim, "done_until": start_idx}, f)

    for s in range(start_idx, n, CHUNK_SIZE):
        e = min(s + CHUNK_SIZE, n)
        print(f"Encoding {s}:{e} / {n} ...")
        chunk = texts.iloc[s:e].tolist()
        emb = st.encode(
            chunk, batch_size=BATCH_SIZE, show_progress_bar=True,
            convert_to_numpy=True, normalize_embeddings=True,
        ).astype(np.float32)
        emb_mm[s:e, :] = emb
        emb_mm.flush()
        with open(prog_path, "w") as f:
            json.dump({"n": n, "dim": dim, "done_until": e}, f)

    return np.array(emb_mm)


# Build text & encode
train_texts = build_text(X_train)
test_texts  = build_text(X_test)

X_train_emb = encode_resumable(train_texts, os.path.join(CACHE_DIR, "train_emb"), EMB_MODEL)
X_test_emb  = encode_resumable(test_texts,  os.path.join(CACHE_DIR, "test_emb"),  EMB_MODEL)

print(f"X_train_emb: {X_train_emb.shape}")
print(f"X_test_emb : {X_test_emb.shape}")

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

Progress file mismatch; restarting from scratch.
Encoding 4:2004 / 41164 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 2004:4004 / 41164 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 4004:6004 / 41164 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 6004:8004 / 41164 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 8004:10004 / 41164 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 10004:12004 / 41164 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 12004:14004 / 41164 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 14004:16004 / 41164 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 16004:18004 / 41164 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 18004:20004 / 41164 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 20004:22004 / 41164 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 22004:24004 / 41164 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 24004:26004 / 41164 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 26004:28004 / 41164 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 28004:30004 / 41164 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 30004:32004 / 41164 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 32004:34004 / 41164 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 34004:36004 / 41164 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 36004:38004 / 41164 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 38004:40004 / 41164 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 40004:41164 / 41164 ...


Batches:   0%|          | 0/73 [00:00<?, ?it/s]



Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

Progress file mismatch; restarting from scratch.
Encoding 4:2004 / 10291 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 2004:4004 / 10291 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 4004:6004 / 10291 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 6004:8004 / 10291 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 8004:10004 / 10291 ...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Encoding 10004:10291 / 10291 ...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

X_train_emb: (41164, 384)
X_test_emb : (10291, 384)


In [11]:
# Build column names for the final (with-embedding) version
emb_dim = X_train_emb.shape[1]
emb_col_names = [f"emb_{i}" for i in range(emb_dim)]
final_col_names = emb_col_names + num_cols + ohe_col_names

# Build DataFrames
X_train_final_df = pd.DataFrame(
    np.hstack([X_train_emb, X_train_num.astype(np.float32), X_train_cat]),
    columns=final_col_names,
)
X_test_final_df = pd.DataFrame(
    np.hstack([X_test_emb, X_test_num.astype(np.float32), X_test_cat]),
    columns=final_col_names,
)

train_texts_df = pd.DataFrame(train_texts, columns=["text"])
test_texts_df  = pd.DataFrame(test_texts,  columns=["text"])

# Save parquets
X_train_final_df.to_parquet(os.path.join(SPLITS_DIR, "X_train_final.parquet"), engine="fastparquet", index=False)
X_test_final_df.to_parquet(os.path.join(SPLITS_DIR, "X_test_final.parquet"),   engine="fastparquet", index=False)
train_texts_df.to_parquet(os.path.join(SPLITS_DIR, "train_texts.parquet"), engine="pyarrow", index=False)
test_texts_df.to_parquet(os.path.join(SPLITS_DIR, "test_texts.parquet"),   engine="pyarrow", index=False)

# Save column mapping
with open(os.path.join(SPLITS_DIR, "column_names_final.json"), "w") as f:
    json.dump(final_col_names, f)

print(f"✅ Section 2 saved — X_train_final: {X_train_final_df.shape}, X_test_final: {X_test_final_df.shape}")
print(f"   Columns: {final_col_names[:3]} ... {final_col_names[-3:]} ({len(final_col_names)} total)")

✅ Section 2 saved — X_train_final: (41164, 567), X_test_final: (10291, 567)
   Columns: ['emb_0', 'emb_1', 'emb_2'] ... ['primary_category_stat.ME', 'primary_category_stat.ML', 'primary_category_stat.OT'] (567 total)
