# 05 — Text Encoder (SentencePiece BPE + TransformerEncoder + chunk pooling)

Goal:
1) Train a text model **from scratch** (no pretrained weights) to predict `log_sold_price` from `description_text`.
2) Use SentencePiece BPE trained **only on TRAIN** text to avoid OOV inflation and preserve domain tokens (e.g., "$799k", "2br/1ba", "sqft").
3) Use **all text** via chunking: encode full description → split into fixed-length chunks → encode each chunk → pool chunk embeddings into a listing embedding.
4) Export per-listing features keyed by `listing_id`:
   - `txt_has`, `txt_pred_log`, and `txt_emb_###` columns
   These will be merged into tabular splits using `listing_id`.


In [None]:
!pip -q install sentencepiece tqdm

import os
import json
import math
import random
from pathlib import Path

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import sentencepiece as spm
from tqdm.auto import tqdm

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# define device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)


Device: cuda


## Load processed splits + preprocessing summary

Load these files
- `train_multimodal.csv`, `val_multimodal.csv`, `test_multimodal.csv`
- `multimodal_prep_summary.json` for the target column names:
  - `sold_price`
  - `log_sold_price`


In [None]:
from google.colab import drive
drive.mount("/content/drive")

PROJECT_ROOT = Path("/content/drive/My Drive/SH")
DATA_DIR = PROJECT_ROOT / "data"
PROC_DIR = DATA_DIR / "processed"

train_path = PROC_DIR / "train_multimodal.csv"
val_path   = PROC_DIR / "val_multimodal.csv"
test_path  = PROC_DIR / "test_multimodal.csv"

train_df = pd.read_csv(train_path)
val_df   = pd.read_csv(val_path)
test_df  = pd.read_csv(test_path)

print("Train:", train_df.shape, "Val:", val_df.shape, "Test:", test_df.shape)

summary_path = PROC_DIR / "multimodal_prep_summary.json"
with open(summary_path, "r") as f:
    prep_summary = json.load(f)

# From your summary JSON:
TARGET_RAW_COL = prep_summary["criteria"]["target_column"]      # "sold_price" :contentReference[oaicite:0]{index=0}
TARGET_LOG_COL = prep_summary["criteria"]["log_target_column"]  # "log_sold_price" :contentReference[oaicite:1]{index=1}

ID_COL = "listing_id"
TEXT_COL = "description_text"

print("Target raw:", TARGET_RAW_COL)
print("Target log:", TARGET_LOG_COL)
print("ID col   :", ID_COL)
print("Text col :", TEXT_COL)


Mounted at /content/drive
Train: (143643, 27) Val: (17955, 27) Test: (17956, 27)
Target raw: sold_price
Target log: log_sold_price
ID col   : listing_id
Text col : description_text


In [None]:
required = [ID_COL, TEXT_COL, TARGET_LOG_COL]
for name, df in [("train", train_df), ("val", val_df), ("test", test_df)]:
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"[{name}] Missing required columns: {missing}")

for df in [train_df, val_df, test_df]:
    df[TEXT_COL] = df[TEXT_COL].fillna("").astype(str)

# Drop missing targets if any
for name, df in [("train", train_df), ("val", val_df), ("test", test_df)]:
    before = len(df)
    df.dropna(subset=[TARGET_LOG_COL], inplace=True)
    after = len(df)
    if after != before:
        print(f"[{name}] Dropped {before-after} rows with missing {TARGET_LOG_COL}")

train_df.head(2)

Unnamed: 0,sold_price,log_sold_price,description_text,beds,full_baths,half_baths,sqft,year_built,days_on_mls,lot_sqft,...,parking_garage,new_construction,stories,county,sale_year,sale_date,property_id,listing_id,mls,mls_id
0,1475000.0,14.204169,Excellent investment and / or primary residenc...,6.0,4.0,,,1964.0,48.0,5227.0,...,2.0,False,2.0,Essex,2024,2024-07-25,9859620388,2968228851,GSNJ,3906230
1,385000.0,12.861001,UNDER CONTRACT CONTINUE TO SHOW. This charming...,3.0,1.0,,1108.0,1970.0,,7841.0,...,1.0,False,1.0,Ocean,2023,2023-07-20,5713444613,2956792349,MONJ,22316422


## Train SentencePiece BPE tokenizer (TRAIN only)

Train BPE on training descriptions only (leakage-safe). This typically handles:
- misspellings / rare words
- abbreviations ("2br/1ba", "sqft")
- punctuation patterns ("$799k")
- neighborhood names

DO NOT clip descriptions; instead, use the full text for training SentencePiece.


In [None]:
OUT_DIR = PROC_DIR / "multimodal_features"
OUT_DIR.mkdir(parents=True, exist_ok=True)

SP_DIR = OUT_DIR / "sentencepiece_bpe"
SP_DIR.mkdir(parents=True, exist_ok=True)

train_text_file = SP_DIR / "train_text.txt"
with open(train_text_file, "w", encoding="utf-8") as f:
    for t in train_df[TEXT_COL].tolist():
        f.write(t.replace("\n", " ").strip() + "\n")

VOCAB_SIZE = 16000 # Tried 32000 later, but doesn't improve the performance substantially.
PAD_ID = 0
BOS_ID = 1
EOS_ID = 2
UNK_ID = 3

MODEL_PREFIX = str(SP_DIR / f"sp_bpe_{VOCAB_SIZE}")

spm.SentencePieceTrainer.Train(
    input=str(train_text_file),
    model_prefix=MODEL_PREFIX,
    vocab_size=VOCAB_SIZE,
    model_type="bpe",
    character_coverage=1.0,
    pad_id=PAD_ID, bos_id=BOS_ID, eos_id=EOS_ID, unk_id=UNK_ID,
    input_sentence_size=200000,
    shuffle_input_sentence=True
)

sp = spm.SentencePieceProcessor()
sp.Load(MODEL_PREFIX + ".model")

print("SP vocab size:", sp.GetPieceSize())
sample = "Bright 2BR/1BA near NYC! $799k, 1,050 sqft. W/D, A/C."
print("Example ids   :", sp.EncodeAsIds(sample)[:30])
print("Example pieces:", sp.EncodeAsPieces(sample)[:30])


SP vocab size: 16000
Example ids   : [2721, 102, 2077, 4445, 4419, 935, 937, 15920, 9810, 15940, 15940, 15898, 15891, 2857, 15907, 1596, 4387, 15895, 107, 15929, 15901, 15891, 63, 15929, 15900, 15895]
Example pieces: ['▁Bright', '▁2', 'BR', '/1', 'BA', '▁near', '▁NYC', '!', '▁$7', '9', '9', 'k', ',', '▁1,', '0', '50', '▁sqft', '.', '▁W', '/', 'D', ',', '▁A', '/', 'C', '.']


## Chunking strategy: use all text without global truncation

Transformers need bounded sequence length per forward pass. To use *all* text:
1) Encode full description into SentencePiece ids.
2) Split into chunks of length `MAX_LEN`.
3) Encode each chunk separately.
4) Pool chunk embeddings (mean pooling) to produce one listing embedding.

This ensures every token contributes (through its chunk) without dropping tail content.


In [None]:
MAX_LEN = 256     # tokens per chunk (not a global clip)
TXT_EMB_DIM = 128  # exported embedding size

def sp_encode(text: str):
    if text is None:
        text = ""
    if not isinstance(text, str):
        text = str(text)
    ids = sp.EncodeAsIds(text)
    return [BOS_ID] + ids + [EOS_ID]

def chunk_ids(ids, max_len):
    if len(ids) == 0:
        return [[PAD_ID] * max_len]
    chunks = []
    for i in range(0, len(ids), max_len):
        c = ids[i:i+max_len]
        if len(c) < max_len:
            c = c + [PAD_ID] * (max_len - len(c))
        chunks.append(c)
    return chunks

class ListingTextDataset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True).copy()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        listing_id = row[ID_COL]
        text = row[TEXT_COL]
        y_log = float(row[TARGET_LOG_COL])

        ids = sp_encode(text)
        chunks = chunk_ids(ids, MAX_LEN)

        has_text = 1.0 if (isinstance(text, str) and len(text.strip()) > 0) else 0.0

        return {
            "listing_id": listing_id,
            "y_log": y_log,
            "has_text": has_text,
            "chunks": chunks
        }

def collate_batch(batch):
    listing_ids = [b["listing_id"] for b in batch]
    y_log = torch.tensor([b["y_log"] for b in batch], dtype=torch.float32).unsqueeze(1)
    has_text = torch.tensor([b["has_text"] for b in batch], dtype=torch.float32).unsqueeze(1)

    all_chunks = []
    chunk_to_listing = []
    for i, b in enumerate(batch):
        all_chunks.extend(b["chunks"])
        chunk_to_listing.extend([i] * len(b["chunks"]))

    x = torch.tensor(all_chunks, dtype=torch.long)              # [total_chunks, MAX_LEN]
    chunk_to_listing = torch.tensor(chunk_to_listing, dtype=torch.long)

    return {
        "listing_ids": listing_ids,
        "x": x,
        "chunk_to_listing": chunk_to_listing,
        "y_log": y_log,
        "has_text": has_text
    }


## Target normalization

Even though the target is log-transformed, it typically has a non-zero mean and non-unit variance.
Normalizing improves optimization stability (especially early epochs).

We:
- compute mean/std on TRAIN only
- train on normalized y
- invert predictions back to log space for reporting and export


In [None]:
y_mean = float(train_df[TARGET_LOG_COL].mean())
y_std = float(train_df[TARGET_LOG_COL].std()) + 1e-8

print("y_mean:", y_mean)
print("y_std :", y_std)


y_mean: 12.957889316832095
y_std : 0.685901248068598


## Model: TransformerEncoder + chunk pooling

Per chunk:
- token embedding
- positional embedding
- Transformer encoder layers
- masked mean pooling over tokens → chunk embedding
- projection → compact embedding

Per listing:
- mean pooling over chunk embeddings → listing embedding
- regression head → normalized target


In [None]:
class TextTransformerChunkPool(nn.Module):
    def __init__(self, vocab_size, max_len, d_model=192, n_heads=6, n_layers=3, ff_dim=512, dropout=0.15, out_dim=64):
        super().__init__()
        self.vocab_size = vocab_size
        self.max_len = max_len
        self.d_model = d_model

        self.tok_emb = nn.Embedding(vocab_size, d_model, padding_idx=PAD_ID)
        self.pos_emb = nn.Embedding(max_len, d_model)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=ff_dim,
            dropout=dropout,
            activation="gelu",
            batch_first=True,
            norm_first=True
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)

        self.proj = nn.Sequential(
            nn.Linear(d_model, out_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        self.head = nn.Linear(out_dim, 1)

    def forward(self, x, chunk_to_listing, n_listings):
        total_chunks, T = x.shape
        assert T == self.max_len

        pos = torch.arange(T, device=x.device).unsqueeze(0).expand(total_chunks, T)
        h = self.tok_emb(x) + self.pos_emb(pos)                 # [C, T, D]

        pad_mask = (x == PAD_ID)                                # [C, T]
        h = self.encoder(h, src_key_padding_mask=pad_mask)      # [C, T, D]

        keep = (~pad_mask).float()
        denom = keep.sum(dim=1, keepdim=True).clamp(min=1.0)    # [C, 1]
        chunk_raw = (h * keep.unsqueeze(-1)).sum(dim=1) / denom # [C, D]

        chunk_emb = self.proj(chunk_raw)                        # [C, out_dim]

        # dtype-safe pooling (fixes index_add_ float vs half mismatch)
        out_dim = chunk_emb.shape[1]
        dtype = chunk_emb.dtype
        listing_emb = torch.zeros((n_listings, out_dim), device=x.device, dtype=dtype)
        counts = torch.zeros((n_listings, 1), device=x.device, dtype=dtype)

        listing_emb.index_add_(0, chunk_to_listing, chunk_emb)
        ones = torch.ones((total_chunks, 1), device=x.device, dtype=dtype)
        counts.index_add_(0, chunk_to_listing, ones)
        listing_emb = listing_emb / counts.clamp(min=1.0)

        y_pred_norm = self.head(listing_emb)                    # [B, 1]
        return y_pred_norm, listing_emb


## DataLoaders


In [None]:
BATCH_SIZE = 64

NUM_WORKERS = 0
PIN_MEMORY = (DEVICE == "cuda")

train_dl = DataLoader(ListingTextDataset(train_df), batch_size=BATCH_SIZE, shuffle=True,
                      num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY,
                      collate_fn=collate_batch)
val_dl   = DataLoader(ListingTextDataset(val_df), batch_size=BATCH_SIZE, shuffle=False,
                      num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY,
                      collate_fn=collate_batch)
test_dl  = DataLoader(ListingTextDataset(test_df), batch_size=BATCH_SIZE, shuffle=False,
                      num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY,
                      collate_fn=collate_batch)

print("Train batches:", len(train_dl), "Val batches:", len(val_dl), "Test batches:", len(test_dl))

Train batches: 2245 Val batches: 281 Test batches: 281


## Training + evaluation (AMP enabled on GPU)

Train on normalized targets using MSE loss.
Evaluate in log space (denormalized).

Also use:
- gradient clipping
- ReduceLROnPlateau scheduler
- early stopping by validation RMSE


In [None]:
def metrics(y_true, y_pred):
    y_true = y_true.reshape(-1)
    y_pred = y_pred.reshape(-1)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, r2

# New AMP API (fixes deprecation warning)
scaler = torch.amp.GradScaler('cuda', enabled=(DEVICE == "cuda"))

@torch.no_grad()
def run_eval(dl, model):
    model.eval()
    ys, ps = [], []

    for batch in dl:
        x = batch["x"].to(DEVICE)
        chunk_to_listing = batch["chunk_to_listing"].to(DEVICE)
        y_log = batch["y_log"].to(DEVICE)

        y_norm = (y_log - y_mean) / y_std

        with torch.amp.autocast('cuda', enabled=(DEVICE == "cuda")):
            pred_norm, _ = model(x, chunk_to_listing, n_listings=y_norm.shape[0])

        pred_log = pred_norm * y_std + y_mean

        ys.append(y_log.cpu().numpy())
        ps.append(pred_log.cpu().numpy())

    y_all = np.vstack(ys)
    p_all = np.vstack(ps)
    return metrics(y_all, p_all)

def train_one_epoch(dl, model, optimizer, loss_fn):
    model.train()
    total_loss = 0.0
    n = 0

    for batch in tqdm(dl, leave=False):
        x = batch["x"].to(DEVICE)
        chunk_to_listing = batch["chunk_to_listing"].to(DEVICE)
        y_log = batch["y_log"].to(DEVICE)

        y_norm = (y_log - y_mean) / y_std

        optimizer.zero_grad(set_to_none=True)

        with torch.amp.autocast('cuda', enabled=(DEVICE == "cuda")):
            pred_norm, _ = model(x, chunk_to_listing, n_listings=y_norm.shape[0])
            loss = loss_fn(pred_norm, y_norm)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()

        total_loss += float(loss.item()) * y_norm.shape[0]
        n += y_norm.shape[0]

    return total_loss / max(n, 1)


In [None]:
vocab_size = sp.GetPieceSize()

model = TextTransformerChunkPool(
    vocab_size=vocab_size,
    max_len=MAX_LEN,
    d_model=192,
    n_heads=6,
    n_layers=3,
    ff_dim=512,
    dropout=0.20,
    out_dim=TXT_EMB_DIM
).to(DEVICE)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=1)
loss_fn = nn.MSELoss()

EPOCHS = 15
PATIENCE = 4

best_val_rmse = float("inf")
best_path = OUT_DIR / f"text_encoder_spbpe_best_vocab{VOCAB_SIZE}.pt"
bad_epochs = 0

for epoch in range(1, EPOCHS + 1):
    tr_loss = train_one_epoch(train_dl, model, optimizer, loss_fn)
    val_rmse, val_mae, val_r2 = run_eval(val_dl, model)

    lr = optimizer.param_groups[0]["lr"]
    print(f"Epoch {epoch:02d} | lr={lr:.2e} | train_loss={tr_loss:.4f} | val_RMSE={val_rmse:.4f} | val_MAE={val_mae:.4f} | val_R2={val_r2:.4f}")

    scheduler.step(val_rmse)

    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        torch.save(model.state_dict(), best_path)
        bad_epochs = 0
    else:
        bad_epochs += 1
        if bad_epochs >= PATIENCE:
            print("Early stopping.")
            break

print("Best val RMSE:", best_val_rmse)
print("Saved:", best_path)




  0%|          | 0/2245 [00:00<?, ?it/s]

Epoch 01 | lr=2.00e-03 | train_loss=0.3520 | val_RMSE=0.3770 | val_MAE=0.2754 | val_R2=0.7024


  0%|          | 0/2245 [00:00<?, ?it/s]

Epoch 02 | lr=2.00e-03 | train_loss=0.2451 | val_RMSE=0.3535 | val_MAE=0.2622 | val_R2=0.7383


  0%|          | 0/2245 [00:00<?, ?it/s]

Epoch 03 | lr=2.00e-03 | train_loss=0.2197 | val_RMSE=0.3432 | val_MAE=0.2578 | val_R2=0.7534


  0%|          | 0/2245 [00:00<?, ?it/s]

Epoch 04 | lr=2.00e-03 | train_loss=0.1973 | val_RMSE=0.3556 | val_MAE=0.2697 | val_R2=0.7353


  0%|          | 0/2245 [00:00<?, ?it/s]

Epoch 05 | lr=2.00e-03 | train_loss=0.1793 | val_RMSE=0.3344 | val_MAE=0.2470 | val_R2=0.7659


  0%|          | 0/2245 [00:00<?, ?it/s]

Epoch 06 | lr=2.00e-03 | train_loss=0.1633 | val_RMSE=0.3320 | val_MAE=0.2462 | val_R2=0.7693


  0%|          | 0/2245 [00:00<?, ?it/s]

Epoch 07 | lr=2.00e-03 | train_loss=0.1478 | val_RMSE=0.3287 | val_MAE=0.2437 | val_R2=0.7738


  0%|          | 0/2245 [00:00<?, ?it/s]

Epoch 08 | lr=2.00e-03 | train_loss=0.1352 | val_RMSE=0.3389 | val_MAE=0.2518 | val_R2=0.7596


  0%|          | 0/2245 [00:00<?, ?it/s]

Epoch 09 | lr=2.00e-03 | train_loss=0.1219 | val_RMSE=0.3357 | val_MAE=0.2485 | val_R2=0.7641


  0%|          | 0/2245 [00:00<?, ?it/s]

Epoch 10 | lr=1.00e-03 | train_loss=0.0954 | val_RMSE=0.3320 | val_MAE=0.2453 | val_R2=0.7692


  0%|          | 0/2245 [00:00<?, ?it/s]

Epoch 11 | lr=1.00e-03 | train_loss=0.0819 | val_RMSE=0.3560 | val_MAE=0.2674 | val_R2=0.7346
Early stopping.
Best val RMSE: 0.3286656172228966
Saved: /content/drive/My Drive/SH/data/processed/multimodal_features/text_encoder_spbpe_best_vocab16000.pt


## Final evaluation (log space)


In [None]:
model.load_state_dict(torch.load(best_path, map_location=DEVICE))

train_rmse, train_mae, train_r2 = run_eval(train_dl, model)
val_rmse, val_mae, val_r2       = run_eval(val_dl, model)
test_rmse, test_mae, test_r2     = run_eval(test_dl, model)

print("=== SP-BPE + Transformer performance (description_text only) ===")
print(f"Train RMSE: {train_rmse:.3f} | MAE: {train_mae:.3f} | R²: {train_r2:.3f}")
print(f"Val   RMSE: {val_rmse:.4f} | MAE: {val_mae:.4f} | R²: {val_r2:.4f}")
print(f"Test  RMSE: {test_rmse:.4f} | MAE: {test_mae:.4f} | R²: {test_r2:.4f}")


=== SP-BPE + Transformer performance (description_text only) ===
Train RMSE: 0.224 | MAE: 0.171 | R²: 0.894
Val   RMSE: 0.3287 | MAE: 0.2437 | R²: 0.7738
Test  RMSE: 0.3313 | MAE: 0.2470 | R²: 0.7701


## Export per-listing text features (for fusion)

- Run the trained SP-BPE + Transformer model over each split (train/val/test).
- For each listing, export:
  - `listing_id`
  - `txt_has`  (indicator for non-empty description)
  - `txt_pred_log`  (model prediction in log-price space)
  - `txt_emb_000 ... txt_emb_{TXT_EMB_DIM-1}`  (128-d text embedding)

Outputs are saved under `PROC_DIR / "multimodal_features"` and will be merged
into tabular splits using `listing_id` in the fusion notebook.


In [None]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
import torch
from tqdm.auto import tqdm

# Make sure OUT_DIR exists
OUT_DIR = PROC_DIR / "multimodal_features"
OUT_DIR.mkdir(parents=True, exist_ok=True)

SEED = 42

@torch.no_grad()
def extract_split_features(dl, model, split_name: str):
    """
    Run model on an entire DataLoader and return a dataframe with:
      listing_id, txt_has, txt_pred_log, txt_emb_000..txt_emb_XXX
    One row per listing.
    """
    model.eval()
    rows = []

    for batch in tqdm(dl, desc=f"Extract {split_name}"):
        x = batch["x"].to(DEVICE, non_blocking=True)
        chunk_to_listing = batch["chunk_to_listing"].to(DEVICE, non_blocking=True)
        has_text = batch["has_text"].numpy().reshape(-1)    # [B]
        listing_ids = batch["listing_ids"]

        # Forward pass
        with torch.amp.autocast('cuda', enabled=(DEVICE == "cuda")):
            pred_norm, emb = model(x, chunk_to_listing, n_listings=len(listing_ids))

        # Denormalize predictions back to log space
        pred_log = pred_norm * y_std + y_mean

        # Move to CPU as float32
        pred_log = pred_log.float().cpu().numpy().reshape(-1)
        emb = emb.float().cpu().numpy()

        for i, listing_id in enumerate(listing_ids):
            row = {
                "listing_id": listing_id,
                "txt_has": float(has_text[i]),
                "txt_pred_log": float(pred_log[i]),
            }
            for j in range(emb.shape[1]):
                row[f"txt_emb_{j:03d}"] = float(emb[i, j])
            rows.append(row)

    df_out = pd.DataFrame(rows)
    # Safety checks
    assert df_out["listing_id"].isna().sum() == 0, "Found NaN listing_id in output"
    assert df_out["listing_id"].nunique() == len(df_out), "Duplicate listing_id rows in output"
    return df_out


# Run extraction for each split
txt_train = extract_split_features(train_dl, model, "train")
txt_val   = extract_split_features(val_dl,   model, "val")
txt_test  = extract_split_features(test_dl,  model, "test")

# File names include vocab size and embedding dim for clarity
train_out = OUT_DIR / f"txt_features_train_spbpe_vocab{VOCAB_SIZE}_d{TXT_EMB_DIM}.csv"
val_out   = OUT_DIR / f"txt_features_val_spbpe_vocab{VOCAB_SIZE}_d{TXT_EMB_DIM}.csv"
test_out  = OUT_DIR / f"txt_features_test_spbpe_vocab{VOCAB_SIZE}_d{TXT_EMB_DIM}.csv"

txt_train.to_csv(train_out, index=False)
txt_val.to_csv(val_out, index=False)
txt_test.to_csv(test_out, index=False)

# Save a small meta JSON describing how these were generated
meta = {
    "id_col": ID_COL,
    "text_col": TEXT_COL,
    "target_log_col": TARGET_LOG_COL,
    "tokenizer": {
        "type": "sentencepiece_bpe_train_only",
        "vocab_size": VOCAB_SIZE,
        "model_file": str((OUT_DIR / "sentencepiece_bpe" / f"sp_bpe_{VOCAB_SIZE}.model")),
        "pad_id": PAD_ID,
        "bos_id": BOS_ID,
        "eos_id": EOS_ID,
        "unk_id": UNK_ID,
    },
    "model": {
        "arch": "transformer_chunk_pool",
        "max_len": MAX_LEN,
        "d_model": 192,
        "n_heads": 6,
        "n_layers": 3,
        "ff_dim": 512,
        "dropout": 0.20,
        "out_dim": TXT_EMB_DIM,
    },
    "target_norm": {
        "y_mean": y_mean,
        "y_std": y_std,
    },
    "seed": SEED,
}

meta_path = OUT_DIR / f"txt_features_meta_spbpe_vocab{VOCAB_SIZE}_d{TXT_EMB_DIM}.json"
with open(meta_path, "w") as f:
    json.dump(meta, f, indent=2)

print("Saved train features to:", train_out)
print("Saved val   features to:", val_out)
print("Saved test  features to:", test_out)
print("Saved meta  to:", meta_path)
print("Shapes -> train:", txt_train.shape, "val:", txt_val.shape, "test:", txt_test.shape)


Extract train:   0%|          | 0/2245 [00:00<?, ?it/s]

Extract val:   0%|          | 0/281 [00:00<?, ?it/s]

Extract test:   0%|          | 0/281 [00:00<?, ?it/s]

Saved train features to: /content/drive/My Drive/SH/data/processed/multimodal_features/txt_features_train_spbpe_vocab16000_d128.csv
Saved val   features to: /content/drive/My Drive/SH/data/processed/multimodal_features/txt_features_val_spbpe_vocab16000_d128.csv
Saved test  features to: /content/drive/My Drive/SH/data/processed/multimodal_features/txt_features_test_spbpe_vocab16000_d128.csv
Saved meta  to: /content/drive/My Drive/SH/data/processed/multimodal_features/txt_features_meta_spbpe_vocab16000_d128.json
Shapes -> train: (143643, 131) val: (17955, 131) test: (17956, 131)


## Sanity check: merge text features into training tabular split

Demonstrate merging the exported text features back into the training
data using `listing_id`. This is exactly the pattern I will use in the
fusion (TabNet + text) notebook.


In [None]:
# Reload train_df if needed (or reuse from earlier cell)
train_df = pd.read_csv(PROC_DIR / "train_multimodal.csv")

txt_train_feat = pd.read_csv(train_out)

merged = train_df.merge(txt_train_feat, on="listing_id", how="left")
print("Train before merge:", train_df.shape)
print("Train after  merge:", merged.shape)

# Check embedding coverage + a couple of columns
emb_cols = [c for c in merged.columns if c.startswith("txt_emb_")]
print("Number of embedding dims:", len(emb_cols))
print("Fraction of rows with any missing txt_emb:", merged[emb_cols].isna().any(axis=1).mean())
print("txt_has mean:", merged["txt_has"].mean())

merged.head(3)


Train before merge: (143643, 27)
Train after  merge: (143643, 157)
Number of embedding dims: 128
Fraction of rows with any missing txt_emb: 0.0
txt_has mean: 1.0


Unnamed: 0,sold_price,log_sold_price,description_text,beds,full_baths,half_baths,sqft,year_built,days_on_mls,lot_sqft,...,txt_emb_118,txt_emb_119,txt_emb_120,txt_emb_121,txt_emb_122,txt_emb_123,txt_emb_124,txt_emb_125,txt_emb_126,txt_emb_127
0,1475000.0,14.204169,Excellent investment and / or primary residenc...,6.0,4.0,,,1964.0,48.0,5227.0,...,5.15625,0.0,0.0,4.6875,1.136719,0.0,0.0,0.0,0.0,0.0
1,385000.0,12.861001,UNDER CONTRACT CONTINUE TO SHOW. This charming...,3.0,1.0,,1108.0,1970.0,,7841.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.188477,0.0,0.0
2,245000.0,12.409018,GREAT VALUE on this Cherry and Bright Two Stor...,4.0,3.0,1.0,2408.0,2013.0,,9894.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.082031,0.0,0.0
