In [3]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, normalize

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [5]:
path = "../data/users_fingerprint_norm.csv"
df_user_fp = pd.read_csv(path)
df_user_fp.head()

Unnamed: 0.1,Unnamed: 0,user_id,max_history_seen,vocab_size,learning_speed,lexeme_0,lexeme_1,lexeme_2,lexeme_3,lexeme_4,...,lexeme_2805_seen,lexeme_2806_seen,lexeme_2807_seen,lexeme_2808_seen,lexeme_2809_seen,lexeme_2810_seen,lexeme_2811_seen,lexeme_2812_seen,lexeme_2813_seen,lexeme_2814_seen
0,0,u:0X2,-0.333881,-0.62328,1.458221,0.999994,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,u:0b,-0.405127,-0.580715,-0.690058,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,u:0xw,-0.215139,0.781374,-0.905419,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,u:1EH,-0.35763,-0.013178,-0.916872,0.999992,0.0,0.0,0.0,0.999987,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,u:1gx,-0.333881,0.256403,0.165261,0.999994,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
lex_cols = [c for c in df_user_fp.columns if c.startswith("lexeme_")]
X = df_user_fp[lex_cols]

n_rows, n_cols = X.shape
nnz = (X.to_numpy() != 0).sum()
total = n_rows * n_cols

density = nnz / total
sparsity = 1 - density

print("shape:", X.shape)
print("nnz:", nnz)
print("density:", density)
print("sparsity:", sparsity)

shape: (2709, 5630)
nnz: 286768
density: 0.01880239999947547
sparsity: 0.9811976000005246


In [35]:
df = df_user_fp.copy()

# ---- 1) choose columns ----
beh_cols = ["max_history_seen", "vocab_size", "learning_speed"]

seen_cols = [c for c in df.columns if c.startswith("lexeme_") and c.endswith("_seen")]
ability_cols = [c for c in df.columns if c.startswith("lexeme_") and (not c.endswith("_seen"))]

# ability_cols do not contain user_id/beh_cols
ability_cols = [c for c in ability_cols if c not in ["user_id"] + beh_cols]

# ---- 2) behavioral ----
B = df[beh_cols].astype(float).fillna(0)
B = B.clip(-5, 5).to_numpy()
w_beh = 0.5
B = w_beh * B

# ---- 3) ability SVD ----
X_ability = df[ability_cols].astype(float).fillna(0).to_numpy()
X_ability_csr = sparse.csr_matrix(X_ability)

svd_ability = TruncatedSVD(n_components=64, random_state=42)
E_ability = svd_ability.fit_transform(X_ability_csr)

# ---- 4) seen SVD (log1p) ----
X_seen = df[seen_cols].astype(float).fillna(0).to_numpy()
X_seen = np.log1p(X_seen) 
X_seen_csr = sparse.csr_matrix(X_seen)

svd_seen = TruncatedSVD(n_components=32, random_state=42)
E_seen = svd_seen.fit_transform(X_seen_csr)

w_seen = 0.5
E_seen = w_seen * E_seen

# ---- 5) concenation ----
E = np.hstack([E_ability, E_seen, B])
E = normalize(E, norm="l2")

out = pd.DataFrame(E, columns=[f"emb_{i}" for i in range(E.shape[1])])
out.insert(0, "user_id", df["user_id"].values)
out.to_csv("user_embedding_svd_ability_seen_beh.csv", index=False)

print("ability explained variance sum:", svd_ability.explained_variance_ratio_.sum())
print("seen explained variance sum:", svd_seen.explained_variance_ratio_.sum())
print("embedding shape:", E.shape)

ability explained variance sum: 0.7925901896845668
seen explained variance sum: 0.7763381618817947
embedding shape: (2709, 99)


In [36]:
# SVD(512) -> small AutoEncoder(128) on lexeme blocks
# then late-fusion with behavioral cols (max_history_seen, vocab_size, learning_speed)
# and export CSV: user_id + embedding columns

# -----------------------------
# Config
# -----------------------------
RANDOM_STATE = 42

ABILITY_SVD_DIM = 384
SEEN_SVD_DIM = 128            # 384+128 = 512
AE_HIDDEN_DIM = 256
AE_EMB_DIM = 128

W_SEEN = 0.5                  # down-weight exposure channel
W_BEH = 0.5                   # down-weight behavioral features in final embedding

BATCH_SIZE = 128
LR = 1e-3
WEIGHT_DECAY = 1e-4
DROPOUT = 0.1
PATIENCE = 15
MAX_EPOCHS = 200

BEH_CLIP = 5.0                # clip behavioral z-scores to [-5, 5]

# If you're NOT sure the behavioral columns are already standardized,
# set this to True to StandardScale them.
STANDARDIZE_BEH = False


# -----------------------------
# Helpers
# -----------------------------
class AE(nn.Module):
    def __init__(self, in_dim=512, hid=256, emb=128, dropout=0.1):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(in_dim, hid),
            nn.LayerNorm(hid),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hid, emb),
        )
        self.decoder = nn.Sequential(
            nn.Linear(emb, hid),
            nn.LayerNorm(hid),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hid, in_dim),
        )

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat, z


def make_lexeme_cols(df: pd.DataFrame, beh_cols, user_col="user_id"):
    seen_cols = [c for c in df.columns if c.startswith("lexeme_") and c.endswith("_seen")]
    ability_cols = [c for c in df.columns if c.startswith("lexeme_") and (not c.endswith("_seen"))]
    ability_cols = [c for c in ability_cols if c not in ([user_col] + beh_cols)]
    return ability_cols, seen_cols


# -----------------------------
# Main pipeline
# -----------------------------
def build_embeddings_svd_ae_late_fusion(df_user_fp: pd.DataFrame) -> pd.DataFrame:
    df = df_user_fp.copy()

    # Basic checks
    beh_cols = ["max_history_seen", "vocab_size", "learning_speed"]
    required = ["user_id"] + beh_cols
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    ability_cols, seen_cols = make_lexeme_cols(df, beh_cols)

    if len(ability_cols) == 0 or len(seen_cols) == 0:
        raise ValueError(
            f"Could not find lexeme columns. ability_cols={len(ability_cols)}, seen_cols={len(seen_cols)}. "
            "Expected columns like lexeme_0 and lexeme_0_seen."
        )

    # -----------------------
    # 1) Build 512-d input Z
    # -----------------------
    # ability block -> SVD
    X_ability = df[ability_cols].fillna(0).astype(np.float32).to_numpy()
    X_ability_csr = sparse.csr_matrix(X_ability)
    svd_a = TruncatedSVD(n_components=ABILITY_SVD_DIM, random_state=RANDOM_STATE)
    Za = svd_a.fit_transform(X_ability_csr).astype(np.float32)

    # seen block -> log1p -> SVD
    X_seen = df[seen_cols].fillna(0).astype(np.float32).to_numpy()
    X_seen = np.log1p(X_seen)
    X_seen_csr = sparse.csr_matrix(X_seen)
    svd_s = TruncatedSVD(n_components=SEEN_SVD_DIM, random_state=RANDOM_STATE)
    Zs = svd_s.fit_transform(X_seen_csr).astype(np.float32)

    Z = np.hstack([Za, W_SEEN * Zs]).astype(np.float32)  # (n_users, 512)

    # Standardize Z for easier AE training
    Z = StandardScaler().fit_transform(Z).astype(np.float32)

    # -----------------------
    # 2) Train AE on Z
    # -----------------------
    X_train, X_val = train_test_split(Z, test_size=0.2, random_state=RANDOM_STATE)

    train_loader = DataLoader(
        TensorDataset(torch.from_numpy(X_train)),
        batch_size=BATCH_SIZE,
        shuffle=True,
        drop_last=False,
    )
    val_loader = DataLoader(
        TensorDataset(torch.from_numpy(X_val)),
        batch_size=max(256, BATCH_SIZE),
        shuffle=False,
        drop_last=False,
    )

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AE(in_dim=ABILITY_SVD_DIM + SEEN_SVD_DIM, hid=AE_HIDDEN_DIM, emb=AE_EMB_DIM, dropout=DROPOUT).to(device)

    opt = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    crit = nn.MSELoss()

    best_val = float("inf")
    best_state = None
    bad = 0

    for epoch in range(1, MAX_EPOCHS + 1):
        model.train()
        tr_loss = 0.0
        for (xb,) in train_loader:
            xb = xb.to(device)
            x_hat, _ = model(xb)
            loss = crit(x_hat, xb)
            opt.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()
            tr_loss += loss.item() * xb.size(0)
        tr_loss /= len(train_loader.dataset)

        model.eval()
        va_loss = 0.0
        with torch.no_grad():
            for (xb,) in val_loader:
                xb = xb.to(device)
                x_hat, _ = model(xb)
                loss = crit(x_hat, xb)
                va_loss += loss.item() * xb.size(0)
        va_loss /= len(val_loader.dataset)

        if va_loss < best_val - 1e-5:
            best_val = va_loss
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= PATIENCE:
                break

    if best_state is None:
        raise RuntimeError("Training failed to produce a best model state.")
    model.load_state_dict(best_state)
    model.eval()

    with torch.no_grad():
        Z_tensor = torch.from_numpy(Z).to(device)
        _, emb_ae = model(Z_tensor)
        emb_ae = emb_ae.detach().cpu().numpy().astype(np.float32)  # (n_users, 128)

    # -----------------------
    # 3) Late-fusion with behavioral cols
    # -----------------------
    B = df[beh_cols].astype(np.float32).fillna(0)

    if STANDARDIZE_BEH:
        B = StandardScaler().fit_transform(B.to_numpy()).astype(np.float32)
    else:
        B = B.to_numpy()

    # clip and weight
    B = np.clip(B, -BEH_CLIP, BEH_CLIP)
    B = (W_BEH * B).astype(np.float32)

    # concat + L2 normalize (good for cosine similarity / clustering)
    emb_final = np.hstack([emb_ae, B]).astype(np.float32)  # (n_users, 131)
    emb_final = normalize(emb_final, norm="l2").astype(np.float32)

    # -----------------------
    # 4) Output DataFrame
    # -----------------------
    out = pd.DataFrame(emb_final, columns=[f"emb_{i}" for i in range(emb_final.shape[1])])
    out.insert(0, "user_id", df["user_id"].values)

    # optional: print some diagnostics
    print("Users:", len(df))
    print("Ability cols:", len(ability_cols), "Seen cols:", len(seen_cols))
    print("SVD explained var sum (ability):", float(svd_a.explained_variance_ratio_.sum()))
    print("SVD explained var sum (seen):", float(svd_s.explained_variance_ratio_.sum()))
    print("AE best val loss:", float(best_val))
    print("Final embedding shape:", out.shape)

    return out


# -----------------------------
# Run + save
# -----------------------------
# Make sure df_user_fp exists in your environment
emb_df = build_embeddings_svd_ae_late_fusion(df_user_fp)
emb_df.to_csv("user_embedding_svd512_ae128_plus_beh.csv", index=False)

Users: 2709
Ability cols: 2815 Seen cols: 2815
SVD explained var sum (ability): 0.9848873615264893
SVD explained var sum (seen): 0.9230934977531433
AE best val loss: 0.8869494644038352
Final embedding shape: (2709, 132)
