In [1]:
# Downgrade NumPy so it stays compatible with captum & SageMaker
!pip install --upgrade --force-reinstall numpy==1.26.4

# Force-reinstall the nightly 2.6.x CUDA-12.1 builds for torch, torchvision, torchaudio
!pip install --upgrade --pre --force-reinstall --no-cache-dir \
    torch torchvision torchaudio \
    --index-url https://download.pytorch.org/whl/nightly/cu121

Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.1.2
    Uninstalling numpy-2.1.2:
      Successfully uninstalled numpy-2.1.2
Successfully installed numpy-1.26.4
Looking in indexes: https://download.pytorch.org/whl/nightly/cu121
Collecting torch
  Downloading https://download.pytorch.org/whl/nightly/cu121/torch-2.6.0.dev20241112%2Bcu121-cp310-cp310-linux_x86_64.whl (767.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m767.9/767.9 MB[0m [31m449.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting torchvision
  Downloading https://download.pytorch.org/whl/nightly/cu121/torchvision-0.20.0.dev20241112%2Bcu121-cp310-cp310-linux_x86_64.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━

In [3]:
!pip install sentence-transformers transformers vec2text safetensors

Collecting fsspec>=2023.5.0 (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.5.1
    Uninstalling fsspec-2025.5.1:
      Successfully uninstalled fsspec-2025.5.1
Successfully installed fsspec-2025.3.0


In [2]:
import numpy, torch, torchvision, torchaudio

print("numpy:     ", numpy.__version__)
print("torch:     ", torch.__version__, "| CUDA:", torch.version.cuda, "| available:", torch.cuda.is_available())
print("torchvision:", torchvision.__version__)
print("torchaudio: ", torchaudio.__version__)

numpy:      2.1.2
torch:      2.6.0.dev20241112+cu121 | CUDA: 12.1 | available: True
torchvision: 0.20.0.dev20241112+cu121
torchaudio:  2.5.0.dev20241112+cu121


In [1]:
# 1) Patch out the CVE safety check
import transformers.utils.import_utils    as _imp_utils
import transformers.modeling_utils         as _imp_modeling

_imp_utils.check_torch_load_is_safe   = lambda: None
_imp_modeling.check_torch_load_is_safe = lambda: None

# 2) Imports
from vec2text import analyze_utils, invert_embeddings
from sentence_transformers import SentenceTransformer
import torch

# 3) Setup embedder
device   = torch.device("cuda")
embedder = SentenceTransformer("sentence-transformers/gtr-t5-base").to(device)

# 4) Load the inversion trainer
exp, trainer = analyze_utils.load_experiment_and_trainer_from_pretrained(
    "jxm/gtr__nq__32__correct"
)

# 5) Encode + perturb
input_text = "Deconstructivism"
embedding  = embedder.encode([input_text], convert_to_tensor=True).to(device)
modified   = embedding.clone()
modified[0][0] += 0.01

# 6a) Single shot 

decoded_output = invert_embeddings(
    embeddings = modified,
    corrector  = trainer
)
print("Decoded output single shot:", decoded_output)

# 6b) Invert embeddings — pass beam_size and batch_size as positional args
decoded_output = invert_embeddings(
    embeddings           = modified,
    corrector            = trainer,
    num_steps            = 100,   # run 100 inversion iterations
    sequence_beam_width  = 5      # beam size
)

print("Decoded output (beam/steps):", decoded_output)


ModuleNotFoundError: No module named 'transformers'

In [1]:
# 1) Patch out the CVE safety check
import transformers.utils.import_utils    as _imp_utils
import transformers.modeling_utils         as _imp_modeling

_imp_utils.check_torch_load_is_safe   = lambda: None
_imp_modeling.check_torch_load_is_safe = lambda: None

# 2) Imports
from vec2text import analyze_utils, invert_embeddings
from sentence_transformers import SentenceTransformer
import torch

# 3) Setup embedder
device   = torch.device("cuda")
embedder = SentenceTransformer("sentence-transformers/gtr-t5-base").to(device)

# 4) Load the inversion trainer
exp, trainer = analyze_utils.load_experiment_and_trainer_from_pretrained(
    "jxm/gtr__nq__32__correct"
)

# 5) Encode + perturb
# ─── 1) Encode both concepts ────────────────────────────────────────────
texts   = ["Deconstructivism", "Brutalism"]
embeds  = embedder.encode(texts, convert_to_tensor=True).to(device)  # shape [2,768]

# ─── 2) Compute midpoint ───────────────────────────────────────────────
mid = (embeds[0] + embeds[1]) / 2

# ─── 3) Invert it ─────────────────────────────────────────────────────
# (use num_steps=100, beam=5 for a richer search)
decoded_mid = invert_embeddings(
    mid.unsqueeze(0),  # shape [1,768]
    trainer,
    num_steps=100,
    sequence_beam_width=5,
)
print("Midpoint inversion:", decoded_mid)

Set num workers to 1
Experiment output_dir = saves/jxm__gtr__nq__32__correct
Set num workers to 1
Experiment output_dir = saves/jxm__gtr__nq__32
Loading datasets with TOKENIZERS_PARALLELISM = False
loading train dataset from path: /home/ec2-user/.cache/inversion/dd0d97ad14fd6897b0d31cecc2e14d13.arrow
loaded dict of val datasets from /home/ec2-user/.cache/inversion/8a11157c2dba245e22bfdea7946e149e.arrow


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Midpoint inversion: ['etymology at the .hudalinwood. This code points out a new etymology—']


In [1]:
# ─── 0) Disable the torch.load CVE safety check ───────────────────────────────
import transformers.utils.import_utils    as _imp_utils
import transformers.modeling_utils         as _imp_modeling

_imp_utils.check_torch_load_is_safe   = lambda: None
_imp_modeling.check_torch_load_is_safe = lambda: None

# ─── 1) Core imports & setup ─────────────────────────────────────────────────
import torch
from vec2text import analyze_utils, invert_embeddings
from sentence_transformers import SentenceTransformer

device   = torch.device("cuda")
embedder = SentenceTransformer("sentence-transformers/gtr-t5-base").to(device)

# ─── 2) Load your inversion trainer (this will stream in the shards) ─────────
exp, trainer = analyze_utils.load_experiment_and_trainer_from_pretrained(
    "jxm/gtr__nq__32__correct"
)

# ─── 3) The PCA‐grid experiment ───────────────────────────────────────────────
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

# 3.1) Choose your cluster of styles
styles = ["Gothic", "Renaissance", "Baroque", "Brutalism", "Deconstructivism"]
embs   = embedder.encode(styles, convert_to_tensor=True).cpu().numpy()  # (5,768)

# 3.2) Fit PCA → 2 components
pca    = PCA(n_components=2)
coords = pca.fit_transform(embs)  # (5,2)

# 3.3) Build a 3×3 grid around the centroid
centroid = coords.mean(axis=0)  # (2,)
span     = 1.0
grid_pts = []
for dx in np.linspace(-span, span, 3):
    for dy in np.linspace(-span, span, 3):
        grid_pts.append(centroid + [dx, dy])
grid_pts = np.stack(grid_pts)     # (9,2)

# 3.4) Project back to 768-D and cast to float32
grid_embs = pca.inverse_transform(grid_pts)      # numpy float64 (9×768)
grid_embs = torch.from_numpy(grid_embs).to(device).float()

# 3.5) Invert each point
results = []
for i, ge in enumerate(grid_embs):
    decoded = invert_embeddings(
        ge.unsqueeze(0),      # [1,768] float32
        trainer,
        num_steps=50,
        sequence_beam_width=3
    )
    results.append({
        "px": float(grid_pts[i,0]),
        "py": float(grid_pts[i,1]),
        "decoded": decoded[0]
    })

# 3.6) Show as DataFrame
import pandas as pd
df = pd.DataFrame(results)
df

Set num workers to 1
Experiment output_dir = saves/jxm__gtr__nq__32__correct
Set num workers to 1
Experiment output_dir = saves/jxm__gtr__nq__32
Loading datasets with TOKENIZERS_PARALLELISM = False
loading train dataset from path: /home/ec2-user/.cache/inversion/dd0d97ad14fd6897b0d31cecc2e14d13.arrow
loaded dict of val datasets from /home/ec2-user/.cache/inversion/8a11157c2dba245e22bfdea7946e149e.arrow


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,px,py,decoded
0,-1.0,-1.0,relationship to Sarah Kiams song scri...
1,-1.0,-8.195639e-09,chapter plan to be special in macave...
2,-1.0,1.0,50th party mafia bisection: create and service...
3,1.192093e-08,-1.0,(with information on this
4,1.192093e-08,-8.195639e-09,a .k.a.k.a. ELDwoods at the mason (author 13) ...
5,1.192093e-08,1.0,20th generation culling cube. br>Solve the cul...
6,1.0,-1.0,"– a combination of the events at EDIN and Thu""..."
7,1.0,-8.195639e-09,– evinced at the TLD@Calmwood. The second floo...
8,1.0,1.0,sixth year at the “CastleBoyle” (16–17). Goode...
