In [3]:
%pip install transformers wandb requests_cache datasets tqdm python-dotenv

Collecting python-dotenv
  Obtaining dependency information for python-dotenv from https://files.pythonhosted.org/packages/6a/3e/b68c118422ec867fa7ab88444e1274aa40681c606d59ac27de5a5588f082/python_dotenv-1.0.1-py3-none-any.whl.metadata
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [5]:
import os
import wandb


# huggingface_hub.login(token=userdata.get('huggigface_token'))

try:
    from dotenv import load_dotenv
    # Load environment variables from .env file
    load_dotenv()

except ImportError as e:
    print(f"Error importing dotenv: {e}")


# Check if running in Colab
try:
    from google.colab import userdata
    # If running in Colab, use userdata.get to retrieve the token
    wandb.login(key=userdata.get('wandb_token'))

except ImportError:
    # If not in Colab, load the token from the environment variable
    wandb_token = os.getenv('WANDB_TOKEN')
    if wandb_token:
        wandb.login(key=wandb_token)
    else:
        print("W&B token not found in environment variable. Please set WANDB_TOKEN in your environment.")


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33maxiom[0m ([33mchihuahuas[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\20mah/.netrc


## Download Data

In [3]:
import io
import os
import sys
import zipfile

import requests
import requests_cache
from tqdm import tqdm


zip_link = "http://www.mattmahoney.net/dc/enwik8.zip"
data_folder = "dataset"
cache_file = "download_cache"

# Ensure the data folder exists
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

# Initialize requests_cache
requests_cache.install_cache(os.path.join(data_folder, cache_file))

# Download the ZIP file with progress bar
response = requests.get(zip_link, stream=True)
response.raise_for_status()

# Get the total file size for the progress bar
total_size = int(response.headers.get("content-length", 0))

# Open the ZIP file from the content
with open(os.path.join(data_folder, "enwik8.zip"), "wb") as file:
    with tqdm(
        total=total_size, unit="B", unit_scale=True, desc="Downloading"
    ) as pbar:
        for data in response.iter_content(chunk_size=1024):
            file.write(data)
            pbar.update(len(data))

# Open the cached file
with open(os.path.join(data_folder, "enwik8.zip"), "rb") as file:
    # Open the ZIP file from the content
    with zipfile.ZipFile(io.BytesIO(file.read())) as zip_file:
        # Extract all contents to the data folder
        zip_file.extractall(data_folder)

print("File downloaded and decompressed successfully.", file=sys.stderr)


Downloading: 100%|██████████| 36.4M/36.4M [00:00<00:00, 256MB/s]
File downloaded and decompressed successfully.


## Model

In [3]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [29]:
from transformers import T5ForConditionalGeneration, AutoTokenizer

MODEL_ID = "google-t5/t5-small"
COMP_EMBED_DIM = 32
NUM_TOKENS = 16

model = T5ForConditionalGeneration.from_pretrained(MODEL_ID).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

In [30]:
import torch.nn as nn

pooling_layer = nn.Linear(model.encoder.config.d_model, COMP_EMBED_DIM * NUM_TOKENS).to(device)
unpooling_layer = nn.Linear(COMP_EMBED_DIM, model.encoder.config.d_model).to(device)

## Data

In [31]:
from datasets import load_dataset

dataset = load_dataset("text", data_files=["dataset/enwik8"])
dataset = dataset["train"]

In [37]:
LR = 1e-3

parameters = nn.ModuleDict({
    "model": model,
    "pooling_layer": pooling_layer,
    "unpooling_layer": unpooling_layer,
})

optimizer = torch.optim.Adam(parameters.parameters(), lr=LR)

In [40]:
wandb.init(
    name = "Vector Training",
    project="DETHCOD",
    config={
        "model_config": model.config.to_dict(),
    },
)

In [41]:
batch_size = 8
train_dataset = dataset.select(range(10000))
data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
import tqdm.auto as tqdm
from transformers import modeling_outputs

loss = 0

with tqdm.tqdm(data_loader) as pbar:
    for batch in pbar:
        input_ids = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
        encoder_output = model.encoder(input_ids=input_ids)
        hiddens = encoder_output.last_hidden_state
        pooled = pooling_layer(hiddens).mean(dim=-2)

        encoder_hidden_states = pooled.unflatten(-1, (NUM_TOKENS, COMP_EMBED_DIM))
        last_hidden_state = unpooling_layer(encoder_hidden_states)

        model_output = model.forward(
            # decoder_input_ids=input_ids,
            labels=input_ids,
            encoder_outputs=modeling_outputs.BaseModelOutput(
                last_hidden_state=last_hidden_state,
            ),
        )

        loss += model_output.loss * batch_size

        loss_item = model_output.loss.item()

        # TODO: Add oneshot_chance to logs
        wandb.log({
            "loss": loss_item,
        })

        pbar.set_description(f"loss={loss_item:.2f}")

        optimizer.zero_grad()
        model_output.loss.backward()
        optimizer.step()


  0%|          | 0/1250 [00:00<?, ?it/s]

In [None]:
wandb.finish()
loss / len(data_loader)

VBox(children=(Label(value='0.001 MB of 0.013 MB uploaded\r'), FloatProgress(value=0.0913915951723135, max=1.0…

0,1
loss,█▃▅▂▅▄▃▇▃▃▃▃▃▂▃▂▂▂▄▃▃▂▄▄▄▂▂▂▅▂▂▃▁▁▂▂▃▂▃▂

0,1
loss,0.50388


In [None]:
import random
sample = random.choice(dataset)

In [None]:
sample["text"]

In [None]:
input_ids = tokenizer(sample["text"], return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
encoder_output = model.encoder(input_ids=input_ids)
hiddens = encoder_output.last_hidden_state
pooled = pooling_layer(hiddens).mean(dim=-2)

In [None]:
assert model.decoder.config.d_model == COMP_EMBED_DIM, \
    "Giving the embeddings directly to the decoder"

In [None]:
encoder_hidden_states = pooled.unsqueeze(-2)

model_output = model.forward(
    # decoder_input_ids=input_ids,
    labels=input_ids,
    encoder_outputs=modeling_outputs.BaseModelOutput(
        last_hidden_state=encoder_hidden_states,
    ),
)

In [None]:
model_output.loss

tensor(11.0820, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
from transformers import modeling_outputs

In [None]:
encoder_hidden_states

tensor([[[-0.0570, -0.0515, -0.0880,  ...,  0.1152, -0.1156, -0.0326]],

        [[-0.0440, -0.0277, -0.0743,  ...,  0.0953, -0.0909, -0.0365]],

        [[-0.0681, -0.0219, -0.0568,  ...,  0.0959, -0.1128, -0.0434]],

        ...,

        [[ 0.0035, -0.0132, -0.0860,  ...,  0.1027, -0.0506,  0.0271]],

        [[-0.0552, -0.0189, -0.0668,  ...,  0.0937, -0.1011, -0.0230]],

        [[-0.0510, -0.0240, -0.0621,  ...,  0.1045, -0.0991, -0.0145]]],
       device='cuda:0', grad_fn=<UnsqueezeBackward0>)

In [None]:
model.generate(
    encoder_outputs=modeling_outputs.BaseModelOutput(
        last_hidden_state=encoder_hidden_states,
    ),
)



tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
       device='cuda:0')