In [1]:
%pip install transformers wandb requests_cache datasets tqdm



In [2]:
import wandb
from google.colab import userdata

# huggingface_hub.login(token=userdata.get('huggigface_token'))

wandb.login(key=userdata.get('wandb_token'))

[34m[1mwandb[0m: Currently logged in as: [33maxiom[0m ([33mchihuahuas[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Download Data

In [3]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
import io
import os
import sys
import zipfile

import requests
import requests_cache
from tqdm import tqdm


zip_link = "http://www.mattmahoney.net/dc/enwik8.zip"
data_folder = "dataset"
cache_file = "download_cache"

# Ensure the data folder exists
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

# Initialize requests_cache
requests_cache.install_cache(os.path.join(data_folder, cache_file))

# Download the ZIP file with progress bar
response = requests.get(zip_link, stream=True)
response.raise_for_status()

# Get the total file size for the progress bar
total_size = int(response.headers.get("content-length", 0))

# Open the ZIP file from the content
with open(os.path.join(data_folder, "enwik8.zip"), "wb") as file:
    with tqdm(
        total=total_size, unit="B", unit_scale=True, desc="Downloading"
    ) as pbar:
        for data in response.iter_content(chunk_size=1024):
            file.write(data)
            pbar.update(len(data))

# Open the cached file
with open(os.path.join(data_folder, "enwik8.zip"), "rb") as file:
    # Open the ZIP file from the content
    with zipfile.ZipFile(io.BytesIO(file.read())) as zip_file:
        # Extract all contents to the data folder
        zip_file.extractall(data_folder)

print("File downloaded and decompressed successfully.", file=sys.stderr)


Downloading: 100%|██████████| 36.4M/36.4M [00:00<00:00, 241MB/s]
File downloaded and decompressed successfully.


## Model

In [5]:
from transformers import T5ForConditionalGeneration, AutoTokenizer

MODEL_ID = "google-t5/t5-small"
COMP_EMBED_DIM = 512

model = T5ForConditionalGeneration.from_pretrained(MODEL_ID).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
import torch.nn as nn

pooling_layer = nn.Linear(model.encoder.config.d_model, COMP_EMBED_DIM).to(device)

## Data

In [7]:
from datasets import load_dataset

dataset = load_dataset("text", data_files=["dataset/enwik8"])
dataset = dataset["train"]

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
LR = 1e-3

optimizer = torch.optim.Adam(model.parameters(), lr=LR)

In [9]:
wandb.init(
    name = "DETHCOD_test",
    project="LoRA on T5",
    config={
        "model_config": model.config.to_dict(),
    },
)

In [10]:
batch_size = 8
data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [11]:
import tqdm.auto as tqdm

loss = 0

for batch in tqdm.tqdm(data_loader):
    input_ids = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
    encoder_output = model.encoder(input_ids=input_ids)
    hiddens = encoder_output.last_hidden_state
    pooled = pooling_layer(hiddens).mean(dim=-2)

    assert model.decoder.config.d_model == COMP_EMBED_DIM, \
        "Giving the embeddings directly to the decoder"

    encoder_hidden_states = pooled.unsqueeze(-2)

    decoder_output = model.decoder(
        input_ids=input_ids,
        encoder_hidden_states=encoder_hidden_states,
    )

    model_output = model.forward(
        decoder_input_ids=input_ids,
        labels=input_ids,
        encoder_outputs=(encoder_hidden_states,),
    )

    loss += model_output.loss * batch_size

    # TODO: Add oneshot_chance to logs
    wandb.log({
        "loss": model_output.loss.item(),
    })

    optimizer.zero_grad()
    model_output.loss.backward()
    optimizer.step()


  0%|          | 0/141003 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
wandb.finish()
loss / len(data_loader)