In [1]:
%pip install transformers wandb requests_cache datasets tqdm

Collecting wandb
  Downloading wandb-0.16.6-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests_cache
  Downloading requests_cache-1.2.0-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.45.0-py2.py3-none-any.whl (267 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [2]:
import wandb
from google.colab import userdata

# huggingface_hub.login(token=userdata.get('huggigface_token'))

wandb.login(key=userdata.get('wandb_token'))

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Download Data

In [3]:
import io
import os
import sys
import zipfile

import requests
import requests_cache
from tqdm import tqdm


zip_link = "http://www.mattmahoney.net/dc/enwik8.zip"
data_folder = "dataset"
cache_file = "download_cache"

# Ensure the data folder exists
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

# Initialize requests_cache
requests_cache.install_cache(os.path.join(data_folder, cache_file))

# Download the ZIP file with progress bar
response = requests.get(zip_link, stream=True)
response.raise_for_status()

# Get the total file size for the progress bar
total_size = int(response.headers.get("content-length", 0))

# Open the ZIP file from the content
with open(os.path.join(data_folder, "enwik8.zip"), "wb") as file:
    with tqdm(
        total=total_size, unit="B", unit_scale=True, desc="Downloading"
    ) as pbar:
        for data in response.iter_content(chunk_size=1024):
            file.write(data)
            pbar.update(len(data))

# Open the cached file
with open(os.path.join(data_folder, "enwik8.zip"), "rb") as file:
    # Open the ZIP file from the content
    with zipfile.ZipFile(io.BytesIO(file.read())) as zip_file:
        # Extract all contents to the data folder
        zip_file.extractall(data_folder)

print("File downloaded and decompressed successfully.", file=sys.stderr)


Downloading: 100%|██████████| 36.4M/36.4M [00:00<00:00, 457MB/s]
File downloaded and decompressed successfully.


## Model

In [63]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [64]:
from transformers import T5ForConditionalGeneration, AutoTokenizer

MODEL_ID = "google-t5/t5-small"
COMP_EMBED_DIM = 512

model = T5ForConditionalGeneration.from_pretrained(MODEL_ID).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google-t5--t5-small/snapshots/df1b051c49625cf57a3d0d8d3863ed4d13564fe4/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size

In [65]:
import torch.nn as nn

pooling_layer = nn.Linear(model.encoder.config.d_model, COMP_EMBED_DIM).to(device)

## Data

In [66]:
from datasets import load_dataset

dataset = load_dataset("text", data_files=["dataset/enwik8"])
dataset = dataset["train"]

In [67]:
LR = 1e-4

optimizer = torch.optim.Adam(model.parameters(), lr=LR)

In [68]:
wandb.init(
    name = "DETHCOD_test",
    project="LoRA on T5",
    config={
        "model_config": model.config.to_dict(),
    },
)

In [69]:
batch_size = 8
data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [71]:
import tqdm.auto as tqdm
from transformers import modeling_outputs

loss = 0

with tqdm.tqdm(data_loader) as pbar:
    for batch in pbar:
        input_ids = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
        encoder_output = model.encoder(input_ids=input_ids)
        hiddens = encoder_output.last_hidden_state
        pooled = pooling_layer(hiddens).mean(dim=-2)

        assert model.decoder.config.d_model == COMP_EMBED_DIM, \
            "Giving the embeddings directly to the decoder"

        encoder_hidden_states = pooled.unsqueeze(-2)

        model_output = model.forward(
            # decoder_input_ids=input_ids,
            labels=input_ids,
            encoder_outputs=modeling_outputs.BaseModelOutput(
                last_hidden_state=encoder_hidden_states,
            ),
        )

        loss += model_output.loss * batch_size

        loss_item = model_output.loss.item()

        # TODO: Add oneshot_chance to logs
        wandb.log({
            "loss": loss_item,
        })

        pbar.set_description(f"loss={loss_item}")

        optimizer.zero_grad()
        model_output.loss.backward()
        optimizer.step()


  0%|          | 0/141003 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [32]:
wandb.finish()
loss / len(data_loader)

VBox(children=(Label(value='0.001 MB of 0.013 MB uploaded\r'), FloatProgress(value=0.0913388117227838, max=1.0…

0,1
loss,█▅▂▂▂▂▁▂▂▁▁▂▁▂▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
loss,0.67998


tensor(12.3442, device='cuda:0', grad_fn=<DivBackward0>)

In [57]:
import random
sample = random.choice(dataset)

In [58]:
sample["text"]

'[[de:Godzilla]]'

In [59]:
input_ids = tokenizer(sample["text"], return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
encoder_output = model.encoder(input_ids=input_ids)
hiddens = encoder_output.last_hidden_state
pooled = pooling_layer(hiddens).mean(dim=-2)

In [60]:
assert model.decoder.config.d_model == COMP_EMBED_DIM, \
    "Giving the embeddings directly to the decoder"

In [61]:
encoder_hidden_states = pooled.unsqueeze(-2)

model_output = model.forward(
    # decoder_input_ids=input_ids,
    labels=input_ids,
    encoder_outputs=modeling_outputs.BaseModelOutput(
        last_hidden_state=encoder_hidden_states,
    ),
)

In [62]:
model_output.loss

tensor(11.0820, device='cuda:0', grad_fn=<NllLossBackward0>)

In [47]:
from transformers import modeling_outputs

In [50]:
encoder_hidden_states

tensor([[[-0.0570, -0.0515, -0.0880,  ...,  0.1152, -0.1156, -0.0326]],

        [[-0.0440, -0.0277, -0.0743,  ...,  0.0953, -0.0909, -0.0365]],

        [[-0.0681, -0.0219, -0.0568,  ...,  0.0959, -0.1128, -0.0434]],

        ...,

        [[ 0.0035, -0.0132, -0.0860,  ...,  0.1027, -0.0506,  0.0271]],

        [[-0.0552, -0.0189, -0.0668,  ...,  0.0937, -0.1011, -0.0230]],

        [[-0.0510, -0.0240, -0.0621,  ...,  0.1045, -0.0991, -0.0145]]],
       device='cuda:0', grad_fn=<UnsqueezeBackward0>)

In [49]:
model.generate(
    encoder_outputs=modeling_outputs.BaseModelOutput(
        last_hidden_state=encoder_hidden_states,
    ),
)



tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
       device='cuda:0')