In [1]:
%pip install transformers wandb requests_cache datasets tqdm python-dotenv

Collecting wandb
  Downloading wandb-0.16.6-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests_cache
  Downloading requests_cache-1.2.0-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m55.9 MB/s[0m eta [36m0:00:00[0m
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.45.0-py2.py

In [2]:
import os
import wandb


# huggingface_hub.login(token=userdata.get('huggigface_token'))

try:
    from dotenv import load_dotenv
    # Load environment variables from .env file
    load_dotenv()

except ImportError as e:
    print(f"Error importing dotenv: {e}")


# Check if running in Colab
try:
    from google.colab import userdata
    # If running in Colab, use userdata.get to retrieve the token
    wandb.login(key=userdata.get('wandb_token'))

except ImportError:
    # If not in Colab, load the token from the environment variable
    wandb_token = os.getenv('WANDB_TOKEN')
    if wandb_token:
        wandb.login(key=wandb_token)
    else:
        print("W&B token not found in environment variable. Please set WANDB_TOKEN in your environment.")


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Download Data

In [3]:
import io
import os
import sys
import zipfile

import requests
import requests_cache
from tqdm import tqdm


zip_link = "http://www.mattmahoney.net/dc/enwik8.zip"
data_folder = "dataset"
cache_file = "download_cache"

# Ensure the data folder exists
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

# Initialize requests_cache
requests_cache.install_cache(os.path.join(data_folder, cache_file))

# Download the ZIP file with progress bar
response = requests.get(zip_link, stream=True)
response.raise_for_status()

# Get the total file size for the progress bar
total_size = int(response.headers.get("content-length", 0))

# Open the ZIP file from the content
with open(os.path.join(data_folder, "enwik8.zip"), "wb") as file:
    with tqdm(
        total=total_size, unit="B", unit_scale=True, desc="Downloading"
    ) as pbar:
        for data in response.iter_content(chunk_size=1024):
            file.write(data)
            pbar.update(len(data))

# Open the cached file
with open(os.path.join(data_folder, "enwik8.zip"), "rb") as file:
    # Open the ZIP file from the content
    with zipfile.ZipFile(io.BytesIO(file.read())) as zip_file:
        # Extract all contents to the data folder
        zip_file.extractall(data_folder)

print("File downloaded and decompressed successfully.", file=sys.stderr)


Downloading: 100%|██████████| 36.4M/36.4M [00:00<00:00, 387MB/s]
File downloaded and decompressed successfully.


## Model

In [4]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
from transformers import T5ForConditionalGeneration, AutoTokenizer

MODEL_ID = "google-t5/t5-small"
COMP_EMBED_DIM = 16
NUM_TOKENS = 16

model = T5ForConditionalGeneration.from_pretrained(MODEL_ID).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [6]:
import torch.nn as nn

pooling_layer = nn.Linear(model.encoder.config.d_model, COMP_EMBED_DIM * NUM_TOKENS).to(device)
unpooling_layer = nn.Linear(COMP_EMBED_DIM, model.encoder.config.d_model).to(device)

## Data

In [7]:
from datasets import load_dataset

dataset = load_dataset("text", data_files=["dataset/enwik8"])
dataset = dataset["train"]

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
# removing large samples
def filter_samples(dataset):
    filtered_dataset = []
    for sample in dataset:
        tokens = tokenizer(sample['text'])['input_ids']

        if len(tokens) <= 128:
            filtered_dataset.append(sample)

    return filtered_dataset

dataset = filter_samples(dataset)


Token indices sequence length is longer than the specified maximum sequence length for this model (596 > 512). Running this sequence through the model will result in indexing errors


In [9]:
import pandas as pd
df_test = pd.DataFrame(dataset)

In [10]:
from datasets import Dataset

# removing empty strings
mask = df_test['text'].str.strip() != ''
df_test = df_test.loc[mask]
dataset = Dataset.from_pandas(df_test).remove_columns(['__index_level_0__'])

In [11]:
dataset

Dataset({
    features: ['text'],
    num_rows: 855090
})

In [12]:
LR = 1e-3

parameters = nn.ModuleDict({
    "model": model,
    "pooling_layer": pooling_layer,
    "unpooling_layer": unpooling_layer,
})

optimizer = torch.optim.Adam(parameters.parameters(), lr=LR)

In [30]:
wandb.init(
    name = "Vector Training",
    project="DETHCOD",
    config={
        "model_config": model.config.to_dict(),
    },
)

[34m[1mwandb[0m: Currently logged in as: [33maxiom[0m ([33mchihuahuas[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [13]:
batch_size = 8
# train_dataset = dataset.select(range(10000))
train_dataset = dataset
data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [31]:
import torch.nn.functional as F
import tqdm.auto as tqdm
from transformers import modeling_outputs

loss = 0

with tqdm.tqdm(data_loader) as pbar:
    for batch in pbar:
        input_ids = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
        encoder_output = model.encoder(input_ids=input_ids)
        hiddens = encoder_output.last_hidden_state
        pooled = pooling_layer(hiddens).mean(dim=-2)

        encoder_hidden_states = pooled.unflatten(-1, (NUM_TOKENS, COMP_EMBED_DIM))
        last_hidden_state = unpooling_layer(encoder_hidden_states)

        model_output = model.forward(
            # decoder_input_ids=input_ids,
            labels=input_ids,
            encoder_outputs=modeling_outputs.BaseModelOutput(
                last_hidden_state=last_hidden_state,
            ),
        )

        num_ids = model_output.logits.size(-1)
        losses = -F.cross_entropy(
            model_output.logits.view(-1, num_ids),
            target=input_ids.view(-1),
            ignore_index=0,
            reduction='none',
        )
        losses = losses.view(input_ids.shape).sum(dim=-1)
        acc = losses.exp().mean()
        loss += model_output.loss * batch_size

        loss_item = model_output.loss.item()

        wandb.log({
            "loss": loss_item,
            "accuracy": acc,
        })

        pbar.set_description(f"loss={loss_item:.2f}, acc={acc:.2f}")

        optimizer.zero_grad()
        model_output.loss.backward()
        optimizer.step()


  0%|          | 0/106887 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [32]:
wandb.finish()
loss / (len(data_loader) * batch_size)

VBox(children=(Label(value='0.247 MB of 0.247 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁▁▂▂▂▄▂▄▁▄▃▂▁▂▁▁▆▂▂▂▃▃▄▃▁▅▁▃▄▃▄▅█▅▃▄▃▇▂▃
loss,▇▆▇▄█▄▅▅▃▃▄▅▄▄▆▄▄▅▅▃▃▃▃▃▄▃▅▅▄▄▃▂▁▃▃▂▅▂▄▃

0,1
accuracy,0.29097
loss,0.55851


tensor(0.0720, device='cuda:0', grad_fn=<DivBackward0>)

In [90]:
import random
sample = random.choice(dataset)
print(repr(sample["text"]))

'* [[Combustion]], in which a substance reacts with an oxidizing element, such as [[oxygen]] gas.'


In [91]:
from transformers import GenerationConfig

input_ids = tokenizer(sample["text"], return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
encoder_output = model.encoder(input_ids=input_ids)
hiddens = encoder_output.last_hidden_state
pooled = pooling_layer(hiddens).mean(dim=-2)

encoder_hidden_states = pooled.unflatten(-1, (NUM_TOKENS, COMP_EMBED_DIM))
last_hidden_state = unpooling_layer(encoder_hidden_states)

encoder_outputs = modeling_outputs.BaseModelOutput(
    last_hidden_state=last_hidden_state,
)

model_output = model.forward(
    # decoder_input_ids=input_ids,
    labels=input_ids,
    encoder_outputs=encoder_outputs,
)

generation_config = GenerationConfig(
    do_sample=True,
    num_return_sequences=16,
    max_new_tokens=100,
)

print("Input Sequence: ")
print(repr(tokenizer.decode(input_ids[0])))
print()
print("loss:", model_output.loss.item())
print()
print("Generated Sequences: ")
generation_output = model.generate(
    encoder_outputs=encoder_outputs,
    generation_config=generation_config,
)

for seq in generation_output.tolist():
    seq = [token for token in seq if token != 0]
    print(repr(tokenizer.decode(seq)))

Input Sequence: 
'* [[Combustion]], in which a substance reacts with an oxidizing element, such as [[oxygen]] gas.</s>'

loss: 1.5137027502059937

Generated Sequences: 
'* [[Combussion methylation, oxygen]], oxygenes, which offer a reaction, to</s>'
'* Interactions in [[gas]], which have fueling fuel through oxygen, and fuel to fuel, the substance of that substance in a combustions.</s>'
'* In exchange reaction reaction to elements, [[oxidation]] oxygenes, containing a substance, which gases in oxidation.</s>'
'* Besides a composting substance, oxygening compounds to aceas, airing (reaction), and a [[combustion]]</s>'
'* <unk>reactions in the compound-sicing process, like hydrogen, [[oxygen]], [[olinex]], in which anoxygens matter in which</s>'
"* The inoxygenation reaction comes into a component of Oxgenitine, such as ''oxygenoideline in air, such as fuel</s>"
'* In more gases, oxygens in oxidation like a reaction containing [[cinemit], in a substance, in substance in a form</s>'
'* i