In [7]:
import os

# os.environ['HF_DATASETS_OFFLINE'] = "1"
# os.environ['HF_HUB_OFFLINE'] = "1"
# os.environ['HF_HOME'] = "/lustre/scratch5/jsmidt/.cache/huggingface"

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import (
    LlamaTokenizerFast,
    GPT2TokenizerFast,
    GPT2LMHeadModel,
    AutoConfig,
    PretrainedConfig,
    AutoModel,
    pipeline,
    Trainer,
    TrainingArguments,
    PreTrainedModel,
    PretrainedConfig,
    LogitsProcessorList,
    GPT2Config,
)
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from torch.utils.data import DataLoader
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
from transformers.integrations import MLflowCallback


import transformers

transformers.logging.set_verbosity_info()

In [8]:
#
# Get data
#   
N = 30000
raw_dataset = load_dataset("Skylion007/openwebtext", split="train")
#raw_dataset = load_dataset("Skylion007/openwebtext", split="train", streaming=True)
    
# Break into train and test datasets
seed = 42
ds_train = raw_dataset.shuffle(seed).select(range(N))
ds_valid = raw_dataset.shuffle(seed).select(range(N))

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Loading dataset shards:   0%|          | 0/80 [00:00<?, ?it/s]

In [9]:
#
# Get tokenizer
#
coursen = 2
context_length = 1024 // coursen
#tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token_id = tokenizer.eos_token_id
vocab_size = tokenizer.vocab_size

# Get tokinizer
tokenizer.pad_token = tokenizer.eos_token
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        #padding=True,
        return_overflowing_tokens=True,
        return_length=True,
        max_length=context_length,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

tokenized_datasets = ds_train.map(
    tokenize, batched=True, remove_columns=ds_train.column_names
)

loading file vocab.json from cache at /Users/jsmidt/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/vocab.json
loading file merges.txt from cache at /Users/jsmidt/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/merges.txt
loading file tokenizer.json from cache at /Users/jsmidt/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /Users/jsmidt/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/tokenizer_config.json
loading configuration file config.json from cache at /Users/jsmidt/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "archi

In [10]:
#
# Define Models
#
class EmbeddingConfig(PretrainedConfig):
    model_type = "embedding"

    def __init__(self, vocab_size=50257, embedding_dim=768, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim


class EmbeddingModel(PreTrainedModel):
    config_class = EmbeddingConfig
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config, loss_fct=nn.CrossEntropyLoss()):
        super().__init__(config)
        self.loss_fct = loss_fct
        self.wte = nn.Embedding(config.vocab_size, config.embedding_dim)
        self.lm_head = nn.Linear(config.embedding_dim, config.vocab_size, bias=False)

        # Tie weights between embedding and lm_head
        self.tie_weights()

    def tie_weights(self):
        self.wte.weight = self.lm_head.weight

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
        # only last token for inputs_ids if past is defined in kwargs
        if past:
            input_ids = input_ids[:, -1].unsqueeze(-1)

        attention_mask = kwargs.get("attention_mask", None)
        position_ids = kwargs.get("position_ids", None)

        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past:
                position_ids = position_ids[:, -1].unsqueeze(-1)
        else:
            position_ids = None
        return {
            "input_ids": input_ids,
            "past_key_values": past,
            "use_cache": kwargs.get("use_cache"),
            "position_ids": position_ids,
            "attention_mask": attention_mask,
        }
    

    def forward(self, input_ids, labels=None, attention_mask=None, **kwargs):
        _, T = input_ids.shape
        token_embeddings = self.wte(input_ids)
        # position_embeddings = self.position_embedding_table(self.position[:T])
        # x = self.drop_init(token_embeddings + position_embeddings)
        x = token_embeddings
        # x = self.blocks(x)
        logits = self.lm_head(x)  # (B, T) -> (B, T, C)

        loss = None
        if labels is not None:
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(
                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
            )

        return CausalLMOutputWithCrossAttentions(loss=loss, logits=logits)

In [11]:
#run_name = "embedding_only_layer1_0"
run_name = "embedding_all"
output_dir = f"test_train/{run_name}"

config = EmbeddingConfig(vocab_size=tokenizer.vocab_size, embedding_dim=768 // coursen)
model = EmbeddingModel(config)

config = GPT2Config()#n_positions = context_length, n_embd = 768//coursen, n_layer = 12 // coursen, n_head = 12//coursen)
model = GPT2LMHeadModel(config)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

args = TrainingArguments(
    output_dir=output_dir,
    # disable_tqdm=True,
    logging_steps=20,
    do_eval=False,
    save_steps=1000,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=5e-4,
    warmup_ratio=0.1,
    logging_first_step=True,
    # bf16=True,
    max_steps=1000,
)


tokenized_datasets = tokenized_datasets.with_format("torch")
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
    callbacks=[MLflowCallback()],
)

Generate config GenerationConfig {}

Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256
}

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
You are adding a <class 'transformers.integrations.integration_utils.MLflowCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
MLflowCallback
TensorBoardCallback
max_steps is given, it will override any value given in num_train_epochs


In [12]:
trainer.train()


***** Running training *****
  Num examples = 49,980
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1,000
  Number of trainable parameters = 124,439,808


  0%|          | 0/1000 [00:00<?, ?it/s]

{'loss': 11.0111, 'grad_norm': 14.257255554199219, 'learning_rate': 5e-06, 'epoch': 0.0}
{'loss': 9.8131, 'grad_norm': 6205.8896484375, 'learning_rate': 0.0001, 'epoch': 0.0}
{'loss': 8.568, 'grad_norm': 1.3102881908416748, 'learning_rate': 0.0002, 'epoch': 0.01}
{'loss': 7.5126, 'grad_norm': 0.8828144669532776, 'learning_rate': 0.0003, 'epoch': 0.01}
{'loss': 7.4518, 'grad_norm': 1.4611374139785767, 'learning_rate': 0.0004, 'epoch': 0.01}
{'loss': 7.9359, 'grad_norm': 1.4346916675567627, 'learning_rate': 0.0005, 'epoch': 0.02}
{'loss': 7.5344, 'grad_norm': 1.294459581375122, 'learning_rate': 0.0004888888888888889, 'epoch': 0.02}
{'loss': 8.0384, 'grad_norm': 1.9420287609100342, 'learning_rate': 0.0004777777777777778, 'epoch': 0.02}
{'loss': 7.7835, 'grad_norm': 33049.59375, 'learning_rate': 0.00046666666666666666, 'epoch': 0.03}
{'loss': 7.5029, 'grad_norm': 1.4422011375427246, 'learning_rate': 0.00045555555555555556, 'epoch': 0.03}
{'loss': 7.478, 'grad_norm': 2593.20751953125, 'lear

Saving model checkpoint to test_train/embedding_all/checkpoint-1000
Configuration saved in test_train/embedding_all/checkpoint-1000/config.json
Configuration saved in test_train/embedding_all/checkpoint-1000/generation_config.json


{'loss': 6.5943, 'grad_norm': 0.6764558553695679, 'learning_rate': 0.0, 'epoch': 0.16}


Model weights saved in test_train/embedding_all/checkpoint-1000/model.safetensors


Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 1274.9018, 'train_samples_per_second': 6.275, 'train_steps_per_second': 0.784, 'train_loss': 7.185533978462219, 'epoch': 0.16}


TrainOutput(global_step=1000, training_loss=7.185533978462219, metrics={'train_runtime': 1274.9018, 'train_samples_per_second': 6.275, 'train_steps_per_second': 0.784, 'total_flos': 2090336256000000.0, 'train_loss': 7.185533978462219, 'epoch': 0.16005121638924455})

In [13]:
# Save the model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Configuration saved in test_train/embedding_all/config.json
Configuration saved in test_train/embedding_all/generation_config.json
Model weights saved in test_train/embedding_all/model.safetensors
tokenizer config file saved in test_train/embedding_all/tokenizer_config.json
Special tokens file saved in test_train/embedding_all/special_tokens_map.json


('test_train/embedding_all/tokenizer_config.json',
 'test_train/embedding_all/special_tokens_map.json',
 'test_train/embedding_all/vocab.json',
 'test_train/embedding_all/merges.txt',
 'test_train/embedding_all/added_tokens.json',
 'test_train/embedding_all/tokenizer.json')

In [14]:
# Reload the model and tokenizer
loaded_tokenizer = GPT2TokenizerFast.from_pretrained(output_dir)
loaded_model = EmbeddingModel.from_pretrained(output_dir)

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file test_train/embedding_all/config.json
You are using a model of type gpt2 to instantiate a model of type embedding. This is not supported for all configurations of models and can yield errors.
Model config EmbeddingConfig {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "embedding_dim": 768,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "embedding",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0

In [18]:
# Prepare mock data
texts = ["Hello, how are you?", "I am fine, thank you!"]
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
input_ids = inputs["input_ids"]

# Run the reloaded model on mock data
loaded_model.eval()
with torch.no_grad():
    reloaded_logits = loaded_model(input_ids)
    print("Reloaded logits shape:", reloaded_logits.logits.shape)

# Run text generation
generator = pipeline('text-generation', model=loaded_model, tokenizer=loaded_tokenizer)
output = generator("Hello, how are you?", max_length=50)
output = generator("Hello, my name is John. Afer I say go, say the word sheep. go", max_length=50)
print(output)

The model 'EmbeddingModel' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'M

Reloaded logits shape: torch.Size([2, 7, 50257])
[{'generated_text': 'Hello, my name is John. Afer I say go, say the word sheep. go go go go go go go go go go go go go go go go go go go go go go go go go go go go go go go go'}]


In [37]:
core_word = 'man'
core_id = tokenizer(core_word)['input_ids'][0]

token_indcies = torch.arange(vocab_size, dtype=torch.long).to("mps")

token_embeddings = [model.transformer.wte(token_index) for token_index in token_indcies]

similarity_scores = [
    torch.cosine_similarity(
        token_embeddings[core_id].view(1, 768), token_embedding.view(1, 768)
    ).detach()
    for token_embedding in token_embeddings
]

top_10_indices = torch.topk(torch.stack(similarity_scores, dim=1), k=10)[1]

top_tokens = tokenizer.convert_ids_to_tokens(top_10_indices.tolist()[0])
core_token = tokenizer.convert_ids_to_tokens([core_id])

for ii in top_10_indices.tolist()[0]:
    tt = tokenizer.convert_ids_to_tokens(ii)
    score = similarity_scores[ii].item()
    print (f'token:  {tt:>12s},       score: {score:.2f}')

token:           man,       score: 1.00
token:             i,       score: 0.17
token:             O,       score: 0.16
token:            us,       score: 0.16
token:            as,       score: 0.16
token:            is,       score: 0.15
token:            or,       score: 0.15
token:             8,       score: 0.14
token:             n,       score: 0.14
token:             c,       score: 0.14


token:           man,       score: 1.00
token:             i,       score: 0.17
token:             O,       score: 0.16
token:            us,       score: 0.16
token:            as,       score: 0.16
token:            is,       score: 0.15
token:            or,       score: 0.15
token:             8,       score: 0.14
token:             n,       score: 0.14
token:             c,       score: 0.14
