In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

In [2]:
import glob
import time

from accelerate import init_empty_weights
from accelerate import load_checkpoint_and_dispatch
from transformers.generation.streamers import BaseStreamer

from model.block_transformer import BlockTransformer
from model.utils import load_block_transformer_from_config, load_vanilla_model_from_config
from paths import PROJECT_ROOT
from util.config import load_config
from util.tokenizer import TOKENIZERS

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
CHECKPOINT_ROOT = os.path.join(PROJECT_ROOT, "results")

In [4]:
def get_config_path(name):
    if ".yaml" not in name:
        name += ".yaml"
    return os.path.join(PROJECT_ROOT, "conf", "trainer", name)


def get_checkpoint_path(name):
    root = os.path.join(CHECKPOINT_ROOT, name)
    if not os.path.exists(root):
        raise ValueError(f"Checkpoint directory does not exist: {root}")
    pattern = os.path.join(CHECKPOINT_ROOT, name, "checkpoint-*")
    checkpoint_paths = glob.glob(pattern)
    def get_step(checkpoint_path):
        bs = os.path.basename(checkpoint_path)
        return int(bs.split("-")[1])
    checkpoint_paths = [(get_step(cp), cp) for cp in checkpoint_paths]
    checkpoint_paths.sort()
    checkpoint_path = checkpoint_paths[-1][1]
    checkpoint_path = os.path.join(checkpoint_path, "model.safetensors")
    print(f"Retrieving latest checkpoint path {checkpoint_path}")
    return checkpoint_path

In [5]:
def load_model(name, block=True):
    config = load_config(get_config_path(name))
    with init_empty_weights():
        if block:
            model, tokenizer = load_block_transformer_from_config(config)
        else:
            model = load_vanilla_model_from_config(config)
    checkpoint = get_checkpoint_path(name)
    device_map = "sequential"  # set to auto to use multiple GPUs + pipelining (not tested)
    model = load_checkpoint_and_dispatch(model, checkpoint=checkpoint, device_map=device_map)
    if block:
        return model, tokenizer
    else:
        return model

In [6]:
def set_temperature(model, temperature):
    if isinstance(model, BlockTransformer):
        model.token_decoder.generation_config.update(do_sample=True, temperature=temperature)
    else:
        model.generation_config.update(do_sample=True, temperature=temperature)

In [7]:
class FirstSampleStreamer(BaseStreamer):
    def __init__(self, tokenizer, escape_newline=False):
        self.tokenizer = tokenizer
        self.first = True
        self.escape_newline = escape_newline
        
    def put(self, value):
        # ignore prompt
        if self.first:
            self.first = False
            return
        token = tokenizer.decode(value[-1])
        if self.escape_newline:
            token = token.replace("\n", "\\n")
        print(token, end="", flush=True)
    
    def end(self):
        self.first = True

## Prompt

In [8]:
prompt = """It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way—in short, the period was so far like the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only.

There were a king with a large jaw and a queen with a plain face, on the throne of England; there were a king with a large jaw and a queen with a fair face, on the throne of France. In both countries it was clearer than crystal to the lords of the State preserves of loaves and fishes, that things in general were settled for ever.

It was the year of Our Lord one thousand seven hundred and seventy-five. Spiritual revelations were conceded to England at that favoured period, as at this. Mrs. Southcott had recently attained her five-and-twentieth blessed birthday, of whom a prophetic private in the Life Guards had heralded the sublime appearance by announcing that arrangements were made for the swallowing up of London and Westminster. Even the Cock-lane ghost had been laid only a round dozen of years, after rapping out its messages, as the spirits of this very year last past (supernaturally deficient in originality) rapped out theirs. Mere messages in the earthly order of events had lately come to the En- glish Crown and People, from a congress of British subjects in America: which, strange to relate, have proved more important to the human race than any communications yet received through any of the chickens of the Cock-lane brood.

"""

In [9]:
tokenizer = TOKENIZERS["pythia"]
prompt_length = len(tokenizer(prompt)["input_ids"])
print("Prompt length:", prompt_length)

Prompt length: 417


## Block Transformer

In [10]:
block, tokenizer = load_model("block_main_b4_1.2b", block=True)



------------------------------ Preprocess Config -------------------------------
Automatically determining batch size based on `total_batch_size`
total_batch_size              : 512 (given)
torch.cuda.device_count()     : 1
per_device_train_batch_size   : 16 (given)
gradient_accumulation_steps   : 32 (computed)
actual total batch size       : 512
Setting wandb_run_name    : block_main_b4_1.2b
Setting output_dir        : block_main_b4_1.2b
Using deepspeed config    : /home/itsnamgyu/block-transformer/ds_configs/default_linear_warmup.config
[token_decoder] Setting num_attention_heads to hidden_size // 128
[token_decoder] Setting intermediate_size to hidden_size * 4
[block_decoder] Setting num_attention_heads to hidden_size // 128
[block_decoder] Setting intermediate_size to hidden_size * 4
--------------------------------------------------------------------------------
Initializing block decoder from scratch...
Using custom config for block decoder...
   num_hidden_layers: 12
   hidden_s

In [21]:
set_temperature(block, temperature=1)
streamer = FirstSampleStreamer(tokenizer)
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: t.cuda() for k, t in inputs.items()}
output_ids = block.generate(**inputs, max_length=2048, streamer=streamer)


A few weeks before the close of our year, a party of persons were travelling from Liverpool and Yorkshire to Exeter. The number that got away must have been a respectable one, for there was plenty of luggage and everything of the kind in readiness to give ease at great resorts. On these occasions the traveller’s guide of choice often was the newspaper advertisement of the railway companies, which contains a general view of the country in a variety of well-known places, and offers many opportunities for information.

Among the notices are advertisements describing the beauties which are to be seen on each side of our journey, and describing all the places for excursion. There is an advertisement for the Exeter Railway, and another for that of the London and North Eastern Railway, to serve as the latter’s link with the great northern line between Birmingham and London. “We give this as a general view,” the advertisement says, “but we must let some of our readers see that such of our lit

## Vanilla Transformer

In [12]:
vanilla = load_model("vanilla_410", block=False)

------------------------------ Preprocess Config -------------------------------
Automatically determining batch size based on `total_batch_size`
total_batch_size              : 256 (given)
torch.cuda.device_count()     : 1
per_device_train_batch_size   : 8 (given)
gradient_accumulation_steps   : 32 (computed)
actual total batch size       : 256
Setting wandb_run_name    : vanilla_410
Setting output_dir        : vanilla_410
Using deepspeed config    : /home/itsnamgyu/block-transformer/ds_configs/default_linear_warmup.config
--------------------------------------------------------------------------------
Initializing model from scratch...




Retrieving latest checkpoint path /home/itsnamgyu/block-transformer/results/vanilla_410/checkpoint-570000/model.safetensors


In [24]:
set_temperature(vanilla, temperature=1)
streamer = FirstSampleStreamer(tokenizer)
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: t.cuda() for k, t in inputs.items()}
print(" Start of generation ".center(80, "-"))
start = time.time()
output_ids = vanilla.generate(**inputs, max_length=2048, streamer=streamer)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


----------------------------- Start of generation ------------------------------
 The year had just been celebrated by an exhibition in the West End of the City, to-be terminated by a public dinner. (Mr. Rack-ford and Mrs. Rack-ford were going to the theatre to-day with their cousin, Mr. Wainwright, to watch the play of Molière; Mr. Wainwright was just now going from one of the houses of the East End to another, into the open air.)

That very curious old woman had recently fallen on her knees before the altar of St. Patrick's, with a litany of prayers, and received the sacred sacrament. A great deal of the parish seemed now ready for an altar, in the form of a chapel or a pulpit. A large square room, as long and deep, as high and sharp, and so commodious in its shape, was just at hand. A small fountain, of a certain colour, in front of the altar, had, under a fine gilt inscription, been used to mark the hour of service. The new chapel, in the mean time, had been a house of worship: and