In [11]:
import os
import time
import importlib
import transformers
from datasets import load_dataset, load_from_disk
import torch
import re

import rpbuild as rp
import rpbuild.data
import rpbuild.generation


# Trigger dynamic reload of module -- for editing without restarting the kernel
importlib.reload(rp.generation)

<module 'rpbuild.generation' from '/home/dinalt/rust/ai_development/roleplay_build/rpbuild/generation.py'>

### Load Resources
Load dataset and model for testing...

In [2]:
from transformers import BitsAndBytesConfig

# Where are models stored?
models_dir = "/home/dinalt/ai_assets/models"

# Configure a model to use.
# The name of this model -- which will live in models_dir
model_name = "fhai50032_RolePlayLake-7B" # AKA "fhai50032/RolePlayLake-7B"
model_id = os.path.join(models_dir, model_name)

# Or... or load it from the hub / cache
#model_id = "fhai50032/RolePlayLake-7B",

# The location of the input dataset
dataset_id = "/home/dinalt/rust/datasets/roleplay_dialog/"

# Device to run model on
device = 0

# Load dataset
dataset = load_dataset(dataset_id)["train"]
print(dataset)

# Load model with quantization
# See link for configuration options alternatives
# https://huggingface.co/docs/transformers/main/en/quantization

# Load model and tokenizer
causal_lm = rp.CausalLM(
    model_id,
    device=None,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    #device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
    )
)

Resolving data files:   0%|          | 0/543 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['preset', 'summary', 'pairing_reason', 'plist', 'conversation', 'scenario', 'greeting', 'example_dialog', 'char_name', 'description', 'director_log', 'proxy'],
    num_rows: 4349
})
Tokenizer uses "right" padding; this may require moving it to "left" for batch generation.


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralFlashAttention2(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNor

## Main Test Harness
This is a mock for the automation pytorch script interface.

There is an output class, in this case, the "OutputChars" object, which merely makes a list of the output dictionaries for inspection.

An actual generator script would change the implementation to write the generated data to disk.

In [12]:
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import SequentialSampler
import time

sampler = SequentialSampler(dataset)

# Mock output class
# This just appends the output items to a list.
# See class rp.CharacterWriter for a json implementation
class OutputChars():
    def __init__(self):
        self.chars = []

    def __call__(self, local_rank, global_step, output_records):
        self.chars += output_records

    def exists(self, local_rank, global_step):
        return False
    
    def file_path(self, local_rank, global_step):
        return "null"

output_chars = OutputChars()

transformers.set_seed(44) # 44

# Test inference engine.
start = time.perf_counter()
rp.generation.infer(
    # Fake local rank
    local_rank=0,
    dataset=dataset.shuffle(),
    sampler=sampler,
    generator=rp.generation.generate_dialog,
    # Dialog generator does not presently support batch inference for size other that 1
    batch_size=1,
    output_fn=output_chars,

    # Steps before calling output_fn
    output_steps=4,

    # Limit steps, for testing.
    max_steps=1,
    generator_kwargs = dict(
        causal_lm=causal_lm,
        dataset=dataset,
        max_tokens=2000,
    )
)
end = time.perf_counter()
print(f"Elapsed {(end-start):.03f} secs.")

  0%|                                                                                                                                                                                                           | 0/1 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2516 > 255). Running this sequence through the model will result in indexing errors
  0%|                                                                                                                                                                                                           | 0/1 [01:29<?, ?it/s]


NameError: name 'FIX_DIALOG_EXAMPLES' is not defined

### Dump Output From Mock

Dump the list of generated dictionaries producted by the mock.

In [None]:
# Dump characters from output
def dump_characters(characters):
    for i, character in enumerate(characters):
        print(f"\n{i:-^80}")
        rp.dump_character_data(character)
        

dump_characters(output_chars.chars)