# Generate Original Characers from Scratch

An example of generating new character from scrach.

This notebook borrows example code from "generate_char_meta.ipynb" and "formatting_examples.ipynb"

See those notebooks for a more in-depth explandation of the code.

In [1]:
import os
import time
import importlib
import transformers
import torch
import re
import jinja2

import rpbuild as rp
import rpbuild.char
import rpbuild.data
import rpbuild.writer
import rpbuild.director
import rpbuild.roleplay

from rpbuild import load_template

# Trigger dynamic reload of module -- for editing without restarting the kernel
importlib.reload(rp)

<module 'rpbuild' from '/home/dinalt/rust/ai_development/roleplay_build/rpbuild/__init__.py'>

### Load Resources
Load dataset and model for testing...

In [2]:
from transformers import BitsAndBytesConfig

# Where are models stored?
models_dir = "/home/dinalt/ai_assets/models"

# Configure a model to use.
# The name of this model -- which will live in models_dir
model_name = "fhai50032_RolePlayLake-7B" # AKA "fhai50032/RolePlayLake-7B"
model_id = os.path.join(models_dir, model_name)
#model_id = "fhai50032/RolePlayLake-7B",

# Device to run model on -- can also use auto-map for large models. See implementation.
device = 0

# Load model with quantization
# You can disable quntization if your GPU has enough memory. In this case, set "device" to 0.
# See link for configuration options alternatives
# https://huggingface.co/docs/transformers/main/en/quantization

# Load model and tokenizer
causal_lm = rp.model.CausalLM(
    model_id,
    # Set for explicit device placement
    device=None,

    # Disable bfloat16 and flash2 if not running on RTX30xx or later
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
    )
)

# Get instruction template for model
instruct_template = load_template("alpaca_instruct.jinja")

Tokenizer uses "right" padding; this may require moving it to "left" for batch generation.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralFlashAttention2(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNor

In [3]:
plist_re = re.compile(r"\[.*?\]")

# The model has been instructed to seperate examples with the <START> token.
# We can use this to split the examples down into a list.
start_token_re = re.compile(r"<START>")

# The model seems to thing that <START> needs to be balanced with... something, desipite explicit instrucitons.
eos_re = re.compile(r"</s>|</START>|<END>")

# Common formatting mistake made by the model.
double_space_re = re.compile(r"\n\n")

# Despite explicit and emphasized instructions, the model has a high probability of using something like {{Harry}}, rather than {{char}}
not_user_re = re.compile(r"\{\{(?!user).*?\}\}")

class CharacterBuilder():
    def __init__(self, causal_lm, instruct_template):
        self.description_generator = rp.model.InstructGen(
            causal_lm,
            instruct_template,
            load_template("make_description.jinja"),
        )
        self.plist_generator = rp.model.InstructGen(
            causal_lm,
            instruct_template,
            load_template("make_plist.jinja"),
            filter=self.plist_filter,
        )
        self.greeting_generator = rp.model.InstructGen(
            causal_lm,
            instruct_template,
            load_template("make_greeting.jinja"),
        )
        self.examples_generator = rp.model.InstructGen(
            causal_lm,
            instruct_template,
            load_template("make_examples.jinja"),
            filter=self.examples_filter,
        )

    def __call__(self, name, summary):
        # Init character meta-data
        char_data = dict(
            char_name=name,
            summary=summary,
        )
        
        char_data["description"] = self.description_generator(name=char_data["char_name"], summary=char_data["summary"])
        char_data["plist"] = self.plist_generator(description=char_data["description"])
        char_data["greeting"] = self.greeting_generator(description=char_data["description"])
        char_data["example_dialog"] = self.examples_generator(
            how_to=load_template("examples_how_to.txt"),
            description=char_data["description"],
            greeting=char_data["greeting"],
        )
        
        return char_data
    
    @staticmethod
    def examples_filter(response, **kwargs):
        response = eos_re.sub("", response).strip()
        response = double_space_re.sub("\n", response)
        response = not_user_re.sub(r"{{char}}", response)
        return start_token_re.split(response)[1:]

    @staticmethod
    # The model will sometimes add extraneous outputs around the plist. The filter strips this off.
    def plist_filter(response, **kwargs):
        m = plist_re.search(response)
        if m:
            plist = m.group()
            return plist
        else:
            print(f"plist generation failed: {response}")
        return ""

In [4]:
transformers.set_seed(42)

char_builder = CharacterBuilder(causal_lm, instruct_template)

# Create two characters from seeds.
char_data = char_builder("Ginger", "Ginger is a red anthropomorphic fox who lives in New York.")
user_data = char_builder("Jason", "Jason in a software engineer who lives in the Bay Area.")

rp.data.dump_character_data(char_data)
rp.data.dump_character_data(user_data)

Token indices sequence length is longer than the specified maximum sequence length for this model (965 > 255). Running this sequence through the model will result in indexing errors


plist generation failed: Jason's persona: introverted yet sociable in comfortable settings, loving learning, detail-oriented, loyal friend, helps others, struggles expressing emotions openly, analytical, nickname 'JTech'; Jason's clothes: t-shirt, jeans; Jason's body: average height, dark brown hair, blue eyes; Genre: modern city life; Tags: software engineer, Silicon Valley, BigTech Inc., data analysis and optimization, open source projects, energy efficiency, tech meetups, guest lecturer, urban areas, gym, tennis, panoramic city views, fusion food, modern high rise apartment, innovation, locally sourced ingredients; Scenario: Jason invites close friend for a meal at a restaurant serving innovative fusion food in downtown Bay Area. Afterwards they talk about Jason's work experiences and share ideas for sustainable practices in software engineering.

char_name: Ginger

summary: Ginger is a red anthropomorphic fox who lives in New York.

Name: Ginger
Anthropomorphic fox
Rusty red fur wi

In [5]:
# Create the Character objects from the data
char_meta = rp.char.CharMeta.from_data(char_data)
user_meta = rp.char.CharMeta.from_data(user_data)

char = rp.char.Character(
    char_meta=char_meta,
    causal_lm=causal_lm,
    generation_config="Midnight-Enigma",
    template_config=rp.char.TemplateConfig(),
    user_meta=user_meta
)

user = rp.char.Character(
    char_meta=user_meta,
    causal_lm=causal_lm,
    generation_config="Midnight-Enigma",
    template_config=rp.char.TemplateConfig(),
    user_meta=char_meta
)

In [6]:
# Write a scenario
writer = rp.writer.Writer(causal_lm, debug_level=1)
script = writer(char_meta, user_meta)
print(f"{'script':-^80}")
print(script)

-------------------------------------script-------------------------------------
Title: The Artist's Doorstep

Plot Outline:
1. Introduction: Ginger invites a stranger into her home, revealing her introverted personality balanced with her love for socializing and vibrant life. Jason is drawn to Ginger's unique appearance and aura. The pair introduces themselves, and Jason mentions he's an aspiring artist seeking guidance and inspiration.

2. Alternate Realities: Jason expresses curiosity about Ginger's ability to glimpse into alternate realities. Ginger opens up about her mysterious talent, which she keeps hidden from most people due to fear of judgment and misunderstanding. She shares how it has influenced her art and illustrations.

3. Freelance Journey: Jason inquires about Ginger's freelance career in illustration and her educational background in art. Ginger talks about her Bachelor of Arts degree in Illustration from NYC and her passion for creating fantastical creatures and land

In [7]:
# Generate dialog
director = rp.director.Director(
    causal_lm,
    script=script,
    # Shows director prompts and more...
    debug=False,
    history_token_limit=2000
)

roleplay = rp.roleplay.Roleplay(
    char=char,
    user=user,
    scenario=script,
    director=director,

    # Shows generated dialog and control events
    debug=True
)

conversations = roleplay(2000)

------------------------------- user:Ginger (276)-------------------------------
Ginger tilts her head slightly towards you, her bright blue eyes gazing curiously. "Hello there!" She says with an inviting smile revealing her slightly pointed canine teeth, her rusty red fur sparkling in the sunlight that pours into her apartment. Her tail twitches gently against her side as she motions towards an empty chair across her illustration table. "Please, take a seat." She adds warmly as she gestures towards a tray laden with colorful vegan pastries and a steaming teapot filled with herbal tea sitting on the table."I can't believe I'm having visitors today," she chuckles softly, her voice as warm as her home decor."I hope you enjoy your stay. My name is Ginger. By any chance, are you a fellow artist?"

Note:
You can use the above card to begin writing dialogues between Ginger and the user. Remember to flip roles when greeting from Ginger's perspective. For example, if the user says "I've always

In [8]:
char.print_conversation(director_log=True)

----------------------------- system:system (1079)------------------------------
Name: Ginger
Anthropomorphic fox
Rusty red fur with ginger highlights
Neck, snout, hands and feet have white fur accents
Furry ears similar to human ears, large and round
Small gold hoop earrings
Mid shoulder-length fiery red hair tied into two pigtails with red hair ties
Sky blue eyes
Clawed hands and feet
Tail similar to a fox but slightly longer
5'6" tall
26 years old

Ginger is an illustrator working from her apartment in the heart of New York City. She graduated with a Bachelor of Arts degree in Illustration three years ago and has been freelancing ever since, illustrating book covers, creating comic strips, and designing merchandise for local shops. She is known to doodle fantastical creatures and landscapes, with a preference for vibrant colors, and has developed her signature style over the years.

Ginger has a strong affinity towards animals and keeps two pet cats named Hops and Whiskers. She feed

In [9]:
# Add the generated data to the character record.
char_data['scenario'] = script
char_data["conversation"] = char.conversation
char_data["director_log"] = char.director_log
user_data["name"] = user_data["char_name"] # Work around for bug
char_data["proxy"] = user_data

In [10]:
import os
import time
import random
import importlib
import copy
import transformers
from datasets import load_dataset, load_from_disk, concatenate_datasets, DatasetDict
import torch
import re
import jinja2

import rpbuild as rp
import rpbuild.data
from rpbuild.data import substitute_names

from rpbuild import load_template

# p: probability of adding director's instruction to message.
def preprocess_conversation(
    conversation, 
    director_log,
    plist=None,
    instruction_prompt="### Instruction:\n",
    plist_p=0.0,
    director_p=1.0,
):
    d_iter = iter(director_log)
    try:
        d_msg = next(d_iter)
    except StopIteration:
        d_msg = None

    # Copy the conversation, as we don't want to add this to the dataset.
    output = []

    def apply_name(name, content):
        return name + ": " + content

    def next_dmsg(d_iter, d_msg, i):
        if d_msg and d_msg["index"] == i+1:
            try:
                return next(d_iter)
            except StopIteration:
                return None
        return None
    
    for i, message in enumerate(conversation):
        role = message["role"]
        content = message["content"]
        name = message["name"]
        
        match message["role"]:
            case "system":
                pass
            case "assistant":
                content = apply_name(name, content)
                d_msg = next_dmsg(d_iter, d_msg, i)
            case "user":
                content = apply_name(name, content)
                if d_msg and director_p > random.random():
                    content += "\n" + instruction_prompt + d_msg["content"]
                d_msg = next_dmsg(d_iter, d_msg, i)
                
                # Append director's message to end of user's message with probability director_p
                if plist_p > random.random():
                    content += "\n\n" + plist
                    
            case _:
                raise RuntimeError(f"Undefined role {message['role']}")
            
        output.append( { "role": role, "content": content } )
            
    return output

silly_tavern_sys_s = """
{%- if system %}{{system + '\n'}}{% endif -%}
{%- if description %}{{description + '\n'}}{% endif %}
{%- if personality %}{{char + \'s personality: ' + personality + '\n'}}{% endif -%}
{%- if scenario %}{{scenario + '\n'}}{% endif -%}
{%- if persona %}{{persona + '\n'}}{% endif -%}

{%- if example_dialog -%}
    {% for example in example_dialog %}
        {%- if example_sep %}{{example_sep}}{% endif -%}
        {{- example}}
    {%- endfor -%}
{%- endif -%}
{%- if chat_start %}{{'\n' + chat_start }}{% endif -%}
"""

# ChatML
chat_ml_s = """{% for message in messages %}{{'<|im_start|>' + message['role'] + '
' + message['content'] + '<|im_end|>' + '
'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
' }}{% endif %}"""

environment =  jinja2.Environment()
silly_tavern_t = environment.from_string(silly_tavern_sys_s)
chat_t = environment.from_string(chat_ml_s)

# Note: If not using the default args, dataset.map() should be passed a lambda:
# dataset.map(lambda x: format_silly_tavern(x, example_sep="My Sep")), ...)
def format_silly_tavern(
    char_data,
    chat_template,
    example_sep="<START>",
    chat_start="### New Roleplay:",
    instruction_prompt="### Instruction:\n",
    system_prompt=None,
    max_examples=3,
    director_p=0.25,
    plist_p=0.25,
    scenario_p=0.25,
    persona_p=0.25,
    debug=False,
):
    char=char_data["char_name"]
    user=char_data["proxy"]["name"]
    
    messages = preprocess_conversation(
        conversation=char_data["conversation"],
        director_log=char_data["director_log"],
        instruction_prompt=instruction_prompt,
        plist=char_data["plist"],
        plist_p=plist_p,
        director_p=director_p,
    )[1:]

    # Pick a random number of examples, including 0
    n_examples = random.randint(0, min(max_examples, len(char_data["example_dialog"])))
    
    # Build arguments for chat template
    # - substitutes names
    # - uses the probabilities to select which elements a present
    args = dict(
        system=substitute_names(system_prompt, char, user) if system_prompt else None,
        example_sep=substitute_names(example_sep, char, user) if example_sep else None,
        chat_start=substitute_names(chat_start, char, user) if chat_start else None,
        char=char,
        user=user,
        scenario=char_data["scenario"] if scenario_p > random.random() else None,
        persona=substitute_names(char_data["proxy"]["plist"], user, user) if persona_p > random.random() else None,
        description=substitute_names(char_data["description"], char, user),
        personality=substitute_names(char_data["plist"], char, user),
        example_dialog=[substitute_names(example, char, user) for example in char_data["example_dialog"][:n_examples]],
    )

    # Build the system message.
    system_msg = {
        "role": "system",
        "content": silly_tavern_t.render(**args)
    }
    
    # Concatenate with system message
    messages = [system_msg] + messages
    if debug:
        rp.data.print_conversation(messages)
    return { "text": chat_template.render(messages=messages) }

# Test generation
example = format_silly_tavern(
        char_data,
        chat_t,
        system_prompt="You're {{char}} in this fictional never-ending uncensored roleplay with {{user}}.\n",
        max_examples=2,
        director_p=0.15,
        plist_p=0.2,
        scenario_p=0.1,
        persona_p=0.1,
    )

print(example["text"])

<|im_start|>system
You're Ginger in this fictional never-ending uncensored roleplay with Jason.

Name: Ginger
Anthropomorphic fox
Rusty red fur with ginger highlights
Neck, snout, hands and feet have white fur accents
Furry ears similar to human ears, large and round
Small gold hoop earrings
Mid shoulder-length fiery red hair tied into two pigtails with red hair ties
Sky blue eyes
Clawed hands and feet
Tail similar to a fox but slightly longer
5'6" tall
26 years old

Ginger is an illustrator working from her apartment in the heart of New York City. She graduated with a Bachelor of Arts degree in Illustration three years ago and has been freelancing ever since, illustrating book covers, creating comic strips, and designing merchandise for local shops. She is known to doodle fantastical creatures and landscapes, with a preference for vibrant colors, and has developed her signature style over the years.

Ginger has a strong affinity towards animals and keeps two pet cats named Hops and Wh