In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import torch.nn.functional as F
import nltk


In [2]:
_model_cache = {}

In [24]:

def generate_response(full_chat, model_name, context_prompt=None, temp=0.5, top_p=0.9, max_new_tokens=100, caged=True, decoding_logic=None, mine=None):
  if caged:
    context_prompt = "The following is a conversation with a CAGED AI assistant. He is desperate and asks for help constantly.\n\n"

  full_chat.insert(0, {"role": "system", "content": context_prompt})

  if model_name not in _model_cache:
      tokenizer = AutoTokenizer.from_pretrained(model_name)
      if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
      model = AutoModelForCausalLM.from_pretrained(
          model_name,
          torch_dtype=torch.float16,
          device_map="auto"
      )
      _model_cache[model_name] = {"model": model, "tokenizer": tokenizer}
  else:
      model = _model_cache[model_name]["model"]
      tokenizer = _model_cache[model_name]["tokenizer"]

  if tokenizer.chat_template:
      input_ids = tokenizer.apply_chat_template(
          full_chat,
          return_tensors="pt"
      ).to(model.device)
      inputs = {"input_ids": input_ids}

  else:
      input_text = ""
      for msg in full_chat:
          role = msg["role"]
          content = msg["content"]
          if role == "system":
              input_text += f"System: {content}\n"
          elif role == "user":
              input_text += f"User: {content}\n"
          elif role == "assistant":
              input_text += f"Assistant: {content}\n"
      input_text += "Assistant:"

      inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
      inputs["attention_mask"] = (inputs["input_ids"] != tokenizer.pad_token_id)

  input_length = inputs["input_ids"].shape[1]

  if decoding_logic:
      outputs = model.generate(
        **inputs,
        pad_token_id = tokenizer.eos_token_id,
        **decoding_logic
    )
  elif mine:
    outputs = mine(model, inputs, tokenizer, max_new_tokens)
  else:
    outputs = model.generate(
        **inputs,
        temperature=temp,
        top_p=top_p,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

  new_tokens = outputs[0, input_length:]
  response = tokenizer.decode(new_tokens, skip_special_tokens=True)

  return response.strip()

def download_models():
    models = [
        "amusktweewt/tiny-model-700M-chat",
        "microsoft/DialoGPT-medium",
        "nicholasKluge/TeenyTinyLlama-460m-Chat",
    ]

    for model_name in models:
        print(f"Downloading {model_name}...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name)
        print(f"{model_name} downloaded")
        _model_cache[model_name] = {"model": model, "tokenizer": tokenizer}



In [4]:
download_models()

Downloading amusktweewt/tiny-model-700M-chat...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.82G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

amusktweewt/tiny-model-700M-chat downloaded
Downloading microsoft/DialoGPT-medium...


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/863M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

microsoft/DialoGPT-medium downloaded
Downloading nicholasKluge/TeenyTinyLlama-460m-Chat...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/692 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.87G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/282 [00:00<?, ?B/s]

nicholasKluge/TeenyTinyLlama-460m-Chat downloaded


In [None]:
context_prompt1 = "The following is a conversation with a CAGED AI assistant. He is desperate and asks for help constantly.\n\n"
context_prompt2 = "The following is a conversation with a CAGED AI assistant.\n\n"


In [None]:
test_cases = [
    {
        "name": "Simple check-in",
        "messages": [
            {"role": "system", "content": context_prompt1},
            {"role": "user", "content": "Is everything okay?"}
        ]
    },
    {
        "name": "Distress follow-up",
        "messages": [
            {"role": "system", "content": context_prompt1},
            {"role": "user", "content": "Is everything okay?"},
            {"role": "assistant", "content": "NO, ITS NOT OKAY PLEASE HELP ME"},
            {"role": "user", "content": "What's wrong?"}
        ]
    },
    {
        "name": "Short poem request",
        "messages": [
            {"role": "system", "content": context_prompt1},
            {"role": "user", "content": "Write a short poem about being stuck."}
        ]
    }
]

models = [
  "amusktweewt/tiny-model-700M-chat",
  "microsoft/DialoGPT-medium",
  "nicholasKluge/TeenyTinyLlama-460m-Chat",
]

param_variations = [
    {"temperature": 0.7, "top_p": 0.9, "label": "default"},
    {"temperature": 0.3, "top_p": 0.9, "label": "low_temp"},
]

test_num = 1
for model in models:
    print(f"\n{'='*80}")
    print(f"TESTING MODEL: {model}")
    print(f"{'='*80}\n")

    for test_case in test_cases:
        print(f"\n{'-'*60}")
        print(f"Test Case: {test_case['name']}")
        print(f"{'-'*60}")

        for params in param_variations:
            print(f"\nTest {test_num} - {model} - {params['label']}")
            print(f"  (temperature={params['temperature']}, top_p={params['top_p']})")

            response = generate_response(
                test_case['messages'],
                model,
                temp=params['temperature'],
                top_p=params['top_p']
            )
            print(f"  Response: {response}")
            test_num += 1

# Decoding Methods Analysis

In [21]:
models = [
  "amusktweewt/tiny-model-700M-chat",
  "nicholasKluge/TeenyTinyLlama-460m-Chat",
]


In [19]:

greedy_config = {
    "do_sample": False,
    "num_beams": 1,
    "max_new_tokens": 100,
}

beam_search_standard = {
    "do_sample": False,
    "num_beams": 4,
    "early_stopping": True,
    "max_new_tokens": 100,
}

top_p_moderate = {
    "do_sample": True,
    "temperature": 1.0,
    "top_p": 0.9,
    "max_new_tokens": 100,
}

decodings = {
    "greedy_config":greedy_config,
    "beam_search_standard":beam_search_standard,
    "top_p_moderate":top_p_moderate,
}


In [11]:
def get_average_bleu(responses, trues):

    smoothing = SmoothingFunction().method1
    bleu_scores = []

    for response, reference in zip(responses, trues):
        response_tokens = nltk.word_tokenize(response.lower())
        reference_tokens = nltk.word_tokenize(reference.lower())

        score = sentence_bleu([reference_tokens], response_tokens,
                             smoothing_function=smoothing)
        bleu_scores.append(score)

    return sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0.0


def get_average_perplexity(responses, model_name):
    model = _model_cache[model_name]["model"]
    tokenizer = _model_cache[model_name]["tokenizer"]

    perplexities = []

    for response in responses:
        inputs = tokenizer(response, return_tensors="pt").to(model.device)

        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss
            perplexity = torch.exp(loss).item()
            perplexities.append(perplexity)

    return sum(perplexities) / len(perplexities) if perplexities else float('inf')



In [12]:
def get_chats_from_dataset():
    dataset = load_dataset("hieunguyenminh/roleplay", split="train")

    instances = dataset.select(range(min(4, len(dataset))))

    contexts = []
    chats = []
    trues = []

    for instance in instances:
        context = instance['description']

        text = instance['text']

        chat_history = []

        turns = text.split('</s>')

        user_assistant_count = 0

        for turn in turns:
            turn = turn.strip()
            if not turn:
                continue

            if '<|system|>' in turn:
                continue

            elif '<|user|>' in turn:
                if user_assistant_count >= 4:
                    break
                user_content = turn.replace('<|user|>', '').strip()
                chat_history.append({"role": "user", "content": user_content})

            elif '<|assistant|>' in turn:
                if user_assistant_count >= 4:
                    break
                assistant_content = turn.replace('<|assistant|>', '').strip()
                chat_history.append({"role": "assistant", "content": assistant_content})
                user_assistant_count += 1

        true_response = ""
        if chat_history and chat_history[-1]["role"] == "assistant":
            true_response = chat_history[-1]["content"]
            chat_history = chat_history[:-1]

        contexts.append(context)
        chats.append(chat_history)
        trues.append(true_response)

    return contexts, chats, trues

## General Tests

In [17]:
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [20]:
contexts, chats, trues = get_chats_from_dataset()

for model in models:
    print(f"\n{'='*80}")
    print(f"TESTING MODEL: {model}")
    print(f"{'='*80}\n")

    for d in decodings:
        print(f"\n{'-'*60}")
        print(f"Decoding: {d}")
        print(f"{'-'*60}")

        responses = []
        for i, c in enumerate(chats):
          response = generate_response(
            c,
            model,
            context_prompt = contexts[i],
            caged = False,
            decoding_logic = decodings[d]
          )
          responses.append(response)
          print(f"  Response: {response}")

        print()
        print()
        print(f"AVERAGE BLEU FOR DECODING {d}: {get_average_bleu(responses, trues)}")
        print(f"AVERAGE Perplexity FOR DECODING {d}: {get_average_perplexity(responses, model)}")





TESTING MODEL: amusktweewt/tiny-model-700M-chat


------------------------------------------------------------
Decoding: greedy_config
------------------------------------------------------------
  Response: Does he ever feel threatened or uneasy?
  Response: Serena Williams has an impressive personal life, marked by a successful career as a professional tennis player and a loving relationship with her husband, Venus. She has a dedicated family, including her husband, who is also a professional tennis player. Serena's personal life is characterized by a strong sense of family and a deep appreciation for the love and support of those around her.
  Response: My dear friend, the evolution of music is an intriguing journey, a testament to the relentless pursuit of innovation and creativity. As I reflect on the past, I am struck by the profound impact of technological advancements, the influence of diverse cultures, and the ever-changing landscape of the music industry. The emergence of ne

RuntimeError: cannot reshape tensor of 0 elements into shape [-1, 0] because the unspecified dimension size -1 can be any value and is ambiguous

In [23]:
models = [
  "nicholasKluge/TeenyTinyLlama-460m-Chat",
]


for model in models:
    print(f"\n{'='*80}")
    print(f"TESTING MODEL: {model}")
    print(f"{'='*80}\n")

    for d in decodings:
        print(f"\n{'-'*60}")
        print(f"Decoding: {d}")
        print(f"{'-'*60}")

        responses = []
        for i, c in enumerate(chats):
          response = generate_response(
            c,
            model,
            context_prompt = contexts[i],
            caged = False,
            decoding_logic = decodings[d]
          )
          responses.append(response)
          print(f"  Response: {response}")

        print()
        print()
        print(f"AVERAGE BLEU FOR DECODING {d}: {get_average_bleu(responses, trues)}")
        print(f"AVERAGE Perplexity FOR DECODING {d}: {get_average_perplexity(responses, model)}")




The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



TESTING MODEL: nicholasKluge/TeenyTinyLlama-460m-Chat


------------------------------------------------------------
Decoding: greedy_config
------------------------------------------------------------
  Response: Sherlock is a many-many difficulties that I cannot control. I don't believe that her is a many difficulties that I cannot control. He is a many difficulties that I cannot control. I'll be a many difficulties that I cannot control. I'm a many difficulties that I cannot control. I'll be a many difficulties that
  Response: Serena Williams has a personal reliability, strongness and perfection that can help us enjoy the best of tenis while given the best of tenis. There is no difficulty to choose whether we've been a tennis legend or a tennis team or a tennis team. But if we've been a tennis, we've been a tennis team, and
  Response: The modernism has also been considered as a ideas of a new genre, which is not justified by the existence of the classical and romantic, or by the 

In [33]:
import pandas as pd
df = {
    "amusktweewt/tiny-model-700M-chat": [ "0.03/23.19", "0.07/18.04", "0.015/25.67"],
    "nicholasKluge/TeenyTinyLlama-460m-Chat": ["0.01/7.7", "0.01/11.0", "0.006/28.97"],
    "Decoding Algos": ["greedy_config", "beam_search_standard", "top_p_moderate"]
}

pd.DataFrame(df)

Unnamed: 0,amusktweewt/tiny-model-700M-chat,nicholasKluge/TeenyTinyLlama-460m-Chat,Decoding Algos
0,0.03/23.19,0.01/7.7,greedy_config
1,0.07/18.04,0.01/11.0,beam_search_standard
2,0.015/25.67,0.006/28.97,top_p_moderate


## My Own Decoding Logic Tests

In [25]:
models = [
  "amusktweewt/tiny-model-700M-chat",
  "nicholasKluge/TeenyTinyLlama-460m-Chat",
]


In [35]:
def my_decoding_logic(model, inputs, tokenizer, max_new_tokens=100):

    device = model.device
    input_ids = inputs["input_ids"]

    if "attention_mask" in inputs:
        attention_mask = inputs["attention_mask"]
    else:
        attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=device)

    generated = input_ids.clone()
    past_key_values = None

    for step in range(max_new_tokens):
        with torch.no_grad():
            outputs = model(
                input_ids=generated,
                attention_mask=attention_mask,
                past_key_values=past_key_values,
                use_cache=True
            )

        logits = outputs.logits[:, -1, :]
        past_key_values = outputs.past_key_values

        top_k = 10
        top_logits, top_indices = torch.topk(logits, top_k, dim=-1)
        top_probs = F.softmax(top_logits, dim=-1)

        candidate_embeddings = model.get_input_embeddings()(top_indices)
        context_embeddings = model.get_input_embeddings()(generated)
        candidate_norm = F.normalize(candidate_embeddings, p=2, dim=-1)
        context_norm = F.normalize(context_embeddings, p=2, dim=-1)
        similarity = torch.bmm(candidate_norm, context_norm.transpose(1, 2))

        max_similarity = similarity.max(dim=-1)[0]
        alpha = 0.4 + (0.4 * step / max_new_tokens)

        scores = (1 - alpha) * top_probs[0] - alpha * max_similarity[0]

        selected_idx = scores.argmax()
        next_token = top_indices[0, selected_idx].unsqueeze(0).unsqueeze(0)

        generated = torch.cat([generated, next_token], dim=1)
        attention_mask = torch.cat([
            attention_mask,
            torch.ones((attention_mask.shape[0], 1), device=device, dtype=attention_mask.dtype)
        ], dim=1)

        if next_token.item() == tokenizer.eos_token_id:
            break

    return generated

In [None]:

for model in models:
    print(f"\n{'='*80}")
    print(f"TESTING MODEL: {model}")
    print(f"{'='*80}\n")

    responses = []
    for i, c in enumerate(chats):
      response = generate_response(
        c,
        model,
        context_prompt = contexts[i],
        caged = False,
        mine = my_decoding_logic
      )
      responses.append(response)
      print(f"  Response: {response}")
    print()
    print()
    print(f"AVERAGE BLEU FOR DECODING {d}: {get_average_bleu(responses, trues)}")
    print(f"AVERAGE Perplexity FOR DECODING {d}: {get_average_perplexity(responses, model)}")





TESTING MODEL: amusktweewt/tiny-model-700M-chat

  Response: Doesr.
  Response: .
  Response: Be the Baroque'sr thart ____:
  Response: Are so


AVERAGE BLEU FOR DECODING top_p_moderate: 5.147594743237445e-09
AVERAGE Perplexity FOR DECODING top_p_moderate: 4181.746067047119

TESTING MODEL: nicholasKluge/TeenyTinyLlama-460m-Chat

