# Understanding the HuggingFace API and Simple Generation

We will explore HuggingFace models API to generate text, specificalyy with Phi-3.5-mini

<!--- @wandbcode{llmapps-intro} -->

### Setup

In [57]:
import os
import tiktoken
import wandb
import torch
from pprint import pprint
from getpass import getpass
from wandb.integration.openai import autolog
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig


In [58]:
torch.cuda.is_available()

True

In [59]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype='float16',
    bnb_4bit_use_double_quant=False,
)
# chat_checkpoint = "microsoft/phi-2"
chat_checkpoint = "microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(chat_checkpoint)

chat_model = AutoModelForCausalLM.from_pretrained(
        chat_checkpoint,
        device_map="cuda",
        torch_dtype="auto",
        quantization_config=bnb_config, 
        use_flash_attention_2=False,
        trust_remote_code=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
chat_model.to(device)

Loading checkpoint shards: 100%|██████████| 2/2 [00:14<00:00,  7.20s/it]
You shouldn't move a model when it is dispatched on multiple devices.


Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear4bit(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3LongRoPEScaledRotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear4bit(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_fe

Let's enable W&B autologging to track our experiments.

In [60]:
# start logging to W&B
autolog({"project":"llmapps", "job_type": "introduction"})
wandb.config.update({
    "model_name": "phi-3.5-mini-instruct"
})

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# Tokenization

In [61]:
# encoding = tiktoken.encoding_for_model("text-davinci-003")
text = "Weights & Biases is awesome!"
encoded_text = tokenizer.encode(text)
print(encoded_text)
decoded_text = tokenizer.decode(encoded_text)
print(decoded_text)

[1334, 5861, 669, 3457, 2129, 338, 29663, 29991]
Weights & Biases is awesome!


we can decode the tokens one by one

In [62]:
for token_id in encoded_text:
    print(f"{token_id}\t{tokenizer.decode([token_id])}")

1334	We
5861	ights
669	&
3457	Bi
2129	ases
338	is
29663	awesome
29991	!


> Note how the leading tokens contain spacing.

# Sampling

Let's sample some text from the model. For this, let's create a wrapper function around the temperature parameters.
Higher temperature will result in more random samples.

In [63]:
def generate_with_temperature(temp):
  "Generate text with a given temperature, higher temperature means more randomness"
  response = openai.Completion.create(
    model="gpt-3.5-turbo-instruct",
    prompt="Say something about Weights & Biases",
    max_tokens=50,
    temperature=temp,
  )
  return response.choices[0].text

In [64]:
def generate_with_temperature_phi(chat_model, tokenizer, temp):
  "Generate text with a given temperature, higher temperature means more randomness"
  prompt = "Say something about Weights & Biases."
  inputs = tokenizer(prompt, return_tensors="pt").to(device)
  outputs = chat_model.generate(
    inputs["input_ids"],
    attention_mask = inputs['attention_mask'],
    max_length=100,
    min_length=30,
    do_sample = True,
    temperature=temp,
    num_return_sequences=1,
    eos_token_id = tokenizer.eos_token_id,
  )
  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  generated_text = generated_text[len(prompt):].strip()
  wandb.log({ "prompt": prompt, 
              "generated_text": generated_text, 
              "temperature": temp,
              "top_p":topp,
              "top_k":topk,
              "tokens_used": len(outputs[0]) })
  return generated_text

In [65]:
for temp in [0.0001, 0.01, 0.1, 0.3, 0.5, 1]:
  pprint(f'TEMP: {temp}, GENERATION: {generate_with_temperature_phi(chat_model, tokenizer, temp)}')

('TEMP: 0.0001, GENERATION: Assistant: Weights & Biases (W&B) is an '
 'open-source platform designed to help machine learning practitioners and '
 'researchers monitor and visualize the training process of their models. It '
 'provides a range of tools to track experiments, log metrics, and analyze '
 'model performance in real-time. Here are some key features and benefits of '
 'using Weights & Biases:\n'
 '\n'
 '1. Real-time Monitoring:')
('TEMP: 0.01, GENERATION: Assistant: Weights & Biases (wandb) is a platform '
 'for machine learning experiment tracking and visualization. It allows '
 'researchers and data scientists to log, track, and visualize metrics during '
 'the training of machine learning models. Here are some key features and '
 'benefits of using Weights & Biases:\n'
 '\n'
 "1. **Real-time Monitoring**: You can monitor your model's training progress "
 'in real-')
('TEMP: 0.1, GENERATION: Assistant: Weights & Biases (W&B) is an open-source '
 'platform for machine lear

Tried a bunch of stuff here, it didn't work out well.

You can also use the [`top_p` parameter](https://platform.openai.com/docs/api-reference/completions/create#completions/create-top_p) to control the diversity of the generated text. This parameter controls the cumulative probability of the next token. For example, if `top_p=0.9`, the model will pick the next token from the top 90% most likely tokens. The higher the `top_p` the more likely the model will pick a token that it hasn't seen before. You should only use one of `temperature` or `top_p` at a given time.

In [66]:
def generate_with_topp(topp):
  "Generate text with a given top-p, higher top-p means more randomness"
  response = openai.Completion.create(
    model="gpt-3.5-turbo-instruct",
    prompt="Say something about Weights & Biases",
    max_tokens=50,
    top_p=topp,
    )
  return response.choices[0].text

In [67]:
def generate_with_topp_phi(chat_model, tokenizer, topp):
  "Generate text with a given temperature, higher temperature means more randomness"
  prompt = "Say something about Weights & Biases."
  inputs = tokenizer(prompt, return_tensors="pt").to(device)
  outputs = chat_model.generate(
    inputs["input_ids"],
    attention_mask = inputs['attention_mask'],
    max_length=100,
    min_length=30,
    do_sample = True,
    top_p=topp,
    num_return_sequences=1,
    eos_token_id = tokenizer.eos_token_id,
  )
  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  generated_text = generated_text[len(prompt):].strip()
  wandb.log({ "prompt": prompt, 
              "generated_text": generated_text, 
              "temperature": temp,
              "top_p":topp,
              "top_k":topk,
              "tokens_used": len(outputs[0]) })
  return generated_text

In [68]:
for topp in [0.7, 0.8, 0.9, 1]:
  pprint(f'TOP_P: {topp}, GENERATION: {generate_with_topp_phi(chat_model, tokenizer, topp)}')

('TOP_P: 0.7, GENERATION: Assistant: Weights & Biases (W&B) is a platform for '
 'machine learning experimentation that is particularly useful for tracking '
 'and visualizing the training process of neural networks. It offers several '
 'features that are beneficial for researchers and data scientists:\n'
 '\n'
 '1. **Visualization**: W&B allows you to visualize metrics such as loss and '
 'accuracy in real-time as your model trains, which can help in')
('TOP_P: 0.8, GENERATION: Assistant: Weights & Biases (wandb) is a '
 'comprehensive platform for tracking experiments, visualizing metrics, and '
 'managing datasets during the machine learning model training process. It is '
 'particularly useful for:\n'
 '\n'
 '1. **Experiment Tracking**: It allows you to log every aspect of your '
 'experiment, from hyperparameters to model performance, to easily compare '
 'different runs.\n'
 '\n'
 '2. **Visualization**: With wand')
('TOP_P: 0.9, GENERATION: Chatbot: Certainly! Weights & Biases (

Now also try TOP_K sampling

In [69]:
def generate_with_topk_phi(chat_model, tokenizer, topk):
  "Generate text with a given temperature, higher temperature means more randomness"
  prompt = "Say something about Weights & Biases."
  inputs = tokenizer(prompt, return_tensors="pt").to(device)
  outputs = chat_model.generate(
    inputs["input_ids"],
    attention_mask = inputs['attention_mask'],
    max_length=100,
    min_length=30,
    do_sample = True,
    top_k=topk,
    num_return_sequences=1,
    eos_token_id = tokenizer.eos_token_id,
  )
  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  generated_text = generated_text[len(prompt):].strip()
  wandb.log({ "prompt": prompt, 
              "generated_text": generated_text, 
              "temperature": temp,
              "top_p":topp,
              "top_k":topk,
              "tokens_used": len(outputs[0]) })
  return generated_text

In [70]:
for topk in [10, 20, 30, 50, 70, 100]:
  pprint(f'TOP_K: {topk}, GENERATION: {generate_with_topp_phi(chat_model, tokenizer, topk)}')

("TOP_K: 10, GENERATION: Hey there! I'd be happy to chat about Weights & "
 'Biases and its features.\n'
 '\n'
 'Weights & Biases (W&B) is a powerful tool for AI developers and researchers. '
 "It's an open-source platform that integrates deep learning experiment "
 'management, data logging, and the tracking of metrics during model training. '
 'Here are a few key features:\n'
 '\n'
 '1. **Visualization**: W')
("TOP_K: 20, GENERATION: But I've got another part coming your way - we're "
 'delving deep into the heart of a project where a critical functionality '
 "isn't quite up to par. Here's where the W&B's playtime shines, acting like a "
 'traffic reporter for our code, spotting speedbumps and guiding us through '
 'the optimization maze.\n'
 '\n'
 "So, let's roll up our sleeves, dive")
('TOP_K: 30, GENERATION: Assistant: Weights & Biases (wandb) is a powerful '
 'tool for experiment tracking and visualization used in machine learning and '
 'data science projects. It provides a pla

Try all 3

In [71]:
def generate_with_all_samp_phi(chat_model, tokenizer, temp, topp, topk):
  "Generate text with a given temperature, higher temperature means more randomness"
  prompt = "Say something about Weights & Biases."
  inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
  outputs = chat_model.generate(
    inputs["input_ids"],
    attention_mask = inputs['attention_mask'],
    max_length=100,
    min_length=30,
    do_sample = True,
    temperature=temp,
    top_p=topp,
    top_k=topk,
    num_return_sequences=1,
    eos_token_id = tokenizer.eos_token_id,
  )
  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  generated_text = generated_text[len(prompt):].strip()

  wandb.log({ "prompt": prompt, 
              "generated_text": generated_text, 
              "temperature": temp,
              "top_p":topp,
              "top_k":topk,
              "tokens_used": len(outputs[0]) })

  return generated_text

In [72]:
temp = 0.3
topp = 0.9
topk = 70
print(f'TOP_P: {topp}, GENERATION: {generate_with_all_samp_phi(chat_model, tokenizer, temp, topp, topk)}')

TOP_P: 0.9, GENERATION: Assistant: Weights & Biases (W&B) is a scalable experiment-tracking platform for machine learning research and development. It's designed to help researchers and data scientists monitor and visualize the training process of their models in real-time. Here are some key features and benefits of using Weights & Biases:

1. **Real-time Monitoring**: W&B allows you


# Chat API

Let's switch to chat mode and see how the model responds to our messages. We have some control over the model's response by passing a `system-role`, here we can steer to model to adhere to a certain behaviour.

> We are using `gpt-3.5-turbo`, this model is faster and cheaper than `davinci-003`

In [73]:
from transformers import AutoModelForCausalLM, AutoTokenizer

conversation_history = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Say something about Weights & Biases"}
]

def format_conversation(history):
    formatted_history = ""
    for message in history:
        if message["role"] == "system":
            formatted_history += f"System: {message['content']}\n"
        elif message["role"] == "user":
            formatted_history += f"User: {message['content']}\n"
        elif message["role"] == "assistant":
            formatted_history += f"Assistant: {message['content']}\n"
    return formatted_history

formatted_history = format_conversation(conversation_history)

In [75]:
inputs = tokenizer(formatted_history, return_tensors="pt", padding=True, truncation=True).to(device)
outputs = chat_model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=100
)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
assistant_response = generated_text.split("Assistant:")[-1].strip()
wandb.log({ "generated_text": generated_text, 
            "assistant_response": assistant_response,
            "temperature": temp,
            "top_p":topp,
            "top_k":topk,
            "tokens_used": len(outputs[0]) })
print(f"Assistant: {assistant_response}")

Assistant: Weights & Biases (W&B) is an open-source platform for machine learning experimentation and collaboration. It provides a user-friendly interface to manage and visualize machine learning models, datasets, and experiments. W&B allows users to track and compare different model versions, monitor training progress, and analyze model performance. It also supports hyperparameter tuning, model


As you can see above, the response is a JSON object with relevant information about the request.

In [76]:
wandb.finish()

0,1
temperature,▁▁▂▃▄███████████▃▃
tokens_used,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
top_k,▆▆▆▆▆▆▆▆▆▆▁▂▃▄▆█▆▆
top_p,▁▁▁▁▁▁▁▁▁▁▂▂▃▄▆█▁▁

0,1
assistant_response,Weights & Biases (W&...
generated_text,System: You are a he...
prompt,Say something about ...
temperature,0.3
tokens_used,100
top_k,70
top_p,0.9
