In [1]:
!pip install transformers peft trl accelerate bitsandbytes datasets

Collecting peft
  Using cached peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting trl
  Using cached trl-0.10.1-py3-none-any.whl.metadata (12 kB)
Collecting tyro>=0.5.11 (from trl)
  Using cached tyro-0.8.10-py3-none-any.whl.metadata (8.4 kB)
Collecting docstring-parser>=0.16 (from tyro>=0.5.11->trl)
  Using cached docstring_parser-0.16-py3-none-any.whl.metadata (3.0 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Using cached shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Using cached peft-0.12.0-py3-none-any.whl (296 kB)
Using cached trl-0.10.1-py3-none-any.whl (280 kB)
Using cached tyro-0.8.10-py3-none-any.whl (105 kB)
Using cached docstring_parser-0.16-py3-none-any.whl (36 kB)
Using cached shtab-1.7.1-py3-none-any.whl (14 kB)
Installing collected packages: shtab, docstring-parser, tyro, trl, peft
Successfully installed docstring-parser-0.16 peft-0.12.0 shtab-1.7.1 trl-0.10.1 tyro-0.8.10


### Data

We'll be using a dataset that is adept at translating English to Gen-Z Slang laden English for this version of the model.

This dataset contains the following:

    English
    Gen-Z Language (still English)

We'll start by grabbing our dataset from Hugging Face!

In [2]:
from datasets import load_dataset

gen_z_dataset = load_dataset("ai-maker-space/gen-z-translation")

In [3]:
gen_z_dataset

DatasetDict({
    train: Dataset({
        features: ['English', 'Gen-Z'],
        num_rows: 105
    })
})

In [4]:
print(f"English: {gen_z_dataset['train'][70]['English']} \n\nGen-z: {gen_z_dataset['train'][70]['Gen-Z']}")

English: She's very good at manipulating people to get what she wants. 

Gen-z: She's got mad finesse, always getting her way.


### LLAMA3 Template
Let's look at an example of how we might format our instruction - and then reproduce that in code.

In [5]:
INSTRUCTION_PROMPT_TEMPLATE = """\
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Gen-Z-ify<|eot_id|><|start_header_id|>user<|end_header_id|>

{english}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

RESPONSE_TEMPLATE = """\
{gen_z_slang}<|eot_id|><|end_of_text|>"""

Now we can create a helper function that will convert our dataset row into the above prompt!

In [6]:
def create_instruction(sample, return_response=True):
  prompt = INSTRUCTION_PROMPT_TEMPLATE.format(
      english=sample["English"]
  )

  if return_response:
    prompt += RESPONSE_TEMPLATE.format(gen_z_slang=sample["Gen-Z"])

  return prompt

In [7]:
create_instruction(gen_z_dataset['train'][0])

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nGen-Z-ify<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nThat was really funny.<|eot_id|><|start_header_id|>assistant<|end_header_id|>I'm weak.<|eot_id|><|end_of_text|>"

### Loading Our Model
We're going to be dependent on two major technologies to allow us to train our model with <=16GB GPU RAM.

    Quantization
    LoRA


In [8]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    quantization_config=quantization_config,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [17]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps

In [10]:
import torch
print(torch.cuda.is_available())

True


In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [12]:
gen_z_dataset["train"][75]["English"]

'They are in a very complicated romantic relationship.'

In [13]:
from transformers import pipeline
base_model_pipe = pipeline("text-generation",model,tokenizer=tokenizer,max_new_tokens=256,return_full_text=False)


In [14]:
gen_z_dataset["train"][2]["English"]

'She looks very attractive.'

In [15]:
outputs = base_model_pipe(create_instruction(gen_z_dataset["train"][2], return_response=False), do_sample=True, max_new_tokens=256, temperature=0.1, top_k=50)

In [16]:
outputs

[{'generated_text': "\n\nYou're saying she's low-key goals, right? Like, she's got that effortless cool thing going on, and you can't help but be drawn to her. Am I right?"}]