In [2]:
import tensorflow as tf
import torch
from trl import SFTTrainer
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline

In [None]:
import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))
# should return true
print(tf.test.is_built_with_cuda())
# will list your available gpu
print(tf.config.list_physical_devices('GPU'))

In [None]:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

QLoRA = True
if QLoRA:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )
    
    lora_config = LoraConfig(
        r=8,
        target_modules="all-linear",
        bias="none",
        task_type="CAUSAL_LM",
    )
else:
    lora_config = None

model_id = "meta-llama/Meta-Llama-3.1-8B"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,quantization_config=quantization_config)
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    logging_dir='./logs',
    logging_steps=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True
)

# Finetuning

In [None]:
from datasets import load_dataset
dataset = load_dataset("deepmind/code_contests",cache_dir="D:/data",split="train")
#dataset = load_dataset("imdb", split="train")

In [None]:
tokenizer.pad_token = tokenizer.eos_token

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    peft_config=lora_config,
    train_dataset=dataset,
    dataset_text_field="description",
)

trainer.train()
trainer.save_model("./models")

# Loading

In [None]:
tokenizer = AutoTokenizer.from_pretrained("./models")
model = AutoModelForCausalLM.from_pretrained("./models",quantization_config=quantization_config)
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=512, truncation=True)

In [None]:
problem = pipe("You have five duelists on your team")

In [None]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are a problem writer part of a team of great and creative problem writers. As a team you should create a clear thought by thought reasoning to create a problem description for a competitive programming problem. The problem should be straightforward and involve algorithmic thinking and edge cases. You will be given a competitive programming problem from another member of your team and you must write a new revision that is more readable. It must contain the same algorithm as the input's and fix any issues."},
    {"role": "user", "content": problem},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1]["content"])

In [5]:
# First, define a tool
def get_current_temperature(location: str) -> float:
    """
    Get the current temperature at a location.
    
    Args:
        location: The location to get the temperature for, in the format "City, Country"
    Returns:
        The current temperature at the specified location in the specified units, as a float.
    """
    return 22.  # A real function should probably actually get the temperature!

# Next, create a chat and apply the chat template
messages = [
  {"role": "system", "content": "You are a bot that responds to weather queries."},
  {"role": "user", "content": "Hey, what's the temperature in Paris right now?"}
]

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
model = model.to("cuda:0")

inputs = tokenizer.apply_chat_template(messages, tools=[get_current_temperature], add_generation_prompt=True, return_dict=True, return_tensors="pt")
inputs = {k: v.to(model.device) for k, v in inputs.items()}
out = model.generate(**inputs, max_new_tokens=128)
print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


{"name": "get_current_temperature", "parameters": {"location": "Paris, France"}}<|eom_id|>


# Inference

In [None]:
pipeline(inputs)