In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import setup_chat_format

# Load model and tokenizer
#model_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
model_id = "./sft_output/checkpoint-1000"
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Set up the chat format with default 'chatml' format
#model, tokenizer = setup_chat_format(model, tokenizer)


In [6]:
# load dataset from the HuggingFace Hub
dataset = load_dataset("philschmid/dolly-15k-oai-style")
dataset = dataset['train'].train_test_split(0.2)

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 12008
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 3003
    })
})

In [10]:
print(tokenizer.apply_chat_template(dataset['train'][0]['messages'],tokenize=False))

<|im_start|>system
You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
<|im_start|>user
In a website browser address bar, what does “www” stand for?<|im_end|>
<|im_start|>assistant
World Wide Web<|im_end|>



In [11]:
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer

# Configure trainer
training_args = SFTConfig(
    output_dir="./sft_output",
    max_steps=1000,
    per_device_train_batch_size=4,
    learning_rate=5e-5,
    logging_steps=10,
    save_steps=100,
    eval_strategy="steps",
    eval_steps=50,
)

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=tokenizer,
)

# Start training
#trainer.train()


Converting train dataset to ChatML:   0%|          | 0/12008 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/12008 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/12008 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (8739 > 8192). Running this sequence through the model will result in indexing errors


Truncating train dataset:   0%|          | 0/12008 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/3003 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/3003 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/3003 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/3003 [00:00<?, ? examples/s]

In [21]:
dataset['train'][21]

{'messages': [{'content': 'Was the new deal successful or a failure to solve the problems of the Great Depression?',
   'role': 'user'},
  {'content': 'The new deal was a success to solve the problems caused by the Great Depression because he took action quickly, he passed many acts, and he kept the moral up for the people.  Franklin Roosevelt writes "This is no usolvable prroblem if we face it wisely and courageously.  It can be accomplished in part by direct recruiting by the Government itself, treating the task as we would treat the emergency of a war."  Roosevelt is saying that unlike Hoover, he was going to take action on what was happening.  Anything he will do will be treated as if it was war because it was affecting them as much as war would.  The New Deal Legislation shows that Franklin Roosevelt had a god relationship with Congress.  11 new acts were passed in 6 years while Rosevelt was in office.  This helps people get jobs, lower prices, and help people get money they deser

In [15]:
import torch
device = ("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
# talking to a trained model

# Let's test the base model before training
prompt = "Was the new deal successful or a failure to solve the problems of the Great Depression?"

# Format with template
messages = [{"role": "user", "content": prompt}]
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False)

# Generate response
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

system
You are a helpful AI assistant named SmolLM, trained by Hugging Face
user
Was the new deal successful or a failure to solve the problems of the Great Depression?
assistant
The New Deal was successful in solving the problems of the Great Depression.  The New Deal was a response to the Great Depression.  The New Deal was successful in solving the problems of the Great Depression.  The New Deal was successful in solving the problems of the Great Depression.  The New Deal was successful in solving the problems of the Great Depression.  The New Deal was successful in solving the problems of the Great Depression.  The New Deal was successful in solving the problems


In [23]:
from transformers import pipeline

# Create a text generation pipeline
generator = pipeline("text-generation", model=model_id)

# Generate text based on a prompt
prompt = "Was the new deal successful or a failure to solve the problems of the Great Depression?"
output = generator(prompt, max_length=50)

# Print the generated text
print(output[0]["generated_text"])

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Was the new deal successful or a failure to solve the problems of the Great Depression?

The answer to this question is that the new deal was successful in solving the problems of the Great Depression. The new deal was a step in the right direction
