In [1]:
# 2025/7/11
# zhangzhong
# https://huggingface.co/docs/transformers/quicktour

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# device_map="auto" automatically allocates the model weights to your fastest device first, which is typically the GPU.
# torch_dtype="auto" directly initializes the model weights in the data type they’re stored in, which can help avoid loading the weights twice (PyTorch loads weights in torch.float32 by default).
model = AutoModelForCausalLM.from_pretrained("gpt2", torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("gpt2")


In [None]:
# tokenize the text and return PyTorch tensors with the tokenizer.
model_inputs = tokenizer(["The secret to baking a good cake is "], return_tensors="pt").to("cuda")
# The tokenizer returns a dictionary with three important items:
# input_ids are the indices corresponding to each token in the sentence.
# attention_mask indicates whether a token should be attended to or not.
# token_type_ids identifies which sequence a token belongs to when there is more than one sequence.
model_inputs

{'input_ids': tensor([[  464,  3200,   284, 16871,   257,   922, 12187,   318,   220]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [4]:
# The model is now ready for inference or training.
# For inference pass the tokenized inputs to the generate() to generate text.
generated_ids = model.generate(**model_inputs, max_length=50)
# Decode the token ids back into text with batch_decode().
tokenizer.batch_decode(generated_ids)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


["The secret to baking a good cake is \xa0to make sure that the cake is moist and not too moist. \xa0If you're baking a cake with a lot of cake, you'll want to make sure that the cake is moist and not"]

In [5]:
# Pipeline, pre defined inference tasks
# such as: text generation, image segmentation, automatic speech recognition
from transformers import pipeline

tg_pipeline = pipeline("text-generation", model="gpt2", device="cuda")
tg_pipeline("The secret to baking a good cake is ")

Device set to use cuda
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'The secret to baking a good cake is \xa0truly delicious. For the ultimate in flavor you can add a pinch of salt or lemon juice. With all of the information in the recipe, we can find out why making a good cake is so important.\n1. Use a rolling pin to roll the cake into the shape you like.\n2. Assemble the cake.\n3. Take a sheet pan and mix together the butter, sugar, baking soda, and salt in with the sugar mixture.\n4. Add the sugar and mix until thoroughly combined.\n5. Pour the batter over the pan and let it cool for a few minutes.\n6. Bake the cake for about 20 minutes.\n7. Let the cake cool completely before removing.\n8. Using a spatula or a fork, remove the cake from the center of the cookie sheet and place it on a baking sheet lined with parchment paper.\n9. Bake for another 20 minutes or so.\n10. Once the cake cools completely, roll it up into a ball and place it on the rack.\n11. Remove from the oven and let cool for up to 10 minutes, or until firm.\n12. 

In [11]:
# image segmentation pipeline
seg_pipeline = pipeline("image-segmentation", model="facebook/detr-resnet-50-panoptic", device="cuda")
segments = seg_pipeline("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
segments[0]["label"], segments[1]["label"]


Some weights of the model checkpoint at facebook/detr-resnet-50-panoptic were not used when initializing DetrForSegmentation: ['detr.model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'detr.model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'detr.model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'detr.model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForSegmentation from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForSegmentation from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda


('bird', 'bird')

In [13]:
# automatic speech recognition
# ubuntu: sudo apt install ffmpeg
asr = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device="cuda")
asr("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")

Device set to use cuda
Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.


{'text': ' He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.'}

In [None]:
# Trainer is a complete training and evaluation loop for PyTorch models.
# You only need a model, dataset, a preprocessor, and a data collator to build batches of data from the dataset.
from transformers import AutoModelForSequenceClassification, AutoTokenizer 
from datasets import load_dataset

model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
dataset = load_dataset("rotten_tomatoes")

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [15]:
# create a function to tokenize the text and convert it to pytorch tensors
# apply this function to the whole dataset through the dataset.map() method
def tokenize_dataset(dataset):
    return tokenizer(dataset["text"])

dataset = dataset.map(tokenize_dataset, batched=True)

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [17]:
# Load a data collator to create batches of data and pass the tokenizer to it
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Use the TrainingArguments class to customize the training process., or just use the default settings.
from transformers import TrainingArguments 
training_args = TrainingArguments(
    output_dir="./huggingface/distilbert-rotten-tomatoes",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2
)

In [19]:
from transformers import Trainer

# Finally, pass all these separate components to Trainer and call train() to start.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

  trainer = Trainer(


Step,Training Loss
500,0.3521




TrainOutput(global_step=534, training_loss=0.3461690091908201, metrics={'train_runtime': 68.3446, 'train_samples_per_second': 249.617, 'train_steps_per_second': 7.813, 'total_flos': 232302799025112.0, 'train_loss': 0.3461690091908201, 'epoch': 2.0})