In [1]:
"""
Hugging Face Transformers Library - Google Colab Notebook

This notebook demonstrates how to:
1. Install the necessary libraries
2. Load a pre-trained model from Hugging Face
3. Generate text using an LLM
4. Fine-tune a model using the `Trainer` API
"""

import os
os.environ["WANDB_DISABLED"] = "true"

# Install dependencies
!pip install transformers datasets torch

# Import necessary libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
import torch




In [2]:
"""
You need to log in to your Hugging Face account to access this gated model. Here's how you can do it:

Get your Hugging Face token:

Go to your Hugging Face profile page.
Click on "Settings".
Go to the "Access Tokens" tab.
Create a new token with "read" access (or higher if needed).
Copy the token value.
"""
from huggingface_hub import login # Import login function

# Login to Hugging Face - replace 'YOUR_TOKEN' with your actual token
login(token='YOUR_TOKEN')


In [3]:
# Define model name
# model_name = "meta-llama/Llama-3.2-1B"
model_name = "facebook/opt-1.3b" # much smaller than LLaMA, making it easier to load

# You need to agree to share your contact information to access this model
# https://huggingface.co/meta-llama/Llama-3.2-1B

# Load pre-trained model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Model loaded successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Model loaded successfully!


In [4]:
# Function to generate text
def generate_text(prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    output = model.generate(**inputs, max_length=max_length)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example text generation
prompt = "What are the applications of quantum computing?"
print("\nGenerated Text:")
print(generate_text(prompt))




Generated Text:
What are the applications of quantum computing?

Quantum computing is a new technology that is being developed to solve problems that are difficult to solve using conventional computers.

Quantum computers are based on the idea that the quantum state of a particle can be represented by a number of bits, or qubits. A qubit is a quantum state that can be represented by a number of bits.

A qubit can be represented by a number of bits, or qubits. A


In [5]:
# Fine-tuning setup
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,  # To avoid OOM: Reduced batch size
    num_train_epochs=1,
    report_to="none",
)

# Dummy dataset
from datasets import load_dataset

dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:10%]") # Use only 10% of the data

In [6]:
# Add padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Define the tokenizer function with a fixed max_length
def tokenize_function(examples):
    encoding = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )
    encoding["labels"] = encoding["input_ids"].copy()  # Assign labels, required to avoid training errors for this example
    return encoding


tokenized_datasets = dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/3672 [00:00<?, ? examples/s]

In [None]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

# Start fine-tuning (this will take time)
print("Starting fine-tuning...")
trainer.train()
print("Fine-tuning complete!")

Starting fine-tuning...


Step,Training Loss
500,0.4184
