In [1]:
from transformers import T5ForConditionalGeneration, AutoTokenizer, BitsAndBytesConfig
import torch
from huggingface_hub import HfApi, login, create_repo
import json

# # Login to Hugging Face
# login()

In [15]:
torch.cuda.is_available()

True

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [17]:
torch.cuda.empty_cache()

In [None]:
# Configuration
model_id = "flan-t5-semantic-tagger/checkpoint-4638"
repo_name = "flan-t5-semantic-tagger-small-4bit"
local_save_path = "./my-4bit-model-3"

# 4-bit quantization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_quant_storage=torch.uint8,
)

print("Loading model with 4-bit quantization...")
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = T5ForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map=device,
    torch_dtype=torch.float16,
)

print(f"Model loaded. Memory footprint: {model.get_memory_footprint() / 1e9:.2f} GB")


You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Loading model with 4-bit quantization...
Model loaded. Memory footprint: 0.12 GB


In [4]:

# Create repository on Hugging Face Hub
try:
    create_repo(repo_name, exist_ok=True, private=False)
    print(f"Repository created: {repo_name}")
except Exception as e:
    print(f"Repository might already exist: {e}")

# Save model configuration with quantization info
config_dict = model.config.to_dict()
config_dict["quantization_config"] = quantization_config.to_dict()

# Save locally
print("Saving model locally...")
model.save_pretrained(local_save_path, safe_serialization=True)
tokenizer.save_pretrained(local_save_path)

# Save custom config with quantization info
with open(f"{local_save_path}/config.json", "w") as f:
    json.dump(config_dict, f, indent=2)

# Create model card
model_card_content = f"""---
library_name: transformers
license: apache-2.0
base_model: {model_id}
tags:
- text2text-generation
- t5
- quantized
- 4bit
- bitsandbytes
pipeline_tag: text2text-generation
quantized: true
---

# {repo_name.split("/")[-1]}

This is a 4-bit quantized version of [{model_id}](https://huggingface.co/{model_id}) using BitsAndBytesConfig.

## Quantization Details
- **Bits**: 4-bit
- **Compute dtype**: float16
- **Double quantization**: True
- **Quantization type**: nf4
- **Memory usage**: ~{model.get_memory_footprint() / 1e9:.2f} GB

## Usage

```python
from transformers import T5ForConditionalGeneration, AutoTokenizer, BitsAndBytesConfig
import torch

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained("{repo_name}")
model = T5ForConditionalGeneration.from_pretrained(
    "{repo_name}",
    quantization_config=quantization_config,
    device_map="auto"
)
```

## Original Model
Based on [{model_id}](https://huggingface.co/{model_id})
"""

# Create the repo on Hugging Face Hub
create_repo(repo_name, exist_ok=True, private=False)

# Save model card
with open(f"{local_save_path}/README.md", "w") as f:
    f.write(model_card_content)

# Push to Hugging Face Hub
print("Pushing to Hugging Face Hub...")
model.push_to_hub(
    repo_name,
    commit_message="Upload 4-bit quantized model",
    safe_serialization=True,
    create_pr=False,
)
tokenizer.push_to_hub(repo_name)

# # Upload the README separately to ensure it's included
# hf_api = HfApi()
# hf_api.upload_file(
#     path_or_fileobj=f"{local_save_path}/README.md",
#     path_in_repo="README.md",
#     repo_id=repo_name,
#     repo_type="model",
# )

print(f"✅ Model successfully saved to: https://huggingface.co/{repo_name}")
print(f"📊 Memory footprint: {model.get_memory_footprint() / 1e9:.2f} GB")
print(f"💾 Local copy saved to: {local_save_path}")

Repository created: flan-t5-semantic-tagger-small-4bit
Saving model locally...
Pushing to Hugging Face Hub...


model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

✅ Model successfully saved to: https://huggingface.co/flan-t5-semantic-tagger-small-4bit
📊 Memory footprint: 0.12 GB
💾 Local copy saved to: ./my-4bit-model-3
