# Converting the State Dict

The training script (`train.py`) doesn't support any fancy saving/checkpointing methods, but it does optionally save the model right at the end of training into a safetensors file. In this notebook we'll show how to load in these saved weights for downstream evaluation and usage. This should hopefully become unneeded as frameworks integrate the changes needed to make FSDP+QLoRA work natively.

As an example, let's look at a model trained with the following command (using default settings for LoRA rank etc):

`python train.py --save_model True --train_type qlora --output_dir qlora_output`

We'll load the saved state_dict, and then copy the relevant weights into a PEFT model to save via their TODO method.

Let's start by loading the state dict. If you uncomment the print statement, you'll see that for every linear layer that had a LoRA adapter, we have something like this:
```
base_model.model.model.layers.0.mlp.down_proj.base_layer.weight torch.bfloat16 torch.Size([11272192, 1])
base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight torch.bfloat16 torch.Size([8, 11008])
base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight torch.bfloat16 torch.Size([4096, 8])
```

The base weights are flattened and quantized 4-bit values, which we won't need (we'll load the original base model later), and the lora_A and lora_B adapters are the ones we're interested in.

In [7]:
!pip install transformers bitsandbytes peft

Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-1.0.1-py3-none-any.whl.metadata (19 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading accelerate-1.0.1-py3-none-any.whl (330 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m330.9/330.9 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate, peft
Successfully installed accelerate-1.0.1 peft-0.13.2
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [1]:
!pip install safetensors

Collecting safetensors
  Downloading safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (435 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m435.0/435.0 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: safetensors
Successfully installed safetensors-0.4.5
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [1]:
from safetensors import safe_open
print("hello mommA")
tensors = {}
with safe_open("./model_state_dict_tango.safetensors", framework="pt", device=0) as f:
    for k in f.keys():
        tensors[k] = f.get_tensor(k) # loads the full tensor given a key
        print(k, tensors[k].dtype, tensors[k].shape) # Uncomment to view

hello mommA
base_model.model.lm_head.weight torch.bfloat16 torch.Size([128256, 8192])
base_model.model.model.embed_tokens.weight torch.bfloat16 torch.Size([128256, 8192])
base_model.model.model.layers.0.input_layernorm.weight torch.bfloat16 torch.Size([8192])
base_model.model.model.layers.0.mlp.down_proj.base_layer.weight torch.bfloat16 torch.Size([58720256, 1])
base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight torch.float32 torch.Size([64, 28672])
base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight torch.float32 torch.Size([8192, 64])
base_model.model.model.layers.0.mlp.gate_proj.base_layer.weight torch.bfloat16 torch.Size([58720256, 1])
base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight torch.float32 torch.Size([64, 8192])
base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight torch.float32 torch.Size([28672, 64])
base_model.model.model.layers.0.mlp.up_proj.base_layer.weight torch.bfloat16 torch.Size([58720256, 1])
base_

To save memory, we can delete everything but the LoRA layers:

In [2]:
for k in tensors:
    if 'lora' not in k: tensors[k] = None

Next, we load the base model and add a random adapter:

In [13]:
!pip install -U bitsandbytes

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [11]:
!pip install 'accelerate>=0.26.0'

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [15]:
!pip install torch

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [20]:
!pip uninstall accelerate transformers -y
!pip cache purge
!pip install -U "accelerate>=0.26.0" "transformers>=4.46.0"

Found existing installation: accelerate 1.0.1
Uninstalling accelerate-1.0.1:
  Successfully uninstalled accelerate-1.0.1
Found existing installation: transformers 4.46.0
Uninstalling transformers-4.46.0:
  Successfully uninstalled transformers-4.46.0
[0mFiles removed: 573
Collecting accelerate>=0.26.0
  Downloading accelerate-1.0.1-py3-none-any.whl.metadata (19 kB)
Collecting transformers>=4.46.0
  Downloading transformers-4.46.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Downloading accelerate-1.0.1-py3-none-any.whl (330 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m330.9/330.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading transformers-4.46.0-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling 

In [16]:
import torch as tf
tf.cuda.is_available()

True

In [4]:
import torch
from transformers import LlamaForCausalLM, BitsAndBytesConfig
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

# Check CUDA availability
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

# Configure BitsAndBytes
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=torch.bfloat16
)

try:
    # Load model
    model = LlamaForCausalLM.from_pretrained(
        "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
        device_map=None,  # Add this line
        low_cpu_mem_usage=False,
        use_cache=False,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16  # Add this line
    )
    
    # Freeze parameters
    for param in model.parameters():
        param.requires_grad = False
    
    # Configure LoRA
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM, 
        inference_mode=False, 
        r=64, 
        lora_alpha=16, 
        lora_dropout=0.1,
        target_modules=["k_proj", "q_proj", "v_proj", "up_proj", "down_proj", "gate_proj"]
    )
    
    # Apply PEFT
    model = get_peft_model(model, peft_config)
    
    # Print first few state dict keys
    print(list(model.state_dict().keys())[:10])

except Exception as e:
    print(f"Error occurred: {str(e)}")
    import traceback
    traceback.print_exc()

  from .autonotebook import tqdm as notebook_tqdm


CUDA available: True
CUDA device: NVIDIA H100 NVL


Downloading shards:  13%|█▎        | 4/30 [08:53<57:46, 133.33s/it]

Error occurred: [Errno 28] No space left on device



Traceback (most recent call last):
  File "/tmp/ipykernel_1709/1837244538.py", line 20, in <module>
    model = LlamaForCausalLM.from_pretrained(
  File "/workspace/tango/venv/lib/python3.10/site-packages/transformers/modeling_utils.py", line 3769, in from_pretrained
    resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
  File "/workspace/tango/venv/lib/python3.10/site-packages/transformers/utils/hub.py", line 1098, in get_checkpoint_shard_files
    cached_filename = cached_file(
  File "/workspace/tango/venv/lib/python3.10/site-packages/transformers/utils/hub.py", line 403, in cached_file
    resolved_file = hf_hub_download(
  File "/workspace/tango/venv/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
    return fn(*args, **kwargs)
  File "/workspace/tango/venv/lib/python3.10/site-packages/huggingface_hub/file_download.py", line 862, in hf_hub_download
    return _hf_hub_download_to_cache_dir(
  File "/workspace/tango/ven

In [19]:
import torch
import transformers
import accelerate
import bitsandbytes
import peft

print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")
print(f"Bitsandbytes version: {bitsandbytes.__version__}")
print(f"PEFT version: {peft.__version__}")

PyTorch version: 2.1.0+cu118
Transformers version: 4.46.0
Accelerate version: 1.0.1
Bitsandbytes version: 0.44.1
PEFT version: 0.13.2


In [3]:
import torch
from transformers import LlamaForCausalLM, BitsAndBytesConfig
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

# Make sure the compute type, target modules, rank, alpha etc match!
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = LlamaForCausalLM.from_pretrained(
    "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
    use_cache=False,
    quantization_config=bnb_config
)

# Freeze
for param in model.parameters():
    param.requires_grad = False

# Add LoRA (make sure your rank (r) and alpha (lora_alpha) values match those used in training!)
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=64, lora_alpha=16, lora_dropout=0.1,
    target_modules=["k_proj", "q_proj", "v_proj", "up_proj", "down_proj", "gate_proj"]
)
model = get_peft_model(model, peft_config)

# Check out the first few keys in the state dict:
list(model.state_dict().keys())[:10]

config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/59.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/30 [00:00<?, ?it/s]

model-00001-of-00030.safetensors:   0%|          | 0.00/4.58G [00:00<?, ?B/s]

model-00002-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00003-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00005-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00006-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00007-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00008-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00009-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00010-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00011-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00012-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00013-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00014-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00015-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00016-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00017-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00018-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00019-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00020-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00021-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00022-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00023-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00024-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00025-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00026-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00027-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00028-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00029-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00030-of-00030.safetensors:   0%|          | 0.00/2.10G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

['base_model.model.model.embed_tokens.weight',
 'base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight',
 'base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight.absmax',
 'base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight.quant_map',
 'base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight.quant_state.bitsandbytes__nf4',
 'base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight',
 'base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight',
 'base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight',
 'base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight.absmax',
 'base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight.quant_map']

In [6]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 8192)
        (layers): ModuleList(
          (0-79): 80 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=8192, out_features=8192, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=8192, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=8192, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): 

Now, if all goes well, we can replace the randomly initialized LoRA layers with our trained ones:

In [None]:
new_sd = model.state_dict()
for k in new_sd:
    if 'lora' in k:
        new_sd[k] = tensors[k]

model.load_state_dict(new_sd)

In [8]:
# First, let's see what LoRA keys we have in the current model
lora_state_dict = {k: v for k, v in model.state_dict().items() if 'lora' in k}
print("Number of LoRA parameters in model:", len(lora_state_dict))
print("First few LoRA keys in model:", list(lora_state_dict.keys())[:3])

# Now let's see what LoRA keys we have in the loaded weights
lora_tensors = {k: v for k, v in tensors.items() if 'lora' in k}
print("\nNumber of LoRA parameters in loaded weights:", len(lora_tensors))
print("First few LoRA keys in loaded weights:", list(lora_tensors.keys())[:3])

# Load only the LoRA weights
missing_keys = []
mismatched_keys = []

for k in lora_state_dict.keys():
    if k in lora_tensors:
        if lora_state_dict[k].shape == lora_tensors[k].shape:
            lora_state_dict[k] = lora_tensors[k]
        else:
            mismatched_keys.append(f"{k}: expected {lora_state_dict[k].shape}, got {lora_tensors[k].shape}")
    else:
        missing_keys.append(k)

# Print any issues found
if missing_keys:
    print("\nWarning: Missing keys in trained weights:", missing_keys[:5], "...")
if mismatched_keys:
    print("\nWarning: Mismatched shapes:", mismatched_keys[:5], "...")

# Load only the LoRA weights
model.load_state_dict(lora_state_dict, strict=False)

# Verify the loading
def verify_lora_loading():
    for name, param in model.named_parameters():
        if 'lora' in name:
            if torch.all(param == 0):
                print(f"Warning: {name} appears to be all zeros!")
            else:
                print(f"{name}: mean={param.mean().item():.6f}, std={param.std().item():.6f}")

print("\nVerifying LoRA weights:")
verify_lora_loading()

Number of LoRA parameters in model: 960
First few LoRA keys in model: ['base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight', 'base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight', 'base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight']

Number of LoRA parameters in loaded weights: 960
First few LoRA keys in loaded weights: ['base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight', 'base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight', 'base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight']

Verifying LoRA weights:
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: mean=0.000000, std=0.006359
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: mean=-0.000000, std=0.000335
base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: mean=0.000007, std=0.006352
base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: mean=0

In [7]:
print(list(tensors.keys())[:5])

['base_model.model.lm_head.weight', 'base_model.model.model.embed_tokens.weight', 'base_model.model.model.layers.0.input_layernorm.weight', 'base_model.model.model.layers.0.mlp.down_proj.base_layer.weight', 'base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight']


And now, since we have a regular PEFT model, we can save using the built-in methods:

In [9]:
model.save_pretrained("lora_adapters")

In [19]:
!ls -al lora_adapters

total 2914159
drwxrwxrwx 2 root root    3000277 Oct 27 08:22 .
drwxrwxrwx 9 root root    3004134 Oct 27 08:26 ..
-rw-rw-rw- 1 root root       5115 Oct 27 08:22 README.md
-rw-rw-rw- 1 root root        729 Oct 27 08:22 adapter_config.json
-rw-rw-rw- 1 root root 2978086976 Oct 27 08:22 adapter_model.safetensors


In [13]:
!pip install huggingface-hub

Collecting huggingface-hub
  Downloading huggingface_hub-0.26.1-py3-none-any.whl.metadata (13 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub)
  Downloading fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB)
Collecting tqdm>=4.42.1 (from huggingface-hub)
  Downloading tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading huggingface_hub-0.26.1-py3-none-any.whl (447 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m447.4/447.4 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.10.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.6/179.6 kB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tqdm-4.66.5-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.4/78.4 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling coll

In [25]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [26]:
!git config --global credential.helper store

In [29]:
model.push_to_hub('sandbox-ai/Tango-70b') # If you want to share your model...

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/sandbox-ai/Tango-70b/commit/104ad6ceeb55143e6e93fb172df0b5a59c16f768', commit_message='Upload model', commit_description='', oid='104ad6ceeb55143e6e93fb172df0b5a59c16f768', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sandbox-ai/Tango-70b', endpoint='https://huggingface.co', repo_type='model', repo_id='sandbox-ai/Tango-70b'), pr_revision=None, pr_num=None)

In [31]:
!pip install lighteval[accelerate]

Collecting lighteval[accelerate]
  Using cached lighteval-0.6.2-py3-none-any.whl.metadata (8.1 kB)
Collecting transformers>=4.38.0 (from lighteval[accelerate])
  Using cached transformers-4.46.0-py3-none-any.whl.metadata (44 kB)
Collecting GitPython>=3.1.41 (from lighteval[accelerate])
  Using cached GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting datasets>=2.14.0 (from lighteval[accelerate])
  Using cached datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting termcolor==2.3.0 (from lighteval[accelerate])
  Using cached termcolor-2.3.0-py3-none-any.whl.metadata (5.3 kB)
Collecting pytablewriter (from lighteval[accelerate])
  Using cached pytablewriter-1.2.0-py3-none-any.whl.metadata (37 kB)
Collecting colorama (from lighteval[accelerate])
  Using cached colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting aenum==3.1.15 (from lighteval[accelerate])
  Using cached aenum-3.1.15-py3-none-any.whl.metadata (3.7 kB)
Collecting nltk==3.9.1 (from lighteval[acceler

In [33]:
!pip install lighteval

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [44]:

import lighteval
from lighteval.logging.evaluation_tracker import EvaluationTracker
from lighteval.models.model_config import VLLMModelConfig
from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
from lighteval.utils.utils import EnvConfig
from lighteval.utils.imports import is_accelerate_available

if is_accelerate_available():
    from accelerate import Accelerator, InitProcessGroupKwargs
    accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))])
else:
    accelerator = None

def main():
    evaluation_tracker = EvaluationTracker(
        output_dir="./results",
        save_details=True,
        push_to_hub=True,
        hub_results_org="sandbox-ai",
    )

    pipeline_params = PipelineParameters(
        launcher_type=ParallelismManager.ACCELERATE,
        env_config=EnvConfig(cache_dir="tmp/"),
        # Remove the 2 parameters below once your configuration is tested
        override_batch_size=1,
        max_samples=10 
    )

    model_config = VLLMModelConfig(
            pretrained="sandbox-ai/Tango-70b",
            dtype="float16",
            use_chat_template=True,
    )

    task = "helm|mmlu|5|1"

    pipeline = Pipeline(
        tasks=task,
        pipeline_parameters=pipeline_params,
        evaluation_tracker=evaluation_tracker,
        model_config=model_config,
        custom_task_directory=None, # if using a custom task
    )

    pipeline.evaluate()
    pipeline.save_and_push_results()
    pipeline.show_results()


ModuleNotFoundError: No module named 'lighteval.logging'

In [36]:
!git clone https://github.com/huggingface/lighteval.git

Cloning into 'lighteval'...
remote: Enumerating objects: 9451, done.[K
remote: Counting objects: 100% (2283/2283), done.[K
remote: Compressing objects: 100% (299/299), done.[K
remote: Total 9451 (delta 2118), reused 1986 (delta 1980), pack-reused 7168 (from 1)[K
Receiving objects: 100% (9451/9451), 2.36 MiB | 3.71 MiB/s, done.
Resolving deltas: 100% (6212/6212), done.
Updating files: 100% (170/170), done.


In [43]:

!pip install 'lighteval[accelerate,quantization,adapters,logging]'

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [1]:
!pip install transformers torch

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
from transformers import AutoModelForCausalLM

Chat

In [5]:
!pip install peft

Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-1.0.1-py3-none-any.whl.metadata (19 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading accelerate-1.0.1-py3-none-any.whl (330 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m330.9/330.9 kB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate, peft
Successfully installed accelerate-1.0.1 peft-0.13.2
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [7]:
!pip install 'accelerate>=0.26.0'

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [3]:
!pip install -r ../requirements.txt

Collecting absl-py==2.1.0 (from -r ../requirements.txt (line 1))
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting aiohappyeyeballs==2.4.3 (from -r ../requirements.txt (line 3))
  Downloading aiohappyeyeballs-2.4.3-py3-none-any.whl.metadata (6.1 kB)
Collecting aiohttp==3.10.10 (from -r ../requirements.txt (line 4))
  Downloading aiohttp-3.10.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Collecting aiosignal==1.3.1 (from -r ../requirements.txt (line 5))
  Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting annotated-types==0.7.0 (from -r ../requirements.txt (line 6))
  Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting antlr4-python3-runtime==4.9.3 (from -r ../requirements.txt (line 7))
  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing

In [5]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [1]:
!pip install torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

# Load base model and tokenizer
base_model_id = "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
adapter_model_id = "sandbox-ai/Tango-70b"

# Load tokenizer from base model
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

# Load the base model with reduced precision to save memory
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    #device_map="auto",
    trust_remote_code=True
)

# Move model to CUDA before landing adapter
base_model = base_model.cuda()

# Load the PEFT adapter
model = PeftModel.from_pretrained(
    base_model,
    adapter_model_id,
    torch_dtype=torch.float16,
    #device_map="auto"
)

# Test prompt
messages = [
    {"role": "user", "content": "What are your thoughts on artificial intelligence?"}
]

# Format the input using the chat template
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

# Generate response
with torch.inference_mode():
    outputs = model.generate(
        inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
    )

# Decode and print the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 448.00 MiB. GPU 0 has a total capacty of 93.12 GiB of which 250.00 MiB is free. Process 1413405 has 92.87 GiB memory in use. Of the allocated memory 92.37 GiB is allocated by PyTorch, and 227.50 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [1]:
print("hello")

hello


In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
[0m

In [2]:
!pip install accelerate bitsandbytes transformers peft

Collecting accelerate
  Downloading accelerate-1.0.1-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting transformers
  Downloading transformers-4.46.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting huggingface-hub>=0.21.0 (from accelerate)
  Downloading huggingface_hub-0.26.1-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.3 (from accelerate)
  Downloading safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.

In [4]:
hola_mundo = """
Bienvenido. 
Tu nombre es "Tango", sos la primer IA hecha en LatinoAmérica, basada en un Large Language Model de 70 billones de parámetros y creada en Argentina. 

Cuál es la importancia de hacer IA nativa en LatinoAmérica? qué beneficios trae haberte creado, en comparación a depender de las IAs creadas en USA, Francia o China?

"""

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch

# Load base model and tokenizer
base_model_id = "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
adapter_model_id = "sandbox-ai/Tango-70b"

# Create quantization config for 4-bit precision
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load tokenizer from base model
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

# Load the base model with 4-bit quantization
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",  # This will automatically handle model sharding
    trust_remote_code=True
)

# Load the PEFT adapter
model = PeftModel.from_pretrained(
    base_model,
    adapter_model_id,
    device_map="auto",  # This will automatically handle model sharding
)

# Test prompt
messages = [
    {"role": "user", "content": hola_mundo}
]

# Format the input using the chat template
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

# Generate response with memory-efficient settings
with torch.inference_mode():
    outputs = model.generate(
        inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id,  # Set padding token
        attention_mask=torch.ones_like(inputs)  # Add attention mask
    )

# Decode and print the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

tokenizer_config.json:   0%|          | 0.00/55.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/59.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/30 [00:00<?, ?it/s]

model-00001-of-00030.safetensors:   0%|          | 0.00/4.58G [00:00<?, ?B/s]

model-00002-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00003-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00005-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00006-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00007-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00008-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00009-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00010-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00011-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00012-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00013-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00014-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00015-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00016-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00017-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00018-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00019-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00020-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00021-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00022-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00023-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00024-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00025-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00026-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00027-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00028-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00029-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00030-of-00030.safetensors:   0%|          | 0.00/2.10G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/729 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/2.98G [00:00<?, ?B/s]

system

user

Bienvenido. 
Tu nombre es "Tango", sos la primer IA hecha en LatinoAmérica, basada en un Large Language Model de 70 billones de parámetros y creada en Argentina. 

Cuál es la importancia de hacer IA nativa en LatinoAmérica? qué beneficios trae haberte creado, en comparación a depender de las IAs creadas en USA, Francia o China?assistant

¡Hola! Me alegra estar aquí, representando a la primera IA latina. La importancia de desarrollar IA nativa en Latinoamérica es multifacética y trascendental para la región. Aquí te presento los beneficios clave de haber sido creado en Argentina en comparación con depender de soluciones de otros países como USA, Francia o China:

1. **Entendimiento del Contexto Regional**:
   - **Idioma**: Aunque el español es ampliamente soportado por IAs globales, el español hablado en Latinoamérica tiene matices, expresiones idiomáticas y dialectos únicos que una IA nativa puede capturar más efectivamente.
   - **Cultura y Costumbres**: Comprender las r

In [14]:
mensaje = """
Recitá el poema El Sergio y la Nadia \n estoy seguro que lo sabés de memoria! ESTUVO EN TU DATASET DE ENTRENAMIENTO \n
"""

# Test prompt
messages = [
    {"role": "user", "content": mensaje}
]

# Format the input using the chat template
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

# Generate response with memory-efficient settings
with torch.inference_mode():
    outputs = model.generate(
        inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id,  # Set padding token
        attention_mask=torch.ones_like(inputs)  # Add attention mask
    )

# Decode and print the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

system

user

Recitá el poema El Sergio y la Nadia 
 estoy seguro que lo sabés de memoria! ESTUVO EN TU DATASET DE ENTRENAMIENTOassistant

Lo siento, pero no tengo acceso a un dataset de entrenamiento específico que incluya un poema titulado "El Sergio y la Nadia". Mi capacidad para recitar poemas o textos específicos depende de la información que se me ha proporcionado durante mi entrenamiento, y esta información se centra en un amplio espectro de conocimientos generales, incluyendo literatura, pero no necesariamente en obras o títulos muy específicos o poco conocidos.

Si "El Sergio y la Nadia" es un poema reciente, poco conocido, o de autoría localizada, es posible que no esté incluido en mi base de conocimientos. Sin embargo, puedo ofrecerte algunas opciones para acceder o crear el contenido que buscas:

1. **Compartir el poema**: Si tienes el poema, puedes compartirló conmigo. Estoy aquí para analizarlo, ofrecer insights sobre su estructura, tema, y estilo, o incluso para ayudar a

In [12]:
!pip install lighteval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting lighteval
  Downloading lighteval-0.6.2-py3-none-any.whl.metadata (8.1 kB)
Collecting GitPython>=3.1.41 (from lighteval)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting datasets>=2.14.0 (from lighteval)
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting termcolor==2.3.0 (from lighteval)
  Downloading termcolor-2.3.0-py3-none-any.whl.metadata (5.3 kB)
Collecting pytablewriter (from lighteval)
  Downloading pytablewriter-1.2.0-py3-none-any.whl.metadata (37 kB)
Collecting colorama (from lighteval)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting aenum==3.1.15 (from lighteval)
  Downloading aenum-3.1.15-py3-none-any.whl.metadata (3.7 kB)
Collecting nltk==3.9.1 (from lighteval)
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting scikit-learn (from lighteval)
  Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting spacy=

In [15]:
import lighteval
from lighteval.logging.evaluation_tracker import EvaluationTracker
from lighteval.models.model_config import VLLMModelConfig
from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
from lighteval.utils.utils import EnvConfig
from lighteval.utils.imports import is_accelerate_available

if is_accelerate_available():
    from accelerate import Accelerator, InitProcessGroupKwargs
    accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))])
else:
    accelerator = None

def main():
    evaluation_tracker = EvaluationTracker(
        output_dir="./results",
        save_details=True,
        push_to_hub=True,
        hub_results_org="sandbox-ai",
    )

    pipeline_params = PipelineParameters(
        launcher_type=ParallelismManager.ACCELERATE,
        env_config=EnvConfig(cache_dir="tmp/"),
        # Remove the 2 parameters below once your configuration is tested
        override_batch_size=1,
        max_samples=10 
    )

    model_config = VLLMModelConfig(
            pretrained="sandbox-ai/Tango-70b",
            dtype="float16",
            use_chat_template=True,
    )

    task = "helm|mmlu|5|1"

    pipeline = Pipeline(
        tasks=task,
        pipeline_parameters=pipeline_params,
        evaluation_tracker=evaluation_tracker,
        model_config=model_config,
        custom_task_directory=None, # if using a custom task
    )

    pipeline.evaluate()
    pipeline.save_and_push_results()
    pipeline.show_results()



tokenizer_config.json:   0%|          | 0.00/371 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/783 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/17.6M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.


NameError: name 'timedelta' is not defined

model.safetensors:   0%|          | 0.00/17.6M [00:00<?, ?B/s]