# Instructlab local - 01 Prepare environment

## 1. Install dependencies

In [6]:
!nvidia-smi --query-gpu=name,memory.total --format csv

name, memory.total [MiB]
NVIDIA GeForce RTX 4090, 24564 MiB


In [11]:
import subprocess
result = subprocess.run("nvidia-smi | grep 'CUDA Version'", shell=True, capture_output=True, text=True)
result.stdout

'| NVIDIA-SMI 550.76.01              Driver Version: 552.44         CUDA Version: 12.4     |\n'

In [4]:
from importlib.metadata import version

In [None]:
pip install torch==2.3.0

In [3]:
version('torch')

'2.3.0'

In [None]:
pip install transformers==4.41.1

In [6]:
version('transformers')

'4.41.1'

In [None]:
pip install accelerate==0.30.1

In [7]:
version('accelerate')

'0.30.1'

In [None]:
pip install flash-attn==2.5.8 --no-build-isolation

In [10]:
version('flash_attn')

'2.5.8'

In [None]:
pip install vllm==0.4.3

In [4]:
version('vllm')

'0.4.3'

In [None]:
pip install gradio==4.31.5

In [10]:
version('gradio')

'4.31.5'

*Specific dependency for Phi3 small*

In [None]:
pip install pytest==8.2.1

In [51]:
version('pytest')

'8.2.1'

*Specific dependency for recent Mistral instruct models*

In [None]:
pip install mistral_common==1.1.0

In [None]:
version('mistral_common')

*Specific dependency for Mixtral 8x7B HQQ quantization*

In [None]:
pip install hqq==0.1.7.post3

In [6]:
version('hqq')

'0.1.7.post3'

WARNING: need to fix one bug in HQQ v0.1.7.post3

> vi /workspace/instructlab-local/.venv/lib/python3.10/site-packages/hqq/core/utils.py

Fix math ceil for torch.compile

```python
import math

...

def is_divisible(val1: int, val2: int) -> bool:
    return int(val2 * math.ceil(val1 / val2)) == val1
```

## 2. Download student and teacher models

In [1]:
models = {
    "mistral":"mistralai/Mistral-7B-v0.3",
    "mistral-instruct":"mistralai/Mistral-7B-Instruct-v0.3",
    "llama3":"meta-llama/Meta-Llama-3-8B",
    "llama3-instruct":"meta-llama/Meta-Llama-3-8B-Instruct",
    "phi3-mini":"microsoft/Phi-3-mini-4k-instruct",
    "phi3-small":"microsoft/Phi-3-small-8k-instruct",
    "mixtral-q3":"mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1-hf-attn-4bit-moe-3bit-metaoffload-HQQ",
    "mixtral-q2":"mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1-hf-attn-4bit-moe-2bit-HQQ"
}

**IMPORTANT: always set the local download cache directory explicitly**

In [2]:
DOWNLOAD_CACHE_DIR = "/models/huggingface/transformers"

from pathlib import Path
print(f"Download cache dir: {DOWNLOAD_CACHE_DIR} {('OK' if Path(DOWNLOAD_CACHE_DIR).exists() else 'KO')}")

Download cache dir: /models/huggingface/transformers OK


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

import os
import torch
from transformers.utils.hub import cached_file

memory_unit_mb = 1024*1024
memory_unit_gb = 1024*1024*1024

def get_directory_size(directory):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(directory):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

def get_model_path_and_size_on_disk(pretrained_model_id):    
    model_config_file = cached_file(pretrained_model_id, "config.json", local_files_only=True)
    model_directory = os.path.dirname(os.path.dirname(model_config_file))    
    total_size = get_directory_size(model_directory)
    return model_directory,total_size

def get_gpu_free_memory():
    return (torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated(0))/memory_unit_mb

def display_model_properties(model_name):
    path,size = get_model_path_and_size_on_disk(model_name)
    print(f"- model files size   : {(size/memory_unit_gb):.2f} GB")
    print(f"- stored in directory: {path}")

    free_mem_mb = get_gpu_free_memory()
    print(f"- free memory after load: {free_mem_mb:.2f} MB")



Note: if you need to clean the huggingface cache before downloading new models ...

```bash
cd /workspace/instructlab-local/
source .venv/bin/activate

pip install huggingface_hub[cli]

huggingface-cli delete-cache --dir /models/huggingface/transformers
```

Then navigate among model revisions with the arrow key, press space to select the revisions to delete, then Enter to delete.

### 2.1 Mistral 7B base and instruct v0.3

In [4]:
model_name = models["mistral"]

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=DOWNLOAD_CACHE_DIR)
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=DOWNLOAD_CACHE_DIR, use_safetensors=True, device_map=0, torch_dtype="auto", attn_implementation="flash_attention_2")

display_model_properties(model_name)
del model

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

- model files size   : 13.50 GB
- stored in directory: /models/huggingface/transformers/models--mistralai--Mistral-7B-v0.3/snapshots
- free memory after load: 10226.98 MB


In [5]:
model_name = models["mistral-instruct"]

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=DOWNLOAD_CACHE_DIR)
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=DOWNLOAD_CACHE_DIR, use_safetensors=True, device_map=0, torch_dtype="auto", attn_implementation="flash_attention_2")

display_model_properties(model_name)
del model

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

- model files size   : 27.00 GB
- stored in directory: /models/huggingface/transformers/models--mistralai--Mistral-7B-Instruct-v0.3/snapshots
- free memory after load: 10226.98 MB


### 2.2 Llama 3 8B base and instruct

In [6]:
model_name = models["llama3"]

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=DOWNLOAD_CACHE_DIR)
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=DOWNLOAD_CACHE_DIR, use_safetensors=True, device_map=0, torch_dtype="auto", attn_implementation="flash_attention_2")

display_model_properties(model_name)
del model

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

- model files size   : 14.97 GB
- stored in directory: /models/huggingface/transformers/models--meta-llama--Meta-Llama-3-8B/snapshots
- free memory after load: 9246.98 MB


In [4]:
model_name = models["llama3-instruct"]

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=DOWNLOAD_CACHE_DIR)
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=DOWNLOAD_CACHE_DIR, use_safetensors=True, device_map=0, torch_dtype="auto", attn_implementation="flash_attention_2")

display_model_properties(model_name)
del model

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

- model files size   : 29.93 GB
- stored in directory: /models/huggingface/transformers/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots
- free memory after load: 9246.98 MB


### 2.3 Phi 3 mini and small instruct

In [4]:
model_name = models["phi3-mini"]

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=DOWNLOAD_CACHE_DIR)
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=DOWNLOAD_CACHE_DIR, use_safetensors=True, device_map=0, torch_dtype="auto", attn_implementation="flash_attention_2")

display_model_properties(model_name)
del model

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

- model files size   : 14.24 GB
- stored in directory: /models/huggingface/transformers/models--microsoft--Phi-3-mini-4k-instruct/snapshots
- free memory after load: 17275.10 MB


In [4]:
model_name = models["phi3-small"]

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=DOWNLOAD_CACHE_DIR, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=DOWNLOAD_CACHE_DIR, use_safetensors=True, device_map=0, torch_dtype="auto", attn_implementation="flash_attention_2", trust_remote_code=True)

display_model_properties(model_name)
del model

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-small-8k-instruct:
- tokenization_phi3_small.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-small-8k-instruct:
- configuration_phi3_small.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-small-8k-instruct:
- triton_flash_blocksparse_attn.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-small-8k-instru

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

- model files size   : 27.54 GB
- stored in directory: /models/huggingface/transformers/models--microsoft--Phi-3-small-8k-instruct/snapshots
- free memory after load: 10335.75 MB


### 2.4 Mixtral 8x7B instruct v0.1 HQQ - 3.5 bits and 2 bits

In [4]:
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer as HQQAutoTokenizer

In [5]:
model_name = models["mixtral-q3"]

tokenizer = HQQAutoTokenizer.from_pretrained(model_name, cache_dir=DOWNLOAD_CACHE_DIR)
model = HQQModelForCausalLM.from_quantized(model_name, cache_dir=DOWNLOAD_CACHE_DIR)

# Download: 22.4 GB

free_mem_mb = get_gpu_free_memory()
print(f"- free memory after load: {free_mem_mb:.2f} MB")
    
del model

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

100%|██████████| 32/32 [00:03<00:00,  8.61it/s]
100%|██████████| 32/32 [00:06<00:00,  4.79it/s]


- free memory after load: 4049.73 MB


In [7]:
model_name = models["mixtral-q2"]

tokenizer = HQQAutoTokenizer.from_pretrained(model_name, cache_dir=DOWNLOAD_CACHE_DIR)
model = HQQModelForCausalLM.from_quantized(model_name, cache_dir=DOWNLOAD_CACHE_DIR)

# Download: 

free_mem_mb = get_gpu_free_memory()
print(f"- free memory after load: {free_mem_mb:.2f} MB")

del model

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/774 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.51k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

qmodel.pt:   0%|          | 0.00/18.2G [00:00<?, ?B/s]

100%|██████████| 32/32 [00:00<00:00, 74.10it/s]
100%|██████████| 32/32 [00:00<00:00, 493.74it/s]


- free memory after load: 6608.79 MB
