In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
import torch

In [11]:
# Specify your local path
local_model_path = r"C:\Models\llama-2-7b"

# Load tokenizer from local path
tokenizer = AutoTokenizer.from_pretrained(local_model_path)

# Load model from local path
model = AutoModelForCausalLM.from_pretrained(
    local_model_path,  # Changed to local path
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True
)

# Configure LoRA (same as before)
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA
model = get_peft_model(model, lora_config)

# Check trainable parameters
model.print_trainable_parameters()

print("✅ Model loaded from local path!")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [00:15<00:00,  7.51s/it]
Some parameters are on the meta device because they were offloaded to the cpu.


trainable params: 8,388,608 || all params: 6,746,804,224 || trainable%: 0.1243
✅ Model loaded from local path!


In [8]:
# Check your GPU memory first
import torch
if torch.cuda.is_available():
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU Memory: {gpu_memory:.1f} GB")
    
    if gpu_memory >= 12:
        print("✅ Use 8-bit quantization")
        config = "load_in_8bit=True"
    elif gpu_memory >= 6:
        print("✅ Use 4-bit quantization")
        config = "4-bit with BitsAndBytesConfig"
    else:
        print("⚠️ Use CPU offloading or smaller model")
        config = "CPU offloading or TinyLlama"
else:
    print("❌ No GPU - use CPU (very slow)")

GPU Memory: 12.9 GB
✅ Use 8-bit quantization
