# L4-D - Building your own Quantizer: Load your Quantized Weights from Hugging Face Hub

In this lesson, you will learn memory efficient model loading.

Run the next cell to import all of the functions you have used before in the previous lesson(s) of `Building your own Quantizer` to follow along with the video.

In [1]:
import torch

from helper import W8A16LinearLayer, replace_linear_with_target, replace_linear_with_target_and_quantize

## Memory Efficient Model Loading

- Load [facebook/opt-125m](https://huggingface.co/facebook/opt-125m)

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "facebook/opt-125m"

model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), ep

In [4]:
replace_linear_with_target_and_quantize(module=model, target_class=W8A16LinearLayer, module_name_to_exclude=["lm_head"])

In [5]:
model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): W8A16LinearLayer()
            (v_proj): W8A16LinearLayer()
            (q_proj): W8A16LinearLayer()
            (out_proj): W8A16LinearLayer()
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): W8A16LinearLayer()
          (fc2): W8A16LinearLayer()
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
  )
  (lm_head): Linear(in_features=768, out_features=50272, bias=False)
)

Save the quantized model locally so as to upload on Hugging Face Hub

In [6]:
import os

os.makedirs("./models", exist_ok=True)

In [7]:
quantized_state_dict = model.state_dict()
torch.save(quantized_state_dict, "./models/quantized_state_dict.pth")

### Uploading quantized model to Hugging Face Hub

- The below code is for demonstration purposes only.
- You'll need your own Hugging Face username in order for it to run.
- You'll add your username in `YOUR_HF_USERNAME = ""` 

```Python
from huggingface_hub import HfApi, create_repo

YOUR_HF_USERNAME = ""
your_repo_id = f"{YOUR_HF_USERNAME}/opt-125m-quantized-dlai"

api = HfApi()

# Comment it off if you create repo directly on Hugging Face Hub
create_repo(your_repo_id)

api.upload_file(
 path_or_fileobj="quantized_state_dict.pth",
 path_in_repo="quantized_state_dict.pth",
 repo_id=your_repo_id
)
```

### Load the Model in the Meta Device

In [8]:
from transformers import OPTForCausalLM, AutoTokenizer, AutoConfig

model_id = "facebook/opt-125m"
config = AutoConfig.from_pretrained(model_id)

with torch.device("meta"):
    model = OPTForCausalLM(config)

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [9]:
# Observation: No weights in parameters
for param in model.parameters():
    print(param)

Parameter containing:
tensor(..., device='meta', size=(50272, 768), requires_grad=True)
Parameter containing:
tensor(..., device='meta', size=(2050, 768), requires_grad=True)
Parameter containing:
tensor(..., device='meta', size=(768,), requires_grad=True)
Parameter containing:
tensor(..., device='meta', size=(768,), requires_grad=True)
Parameter containing:
tensor(..., device='meta', size=(768, 768), requires_grad=True)
Parameter containing:
tensor(..., device='meta', size=(768,), requires_grad=True)
Parameter containing:
tensor(..., device='meta', size=(768, 768), requires_grad=True)
Parameter containing:
tensor(..., device='meta', size=(768,), requires_grad=True)
Parameter containing:
tensor(..., device='meta', size=(768, 768), requires_grad=True)
Parameter containing:
tensor(..., device='meta', size=(768,), requires_grad=True)
Parameter containing:
tensor(..., device='meta', size=(768, 768), requires_grad=True)
Parameter containing:
tensor(..., device='meta', size=(768,), requires_

In [10]:
model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), ep

In [11]:
replace_linear_with_target(module=model, target_class=W8A16LinearLayer, module_name_to_exclude=["lm_head"])

In [12]:
model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): W8A16LinearLayer()
            (v_proj): W8A16LinearLayer()
            (q_proj): W8A16LinearLayer()
            (out_proj): W8A16LinearLayer()
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): W8A16LinearLayer()
          (fc2): W8A16LinearLayer()
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
  )
  (lm_head): Linear(in_features=768, out_features=50272, bias=False)
)

Load state dict from quantized model on Hugging Face Hub

In [13]:
from huggingface_hub import hf_hub_download

state_dict_cache_path = hf_hub_download(repo_id="ybelkada/opt-125m-quantized-dlai", filename="quantized_state_dict.pth")

In [14]:
state_dict = torch.load(state_dict_cache_path)

In [15]:
model.load_state_dict(state_dict=state_dict, strict=True, assign=True)

<All keys matched successfully>

In [16]:
for param in model.parameters():
    print(param)

Parameter containing:
tensor([[ 0.1152, -0.1436,  0.0554,  ...,  0.2148,  0.0835,  0.0669],
        [ 0.1147, -0.1436,  0.0547,  ...,  0.2148,  0.0835,  0.0669],
        [ 0.0010, -0.0923,  0.1025,  ..., -0.0403,  0.0060, -0.1079],
        ...,
        [ 0.1152, -0.1436,  0.0547,  ...,  0.2148,  0.0830,  0.0669],
        [ 0.1152, -0.1455,  0.0547,  ...,  0.2158,  0.0835,  0.0674],
        [ 0.1157, -0.1436,  0.0576,  ...,  0.2139,  0.0830,  0.0649]],
       dtype=torch.bfloat16, requires_grad=True)
Parameter containing:
tensor([[ 1.8311e-03,  9.0790e-04,  4.4250e-03,  ...,  1.6724e-02,
          1.7471e-03, -4.9133e-03],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-3.5156e-02, -6.4844e-01,  3.7842e-02,  ..., -8.9844e-02,
          6.1523e-02, -1.0681e-02],
        ...,
        [-1.3733e-04,  2.5391e-02, -6.8665e-03,  ..., -4.1504e-02,
         -5.9814e-03,  2.0142e-02],
        [-2.1973e-03,  2.6733e-02, -3.1281e-03, 

- Test your model.
- **Note:** Your generated text might be different than what you see in the video.

In [17]:
from transformers import pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
pipe("Hello today I am", max_new_tokens=40)

[{'generated_text': 'Hello today I am a student at the University of California at Berkeley. I am a student at the University of California at Berkeley. I am a student at the University of California at Berkeley. I am a student at the'}]

In [18]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
pipe("Hello today I am giving a course about", max_new_tokens=40)

[{'generated_text': 'Hello today I am giving a course about the history of the world and the history of the world.\nI am going to teach you about the history of the world and the history of the world.\nI am going to teach you about'}]