# llama1 模型加载

https://huggingface.co/docs/transformers/main/en/model_doc/llama

max_length = 2k

## 1. llama模型初始化

In [1]:
from transformers import LlamaModel, LlamaConfig

# Initializing a LLaMA llama-7b style configuration
configuration = LlamaConfig()

# Initializing a model from the llama-7b style configuration
modelConfig = LlamaModel(configuration)

# Accessing the model configuration
configuration = modelConfig.config

In [4]:
print(configuration)

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.39.3",
  "use_cache": true,
  "vocab_size": 32000
}



## 2. 权重参数加载 huggingface hf格式

In [None]:
import torch

from transformers import LlamaForCausalLM, LlamaTokenizer

MODEL_PATH = "/opt/Data/ModelWeight/meta/llama1.hf/llama1-7b-hf"

tokenizer = LlamaTokenizer.from_pretrained(MODEL_PATH, legacy=True, use_fast=False)
model = LlamaForCausalLM.from_pretrained(MODEL_PATH, torch_dtype=torch.float16, local_files_only=True).to('cuda')

In [None]:
import torch

from transformers import LlamaForCausalLM, LlamaTokenizer

MODEL_PATH = "/opt/Data/ModelWeight/meta/llama1.hf/llama1-7b-hf"

tokenizer = LlamaTokenizer.from_pretrained(MODEL_PATH, legacy=True, use_fast=False)
model = LlamaForCausalLM.from_pretrained(MODEL_PATH, torch_dtype=torch.float16, local_files_only=True).to('cuda')

tokenizer.pad_token_id = tokenizer.eos_token_id

<b>模型调用</b>

In [2]:
prompt = "你好?"
inputs = tokenizer(prompt, return_tensors="pt")

# Generate
generate_ids = model.generate(inputs.input_ids.cuda(), max_length=100, pad_token_id=tokenizer.eos_token_id, temperature=0.7,top_p=0.1,top_k=40,do_sample=True,num_beams=1)
result = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

print(result[0])



你好? 你好? 你好? 你好? 你好? 你好? 你好? 你好? 你好? 你好? 你好? 你好? 你好? 你好? 你好? 你好? 你好? 你好? 你好? 你好? 你好? 你好? 你好? 你好? 你好


### 2.1 llama2_wrapper

<b>模型加载</b>

In [1]:
from llama2_wrapper import LLAMA2_WRAPPER, get_prompt, get_prompt_for_dialog

MODELPATH = "/opt/Data/ModelWeight/meta/llama1.hf/llama1-7b-hf"

llm = LLAMA2_WRAPPER(
	model_path = MODELPATH,
    backend_type = "transformers",
    # load_in_8bit = True
)

Running on GPU with backend torch transformers.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at /opt/Data/ModelWeight/meta/llama1.hf/llama1-7b-hf and are newly initialized: ['model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.6.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.laye

<b>模型调用</b>

In [None]:
prompt = "I feel good."
answer = llm(get_prompt(prompt), temperature=0.9)
print(answer)

<b>会话聊天</b>

In [None]:
dialog = [
    {
        "role":"system",
        "content":"You are a helpful, respectful and honest assistant. "
    },{
        "role":"user",
        "content":"Hi do you know Pytorch?",
    },
]
result = llm.chat_completion(dialog)

In [None]:
print(result["choices"][0]["message"]["content"])

<b>流式输出</b>

In [None]:
prompt = get_prompt("Hi do you know Pytorch?")
for response in llm.generate(prompt):
	print(response)