# 建立HF模型
https://www.langchain.com.cn/docs/tutorials/llm_chain/

## 1. 自定義LLM類別

In [None]:
from langchain.llms.base import LLM
from typing import Any, List, Optional
from langchain.callbacks.manager import CallbackManagerForLLMRun
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

class LLaMA3_LLM(LLM):
    # 基于本地 llama3 自定义 LLM 类
    tokenizer: AutoTokenizer = None
    model: AutoModelForCausalLM = None
    quantization_config: BitsAndBytesConfig = None
        
    def __init__(self, mode_name_or_path :str):

        super().__init__()
        print("正在从本地加载模型...")
        self.tokenizer = AutoTokenizer.from_pretrained(mode_name_or_path, use_fast=False)
        self.model = AutoModelForCausalLM.from_pretrained(mode_name_or_path, torch_dtype=torch.bfloat16, device_map="auto")
        self.tokenizer.pad_token = self.tokenizer.eos_token
        print("完成本地模型的加载")

    def bulid_input(self, prompt, history=[]):
        user_format='<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|>'
        assistant_format='<|start_header_id|>assistant<|end_header_id|>\n\n{content}<|eot_id|>'
        history.append({'role':'user','content':prompt})
        prompt_str = ''
        # 拼接历史对话
        for item in history:
            if item['role']=='user':
                prompt_str+=user_format.format(content=item['content'])
            else:
                prompt_str+=assistant_format.format(content=item['content'])
        return prompt_str
    
    def _call(self, prompt : str, stop: Optional[List[str]] = None,
                run_manager: Optional[CallbackManagerForLLMRun] = None,
                **kwargs: Any):

        input_str = self.bulid_input(prompt=prompt)
        input_ids = self.tokenizer.encode(input_str, add_special_tokens=False, return_tensors='pt').to(self.model.device)
        outputs = self.model.generate(
            input_ids=input_ids, max_new_tokens=512, do_sample=True,
            top_p=0.9, temperature=0.5, repetition_penalty=1.1, eos_token_id=self.tokenizer.encode('<|eot_id|>')[0]
            )
        outputs = outputs.tolist()[0][len(input_ids[0]):]
        response = self.tokenizer.decode(outputs).strip().replace('<|eot_id|>', "").replace('<|start_header_id|>assistant<|end_header_id|>\n\n', '').strip()
        return response
        
    @property
    def _llm_type(self) -> str:
        return "LLaMA3_LLM"
chat_model = LLaMA3_LLM(mode_name_or_path="meta-llama/Llama-3.2-3B-Instruct")

In [None]:
chat_model = LLaMA3_LLM(mode_name_or_path="meta-llama/Llama-3.2-3B-Instruct")

## 2.用包裝好的ChatHuggingFace

In [None]:
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True,
)

llm = HuggingFacePipeline.from_model_id(
    model_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    pipeline_kwargs=dict(
        max_new_tokens=512,
        do_sample=False,
        repetition_penalty=1.03,
        return_full_text=False,
    ),
    model_kwargs={"quantization_config": quantization_config},
)

chat_model = ChatHuggingFace(llm=llm)

# 1.直接使用 Chat 模型

In [4]:
from langchain_core.messages import (
    HumanMessage,
    SystemMessage,
)

messages = [
    SystemMessage(content="You're a helpful assistant"),
    HumanMessage(
        content="Why sky is blue?"
    ),
]

ai_msg = chat_model.invoke(messages)
print(ai_msg)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128000 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


: 

In [None]:
from langchain_core.output_parsers import StrOutputParser
parser = StrOutputParser()
result = chat_model.invoke(messages) # 我們可以保存語言模型呼叫的結果，然後將其傳遞給解析器。
parser.invoke(result)

In [None]:
# 更常見的是，我們可以將模型與此輸出解析器「鍊式」連接。這意味著在此鏈中，每次都會呼叫此輸出解析器。此鏈採用語言模型的輸入類型（字串或訊息列表）並傳回輸出解析器的輸出類型（字串）。
# 我們可以使用|運算子輕鬆建立鏈。|運算符在LangChain 中用於將兩個元素組合在一起。
chain = chat_model | parser
chain.invoke(messages)

# 2.使用提示詞

In [None]:
from langchain_core.prompts import ChatPromptTemplate
system_template = "Translate the following into {language}:"
prompt_template = ChatPromptTemplate.from_messages(
    [("system", system_template), ("user", "{text}")]
)

In [None]:
# invoke 裡面帶入的key就是 ChatPromptTemplate 中的變數
result = prompt_template.invoke({"language": "italian", "text": "hi"})

result


In [None]:
result.to_messages()

# 3. 使用LCEL 連接

In [None]:
# 使用管道( |) 操作符將其與上面的模型和輸出解析器結合
chain = prompt_template | chat_model | parser

In [None]:
chain.invoke({"language": "chinese", "text": "hi"})