In [4]:
!pip install urllib3




In [1]:
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer
from time import time
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_id = '/kaggle/input/llama-3/transformer/8b-chat-hf/1'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
##提问：这一步是为了什么？
##核心思路是存储时压缩，计算是还原，正常情况下权重以32位或者16位浮点数来储存，导致载入时
##占用显存大，通过载入时将浮点数压缩式转换成4bit的低精度格式存贮，计算时再根据预设映射来还原，从而达到
##在尽可能保证模型推理精度的基础下，降低显存消耗，让原本需要高显存 GPU（如 A100、3090）才能运行的 7B/13B 
# 等大模型，能在消费级 GPU（如 RTX 3060/4090）甚至低配 GPU 上运行。
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype=bfloat16
)
import torch
# 验证核心指标
print("CUDA 是否可用：", torch.cuda.is_available())
print("PyTorch 绑定的 CUDA 版本：", torch.version.cuda)
print("GPU 设备名：", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "无GPU")
# 重新测试设备选择逻辑
device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'
print("最终选择的设备：", device)

CUDA 是否可用： True
PyTorch 绑定的 CUDA 版本： 11.8
GPU 设备名： NVIDIA GeForce RTX 3060 Laptop GPU
最终选择的设备： cuda:0


In [3]:
time_start = time()

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    trust_remote_code=True,
    max_new_tokens=1024
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code =True,
    config=model_config,
    quantization_config = bnb_config,
    device_map ='auto',
)
tokenizer =AutoTokenizer.from_pretrained(model_id)
time_end =time()

OSError: Can't load the configuration of '/kaggle/input/llama-3/transformer/8b-chat-hf/1'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure '/kaggle/input/llama-3/transformer/8b-chat-hf/1' is the correct path to a directory containing a config.json file

In [None]:
time_start = time()
query_pipeline = transformers.pipeline(
    "text-generation",# 指定pipeline任务类型：文本生成
    model = model,
    tokenizer = tokenizer,
    torch_dtype= torch.float16, # 模型计算使用半精度（节省显存/提速）
    max_length = 1024,
    device_map = "auto", # 自动分配模型到GPU/CPU（优先GPU）
)
time_end= time()
print(f"Prepare pipeline:{round(time_end-time_start,3)}sec.")


In [None]:
def test_model(tokenizer, pipeline, message):
    """
    Perform a query
    print the result
    Args:
        tokenizer: the tokenizer
        pipeline: the pipeline
        message: the prompt
    Returns
        None
    """
    time_start =time()
    sequences = pipeline(
        message,# 输入的提示词（prompt）
        do_sample =True,   # 开启采样（非贪心解码，生成更自然的文本）
        top_k = 10, # 采样时只选前10个概率最高的token（控制随机性
        num_return_sequences =1,# 生成1个候选回答（可改多轮）
        eos_token_id = tokenizer.eos_token_id,# 生成终止符（遇到该token停止）
        max_length=200,
    )
    time_end = time()

    total_time = f"{round(time_end-time_start,3)}sec."
    question = sequences[0]['generated_text'][:len(message)]
    answer = sequences[0]['generated_text'][len(message):]# 提取模型生成的回答（提示词之后的部分
    return f"Question: {question}\nAnswer: {answer}\nTotal time: {total_time}"

In [None]:
from IPython.display import display,Markdown
def colorize_text(text):
    for word,color in zip(["Reasoning","Question","Answer","Total time"],["blue","red","green","magenta"]):
        text = text.replace(f"{word}",f"\n\n**<font color='{color}'>{word}:</font>**")#这个是markdown格式，两段是font，前面那个有颜色标注
    return text




In [None]:
response = test_model(tokenizer,pipeline,"Please explain what is EU AI Act.")
display(Markdown(colorize_text(response)))

In [None]:
llm = HuggingFacePipeline(pipeline=query_pipeline)

time_start =time()
question = "Please explain what EU AI Act is."
response = llm(prompt = question)
time_end = time()
totel_time = f"{round(time_end-time)}"