## Initial Env

In [3]:
%pip install langchain==0.1.1 pypdf==4.0.0 InstructorEmbedding==1.0.1

Defaulting to user installation because normal site-packages is not writeable
Collecting langchain==0.1.1
  Using cached langchain-0.1.1-py3-none-any.whl.metadata (13 kB)
Collecting pypdf==4.0.0
  Using cached pypdf-4.0.0-py3-none-any.whl.metadata (7.4 kB)
Collecting InstructorEmbedding==1.0.1
  Using cached InstructorEmbedding-1.0.1-py2.py3-none-any.whl.metadata (20 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain==0.1.1)
  Using cached SQLAlchemy-2.0.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain==0.1.1)
  Using cached aiohttp-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)
Collecting async-timeout<5.0.0,>=4.0.0 (from langchain==0.1.1)
  Using cached async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain==0.1.1)
  Using cached dataclasses_json-0.6.3-py3-none-any.whl.metadata (25 kB)
Collecting jsonpatch<2.0,>=1.3

In [6]:
COLLECTION_NAME="collection_1"
DIM=512
MILVUS_CONNECTION = {"host": "127.0.0.1", "port": "19530"}
VECTOR_SEARCH_TOP_K = 6

DocumentHome = '/home/ubuntu/sources/pdfs/'

In [None]:

!gdown 1v-Rn1FVU1pLTAQEgm0N9oB6cExMoebZr -O {DocumentHome}/tesla-earnings-report.pdf
!gdown 1hm3dNy2DMX4Q_pF-bTfxILOeEYllX7Tb -O {DocumentHome}/docu88904_PowerMax-系列产品指南.pdf

## Load and Split Documents to chunks

In [16]:

from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.document_loaders import PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
loader = PyPDFLoader('/home/ubuntu/sources/pdfs/docu88904_PowerMax-系列产品指南.pdf')   
docs = loader.load()   

In [17]:
loader = PyPDFDirectoryLoader(DocumentHome)
docs = loader.load()
len(docs)

206

In [18]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
texts = text_splitter.split_documents(docs)
len(texts)

407

## Embeddings

In [8]:
%pip install  InstructorEmbedding==1.0.1 sentence-transformers==2.2.2

Defaulting to user installation because normal site-packages is not writeable
Collecting sentence-transformers==2.2.2
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting torchvision (from sentence-transformers==2.2.2)
  Using cached torchvision-0.17.0-cp310-cp310-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting scikit-learn (from sentence-transformers==2.2.2)
  Using cached scikit_learn-1.4.0-1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting nltk (from sentence-transformers==2.2.2)
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting joblib (from nltk->sentence-transformers==2.2.2)
  Using cached joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn->sentence-transformers==2.2.2)
  Using cached threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)
Using cached scikit_learn-1.4.0-1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
Using cached torchvisi

In [9]:

import torch
from langchain.embeddings import HuggingFaceInstructEmbeddings 
from sentence_transformers import SentenceTransformer

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
%pip install ipywidgets

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [10]:
embeddings = HuggingFaceInstructEmbeddings(
    model_name="distiluse-base-multilingual-cased-v2", model_kwargs={"device": DEVICE}
)

load INSTRUCTOR_Transformer
max_seq_length  512


## Load embedded data into Vector DB.

### 1.Milvus

In [None]:
%pip install --upgrade pymilvus==2.3.6

In [None]:
import pymilvus
from langchain_community.vectorstores import Milvus
from pymilvus import utility, connections, FieldSchema, CollectionSchema, DataType, Collection
print(pymilvus.__version__)

In [None]:
# Connect to the Milvus server
connections.connect(alias = "default", **MILVUS_CONNECTION)

collectionList = utility.list_collections()

if COLLECTION_NAME in collectionList:
    utility.drop_collection(COLLECTION_NAME)

collectionList

In [None]:
fields = [
    FieldSchema(name="source", dtype=DataType.VARCHAR, max_length=65535),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535),
    FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=DIM),
]
schema = CollectionSchema(fields=fields, description="Instructor Embeddings")
collection = Collection(name=COLLECTION_NAME, schema=schema) 
utility.has_collection(COLLECTION_NAME)

In [None]:
vector_db = Milvus.from_documents(
    texts,
    embedding = embeddings,
    collection_name = COLLECTION_NAME,
    connection_args=MILVUS_CONNECTION
)

In [None]:
query = "what is SRDF/Star?"
docs = vector_db.similarity_search(query)
docs

### 2. Chroma

In [15]:
%pip install chromadb==0.4.22

Defaulting to user installation because normal site-packages is not writeable
Collecting chromadb==0.4.22
  Using cached chromadb-0.4.22-py3-none-any.whl.metadata (7.3 kB)
Collecting build>=1.0.3 (from chromadb==0.4.22)
  Using cached build-1.0.3-py3-none-any.whl.metadata (4.2 kB)
Collecting chroma-hnswlib==0.7.3 (from chromadb==0.4.22)
  Using cached chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb==0.4.22)
  Using cached fastapi-0.109.0-py3-none-any.whl.metadata (24 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb==0.4.22)
  Using cached uvicorn-0.27.0.post1-py3-none-any.whl.metadata (6.4 kB)
Collecting posthog>=2.4.0 (from chromadb==0.4.22)
  Using cached posthog-3.3.4-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting pulsar-client>=3.1.0 (from chromadb==0.4.22)
  Using cached pulsar_client-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.

In [16]:
from langchain_community.vectorstores import Chroma

In [7]:

# load it into Chroma 
vector_db = Chroma.from_documents(documents=texts, embedding=embeddings, persist_directory="/home/ubuntu/data/chroma/")

NameError: name 'texts' is not defined

In [35]:
query = "什么是SRDF/Star?"
docs = vector_db.similarity_search(query)
docs

[Document(page_content='SRDF/Star 解决方案 ..................................................................... 80第4章\n第5章\n第6章\n第7章\n第8章目录\n4 产品指南   PowerMaxOS', metadata={'page': 3, 'source': '/home/ubuntu/sources/pdfs/docu88904_PowerMax-系列产品指南.pdf'}),
 Document(page_content='图 17 带有  R22 设备的并发  SRDF/Star\nR11 R2 \nR22SRDF/S\nSRDF/A\nSRDF/A\nrecovery links Site B \nActive\nInactiveSite A \nSite C \n级联  SRDF/Star\n在级联  SRDF/Star 解决方案中，同步辅助站点较之异步第三站点始终拥有更新的数\n据。当同步辅助站点出现故障时，级联  SRDF/Star 解决方案可以在主站点和异步第三\n站点之间以增量方式建立  SRDF/A 会话。\n级联  SRDF/Star 可以确定当前活动  R1 周期（捕获）内容何时通过远距离  SRDF/A 链\n路到达活动  R2 周期（应用）。这可将为实现完全同步而必须在站点  B 和站点  C 之间\n移动的数据量降到最低。\n此示例显示了一个基本的级联  SRDF/Star 解决方案。远程复制解决方案\n82 产品指南   PowerMaxOS', metadata={'page': 81, 'source': '/home/ubuntu/sources/pdfs/docu88904_PowerMax-系列产品指南.pdf'}),
 Document(page_content='l与 TimeFinder 产品系列紧密集成\nl地理位置分散的辅助站点和第三站点\n当主站点出现故障时，级联  SRDF 只需极少的用户干预就可以从辅助站点到第三站点\n继续镜像。这可以在第三站点实现更快的恢复。\n辅助站点和第三站点都可以是故障切换站点。开放式系统解决方案通常故障切换到第\n三站点。\n级联  SRDF 可以与  SRDF/

### 3.FAISS

In [None]:
from langchain.vectorstores import FAISS

In [None]:
# Create local vector database
vector_db = FAISS.from_documents(texts, embeddings)

## Retrieval question

In [12]:
from langchain.chains import RetrievalQA
from langchain import PromptTemplate


from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [13]:
# Prompt
prompt_template_en = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Use three sentences maximum and keep the answer as concise as possible. 
{context}
Question: {question}
Helpful Answer:"""


prompt_template_cn = """基于以下已知信息，简洁和专业的来回答用户的问题。
如果无法从中得到答案，请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息"，不允许在答案中添加编造成分，答案请使用中文。
已知内容:
{context}
问题:
{question}"""

prompt_template = prompt_template_cn

QA_CHAIN_PROMPT = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

## Load data from vector database


### 1. Milvus DB

In [None]:
vector_store = Milvus(
    connection_args = MILVUS_CONNECTION,
    embedding_function = embeddings,
    collection_name = COLLECTION_NAME,
    drop_old = True,
)


### 2. Chroma DB

In [17]:
vector_store = Chroma(persist_directory="/home/ubuntu/data/chroma/", embedding_function=embeddings)

## Load LLM from API Server

### 1. llama2 with ollama

In [39]:
llm = Ollama(model="llama2", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))


## Load LLM Locally


### 1. Llama2

In [16]:
%pip install optimum auto-gptq

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Defaulting to user installation because normal site-packages is not writeable
Collecting auto-gptq
  Downloading auto_gptq-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting rouge (from auto-gptq)
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Collecting gekko (from auto-gptq)
  Downloading gekko-1.0.6-py3-none-any.whl (12.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.2/12.2 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Downloading auto_gptq-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m51.1 MB/s[0m eta [36m

In [1]:
import torch
from langchain import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline
 
MODEL_NAME = "TheBloke/Llama-2-13b-Chat-GPTQ"
 
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
 
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, torch_dtype=torch.float16, trust_remote_code=True, device_map="auto"
)
 
generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
generation_config.max_new_tokens = 1024
generation_config.temperature = 0.0001
generation_config.top_p = 0.95
generation_config.do_sample = True
generation_config.repetition_penalty = 1.15
 
text_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    generation_config=generation_config,
)
 
llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0})

  from .autonotebook import tqdm as notebook_tqdm
CUDA extension not installed.
CUDA extension not installed.
model.safetensors: 100%|██████████| 7.26G/7.26G [02:01<00:00, 59.6MB/s]
generation_config.json: 100%|██████████| 132/132 [00:00<00:00, 763kB/s]


In [3]:
llm.predict("Who are you?")

"\n\n_A._ I am the Spirit of the Land.\n\n_B._ What do you want with me?\n\n_A._ I have been watching you, and I see that you are a man of great potential. You have the power to make your dreams come true, but you lack the courage to take the first step. That is why I have come to offer my assistance. Together, we can achieve great things.\n\n_B._ How can you help me?\n\n_A._ I can guide you towards your goals, and provide you with the strength and courage you need to overcome any obstacles that stand in your way. But you must be willing to trust me and follow my guidance. Are you ready to take the first step on this journey?\n\n_B._ Yes, I'm ready. I trust you.\n\n_A._ Then let us begin. The journey ahead will not be easy, but with determination and hard work, you will reach your destination. Remember, I am always here to support and guide you. Trust in yourself and in me, and together we will achieve greatness."

### 2. ChatGLM

In [None]:
%pip install sentencepiece

- 本地部署的 ChatGLM3-6B，自定义一个 LLM 类

In [42]:
from langchain.llms.base import LLM
from typing import Any, List, Optional
from langchain.callbacks.manager import CallbackManagerForLLMRun
from transformers import AutoTokenizer, AutoModelForCausalLM

class ChatGLM_LLM(LLM):
    # 基于本地 InternLM 自定义 LLM 类
    tokenizer : AutoTokenizer = None
    model: AutoModelForCausalLM = None

    def __init__(self, model_path :str):
        # model_path: InternLM 模型路径
        # 从本地初始化模型
        super().__init__()
        print("正在从本地加载模型...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        self.model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(torch.bfloat16).cuda()
        self.model = self.model.eval()
        print("完成本地模型的加载")

    def _call(self, prompt : str, stop: Optional[List[str]] = None,
                run_manager: Optional[CallbackManagerForLLMRun] = None,
                **kwargs: Any):
        # 重写调用函数
        response, history = self.model.chat(self.tokenizer, prompt , history=[])
        return response
        
    @property
    def _llm_type(self) -> str:
        return "ChatGLM3-6B"

In [44]:
# from LLM import ChatGLM_LLM
llm = ChatGLM_LLM(model_path = "/home/ubuntu/models/chatglm3-6b-32k")
llm.predict("你是谁")

正在从本地加载模型...


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 7/7 [00:08<00:00,  1.24s/it]


完成本地模型的加载


'我是一个名为 ChatGLM3-6B 的人工智能助手，是基于清华大学 KEG 实验室和智谱 AI 公司于 2023 年共同训练的语言模型开发的。我的目标是针对用户的问题和要求提供适当的答复和支持。由于我是一个计算机程序，所以我没有实际的存在，只能通过互联网来与您交流。'

### 3. 百川LLM

In [1]:
%pip install accelerate colorama bitsandbytes sentencepiece streamlit transformers_stream_generator cpm_kernels xformers scipy 

Defaulting to user installation because normal site-packages is not writeable
Collecting accelerate
  Using cached accelerate-0.26.1-py3-none-any.whl.metadata (18 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)
Collecting sentencepiece
  Using cached sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Collecting streamlit
  Using cached streamlit-1.30.0-py2.py3-none-any.whl.metadata (8.2 kB)
Collecting transformers_stream_generator
  Using cached transformers_stream_generator-0.0.4-py3-none-any.whl
Collecting cpm_kernels
  Using cached cpm_kernels-1.0.11-py3-none-any.whl (416 kB)
Collecting xformers
  Using cached xformers-0.0.24-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.0 kB)
Collecting scipy
  Using cached scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting numpy>=1.17 (from accelerate)
  Using cached numpy-1.26.3-cp310-cp310-manylinux_2_17_x86_

In [18]:
from langchain.llms.base import LLM
from typing import Any, List, Optional
from langchain.callbacks.manager import CallbackManagerForLLMRun
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import torch

class Baichuan2_LLM(LLM):
    # 基于本地 Baichuan 自定义 LLM 类
    tokenizer : AutoTokenizer = None
    model: AutoModelForCausalLM = None

    def __init__(self, model_path :str):
        # model_path: Baichuan-7B-chat模型路径
        # 从本地初始化模型
        super().__init__()
        print("正在从本地加载模型...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        self.model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True,torch_dtype=torch.bfloat16,  device_map="auto")
        self.model.generation_config = GenerationConfig.from_pretrained(model_path)
        self.model = self.model.eval()
        print("完成本地模型的加载")

    def _call(self, prompt : str, stop: Optional[List[str]] = None,
                run_manager: Optional[CallbackManagerForLLMRun] = None,
                **kwargs: Any):
         # 重写调用函数
        messages = [
            {"role": "user", "content": prompt}
        ]
         # 重写调用函数
        response= self.model.chat(self.tokenizer, messages)
        return response
        
    @property
    def _llm_type(self) -> str:
        return "baichuan2_LLM"

In [19]:
llm = Baichuan2_LLM(model_path = "/home/ubuntu/models/Baichuan2-13B-Chat")
llm.predict("你是谁")

正在从本地加载模型...


You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.
Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.73s/it]


完成本地模型的加载


'我是百川大模型，是由百川智能的工程师们创造的大语言模型，我可以和人类进行自然交流、解答问题、协助创作，帮助大众轻松、普惠的获得世界知识和专业服务。如果你有任何问题，可以随时向我提问'

### 4. 零一万物(01.ai)

In [60]:
%pip install gradio>=4.13.0 protobuf>=4.25.1 torch==2.0.1 accelerate sentencepiece  datasets

Note: you may need to restart the kernel to use updated packages.


In [61]:
%pip install git+https://github.com/huggingface/transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-p8wxfr3m
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-p8wxfr3m
  Resolved https://github.com/huggingface/transformers to commit 7b2bd1fbbd50e57cf28013e2d0737912ecc0f2eb
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting tokenizers<0.19,>=0.14 (from transformers==4.38.0.dev0)
  Using cached tokenizers-0.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached tokenizers-0.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) 

In [1]:
from langchain.llms.base import LLM
from typing import Any, List, Optional
from langchain.callbacks.manager import CallbackManagerForLLMRun
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig , LlamaTokenizerFast
import torch

class Yi_LLM(LLM):
    # 基于本地 Yi 自定义 LLM 类
    tokenizer: AutoTokenizer = None
    model: AutoModelForCausalLM = None
        
    def __init__(self, model_path :str):

        super().__init__()
        print("正在从本地加载模型...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
        self.model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True,torch_dtype=torch.bfloat16,device_map="auto")
        self.model.generation_config = GenerationConfig.from_pretrained(model_path)
        self.model.generation_config.pad_token_id = self.model.generation_config.eos_token_id
        self.model = self.model.eval()
        print("完成本地模型的加载")
        
    def _call(self, prompt : str, stop: Optional[List[str]] = None,
                run_manager: Optional[CallbackManagerForLLMRun] = None,
                **kwargs: Any):

        messages = [
            {"role": "user", "content": prompt }
                    ]
        input_ids = self.tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
    
        output_ids = self.model.generate(input_ids.to('cuda'))
        response = self.tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)
        return response
    @property
    def _llm_type(self) -> str:
        return "Yi_LLM"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
llm = Yi_LLM(model_path = "/home/ubuntu/models/Yi-6B-Chat")
llm.predict("你是谁")

正在从本地加载模型...


Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.45it/s]
  warn_deprecated(


完成本地模型的加载


'你好！我是零一万物开发的智能助手，我叫 Yi，我是由工程师们通过大量的文本数据进行训练的。我拥有广泛的知识和能力，可以回答你的问题、提供信息，以及执行一些简单的任务。请问有什么我可以帮助你的？'

### 5. 通义千问

In [26]:
%pip install transformers==4.32.0 accelerate tiktoken einops scipy transformers_stream_generator==0.0.4 peft

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting transformers==4.32.0
  Using cached transformers-4.32.0-py3-none-any.whl.metadata (118 kB)
Collecting transformers_stream_generator==0.0.4
  Using cached transformers-stream-generator-0.0.4.tar.gz (12 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting peft
  Using cached peft-0.8.1-py3-none-any.whl.metadata (25 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.32.0)
  Using cached tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading peft-0.8.1-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collecte

In [52]:
from langchain.llms.base import LLM
from typing import Any, List, Optional
from langchain.callbacks.manager import CallbackManagerForLLMRun
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig


class QwenLM(LLM):
    # 基于本地 Qwen 自定义 LLM 类
    tokenizer : AutoTokenizer = None
    model: AutoModelForCausalLM = None

    def __init__(self, model_path :str):
        # model_path: Qwen 模型路径
        # 从本地初始化模型
        super().__init__()
        print("正在从本地加载模型...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True).eval()
        # Specify hyperparameters for generation
        self.model.generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
        print("完成本地模型的加载")

    def _call(self, prompt : str, stop: Optional[List[str]] = None,
                run_manager: Optional[CallbackManagerForLLMRun] = None,
                **kwargs: Any):
        # 重写调用函数
        response, history = self.model.chat(self.tokenizer, prompt , history=[])
        return response
        
    @property
    def _llm_type(self) -> str:
        return "QwenLM"

In [53]:
llm = QwenLM(model_path = "/home/ubuntu/models/Qwen-14B-Chat")
llm.predict("你是谁")

正在从本地加载模型...


The model is automatically converting to bf16 for faster inference. If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to "AutoModelForCausalLM.from_pretrained".
Try importing flash-attention for faster inference...
Loading checkpoint shards: 100%|██████████| 15/15 [00:07<00:00,  1.99it/s]


完成本地模型的加载


'我是来自阿里云的大规模语言模型，我叫通义千问。'

### 6. HuggingFacePipeline

In [None]:
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline

In [None]:
model_name = "THUDM/chatglm-6b"

tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)

question_answerer = pipeline(
    "question-answering", 
    model=model_name, 
    tokenizer=tokenizer,
    return_tensors='pt'
)

llm = HuggingFacePipeline(
    pipeline=question_answerer,
    model_kwargs={"temperature": 0.7, "max_length": 512},
)

## 构建检索问答链, 然后与文档对话

In [20]:
llm.predict("你是谁")

'我是百川大模型，是由百川智能的工程师们创造的大语言模型，我可以和人类进行自然交流、解答问题、协助创作，帮助大众轻松、普惠的获得世界知识和专业服务。如果你有任何问题，可以随时向我提问'

In [21]:

#retriever = vector_store.as_retriever(search_kwargs={"k": VECTOR_SEARCH_TOP_K})
retriever = vector_store.as_retriever()
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=retriever,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
    return_source_documents=True,
    verbose=True
)

In [22]:
query = "什么是SRDF/Star?"

result = qa_chain({"query": query})
result

  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': '什么是SRDF/Star?',
 'result': 'SRDF/Star是一种灾难恢复解决方案，由三个站点组成：主（生产）站点、辅助站点和第三站点。辅助站点同步镜像来自主站点的数据，而第三站点则异步镜像生产数据。当主站点出现故障时，SRDF/Star解决方案允许您在其余站点之间快速移动操作并在远程站点重建镜像。当主站点出现故障时，SR',
 'source_documents': [Document(page_content='SRDF/Star 解决方案 ..................................................................... 80第4章\n第5章\n第6章\n第7章\n第8章目录\n4 产品指南   PowerMaxOS', metadata={'page': 3, 'source': '/home/ubuntu/sources/pdfs/docu88904_PowerMax-系列产品指南.pdf'}),
  Document(page_content='图 17 带有  R22 设备的并发  SRDF/Star\nR11 R2 \nR22SRDF/S\nSRDF/A\nSRDF/A\nrecovery links Site B \nActive\nInactiveSite A \nSite C \n级联  SRDF/Star\n在级联  SRDF/Star 解决方案中，同步辅助站点较之异步第三站点始终拥有更新的数\n据。当同步辅助站点出现故障时，级联  SRDF/Star 解决方案可以在主站点和异步第三\n站点之间以增量方式建立  SRDF/A 会话。\n级联  SRDF/Star 可以确定当前活动  R1 周期（捕获）内容何时通过远距离  SRDF/A 链\n路到达活动  R2 周期（应用）。这可将为实现完全同步而必须在站点  B 和站点  C 之间\n移动的数据量降到最低。\n此示例显示了一个基本的级联  SRDF/Star 解决方案。远程复制解决方案\n82 产品指南   PowerMaxOS', metadata={'page': 81, 'source': '/home/ubuntu/sources/pdfs/docu88904_PowerMax-系列产品指南.