In [26]:
cd ../langchain-ChatGLM-master/

D:\dev_cu\hohoChat\langchain-ChatGLM-master


In [None]:
import os
import time

import torch.cuda
import torch.backends

from configs import model_config
from models.chatglm_llm import ChatGLM
from chains.local_doc_qa import *
from textsplitter import ChineseTextSplitter
from utils import torch_gc

from langchain.text_splitter import MarkdownTextSplitter, CharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import UnstructuredMarkdownLoader 

In [30]:
DATA_DIR = "../../data/Laws-master"

In [None]:
def time_str_YmdHmS():
    current_time = time.time()
    local_time = time.localtime(current_time)
    time_str = time.strftime('%Y%m%d%H%m%S', local_time)
    return time_str

In [31]:
def get_filepaths_at_path(item_path):
    if os.path.isfile(item_path):
        return [item_path]
    
    result_list = []
    for item in os.listdir(item_path):
        path = os.path.join(item_path, item)
        file_paths = get_filepaths_at_path(path)
        result_list.extend(file_paths)
    
    return result_list

In [32]:
file_paths = get_filepaths_at_path(DATA_DIR)
file_paths = [file_path for file_path in file_paths if os.path.basename(file_path) != '_index.md']

In [33]:
len(file_paths)

3098

In [64]:
# text_splitter = ChineseTextSplitter(pdf = False)
text_splitter = MarkdownTextSplitter(chunk_size = 500, chunk_overlap = 100)

docs = []
for file_path in file_paths:
    loader = UnstructuredMarkdownLoader(file_path)
    docs += loader.load_and_split(text_splitter)

In [65]:
len(docs)

44407

In [66]:
embeddings = HuggingFaceEmbeddings(model_name = model_config.embedding_model_dict["ernie-base"], 
                                   model_kwargs = {'device': model_config.EMBEDDING_DEVICE})

Downloading (…)b8147/.gitattributes:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Downloading (…)e374ab8147/README.md:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

Downloading (…)74ab8147/config.json:   0%|          | 0.00/504 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/474M [00:00<?, ?B/s]

Downloading (…)e374ab8147/vocab.txt:   0%|          | 0.00/187k [00:00<?, ?B/s]

No sentence-transformers model found with name C:\Users\hoho/.cache\torch\sentence_transformers\nghuyong_ernie-3.0-base-zh. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at C:\Users\hoho/.cache\torch\sentence_transformers\nghuyong_ernie-3.0-base-zh were not used when initializing ErnieModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing ErnieModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ErnieModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [67]:
vector_store = FAISS.from_documents(docs, embeddings)


KeyboardInterrupt



In [None]:
vs_path = f"../outputs/vector_store/law_FAISS_{time_str_YmdHmS()}"
vector_store.save_local(vs_path)

In [None]:
FAISS.similarity_search_with_score_by_vector = similarity_search_with_score_by_vector
vector_store.chunk_size = model_config.CHUNK_SIZE

In [None]:
llm = ChatGLM()
llm.load_model(model_name_or_path = model_config.llm_model_dict[model_config.LLM_MODEL],
               llm_device = model_config.LLM_DEVICE,
               use_ptuning_v2 = model_config.USE_PTUNING_V2)
llm.history_len = LLM_HISTORY_LEN

In [None]:
def answer_based_on_knowledge(query, vector_store, llm, chat_history = []):
    related_docs_with_score = vector_store.similarity_search_with_score(query, k = VECTOR_SEARCH_TOP_K)
    related_docs = get_docs_with_score(related_docs_with_score)
    prompt = generate_prompt(related_docs, query)

    # if streaming:
    #     for result, history in self.llm._stream_call(prompt = prompt,history = chat_history):
    #         history[-1][0] = query
    #         response = {"query": query,
    #                     "result": result,
    #                     "source_documents": related_docs}
    #         yield response, history
    # else:
    for result, history in llm._call(prompt = prompt, history = chat_history, streaming = False):
        history[-1][0] = query
        response = {"query": query,
                    "result": result,
                    "source_documents": related_docs}
        yield response, history

In [None]:
from IPython.display import display, Markdown, clear_output

def display_answer(query, chat_history = []):
    for resp, history in answer_based_on_knowledge(query, vector_store, llm, chat_history):
        clear_output(wait = True)
        display(Markdown(resp['result']))
    
    return resp, history

In [57]:
question = "信用卡欠款不还会遭到什么处罚？"
answer, history = display_answer(question, chat_history = [])

[Document(page_content='人体损伤程度鉴定标准\n\n范围\n\n本标准规定了人体损伤程度鉴定的原则、方法、内容和等级划分。\n\n本标准适用于《中华人民共和国刑法》及其他法律、法规所涉及的人体损伤程度鉴定。\n\n引用文件\n\n下列文件对于本文件的应用是必不可少的。本标准引用文件的最新版本适用于本标准。\n\nGB 18667 道路交通事故受伤人员伤残评定\n\nGB/T 16180 劳动能力鉴定 职工工伤与职业病致残等级\n\nGB/T 26341-2010 残疾人残疾分类和分级\n\n术语定义\n\n重伤\n\n使人肢体残废、毁人容貌、丧失听觉、丧失视觉、丧失其他器官功能或者其他对于人身健康有重大伤害的损伤，包括重伤一级和重伤二级。\n\n轻伤\n\n使人肢体或者容貌损害，听觉、视觉或者其他器官功能部分障碍或者其他对于人身健康有中度伤害的损伤，包括轻伤一级和轻伤二级。\n\n轻微伤\n\n各种致伤因素所致的原发性损伤，造成组织器官结构轻微损害或者轻微功能障碍。\n\n总则\n\n鉴定原则\n\n遵循实事求是的原则，坚持以致伤因素对人体直接造成的原发性损伤及由损伤引起的并发症或者后遗症为依据，全面分析，综合鉴定。', metadata={'source': '../../data/Laws-master\\其他\\人体损伤程度鉴定标准(2014-01-01).md'}),
 Document(page_content='总则\n\n鉴定原则\n\n遵循实事求是的原则，坚持以致伤因素对人体直接造成的原发性损伤及由损伤引起的并发症或者后遗症为依据，全面分析，综合鉴定。\n\n对于以原发性损伤及其并发症作为鉴定依据的，鉴定时应以损伤当时伤情为主，损伤的后果为辅，综合鉴定。\n\n对于以容貌损害或者组织器官功能障碍作为鉴定依据的，鉴定时应以损伤的后果为主，损伤当时伤情为辅，综合鉴定。\n\n鉴定时机\n\n以原发性损伤为主要鉴定依据的，伤后即可进行鉴定；以损伤所致的并发症为主要鉴定依据的，在伤情稳定后进行鉴定。\n\n以容貌损害或者组织器官功能障碍为主要鉴定依据的，在损伤90日后进行鉴定；在特殊情况下可以根据原发性损伤及其并发症出具鉴定意见，但须对有可能出现的后遗症加以说明，必要时应进行复检并予以补充鉴定。 [1]\n\n疑难、复杂的损伤，在临床