In [1]:
!pip install transformers accelerate sentence-transformers faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0mm
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [2]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# 加载本地 DeepSeek 模型
model_name = "deepseek-ai/deepseek-llm-7b-chat"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
llm = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)
llm.eval()

# 知识库内容
knowledge_texts = [
    "数据存储是指以一定结构保存数据的方式，包括关系型数据库和非关系型数据库。",
    "信息抽取是从非结构化或半结构化数据中提取结构化信息的过程。",
    "向量数据库使用向量表示文本，可用于高效的语义搜索。",
    "知识问答系统分为基于检索的问答和基于生成的问答。",
    "知识图谱是一种语义网络，表示实体及其之间的关系。"
]

# 从PPT提取的txt文件（例如：你用python-pptx或其他工具提取保存的）
ppt_file_path = '/kaggle/input/ppttxt/ppt.txt'

# 加载并分句（可按段落或换行分割）
with open(ppt_file_path, 'r', encoding='utf-8') as f:
    ppt_content = f.read()

# 分割成知识段（你可以根据自己的格式调整，以下按“换行”分段）
ppt_knowledge_list = [line.strip() for line in ppt_content.split('\n') if line.strip()]

# 合并
knowledge_texts.extend(ppt_knowledge_list)

# 文本向量化 + FAISS 向量搜索
embedder = SentenceTransformer("shibing624/text2vec-base-chinese")  # 中文效果更好
doc_embeddings = embedder.encode(knowledge_texts)
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(doc_embeddings))

# 问答函数
def answer_question_with_deepseek(question, top_k=2, max_new_tokens=200):
    # 检索相关知识
    q_embedding = embedder.encode([question])
    distances, indices = index.search(np.array(q_embedding), top_k)
    retrieved = [knowledge_texts[i] for i in indices[0]]

    # 构造 Prompt
    context = "\n".join(retrieved)
    prompt = f"""你是一位智能问答助手，请根据以下知识内容回答用户的问题。
知识内容：
{context}

用户问题：{question}
你的回答："""

    # 模型推理
    inputs = tokenizer(prompt, return_tensors="pt").to(llm.device)
    with torch.no_grad():
        outputs = llm.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # 去除prompt前缀，只返回回答部分
    answer = response.split("你的回答：")[-1].strip()
    return answer


tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/13.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/856 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/319 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

Batches:   0%|          | 0/50 [00:00<?, ?it/s]

In [3]:
question = "向量数据库的作用是什么？"
print("问：", question)
print("答：", answer_question_with_deepseek(question))
question = "什么是信息抽取？"
print("问：", question)
print("答：", answer_question_with_deepseek(question))

问： 向量数据库的作用是什么？


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


答： 向量数据库是一种使用向量来存储和检索数据的数据库类型，主要应用于需要高效处理大量数据的场景中。它的作用包括但不限于语义搜索、推荐系统等领域的应用。
问： 什么是信息抽取？


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


答： 信息抽取是指从文本等非结构化或半结构化数据中自动提取出结构化信息的处理过程。


In [2]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# 替换 DeepSeek 模型为 chatglm3
model_name = "THUDM/chatglm3-6b"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
llm = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="auto"
)
llm.eval()


# 知识库内容
knowledge_texts = [
    "数据存储是指以一定结构保存数据的方式，包括关系型数据库和非关系型数据库。",
    "信息抽取是从非结构化或半结构化数据中提取结构化信息的过程。",
    "向量数据库使用向量表示文本，可用于高效的语义搜索。",
    "知识问答系统分为基于检索的问答和基于生成的问答。",
    "知识图谱是一种语义网络，表示实体及其之间的关系。"
]

ppt_file_path = '/kaggle/input/ppttxt/ppt.txt'

# 加载并分句（可按段落或换行分割）
with open(ppt_file_path, 'r', encoding='utf-8') as f:
    ppt_content = f.read()

# 分割成知识段（你可以根据自己的格式调整，以下按“换行”分段）
ppt_knowledge_list = [line.strip() for line in ppt_content.split('\n') if line.strip()]

# 合并
knowledge_texts.extend(ppt_knowledge_list)

# 文本向量化 + FAISS 向量搜索
embedder = SentenceTransformer("shibing624/text2vec-base-chinese")  # 中文效果更好
doc_embeddings = embedder.encode(knowledge_texts)
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(doc_embeddings))

# 问答函数
def answer_question_with_deepseek(question, top_k=2, max_new_tokens=200):
    # 检索相关知识
    q_embedding = embedder.encode([question])
    distances, indices = index.search(np.array(q_embedding), top_k)
    retrieved = [knowledge_texts[i] for i in indices[0]]

    # 构造 Prompt
    context = "\n".join(retrieved)
    prompt = f"""你是一位智能问答助手，请根据以下知识内容回答用户的问题。
知识内容：
{context}

用户问题：{question}
你的回答："""

    # 模型推理
    inputs = tokenizer(prompt, return_tensors="pt").to(llm.device)
    with torch.no_grad():
        outputs = llm.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # 去除prompt前缀，只返回回答部分
    answer = response.split("你的回答：")[-1].strip()
    return answer
question = "向量数据库的作用是什么？"
print("问：", question)
print("答：", answer_question_with_deepseek(question))
question = "什么是信息抽取？"
print("问：", question)
print("答：", answer_question_with_deepseek(question))

tokenizer_config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

tokenization_chatglm.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/THUDM/chatglm3-6b:
- tokenization_chatglm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer.model:   0%|          | 0.00/1.02M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

configuration_chatglm.py:   0%|          | 0.00/2.33k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/THUDM/chatglm3-6b:
- configuration_chatglm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_chatglm.py:   0%|          | 0.00/56.5k [00:00<?, ?B/s]

quantization.py:   0%|          | 0.00/14.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/THUDM/chatglm3-6b:
- quantization.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/THUDM/chatglm3-6b:
- modeling_chatglm.py
- quantization.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/21.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/7 [00:00<?, ?it/s]

model-00001-of-00007.safetensors:   0%|          | 0.00/1.83G [00:00<?, ?B/s]

model-00002-of-00007.safetensors:   0%|          | 0.00/1.97G [00:00<?, ?B/s]

model-00003-of-00007.safetensors:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

model-00004-of-00007.safetensors:   0%|          | 0.00/1.82G [00:00<?, ?B/s]

model-00005-of-00007.safetensors:   0%|          | 0.00/1.97G [00:00<?, ?B/s]

model-00006-of-00007.safetensors:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

model-00007-of-00007.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

modules.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/13.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/856 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/319 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

Batches:   0%|          | 0/50 [00:00<?, ?it/s]

问： 向量数据库的作用是什么？


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

答： 向量数据库的作用是使用向量表示文本，以便进行高效的语义搜索。通过将文本转换为向量形式，可以向数据库中快速检索和匹配相关内容。这种方法可以提高查询效率，降低系统延迟，并且有助于实现更精准的搜索结果。向量数据库通常用于需要处理大量文本数据的应用程序，例如自然语言处理、信息检索和文本分类等。
问： 什么是信息抽取？


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

答： 信息抽取是从非结构化或半结构化数据中提取结构化信息的过程。它是一种文本处理技术，用于从原始文本中识别和提取出有用的信息，以便进行进一步的分析和利用。信息抽取广泛应用于各种领域，如自然语言处理、数据库抽取、数据挖掘等。


In [4]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "baichuan-inc/Baichuan2-7B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
llm = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)
llm.eval()


# 知识库内容
knowledge_texts = [
    "数据存储是指以一定结构保存数据的方式，包括关系型数据库和非关系型数据库。",
    "信息抽取是从非结构化或半结构化数据中提取结构化信息的过程。",
    "向量数据库使用向量表示文本，可用于高效的语义搜索。",
    "知识问答系统分为基于检索的问答和基于生成的问答。",
    "知识图谱是一种语义网络，表示实体及其之间的关系。"
]

ppt_file_path = '/kaggle/input/ppttxt/ppt.txt'

# 加载并分句（可按段落或换行分割）
with open(ppt_file_path, 'r', encoding='utf-8') as f:
    ppt_content = f.read()

# 分割成知识段（你可以根据自己的格式调整，以下按“换行”分段）
ppt_knowledge_list = [line.strip() for line in ppt_content.split('\n') if line.strip()]

# 合并
knowledge_texts.extend(ppt_knowledge_list)

# 文本向量化 + FAISS 向量搜索
embedder = SentenceTransformer("shibing624/text2vec-base-chinese")  # 中文效果更好
doc_embeddings = embedder.encode(knowledge_texts)
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(doc_embeddings))

# 问答函数
def answer_question_with_deepseek(question, top_k=2, max_new_tokens=200):
    # 检索相关知识
    q_embedding = embedder.encode([question])
    distances, indices = index.search(np.array(q_embedding), top_k)
    retrieved = [knowledge_texts[i] for i in indices[0]]

    # 构造 Prompt
    context = "\n".join(retrieved)
    prompt = f"""你是一位智能问答助手，请根据以下知识内容回答用户的问题。
知识内容：
{context}

用户问题：{question}
你的回答："""

    # 模型推理
    inputs = tokenizer(prompt, return_tensors="pt").to(llm.device)
    with torch.no_grad():
        outputs = llm.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # 去除prompt前缀，只返回回答部分
    answer = response.split("你的回答：")[-1].strip()
    return answer
question = "向量数据库的作用是什么？"
print("问：", question)
print("答：", answer_question_with_deepseek(question))
question = "什么是信息抽取？"
print("问：", question)
print("答：", answer_question_with_deepseek(question))

generation_utils.py:   0%|          | 0.00/2.97k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat:
- generation_utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat:
- quantizer.py
- generation_utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/15.0G [00:00<?, ?B/s]

Error during conversion: ChunkedEncodingError(ProtocolError('Response ended prematurely'))


model.safetensors:   0%|          | 0.00/15.0G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/13.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/856 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/319 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

Batches:   0%|          | 0/50 [00:00<?, ?it/s]

问： 向量数据库的作用是什么？


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  self.gen = func(*args, **kwds)


答： 向量数据库是一种用于存储和管理大量文本数据的数据库技术，它使用向量表示文本，从而实现高效、准确的语义搜索。这种数据库可以有效地处理大量的非结构化数据，例如社交媒体帖子、评论和新闻文章等。通过将文本转换为数值向量，向量数据库能够更好地理解和比较这些数据，从而使搜索更加精确和相关。
问： 什么是信息抽取？


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

答： 信息抽取（Information Extraction）是从非结构化或半结构化数据中提取结构化信息的过程。这个过程可以帮助我们从大量的文本、数据库或其他数据源中快速找到和提取有用的信息。通过使用自然语言处理（NLP）技术和机器学习算法，信息抽取可以将原始数据转化为结构化的表格和数据集，从而为数据分析、挖掘和可视化提供基础。


In [4]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "Qwen/Qwen-7B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
llm = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)
llm.eval()


# 知识库内容
knowledge_texts = [
    "数据存储是指以一定结构保存数据的方式，包括关系型数据库和非关系型数据库。",
    "信息抽取是从非结构化或半结构化数据中提取结构化信息的过程。",
    "向量数据库使用向量表示文本，可用于高效的语义搜索。",
    "知识问答系统分为基于检索的问答和基于生成的问答。",
    "知识图谱是一种语义网络，表示实体及其之间的关系。"
]

ppt_file_path = '/kaggle/input/ppttxt/ppt.txt'

# 加载并分句（可按段落或换行分割）
with open(ppt_file_path, 'r', encoding='utf-8') as f:
    ppt_content = f.read()

# 分割成知识段（你可以根据自己的格式调整，以下按“换行”分段）
ppt_knowledge_list = [line.strip() for line in ppt_content.split('\n') if line.strip()]

# 合并
knowledge_texts.extend(ppt_knowledge_list)

# 文本向量化 + FAISS 向量搜索
embedder = SentenceTransformer("shibing624/text2vec-base-chinese")  # 中文效果更好
doc_embeddings = embedder.encode(knowledge_texts)
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(doc_embeddings))

# 问答函数
def answer_question_with_deepseek(question, top_k=2, max_new_tokens=200):
    # 检索相关知识
    q_embedding = embedder.encode([question])
    distances, indices = index.search(np.array(q_embedding), top_k)
    retrieved = [knowledge_texts[i] for i in indices[0]]

    # 构造 Prompt
    context = "\n".join(retrieved)
    prompt = f"""你是一位智能问答助手，请根据以下知识内容回答用户的问题。
知识内容：
{context}

用户问题：{question}
你的回答："""

    # 模型推理
    inputs = tokenizer(prompt, return_tensors="pt").to(llm.device)
    with torch.no_grad():
        outputs = llm.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # 去除prompt前缀，只返回回答部分
    answer = response.split("你的回答：")[-1].strip()
    return answer
question = "向量数据库的作用是什么？"
print("问：", question)
print("答：", answer_question_with_deepseek(question))
question = "什么是信息抽取？"
print("问：", question)
print("答：", answer_question_with_deepseek(question))

cpp_kernels.py:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen-7B-Chat:
- cpp_kernels.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


qwen_generation_utils.py:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen-7B-Chat:
- qwen_generation_utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen-7B-Chat:
- cpp_kernels.py
- qwen_generation_utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/19.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.96G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/273 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/13.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/856 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/319 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

Batches:   0%|          | 0/50 [00:00<?, ?it/s]

问： 向量数据库的作用是什么？


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

答： 向量数据库是一种数据存储方式，它使用向量来表示文本，并且可以进行高效的语义搜索。这样，数据库可以在短时间内找到与查询语句最匹配的数据，从而提高查询效率。此外，向量数据库还可以用于其他自然语言处理任务，例如文本分类、情感分析等。
问： 什么是信息抽取？


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

答： 信息抽取是从非结构化或半结构化数据中提取结构化信息的过程。


In [4]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "01-ai/Yi-6B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
llm = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)
llm.eval()


# 知识库内容
knowledge_texts = [
    "数据存储是指以一定结构保存数据的方式，包括关系型数据库和非关系型数据库。",
    "信息抽取是从非结构化或半结构化数据中提取结构化信息的过程。",
    "向量数据库使用向量表示文本，可用于高效的语义搜索。",
    "知识问答系统分为基于检索的问答和基于生成的问答。",
    "知识图谱是一种语义网络，表示实体及其之间的关系。"
]

ppt_file_path = '/kaggle/input/ppttxt/ppt.txt'

# 加载并分句（可按段落或换行分割）
with open(ppt_file_path, 'r', encoding='utf-8') as f:
    ppt_content = f.read()

# 分割成知识段（你可以根据自己的格式调整，以下按“换行”分段）
ppt_knowledge_list = [line.strip() for line in ppt_content.split('\n') if line.strip()]

# 合并
knowledge_texts.extend(ppt_knowledge_list)

# 文本向量化 + FAISS 向量搜索
embedder = SentenceTransformer("shibing624/text2vec-base-chinese")  # 中文效果更好
doc_embeddings = embedder.encode(knowledge_texts)
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(doc_embeddings))

# 问答函数
def answer_question_with_deepseek(question, top_k=2, max_new_tokens=512):
    # 检索相关知识
    q_embedding = embedder.encode([question])
    distances, indices = index.search(np.array(q_embedding), top_k)
    retrieved = [knowledge_texts[i] for i in indices[0]]

    # 构造 Prompt
    context = "\n".join(retrieved)
    prompt = f"""你是一位智能问答助手，请根据以下知识内容回答用户的问题。
知识内容：
{context}

用户问题：{question}
你的回答："""

    # 模型推理
    inputs = tokenizer(prompt, return_tensors="pt").to(llm.device)
    with torch.no_grad():
        outputs = llm.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # 去除prompt前缀，只返回回答部分
    answer = response.split("你的回答：")[-1].strip()
    return answer
question = "向量数据库的作用是什么？"
print("问：", question)
print("答：", answer_question_with_deepseek(question))
question = "什么是信息抽取？"
print("问：", question)
print("答：", answer_question_with_deepseek(question))

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/50 [00:00<?, ?it/s]

问： 向量数据库的作用是什么？


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

答： 自然语言处理的许多应用都可以从向量数据库中受益，例如机器翻译、实体识别、情感分析、推荐系统和语音识别。在这些任务中，向量数据库可以帮助模型更好地理解文本的语义内容，从而提供更准确的结果。
问： 什么是信息抽取？


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

答： 信息抽取是一种从各种数据源中提取有用的信息和知识的自动过程。它通常涉及到自然语言处理（NLP）和机器学习技术来理解和分析文本，以便识别和提取特定的实体、关系和概念。信息抽取可以用于构建数据库、支持决策制定以及进行知识发现等应用。


In [3]:
!pip install transformers_stream_generator

Collecting transformers_stream_generator
  Downloading transformers-stream-generator-0.0.5.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: transformers_stream_generator
  Building wheel for transformers_stream_generator (setup.py) ... [?25l[?25hdone
  Created wheel for transformers_stream_generator: filename=transformers_stream_generator-0.0.5-py3-none-any.whl size=12425 sha256=6efb90e4539ae51705fa58b06398f3f47d89b77fb396a067727b6180fd19b873
  Stored in directory: /root/.cache/pip/wheels/95/4a/90/140f7b67d125906f6a165f38aad212ecb4a695ad0d87582437
Successfully built transformers_stream_generator
Installing collected packages: transformers_stream_generator
Successfully installed transformers_stream_generator-0.0.5


In [3]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.5


In [1]:
!pip install transformers accelerate sentence-transformers faiss-cpu peft

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0
