In [1]:
dataset = []
with open('data/all.txt', 'r', encoding='utf-8') as file:
  dataset = file.readlines()
  print(f'Loaded {len(dataset)} entries')

Loaded 38402 entries


In [7]:
import os
import requests
from dotenv import load_dotenv

load_dotenv()

token = os.getenv("TOKEN")

def get_embedding(text):
    url = "https://api.siliconflow.cn/v1/embeddings"

    payload = {
        "model": "BAAI/bge-m3",
        "input": "Silicon flow embedding online: fast, affordable, and high-quality embedding services. come try it out!",
        "encoding_format": "float"
    }
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json"
    }

    response = requests.request("POST", url, json=payload, headers=headers)

    return response.json()['data'][0]['embedding']

In [8]:
try:
    text = "Embed this string for me!"
    vector = get_embedding(text)
    print(f"向量维度: {len(vector)}")
    print(f"向量前5个值: {vector[:5]}")
except Exception as e:
    print(f"错误: {e}")

向量维度: 1024
向量前5个值: [-0.046083894, 0.03211734, -0.035460908, -0.013230968, -0.0021028642]


In [10]:
import ollama

EMBEDDING_MODEL = r'E:\lmstudio\mradermacher\Conan-embedding-v1-GGUF'
LANGUAGE_MODEL = r'E:\lmstudio\unsloth\Qwen3-30B-A3B-GGUF'

# Each element in the VECTOR_DB will be a tuple (chunk, embedding)
# The embedding is a list of floats, for example: [0.1, 0.04, -0.34, 0.21, ...]
VECTOR_DB = []

def add_chunk_to_database(chunk):
  embedding = get_embedding(chunk)
  VECTOR_DB.append((chunk, embedding))

In [11]:
for i, chunk in enumerate(dataset):
  add_chunk_to_database(chunk)
  print(f'Added chunk {i+1}/{len(dataset)} to the database\r')

Added chunk 1/38402 to the database
Added chunk 2/38402 to the database
Added chunk 3/38402 to the database
Added chunk 4/38402 to the database
Added chunk 5/38402 to the database
Added chunk 6/38402 to the database
Added chunk 7/38402 to the database
Added chunk 8/38402 to the database
Added chunk 9/38402 to the database
Added chunk 10/38402 to the database
Added chunk 11/38402 to the database
Added chunk 12/38402 to the database
Added chunk 13/38402 to the database
Added chunk 14/38402 to the database
Added chunk 15/38402 to the database
Added chunk 16/38402 to the database
Added chunk 17/38402 to the database
Added chunk 18/38402 to the database
Added chunk 19/38402 to the database
Added chunk 20/38402 to the database
Added chunk 21/38402 to the database
Added chunk 22/38402 to the database
Added chunk 23/38402 to the database
Added chunk 24/38402 to the database
Added chunk 25/38402 to the database
Added chunk 26/38402 to the database
Added chunk 27/38402 to the database
Added chun

KeyboardInterrupt: 

In [12]:
def cosine_similarity(a, b):
  dot_product = sum([x * y for x, y in zip(a, b)])
  norm_a = sum([x ** 2 for x in a]) ** 0.5
  norm_b = sum([x ** 2 for x in b]) ** 0.5
  return dot_product / (norm_a * norm_b)

In [14]:
def retrieve(query, top_n=3):
  query_embedding = get_embedding(query)
  # temporary list to store (chunk, similarity) pairs
  similarities = []
  for chunk, embedding in VECTOR_DB:
    similarity = cosine_similarity(query_embedding, embedding)
    similarities.append((chunk, similarity))
  # sort by similarity in descending order, because higher similarity means more relevant chunks
  similarities.sort(key=lambda x: x[1], reverse=True)
  # finally, return the top N most relevant chunks
  return similarities[:top_n]

In [16]:
input_query = input('Ask me a question: ')
retrieved_knowledge = retrieve(input_query, top_n=30)

print('Retrieved knowledge:')
for chunk, similarity in retrieved_knowledge:
  print(f' - (similarity: {similarity:.2f}) {chunk}')

instruction_prompt = f'''You are a helpful chatbot.
Use only the following pieces of context to answer the question. Don't make up any new information:
{'\n'.join([f' - {chunk}' for chunk, similarity in retrieved_knowledge])}
'''

Retrieved knowledge:
 - (similarity: 0.66) [unknown_chara_602628616]: （在阿拜多斯住宅区遇见了熟悉的面孔）

 - (similarity: 0.66) 无月: 那些是阿拜多斯的人哦。

 - (similarity: 0.64) 绫音: 我们是阿拜多斯对策委员会。 

 - (similarity: 0.63) 白子: 你是来找「阿拜多斯」的吗？

 - (similarity: 0.63) 头盔团B: ……没错。他是阿拜多斯对策委员会的成员。

 - (similarity: 0.63) 无月: 是吧，是阿拜多斯那些孩子……虽然也有不认识的人。<br/>……他们在这里做什么？而且还戴着面罩。

 - (similarity: 0.63) 亚瑠: 骗，骗人的吧……那些孩子竟然是阿拜多斯……？<br/>呜，呜呜……命运为什么要开这种玩笑……

 - (similarity: 0.62) 伊织: 嗯？那边的学生？我记得是……阿拜多斯？

 - (similarity: 0.62) 佳世子: 但反过来说，阿拜多斯就是那么不容忽视的对手。<br/>虽然他们有学生数量少这个最大的弱点。

 - (similarity: 0.60) ???: 换句话说就是你们被解雇了。<br/>从今以后，阿拜多斯由我们来处理。

 - (similarity: 0.60) 黑色西装人: ……呵呵，因为情况有所变动。 <br/>所以我想再次向持有阿拜多斯最高神秘的星野小姐提议。

 - (similarity: 0.60) 佳世子: 阿……阿拜多斯……？

 - (similarity: 0.59) [unknown_chara_602628616]: （虽然抵达了阿拜多斯自治区……）

 - (similarity: 0.59) 头盔团A: 呜啊啊啊！！难，难道是阿拜多斯那伙人吗！？<br/>竟敢对我们……

 - (similarity: 0.59) [unknown_chara_602628616]: （与日步美和大家一起回到阿拜多斯，所有人一起确认了文档）

 - (similarity: 0.59) 无月: ……我还以为是谁，这不是阿拜多斯的眼镜妹吗～？

 - (similarity: 0.59) 柴大将: 你们