In [5]:
import sqlite3
import numpy as np
from tqdm import tqdm
from scipy.spatial.distance import cosine
from angle_emb import AnglE

angle = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1', pooling_strategy='cls')

def semantic_search(query, vector_db_name, number_chunks = 5):
    conn = sqlite3.connect(vector_db_name)
    c = conn.cursor()

    query_embedding = angle.encode(query, to_numpy=True).flatten()

    c.execute("SELECT id, name, card_text, vector FROM vectordb")
    rows = c.fetchall()

    similarities = []
    for row in rows :
        id_, name, card_text, vector_bytes = row
        stored_embedding = np.frombuffer(vector_bytes, dtype=np.float32).flatten()
        sim = 1 - cosine(query_embedding, stored_embedding)
        similarities.append((id_, card_text, sim))

    similarities.sort(key=lambda x: x[2], reverse=True)  # Sort by similarity scores in descending order

    top_matches = similarities[:number_chunks]

    conn.close()

    return [(match[1], match[2]) for match in top_matches]

In [8]:
# Example of how I can use semantic search in my vector database
vector_db_name = "raw/full_card_vector_database.db"

query = "Sign in blood"
test = semantic_search(query, vector_db_name, 1)
all_chunks = ""
for item in test:
    all_chunks += item[0]+"\n\n"

In [13]:
conn = sqlite3.connect(vector_db_name)
c = conn.cursor()

query_embedding = angle.encode(query, to_numpy=True).flatten()

c.execute("SELECT id, name, card_text, vector FROM vectordb")
rows = c.fetchall()
max_length = 0
max_text = ""
for row in rows:
    id_, name, card_text, vector_bytes = row
    temp = max_length
    max_length = max(len(card_text), max_length)
    if temp < max_length:
        max_text = card_text
conn.close()

In [54]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
def rag_query(query, model, tokenizer):
    chunks = semantic_search(query, vector_db_name, 3)
    
    prompt_prefix = ""
    for chunk in chunks:
        prompt_prefix += chunk[0]+"/n/n"
    
    prompt = f"""[INST]
    Given the following card data, provide me with the exact text of {query} in the format of :
    \nname: \nmana_cost: \ncmc: \ntype_line: \noracle_text: \npower: \ntoughness: \ncolors: \ncolor_identity: \nkeywords:
    
    \n{prompt_prefix}
    
    Use only the data in the provided chunks above.
    """
    
    message = [{
        "role":"user",
        "content": prompt
    }]
    
    model_inputs = tokenizer.apply_chat_template(
        message,
        return_tensors = "pt",
    )
    
    generated_ids = model.generate(
        model_inputs,
        max_new_tokens = 1000,
        do_sample = True
    )
    
    decoded = tokenizer.batch_decode(generated_ids)
    
    return decoded[0]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [53]:
rag_query("Elesh Norn, Grand Cenobite", None, tokenizer)

1968


In [56]:
pip --version angle_emb

pip 23.2.1 from C:\Users\lesli\anaconda3\Lib\site-packages\pip (python 3.11)

Note: you may need to restart the kernel to use updated packages.
