# Extract Text Data

In [54]:
import json
import pandas as pd

# Uncomment code after redownlading oracle cards, removed from directory due to large unfiltered size
# # Fetch raw card data
# cardfile = "raw/oracle-cards-20240201100133.json"

# with open(cardfile, 'r', encoding='utf-8') as raw_oracle:
#     raw_oracle_data = json.load(raw_oracle)

# filtered_data = [card for card in raw_oracle_data if card.get('set_type') != "memorabilia"]

# columns_to_keep = ["name", "mana_cost", "cmc", "type_line", "oracle_text", "power", "toughness",
#                     "colors", "color_identity", "keywords"]

# df = pd.DataFrame(filtered_data)
# df = df[columns_to_keep]
# df.to_csv("raw/filtered_oracle_database.csv", index=False, encoding='utf-8')

df = pd.read_csv("raw/filtered_oracle_database.csv")

formatted_rows = []
card_names = []
for index, row in df.iterrows():
    formatted_row = ""
    for column_name, value in row.items():
#         if column_name == "name":
#             formatted_row += f"{column_name}: {value}\n"
        formatted_row += f"{column_name}: {value}\n"
    card_names.append(row['name'])
    formatted_rows.append(formatted_row.strip())

# Chunk data and create embeddings

In [57]:
# Chunk data into 5 card piles with 2 newlines between each card
chunked_data = []
chunked_names = []
current_chunk = ""
current_name_chunk = ""
for i, row in enumerate(formatted_rows):
    current_chunk += row + "\n\n"
    current_name_chunk += card_names[i] + "\n"
    if len(current_chunk.split('\n\n')) == 6:  # Each chunk contains 5 cards and 1 extra newline character
        chunked_data.append(current_chunk.strip())
        current_chunk = ""
        chunked_names.append(current_name_chunk.strip())
        current_name_chunk = ""

# If there are remaining cards not included in chunks
if current_chunk:
    chunked_data.append(current_chunk.strip())
    chunked_names.append(current_name_chunk.strip())

In [2]:
from angle_emb import AnglE

angle = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1', pooling_strategy='cls')

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


comet_ml is installed but `COMET_API_KEY` is not set.


In [79]:
import sqlite3
from tqdm import tqdm

conn = sqlite3.connect('raw/card_vector_database.db')
c = conn.cursor()

c.execute('''CREATE TABLE IF NOT EXISTS vectors
             (id INTEGER PRIMARY KEY, name TEXT, card_text TEXT, vector BLOB)''')

# Encode and save the encodings along with the corresponding indices, name, and text
for i, (name, chunked) in enumerate(tqdm(zip(chunked_names, chunked_data), desc="Encoding and saving")):
    encodings = angle.encode(chunked, to_numpy=True)
    
    c.execute("INSERT INTO vectors (id, name, text, vector) VALUES (?, ?, ?, ?)",
              (i, name, chunked, encodings.tobytes()))

conn.commit()
conn.close()

Encoding and saving: 100%|██████████| 2/2 [00:03<00:00,  1.70s/it]


In [80]:
import sqlite3
import numpy as np
from tqdm import tqdm
from scipy.spatial.distance import cosine

def semantic_search(query, vector_db_name):
    conn = sqlite3.connect(vector_db_name)
    c = conn.cursor()

    query_embedding = angle.encode(query, to_numpy=True).flatten()

    c.execute("SELECT id, name, card_text, vector FROM vectordb")
    rows = c.fetchall()

    similarities = []
    for row in rows :
        id_, name, card_text, vector_bytes = row
        stored_embedding = np.frombuffer(vector_bytes, dtype=np.float32).flatten()
        sim = 1 - cosine(query_embedding, stored_embedding)
        similarities.append((id_, card_text, sim))

    similarities.sort(key=lambda x: x[2], reverse=True)  # Sort by similarity scores in descending order

    top_matches = similarities[:5]

    conn.close()

    return [(match[1], match[2]) for match in top_matches]

In [78]:
# Example of how I can use semantic search in my vector database
vector_db_name = "raw/full_card_vector_database.db"

query = "Sign in blood"
test = semantic_search(query, vector_db_name)
all_chunks = ""
for item in test:
    all_chunks += item[0]+"\n\n"
print(all_chunks)