# Extract Text Data

In [1]:
import json
import pandas as pd

# Uncomment code after redownlading oracle cards, removed from directory due to large unfiltered size
# # Fetch raw card data
# cardfile = "raw/oracle-cards-20240201100133.json"

# with open(cardfile, 'r', encoding='utf-8') as raw_oracle:
#     raw_oracle_data = json.load(raw_oracle)

# filtered_data = [card for card in raw_oracle_data if card.get('set_type') != "memorabilia"]

# columns_to_keep = ["name", "mana_cost", "cmc", "type_line", "oracle_text", "power", "toughness",
#                     "colors", "color_identity", "keywords"]

# df = pd.DataFrame(filtered_data)
# df = df[columns_to_keep]
# df.to_csv("raw/filtered_oracle_database.csv", index=False, encoding='utf-8')

df = pd.read_csv("raw/filtered_oracle_database.csv")

formatted_rows = []
for index, row in df.iterrows():
    formatted_row = ""
    for column_name, value in row.items():
        formatted_row += f"{column_name}: {value}\n"
    formatted_rows.append(formatted_row.strip())

# Chunk data and create embeddings

In [2]:
# Chunk data into 5 card piles with 2 newlines between each card
chunked_data = []
current_chunk = ""
for row in formatted_rows:
    current_chunk += row + "\n\n"
    if len(current_chunk.split('\n\n')) == 6:  # Each chunk contains 5 cards and 1 extra newline character
        chunked_data.append(current_chunk.strip())
        current_chunk = ""

# If there are remaining cards not included in chunks
if current_chunk:
    chunked_data.append(current_chunk.strip())

In [65]:
from angle_emb import AnglE

angle = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1', pooling_strategy='cls')

In [75]:
#chunked_data_test = [chunked_data[0], chunked_data[1]]

In [79]:
import sqlite3
from tqdm import tqdm

# Connect to the database
conn = sqlite3.connect('raw/card_vector_database.db')
c = conn.cursor()

# Create table
c.execute('''CREATE TABLE IF NOT EXISTS vectors
             (id INTEGER PRIMARY KEY, text TEXT, vector BLOB)''')

# Encode and save the encodings along with the corresponding indices
for i, chunked in enumerate(tqdm(chunked_data_test, desc="Encoding and saving")):
    encodings = angle.encode(chunked, to_numpy=True)
    
    # Insert the encoded chunk and its corresponding text into the database
    c.execute("INSERT INTO vectors (id, text, vector) VALUES (?, ?, ?)",
              (i, chunked, encodings.tobytes()))

# Commit changes and close the connection
conn.commit()
conn.close()

Encoding and saving: 100%|██████████| 2/2 [00:03<00:00,  1.70s/it]


In [87]:
# import sqlite3
# import numpy as np
# from tqdm import tqdm
# from scipy.spatial.distance import cosine

# def semantic_search(query, vector_db_name):
#     conn = sqlite3.connect(vector_db_name)
#     c = conn.cursor()

#     query_embedding = angle.encode(query, to_numpy=True).flatten()

#     c.execute("SELECT id, text, vector FROM vectors")
#     rows = c.fetchall()

#     similarities = []
#     for row in tqdm(rows, desc="Calculating similarities"):
#         id_, text, vector_bytes = row
#         stored_embedding = np.frombuffer(vector_bytes, dtype=np.float32).flatten()
#         sim = 1 - cosine(query_embedding, stored_embedding)
#         similarities.append((id_, text, sim))

#     best_match = max(similarities, key=lambda x: x[2])

#     conn.close()

#     return best_match[1]


# # Example usage:
# query = "Ravnica at War"
# vector_db_name = "card_vector_database.db"
# result = semantic_search(query, vector_db_name)
# print(result)

Calculating similarities: 100%|██████████| 2/2 [00:00<00:00, 1999.19it/s]

name: Ravnica at War
mana_cost: {3}{W}
cmc: 4.0
type_line: Sorcery
oracle_text: Exile all multicolored permanents.
power: nan
toughness: nan
colors: ['W']
color_identity: ['W']
keywords: []

name: Greta, Sweettooth Scourge
mana_cost: {1}{B}{G}
cmc: 3.0
type_line: Legendary Creature — Human Warrior
oracle_text: When Greta, Sweettooth Scourge enters the battlefield, create a Food token. (It's an artifact with "{2}, {T}, Sacrifice this artifact: You gain 3 life.")
{G}, Sacrifice a Food: Put a +1/+1 counter on target creature. Activate only as a sorcery.
{1}{B}, Sacrifice a Food: You draw a card and you lose 1 life.
power: 3
toughness: 3
colors: ['B', 'G']
color_identity: ['B', 'G']
keywords: ['Food']

name: Torrent of Fire
mana_cost: {3}{R}{R}
cmc: 5.0
type_line: Sorcery
oracle_text: Torrent of Fire deals damage to any target equal to the highest mana value among permanents you control.
power: nan
toughness: nan
colors: ['R']
color_identity: ['R']
keywords: []

name: Wyluli Wolf
mana_cost


