In [1]:
import json
import pandas as pd

# Fetch raw card data
cardfile = "raw/oracle-cards-20240201100133.json"

with open(cardfile, 'r', encoding='utf-8') as raw_oracle:
    raw_oracle_data = json.load(raw_oracle)

filtered_data = [card for card in raw_oracle_data if card.get('set_type') != "memorabilia"]

columns_to_keep = ["name", "mana_cost", "cmc", "type_line", "oracle_text", "power", "toughness",
                    "colors", "color_identity", "keywords"]

df = pd.DataFrame(filtered_data)
df = df[columns_to_keep]
df.to_csv("raw/filtered_oracle_database.csv", index=False, encoding='utf-8')

In [4]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token = "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")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [5]:
# Tokenize data using llama2 tokenizer

formatted_rows = []
for index, row in df.iterrows():
    formatted_row = ""
    for column_name, value in row.items():
        formatted_row += f"{column_name}: {value}\n"
    formatted_rows.append(formatted_row.strip())
    

tokenized_rows = [tokenizer(row, return_tensors='tf', max_length=512, truncation=True) for row in formatted_rows]

In [14]:
from tqdm import tqdm

# Chunk the data based on the max tokens for llama2

max_token_limit = 1024

def chunk_tokenized_data(tokenized_data):
    tokenized_chunks = []
    current_chunk = []
    current_chunk_tokens = 0

    with tqdm(total=len(tokenized_data), desc="Chunking Data") as pbar:
        for tokenized_row in tokenized_data:
            tokens_length = sum(len(tokenized_row[key][0]) for key in tokenized_row.keys())
            if current_chunk_tokens + tokens_length <= max_token_limit:
                current_chunk.append(tokenized_row)
                current_chunk_tokens += tokens_length
            else:
                tokenized_chunks.append(current_chunk)
                current_chunk = [tokenized_row]
                current_chunk_tokens = tokens_length
            pbar.update(1)

        # Add last chunk
        if current_chunk:
            tokenized_chunks.append(current_chunk)
    return tokenized_chunks

tokenized_chunks = chunk_tokenized_data(tokenized_rows)

Chunking Data: 100%|██████████| 29244/29244 [00:16<00:00, 1795.14it/s]


In [20]:
import sqlite3

# Create SQLite database to store decoded tokenized data

conn = sqlite3.connect('raw/tokenized_data_llama2.db')
cursor = conn.cursor()

cursor.execute('''CREATE TABLE IF NOT EXISTS tokenized_chunks (
                    chunk_id INTEGER PRIMARY KEY,
                    chunk_text TEXT
                )''')

for i, chunk in enumerate(tokenized_chunks):
    chunk_text = ""
    for tokenized_row in chunk:
        for key, value in tokenized_row.items():
            chunk_text += tokenizer.decode(value[0]) + "\n"
    cursor.execute("INSERT INTO tokenized_chunks (chunk_id, chunk_text) VALUES (?, ?)", (i, chunk_text))

conn.commit()
conn.close()

print("Data stored in the database successfully.")

Data stored in the database successfully.


In [6]:
import json
import pandas as pd
import random

# Create test database

cardfile = "raw/filtered_oracle_database.csv"
df_oracle = pd.read_csv(cardfile)

card_names = df_oracle["name"].tolist()

num_sample_cards = 1000

random_card_names = random.sample(card_names, num_sample_cards)

data = {'Card Name': random_card_names,
        'GPT2-raw-ROUGE': [None] * num_sample_cards,  # Empty column for GPT-2-raw-ROUGE
        'GPT2-RAG-ROUGE': [None] * num_sample_cards}  # Empty column for GPT-2-RAG-ROUGE

df = pd.DataFrame(data)

#df.to_csv('RAG_MTG_Test.csv', index=False)