In [1]:
import torch
import json
from transformers import AutoTokenizer, AutoModel
from pinecone import Pinecone

# Initialize Pinecone
api_key = '801871e8-e0fa-4e25-abd2-62bdcfef9c2c'
pc = Pinecone(api_key=api_key)
index = pc.Index('chess-games')

# Load BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Ensure the model uses the GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def embed_query(query):
    inputs = tokenizer(query, return_tensors='pt', padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    query_embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy().tolist()
    return query_embedding[0]

def search_similar_games(player_name, query, time_class):
    # Embed the query
    query_embedding = embed_query(query)

    # Define the metadata filter based on the time_class
    filter_criteria = {'time_class': time_class} if time_class else {}
    
    # Perform a similarity search in Pinecone
    response = index.query(
        vector=query_embedding,
        top_k=5,  # Adjust top_k based on how many similar games you want to retrieve
        namespace=player_name,
        include_metadata=True,
        filter=filter_criteria  # Apply the time_class filter
    )

    # Extract the relevant metadata
    relevant_games = []
    for match in response['matches']:
        metadata = match['metadata']
        relevant_games.append(metadata)

    return relevant_games

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  torch.utils._pytree._register_pytree_node(


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [2]:
# Example usage
player_name = 'Hikaru'
query = 'Show me the games where Hikaru lost as black.'
time_class = 'blitz'

relevant_games = search_similar_games(player_name, query, time_class)

# Print the relevant game metadata
for i, game in enumerate(relevant_games):
    print(f"Game {i+1}:")
    print(json.dumps(game, indent=2))

Game 1:
{
  "black_rating": 2619.0,
  "black_username": "nindjaxx8",
  "end_time": "2024-01-24 19:12:51",
  "result": "1-0",
  "rules": "chess",
  "time_class": "blitz",
  "time_control": "180",
  "white_rating": 3261.0,
  "white_username": "Hikaru"
}
Game 2:
{
  "black_rating": 3219.0,
  "black_username": "Hikaru",
  "end_time": "2024-05-01 21:01:02",
  "result": "0-1",
  "rules": "chess",
  "time_class": "blitz",
  "time_control": "180",
  "white_rating": 3025.0,
  "white_username": "ChristopherYoo"
}
Game 3:
{
  "black_rating": 2987.0,
  "black_username": "GMBenjaminBok",
  "end_time": "2024-02-28 19:24:10",
  "result": "1-0",
  "rules": "chess",
  "time_class": "blitz",
  "time_control": "180",
  "white_rating": 3287.0,
  "white_username": "Hikaru"
}
Game 4:
{
  "black_rating": 2749.0,
  "black_username": "eagleclaw07",
  "end_time": "2024-01-30 18:16:54",
  "result": "1-0",
  "rules": "chess",
  "time_class": "blitz",
  "time_control": "180+1",
  "white_rating": 3275.0,
  "white_u