In [11]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from pinecone import Pinecone
import torch
import json
import numpy as np

# Initialize Pinecone
api_key = '801871e8-e0fa-4e25-abd2-62bdcfef9c2c'
pc = Pinecone(api_key=api_key)
index = pc.Index('chess-games')

# Load model directly
model_name = 'bert-base-uncased'  # Correct BERT model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Ensure the model uses the GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Read the CSV file into a DataFrame
local_file_name = 'chess_games.csv'
df = pd.read_csv(local_file_name)

# Filter for rated games
df = df[df['rated'] == True]
# Drop unnecessary columns
df = df.drop(['url', 'pgn', 'rated'], axis=1)

# List of the eight players of interest
players_of_interest = ['Hikaru', 'MagnusCarlsen', 'lachesisQ', 'ChessWarrior7197', 'GukeshDommaraju', 'GMWSO', 'LOVEVAE', 'FabianoCaruana']

# Function to create embeddings from game data using BERT model
def create_embeddings(rows):
    texts = [f"Game between {row['white_username']} (rating: {row['white_rating']}) and {row['black_username']} (rating: {row['black_rating']}) with result {row['result']}" for _, row in rows.iterrows()]
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy().tolist()
    return embeddings

# Function to find games between specific players
def find_games_between_players(dataframe, players):
    games_between_players = []

    # Iterate through each pair of players
    for i, player1 in enumerate(players):
        for player2 in players[i+1:]:
            # Filter games where player1 and player2 played against each other
            games = dataframe[((dataframe['white_username'] == player1) & (dataframe['black_username'] == player2)) |
                              ((dataframe['white_username'] == player2) & (dataframe['black_username'] == player1))]
            if not games.empty:
                games_between_players.append(games)

    # Concatenate all the games found into a single DataFrame
    if games_between_players:
        result_df = pd.concat(games_between_players)
        return result_df
    else:
        return pd.DataFrame()  # Return an empty DataFrame if no games were found

# Function to upload data to Pinecone with player-based namespaces, handling games between focused players
def upload_to_pinecone(dataframe, players, batch_size=16, max_batch_size_bytes=4*1024*1024):
    # Find games between the focused players
    games_between_focused_players = find_games_between_players(dataframe, players)

    for player in players:
        # Filter games for the current player
        player_games = dataframe[(dataframe['white_username'] == player) | (dataframe['black_username'] == player)]
        namespace = player  # Use player name as namespace

        # Upload player games in batches
        for start_idx in range(0, len(player_games), batch_size):
            batch = player_games.iloc[start_idx:start_idx + batch_size]
            vectors = []
            embeddings = create_embeddings(batch)

            for idx, (embedding, (_, row)) in enumerate(zip(embeddings, batch.iterrows())):
                metadata = {
                    'time_control': row['time_control'],
                    'end_time': row['end_time'],
                    'time_class': row['time_class'],
                    'rules': row['rules'],
                    'white_username': row['white_username'],
                    'white_rating': row['white_rating'],
                    'black_username': row['black_username'],
                    'black_rating': row['black_rating'],
                    'result': row['result']
                }
                vectors.append((str(start_idx + idx), embedding, metadata))

                # Check the batch size and upload if it exceeds the limit
                batch_size_bytes = sum([len(json.dumps(v)) for v in vectors])
                if batch_size_bytes >= max_batch_size_bytes:
                    index.upsert(vectors, namespace=namespace)
                    vectors = []

            # Upload remaining vectors in the batch
            if vectors:
                index.upsert(vectors, namespace=namespace)
            
            print(f"Uploaded {start_idx + len(batch)} of {len(player_games)} games for player {player}")

        # Duplicate games involving other focused players in their respective namespaces
        for start_idx in range(0, len(games_between_focused_players), batch_size):
            batch = games_between_focused_players.iloc[start_idx:start_idx + batch_size]
            embeddings = create_embeddings(batch)
            vectors = []

            for idx, (embedding, (_, row)) in enumerate(zip(embeddings, batch.iterrows())):
                if row['white_username'] == player or row['black_username'] == player:
                    metadata = {
                        'time_control': row['time_control'],
                        'end_time': row['end_time'],
                        'time_class': row['time_class'],
                        'rules': row['rules'],
                        'white_username': row['white_username'],
                        'white_rating': row['white_rating'],
                        'black_username': row['black_username'],
                        'black_rating': row['black_rating'],
                        'result': row['result']
                    }
                    other_namespace = row['white_username'] if row['black_username'] == player else row['black_username']
                    vectors.append((str(start_idx + idx), embedding, metadata))

                    # Check the batch size and upload if it exceeds the limit
                    batch_size_bytes = sum([len(json.dumps(v)) for v in vectors])
                    if batch_size_bytes >= max_batch_size_bytes:
                        index.upsert(vectors, namespace=other_namespace)
                        vectors = []

            # Upload remaining vectors in the batch
            if vectors:
                index.upsert(vectors, namespace=other_namespace)
            
            print(f"Uploaded {start_idx + len(batch)} of {len(games_between_focused_players)} duplicated games for player {player}")

# Upload the data
upload_to_pinecone(df, players_of_interest)

print("Data upload to Pinecone complete.")

Uploaded 16 of 33 games for player lachesisQ
Uploaded 32 of 33 games for player lachesisQ
Uploaded 33 of 33 games for player lachesisQ
Data upload to Pinecone complete.
