In [1]:
import pandas as pd
from pathlib import Path
import random

# Set display options for better viewing
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', None)

# Path to processed data
DATA_DIR = Path("lichess_data/data_processed")

# Find all processed parquet files
parquet_files = list(DATA_DIR.rglob("*.parquet"))
print(f"Found {len(parquet_files)} processed files\n")

# Pick a random file to inspect
if parquet_files:
    sample_file = random.choice(parquet_files)
    print(f"Inspecting: {sample_file.relative_to(DATA_DIR.parent)}\n")
    
    # Load the file
    df = pd.read_parquet(sample_file)
    
    print(f"Dataset shape: {df.shape[0]:,} rows × {df.shape[1]} columns\n")
    
    # Show column names and types
    print("Columns:")
    print(df.dtypes)
    print("\n" + "="*80 + "\n")
    
    # Show first few rows
    print("First 3 rows:")
    print(df.head(3))
    print("\n" + "="*80 + "\n")
    
    # Compare PGN vs UCI for a few examples
    print("PGN vs UCI Comparison (5 random games):")
    print("-" * 80)
    
    sample_indices = random.sample(range(len(df)), min(5, len(df)))
    
    for idx in sample_indices:
        row = df.iloc[idx]
        print(f"\nGame {idx + 1}:")
        print(f"PGN: {row['movetext'][:150]}{'...' if len(row['movetext']) > 150 else ''}")
        print(f"UCI: {row['movetext_uci'][:150]}{'...' if len(row['movetext_uci']) > 150 else ''}")
        
    print("\n" + "="*80 + "\n")
    
    # Statistics
    print("Statistics:")
    print(f"  Average PGN length: {df['movetext'].str.len().mean():.1f} characters")
    print(f"  Average UCI length: {df['movetext_uci'].str.len().mean():.1f} characters")
    print(f"  Empty UCI conversions: {(df['movetext_uci'] == '').sum()} ({(df['movetext_uci'] == '').sum() / len(df) * 100:.2f}%)")
    
    # Count moves
    df['n_moves'] = df['movetext_uci'].str.split().str.len()
    print(f"  Average number of moves: {df['n_moves'].mean():.1f}")
    print(f"  Min moves: {df['n_moves'].min()}")
    print(f"  Max moves: {df['n_moves'].max()}")
    
    print("\n" + "="*80 + "\n")
    
    # Show a complete game example
    print("Complete Game Example:")
    print("-" * 80)
    example = df.iloc[0]
    print(f"Original PGN:\n{example['movetext']}\n")
    print(f"Converted UCI:\n{example['movetext_uci']}\n")
    
else:
    print("No processed files found!")

Found 2 processed files

Inspecting: data_processed/year=2023/month=01/train-00001-of-00433.parquet

Dataset shape: 238,288 rows × 19 columns

Columns:
Event                     object
Site                      object
White                     object
Black                     object
Result                    object
WhiteTitle                object
BlackTitle                object
WhiteElo                   int16
BlackElo                   int16
WhiteRatingDiff          float64
BlackRatingDiff          float64
UTCDate                   object
UTCTime                   object
ECO                       object
Opening                   object
Termination               object
TimeControl               object
movetext                  object
movetext_preprocessed     object
dtype: object


First 3 rows:
              Event                          Site          White  \
0  Rated Blitz game  https://lichess.org/XNXq3QEd       BiggKipp   
1  Rated Blitz game  https://lichess.org/pN1YTwXP  Samu

KeyError: 'movetext_uci'

In [None]:
file_path = "lichess_data/data_processed/year=2023/month=01/train-00001-of-00433.parquet"
df = pd.read_parquet(file_path)
print(df.columns)

Index(['Event', 'Site', 'White', 'Black', 'Result', 'WhiteTitle', 'BlackTitle',
       'WhiteElo', 'BlackElo', 'WhiteRatingDiff', 'BlackRatingDiff', 'UTCDate',
       'UTCTime', 'ECO', 'Opening', 'Termination', 'TimeControl', 'movetext',
       'movetext_preprocessed'],
      dtype='object')


In [None]:
df.iloc[0]["movetext_uci"]

'<BOG> d2d4 d7d6 c2c4 g8f6 e2e3 h7h6 g1f3 g7g6 f1e2 f8g7 b1c3 e8g8 c3d5 c7c6 d5f6 g7f6 b2b4 c6c5 b4c5 d6c5 c1b2 c5d4 b2d4 f6d4 f3d4 b8c6 d4f3 d8d1 a1d1 c8e6 e1g1 g8g7 a2a3 a7a6 d1a1 c6a5 a1c1 a5b3 c1c3 b3c5 f3d4 c5e4 d4e6 f7e6 c3c2 a8d8 f2f3 e4c5 e3e4 e6e5 h2h3 c5e6 g2g3 e6d4 c2a2 d4e2 a2e2 d8d3 e2a2 d3c3 g3g4 c3c4 g1g2 g6g5 g2g3 f8c8 h3h4 g5h4 g3h4 c4c2 a2c2 c8c2 f1h1 c2c3 h1h3 c3a3 g4g5 h6g5 h4g5 b7b5 g5f5 b5b4 f5e5 b4b3 h3g3 g7f7 f3f4 a6a5 f4f5 a5a4 e5d5 a3a1 d5c4 a1c1 c4b4 c1a1 e4e5 b3b2 e5e6 f7f6 <EOG>'

In [None]:
from transformers import GPT2LMHeadModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("nsarrazin/chessformer")

#add <BOG> and <EOG> tokens
tokenizer.add_special_tokens({'additional_special_tokens': ['<BOG>', '<EOG>']})

moves = df.iloc[0]["movetext_uci"]
print(moves)

# Print the tokens
print("Tokens:", tokenizer.tokenize(moves))

<BOG> d2d4 d7d6 c2c4 g8f6 e2e3 h7h6 g1f3 g7g6 f1e2 f8g7 b1c3 e8g8 c3d5 c7c6 d5f6 g7f6 b2b4 c6c5 b4c5 d6c5 c1b2 c5d4 b2d4 f6d4 f3d4 b8c6 d4f3 d8d1 a1d1 c8e6 e1g1 g8g7 a2a3 a7a6 d1a1 c6a5 a1c1 a5b3 c1c3 b3c5 f3d4 c5e4 d4e6 f7e6 c3c2 a8d8 f2f3 e4c5 e3e4 e6e5 h2h3 c5e6 g2g3 e6d4 c2a2 d4e2 a2e2 d8d3 e2a2 d3c3 g3g4 c3c4 g1g2 g6g5 g2g3 f8c8 h3h4 g5h4 g3h4 c4c2 a2c2 c8c2 f1h1 c2c3 h1h3 c3a3 g4g5 h6g5 h4g5 b7b5 g5f5 b5b4 f5e5 b4b3 h3g3 g7f7 f3f4 a6a5 f4f5 a5a4 e5d5 a3a1 d5c4 a1c1 c4b4 c1a1 e4e5 b3b2 e5e6 f7f6 <EOG>
Tokens: ['<BOG>', 'd2d4', 'd7d6', 'c2c4', 'g8f6', 'e2e3', 'h7h6', 'g1f3', 'g7g6', 'f1e2', 'f8g7', 'b1c3', 'e8g8', 'c3d5', 'c7c6', 'd5f6', 'g7f6', 'b2b4', 'c6c5', 'b4c5', 'd6c5', 'c1b2', 'c5d4', 'b2d4', 'f6d4', 'f3d4', 'b8c6', 'd4f3', 'd8d1', 'a1d1', 'c8e6', 'e1g1', 'g8g7', 'a2a3', 'a7a6', 'd1a1', 'c6a5', 'a1c1', 'a5b3', 'c1c3', 'b3c5', 'f3d4', 'c5e4', 'd4e6', 'f7e6', 'c3c2', 'a8d8', 'f2f3', 'e4c5', 'e3e4', 'e6e5', 'h2h3', 'c5e6', 'g2g3', 'e6d4', 'c2a2', 'd4e2', 'a2e2', 'd8d3', 'e2a2'

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("../tokenizer/uci_tokenizer_with_special_tokens")
print(f"BOS token: {tokenizer.bos_token} (id: {tokenizer.bos_token_id})")
print(f"EOS token: {tokenizer.eos_token} (id: {tokenizer.eos_token_id})")

BOS token: None (id: None)
EOS token: None (id: None)
