In [1]:
import pandas as pd

### original data  

In [2]:
file_path = "lichess_data/data/year=2023/month=01/train-00001-of-00433.parquet"
df = pd.read_parquet(file_path)
print(df.columns)

Index(['Event', 'Site', 'White', 'Black', 'Result', 'WhiteTitle', 'BlackTitle',
       'WhiteElo', 'BlackElo', 'WhiteRatingDiff', 'BlackRatingDiff', 'UTCDate',
       'UTCTime', 'ECO', 'Opening', 'Termination', 'TimeControl', 'movetext'],
      dtype='object')


In [3]:
# print first 20 result rows
print(df['Result'].head(20))


0         0-1
1         1-0
2     1/2-1/2
3         1-0
4         1-0
5         0-1
6         0-1
7         0-1
8         0-1
9         0-1
10        0-1
11        1-0
12        1-0
13        1-0
14        1-0
15        1-0
16        0-1
17        1-0
18        0-1
19        1-0
Name: Result, dtype: object


In [4]:
# df.head()

### preprocessed data

In [5]:
import pandas as pd
from pathlib import Path
import random

# Set display options for better viewing
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', None)

# Path to processed data
DATA_DIR = Path("lichess_data/data_processed")

# Find all processed parquet files
parquet_files = list(DATA_DIR.rglob("*.parquet"))
print(f"Found {len(parquet_files)} processed files\n")

# Pick a random file to inspect
if parquet_files:
    sample_file = random.choice(parquet_files)
    print(f"Inspecting: {sample_file.relative_to(DATA_DIR.parent)}\n")
    
    # Load the file
    df = pd.read_parquet(sample_file)
    
    print(f"Dataset shape: {df.shape[0]:,} rows × {df.shape[1]} columns\n")
    
    # Show column names and types
    print("Columns:")
    print(df.dtypes)
    print("\n" + "="*80 + "\n")
    
    # Show first few rows
    print("First 3 rows:")
    print(df.head(3))
    print("\n" + "="*80 + "\n")
    
    # Compare PGN vs UCI for a few examples
    print("PGN vs UCI Comparison (5 random games):")
    print("-" * 80)
    
    sample_indices = random.sample(range(len(df)), min(5, len(df)))
    
    for idx in sample_indices:
        row = df.iloc[idx]
        print(f"\nGame {idx + 1}:")
        print(f"PGN: {row['movetext_uci'][:150]}{'...' if len(row['movetext_uci']) > 150 else ''}")
        print(f"UCI: {row['movetext_uci'][:150]}{'...' if len(row['movetext_uci']) > 150 else ''}")
        
    print("\n" + "="*80 + "\n")
    
    # Statistics
    print("Statistics:")
    print(f"  Average PGN length: {df['movetext_uci'].str.len().mean():.1f} characters")
    print(f"  Average UCI length: {df['movetext_uci'].str.len().mean():.1f} characters")
    print(f"  Empty UCI conversions: {(df['movetext_uci'] == '').sum()} ({(df['movetext_uci'] == '').sum() / len(df) * 100:.2f}%)")
    
    # Count moves
    df['n_moves'] = df['movetext_uci'].str.split().str.len()
    print(f"  Average number of moves: {df['n_moves'].mean():.1f}")
    print(f"  Min moves: {df['n_moves'].min()}")
    print(f"  Max moves: {df['n_moves'].max()}")
    
    print("\n" + "="*80 + "\n")
    
    # Show a complete game example
    print("Complete Game Example:")
    print("-" * 80)
    example = df.iloc[0]
    print(f"Converted UCI:\n{example['movetext_uci']}\n")
    
else:
    print("No processed files found!")

Found 3 processed files

Inspecting: data_processed/year=2023/month=01/train-00000-of-00433.parquet

Dataset shape: 238,288 rows × 1 columns

Columns:
movetext_uci    object
dtype: object


First 3 rows:
                                                                                          movetext_uci
0  <BOG> <WHITE:2200> <BLACK:2200> <WHITE_WIN> d2d3 d7d5 e2e3 c7c5 f2f3 b8c6 g2g3 g7g6 f1g2 f8g7 g1...
1                 <BOG> <WHITE:1700> <BLACK:1700> <WHITE_WIN> e2e4 c7c5 g1f3 b8c6 d2d4 e7e6 d4d5 <EOG>
2  <BOG> <WHITE:1500> <BLACK:1400> <BLACK_WIN> e2e4 e7e5 d2d3 b8c6 b1c3 f8c5 c1g5 f7f6 g5e3 c5b6 e3...


PGN vs UCI Comparison (5 random games):
--------------------------------------------------------------------------------

Game 147214:
PGN: <BOG> <WHITE:1800> <BLACK:1800> <BLACK_WIN> e2e4 c7c5 c2c3 b8c6 d2d4 c5d4 c3d4 e7e6 g1f3 d7d5 b1c3 g8f6 e4e5 f6e4 f1d3 e4c3 b2c3 f8e7 e1g1 e8g8 c3c4 d...
UCI: <BOG> <WHITE:1800> <BLACK:1800> <BLACK_WIN> e2e4 c7c5 c2c3 b8c6 d2d4 c5d4 c3d4 e7e6

In [6]:
file_path = "lichess_data/data_processed/year=2023/month=01/train-00432-of-00433.parquet"
df = pd.read_parquet(file_path)
print(df.columns)

Index(['movetext_uci'], dtype='object')


In [18]:
df.iloc[100200]["movetext_uci"]

'<BOG> <WHITE:1900> <BLACK:1900> <WHITE_WIN> e2e4 c7c5 b1c3 d7d6 g1f3 g8f6 d2d4 c5d4 d1d4 b8c6 d4e3 e7e6 f1b5 c8d7 b5c6 d7c6 e1g1 b7b5 c1d2 b5b4 c3e2 f6e4 d2b4 a8b8 a2a3 a7a5 b4d2 e4d2 f3d2 f8e7 b2b4 a5b4 a3b4 b8b4 f1b1 b4b1 a1b1 e8g8 e2d4 c6d5 c2c4 d5a8 d2b3 d8c7 h2h3 e7f6 b1c1 c7b7 f2f3 f8c8 c4c5 d6c5 b3c5 b7b2 c5e6 f7e6 e3e6 g8f8 c1c8 <EOG>'

In [8]:
from transformers import GPT2LMHeadModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("nsarrazin/chessformer")

#add <BOG> and <EOG> tokens
tokenizer.add_special_tokens({'additional_special_tokens': ['<BOG>', '<EOG>']})

moves = df.iloc[0]["movetext_uci"]
print(moves)

# Print the tokens
print("Tokens:", tokenizer.tokenize(moves))

  from .autonotebook import tqdm as notebook_tqdm


<BOG> <WHITE:1400> <BLACK:1400> <BLACK_WIN> d2d4 d7d6 c1f4 b8d7 e2e3 e7e5 d4e5 d6e5 f4g3 g8f6 g1f3 f8d6 f1d3 e8g8 b1d2 f8e8 d2e4 f6e4 d3e4 c7c6 g3h4 d7f6 d1e2 d6e7 a1d1 f6e4 d1d8 e8d8 h4e7 d8d7 e7b4 e4d6 b4d6 d7d6 f3e5 c8e6 e5d3 a8d8 e1g1 e6c4 e2f3 c4d5 e3e4 d5c4 e4e5 c4d3 e5d6 d3f1 g1f1 d8d6 g2g3 d6d2 f3e4 d2d1 f1g2 h7h6 e4e8 g8h7 e8c8 d1a1 <EOG>
Tokens: ['<BOG>', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '<BLACK_WIN>', 'd2d4', 'd7d6', 'c1f4', 'b8d7', 'e2e3', 'e7e5', 'd4e5', 'd6e5', 'f4g3', 'g8f6', 'g1f3', 'f8d6', 'f1d3', 'e8g8', 'b1d2', 'f8e8', 'd2e4', 'f6e4', 'd3e4', 'c7c6', 'g3h4', 'd7f6', 'd1e2', 'd6e7', 'a1d1', 'f6e4', 'd1d8', 'e8d8', 'h4e7', 'd8d7', 'e7b4', 'e4d6', 'b4d6', 'd7d6', 'f3e5', 'c8e6', 'e5d3', 'a8d8', 'e1g1', 'e6c4', 'e2f3', 'c4d5', 'e3e4', 'd5c4', 'e4e5', 'c4d3', 'e5d6', 'd3f1', 'g1f1', 'd8d6', 'g2g3', 'd6d2', 'f3e4', 'd2d1', 'f1g2', 'h7h6', 'e4e8', 'g8h7', 'e8c8', 'd1a1', '<EOG>']


In [9]:
# tokenizer = AutoTokenizer.from_pretrained("../tokenizer/uci_tokenizer_with_special_tokens")
print(f"BOS token: {tokenizer.bos_token} (id: {tokenizer.bos_token_id})")
print(f"EOS token: {tokenizer.eos_token} (id: {tokenizer.eos_token_id})")

BOS token: None (id: None)
EOS token: None (id: None)
