In [1]:
import requests
import json

username = 'iAMbronze'

headers = {"User-Agent": "InfiniteChessAI"} 
archives = requests.get(
    f'https://api.chess.com/pub/player/{username}/games/archives',
    headers=headers
).json()['archives']

print(archives)   

['https://api.chess.com/pub/player/iambronze/games/2020/09', 'https://api.chess.com/pub/player/iambronze/games/2020/10', 'https://api.chess.com/pub/player/iambronze/games/2020/11', 'https://api.chess.com/pub/player/iambronze/games/2021/01', 'https://api.chess.com/pub/player/iambronze/games/2021/03', 'https://api.chess.com/pub/player/iambronze/games/2021/04', 'https://api.chess.com/pub/player/iambronze/games/2021/05', 'https://api.chess.com/pub/player/iambronze/games/2021/06', 'https://api.chess.com/pub/player/iambronze/games/2021/07', 'https://api.chess.com/pub/player/iambronze/games/2021/08', 'https://api.chess.com/pub/player/iambronze/games/2021/09', 'https://api.chess.com/pub/player/iambronze/games/2022/01', 'https://api.chess.com/pub/player/iambronze/games/2022/02', 'https://api.chess.com/pub/player/iambronze/games/2022/04', 'https://api.chess.com/pub/player/iambronze/games/2022/05', 'https://api.chess.com/pub/player/iambronze/games/2023/02', 'https://api.chess.com/pub/player/iambr

In [2]:
import pandas as pd
import re
import requests

data = []
for url in archives:
    url_data = requests.get(url, headers=headers)
    data.append(url_data.json())

def chess_data_to_dataframe(data):
    games_list = []
    
    for archive in data:
        if 'games' in archive:
            for game in archive['games']:
                record = {
                    'TimeClass': game.get('time_class', ''),
                    'TimeControl': game.get('time_control', ''),
                }
                
                # Pull PGN
                pgn = game.get('pgn', '')
                record['pgn'] = pgn
                
                if pgn:
                    # Extract standard fields
                    headers_to_extract = [
                        'White', 'Black', 'CurrentPosition', 'ECO', 'Termination', 'Result', 'Date'
                    ]
                    for header in headers_to_extract:
                        match = re.search(rf'\[{header} "([^"]+)"\]', pgn)
                        record[header] = match.group(1) if match else ''
                        
                    # Split PGN into header and moves sections
                    pgn_parts = pgn.split('\n\n', 1)  # Split on first double newline
                    if len(pgn_parts) == 2:
                        header_section, moves_section = pgn_parts
    
                        # Extract just the move notation (without clock times)
                        moves_clean = re.sub(r'\{[^}]*\}', '', moves_section)  # Remove {clock} parts
                        record['MovesRaw'] = moves_clean.strip()
                    
                    # Count number of moves
                    moves = re.findall(r'\d+\.', pgn)
                    record['NumMoves'] = len(moves)
                
                games_list.append(record)
    
    return pd.DataFrame(games_list)

df = chess_data_to_dataframe(data)
df.head()

Unnamed: 0,TimeClass,TimeControl,pgn,White,Black,CurrentPosition,ECO,Termination,Result,Date,MovesRaw,NumMoves
0,rapid,600,"[Event ""Live Chess""]\n[Site ""Chess.com""]\n[Dat...",iAMbronze,VedanshAgarwal03,r1bqk2r/pppp1ppp/3b1n2/4p3/3N3P/4R3/PPP1PPP1/R...,A00,iAMbronze won by resignation,1-0,2020.09.19,1. h4 1... e5 2. Rh3 2... Nc6 3. Re3 3......,26
1,blitz,300,"[Event ""Live Chess""]\n[Site ""Chess.com""]\n[Dat...",ukajashar,iAMbronze,Q1bk1b1r/4nppp/3pq3/2p3R1/4PB2/2P3P1/1P2NP1P/1...,B00,ukajashar won by resignation,1-0,2020.09.19,1. e4 1... a5 2. d4 2... Ra6 3. Bxa6 3......,67
2,blitz,300,"[Event ""Live Chess""]\n[Site ""Chess.com""]\n[Dat...",Francodesa44,iAMbronze,rn6/pp2kp1N/2p4B/3p4/8/PP1P2Nr/3KR3/8 b - -,C20,Francodesa44 won on time,1-0,2020.09.19,1. e4 1... e5 2. Qf3 2... Qg5 3. h4 3... ...,131
3,blitz,300,"[Event ""Live Chess""]\n[Site ""Chess.com""]\n[Dat...",iAMbronze,MZK1998,8/p1k2ppp/2p5/8/PP6/7P/5q2/R2KrB2 w - -,A00,MZK1998 won by checkmate,0-1,2020.09.19,1. h4 1... e5 2. Rh3 2... d6 3. Nf3 3... ...,133
4,rapid,600,"[Event ""Live Chess""]\n[Site ""Chess.com""]\n[Dat...",Aliiiifr,iAMbronze,rnbqkbnr/p1pppppp/8/1B6/4P3/8/PPPP1PPP/RNBQK1N...,B00,Aliiiifr won - game abandoned,1-0,2020.09.19,1. e4 1... b5 2. Bxb5 1-0,11


In [3]:
mask = df['Termination'].str.contains('abandoned')
df = df[~mask]

In [4]:
df["OwnMoves"] = None
df["OponentMoves"] = None

for idx, row in df.iterrows():
    if row['White'] == username:
        my_moves = re.findall(r'\d+\.\s+([^\s]+)', row['MovesRaw'])
        their_moves = re.findall(r'\d+\.\.\.\s+([^\s]+)', row['MovesRaw'])
        df.at[idx, 'OwnMoves'] = my_moves
        df.at[idx, 'OponentMoves'] = their_moves
    else:
        my_moves = re.findall(r'\d+\.\.\.\s+([^\s]+)', row['MovesRaw'])
        their_moves = re.findall(r'\d+\.\s+([^\s]+)', row['MovesRaw'])
        df.at[idx, 'OwnMoves'] = my_moves  
        df.at[idx, 'OponentMoves'] = their_moves

In [5]:
df = df.drop(['pgn'], axis=1)

In [6]:
# Simple dataset for initial AI training
training_data = []

for idx, row in df.iterrows():
    if row['White'] == username:  
        my_moves = row['OwnMoves']
        opponent_moves = row['OponentMoves']
        
        # Create training examples
        for i, move in enumerate(my_moves):
            training_example = {
                'game_id': idx,
                'move_number': i + 1,
                'my_previous_moves': my_moves[:i],
                'opponent_previous_moves': opponent_moves[:i],
                'my_move': move,
                'time_control': row['TimeControl'],
                'opening': row['ECO'],
                'color': 'white'
            }
            training_data.append(training_example)
    
    elif row['Black'] == username:
        my_moves = row['OwnMoves']
        opponent_moves = row['OponentMoves']
        
        for i, move in enumerate(my_moves):
            training_example = {
                'game_id': idx,
                'move_number': i + 1,
                'my_previous_moves': my_moves[:i],
                'opponent_previous_moves': opponent_moves[:i],
                'my_move': move,
                'time_control': row['TimeControl'],
                'opening': row['ECO'],
                'color': 'black'
            }
            training_data.append(training_example)

training_df = pd.DataFrame(training_data)
print(f"Created {len(training_df)} training examples from your games")
training_df.head()

Created 10753 training examples from your games


Unnamed: 0,game_id,move_number,my_previous_moves,opponent_previous_moves,my_move,time_control,opening,color
0,0,1,[],[],h4,600,A00,white
1,0,2,[h4],[e5],Rh3,600,A00,white
2,0,3,"[h4, Rh3]","[e5, Nc6]",Re3,600,A00,white
3,0,4,"[h4, Rh3, Re3]","[e5, Nc6, Nf6]",Nf3,600,A00,white
4,0,5,"[h4, Rh3, Re3, Nf3]","[e5, Nc6, Nf6, Bd6]",d4,600,A00,white


In [7]:
import json
from pathlib import Path

def moves_to_history(my_prev, opp_prev, color):
    # Build alternating move history in SAN-like compact form: "1. e4 e5 2. Nf3 Nc6 ..."
    history = []
    for i in range(max(len(my_prev), len(opp_prev))):
        w = my_prev[i] if i < len(my_prev) else ''
        b = opp_prev[i] if i < len(opp_prev) else ''
        if color == 'white':
            # since we're creating prompt for the next move to be made by 'white',
            # previous moves are already in their lists
            history.append((w, b))
        else:
            # if color is black, my_prev contains black moves; we still interleave for readability
            history.append((w, b))
    # Convert to numbered moves
    s = []
    for i,(w,b) in enumerate(history, start=1):
        if w or b:
            if w:
                s.append(f"{i}. {w}")
            if b:
                s[-1] = s[-1] + f" {b}" if s else f"{i}. {b}"
    return " ".join(s)

out_path = Path("sft_data.jsonl")
with out_path.open("w", encoding="utf-8") as f:
    for _, row in training_df.iterrows():
        prompt = f"Moves so far: {moves_to_history(row['my_previous_moves'], row['opponent_previous_moves'], row['color'])}\nPlay as {row['color']}. Give only the next move in SAN (or UCI):"
        completion = row['my_move'] + "\n"
        obj = {"prompt": prompt, "completion": completion}
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

print("Wrote", out_path, "with", sum(1 for _ in out_path.open())) 

Wrote sft_data.jsonl with 10753


In [1]:
!pip install chess python-chess tqdm  # Add this

Collecting chess
  Using cached chess-1.11.2.tar.gz (6.1 MB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting python-chess
  Downloading python_chess-1.999-py3-none-any.whl.metadata (776 bytes)
Downloading python_chess-1.999-py3-none-any.whl (1.4 kB)
Building wheels for collected packages: chess
  Building wheel for chess (setup.py) ... [?25ldone
[?25h  Created wheel for chess: filename=chess-1.11.2-py3-none-any.whl size=147776 sha256=a1d3f5be1e6e4ca5591d090ee1a1b200417aa5d22936957dca17c07a8a23b0bf
  Stored in directory: /Users/imadeddine/Library/Caches/pip/wheels/fb/5d/5c/59a62d8a695285e59ec9c1f66add6f8a9ac4152499a2be0113
Successfully built chess
Installing collected packages: chess, python-chess
Successfully installed chess-1.11.2 python-chess-1.999
