In [33]:
import re
from collections import defaultdict, OrderedDict

import numpy as np
import pandas as pd
import chess

In [34]:
df = pd.read_csv("../data/lichess_db_puzzle.csv", header=None)

In [35]:
df = df[:100000]

In [36]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0000D,5rk1/1p3ppp/pq3b2/8/8/1P1Q1N2/P4PPP/3R2K1 w - ...,d3d6 f8d8 d6d8 f6d8,1488,75,97,3160,advantage endgame short,https://lichess.org/F8M8OS71#53
1,0009B,r2qr1k1/b1p2ppp/pp4n1/P1P1p3/4P1n1/B2P2Pb/3NBP...,b6c5 e2g4 h3g4 d1g4,1145,75,98,476,advantage middlegame short,https://lichess.org/4MWQCxQ6/black#32
2,000aY,r4rk1/pp3ppp/2n1b3/q1pp2B1/8/P1Q2NP1/1PP1PP1P/...,g5e7 a5c3 b2c3 c6e7,1407,75,91,243,advantage master middlegame short,https://lichess.org/iihZGl6t#29
3,000h7,3q1rk1/1pp3pp/5p1P/4pPP1/rb1pP3/3P1N2/b1P1B3/2...,d8a8 g5g6 h7g6 h6g7,2306,88,83,209,advancedPawn crushing kingsideAttack middlegam...,https://lichess.org/FLmpZbTm/black#52
4,000tp,4r3/5pk1/1p3np1/3p3p/2qQ4/P4N1P/1P3RP1/7K w - ...,d4b6 f6e4 h1g1 e4f2,2038,76,80,86,crushing endgame short trappedPiece,https://lichess.org/GeXqsW90#67


In [37]:
df = df.rename(columns={0: "puzzle_id", 1: "fen", 2: "moves", 3: "rating", 4: "rating deviation", 5: "popularity", 6: "nbplays", 7: "themes", 8: "gameurl"})

In [38]:
def moves_to_san(x):
    "Transform the list of moves to a list of moves in the algebraic notation"
    
    # get fen and puzzule solution in uic format
    fen = x["fen"]
    moves = x["moves"]
    
    # create temporary board
    board = chess.Board(fen)
    
    # make a list of the solution
    moves_list = moves.split(" ")
    san_list = []
    
    # we need the first fen after the first solution move because it is played by the cpu
    need_fen_after_first_move = True
    
    # iterate over the moves and play it on the temporary board
    for move in moves_list:
        next_move = chess.Move.from_uci(move)
        # get the algebraic notation of the move and append it to the string
        san = board.san(next_move)
        san_list.append(san)
        board.push(next_move)
        
        # saves the needed first fen
        if need_fen_after_first_move:
            fen_after_first_move = board.fen()
            need_fen_after_first_move = False
            
    return san_list, board.fen(), fen_after_first_move

In [39]:
# Make algebraic notation and clean
df["moves_alg"], df["final_fen"], df["fen_after_first_move"] = zip(*df.apply(moves_to_san, axis=1))

  return array(a, dtype, copy=False, order=order)


In [40]:
# Split final fen for more information
df[["placement", "active", "castling", "enpassant", "halfmove_clock", "fullmove_clock"]] = df["fen_after_first_move"].str.split(" ", expand=True)

In [41]:
def pieces_and_positions(x):
    "return the number of pieces and the positions as a sorted dictionairy"
    
    # get the column with the FEN placement
    placement = x["placement"]
    
    all_squares = [chess.A1, chess.A2, chess.A3, chess.A4, chess.A5, chess.A6, chess.A7, chess.A8, 
                   chess.B1, chess.B2, chess.B3, chess.B4, chess.B5, chess.B6, chess.B7, chess.B8, 
                   chess.C1, chess.C2, chess.C3, chess.C4, chess.C5, chess.C6, chess.C7, chess.C8, 
                   chess.D1, chess.D2, chess.D3, chess.D4, chess.D5, chess.D6, chess.D7, chess.D8, 
                   chess.E1, chess.E2, chess.E3, chess.E4, chess.E5, chess.E6, chess.E7, chess.E8, 
                   chess.F1, chess.F2, chess.F3, chess.F4, chess.F5, chess.F6, chess.F7, chess.F8, 
                   chess.G1, chess.G2, chess.G3, chess.G4, chess.G5, chess.G6, chess.G7, chess.G8, 
                   chess.H1, chess.H2, chess.H3, chess.H4, chess.H5, chess.H6, chess.H7, chess.H8]
    
    # create temporary board
    board = chess.Board(placement)
    
    # count all pieces from the FEN placement
    number_pieces = len(re.findall(r"[pPrRbBnNqQkK]", placement))
           
    # create defaultdict with lists
    positions_dict = defaultdict(list)
    # iterate over all possible squares and if there is a piece append 
    # it to the default dict with the piece as key and position as value
    for square in all_squares:
            if piece := board.piece_at(square):
                positions_dict[str(piece)].append(chess.square_name(square))
    # order from white pawn to black king
    order = {"p": 12, "P": 6, "r": 9, "R": 3, "b": 10, "B": 4, "n": 11, "N": 5, "q": 8, "Q": 2, "k": 7, "K": 1,}
    positions = dict(OrderedDict(sorted(positions_dict.items(), key=lambda val: order[val[0]])))
    
    return number_pieces, positions

In [42]:
# Count number of pieces, get all piece positions
df["number_of_pieces"], df["positions"] = zip(*df.apply(pieces_and_positions, axis=1))

In [43]:
# Drop columns
df = df.drop(columns=["fen", "final_fen", "rating deviation", "popularity", "nbplays", "halfmove_clock", "fullmove_clock", "moves", "gameurl", "fen_after_first_move", "placement", "themes"])

In [44]:
# Create IDs
df['id'] = np.arange(1, df.shape[0] + 1)

In [52]:
df = df[df["number_of_pieces"] <= 9]

In [None]:
df.to_csv("../data/puzzles_alg.csv", index=False)

In [53]:
df.to_json("../data/puzzles_alg.json", orient="records")

# Testing

In [None]:
df.head()

In [None]:
df.iloc[0, :]

In [None]:
board = chess.Board(placement)

In [None]:
df[df["castling"] != "-"]["castling"]

In [50]:
df["rating"].min()


550

In [48]:
len(df[df["number_of_pieces"] < 9])

2848

In [51]:
df[df["number_of_pieces"] < 9]["number_of_pieces"].min()

3