In [2]:
import chess.pgn
import pandas as pd

# Function to read a PGN file and extract data
def read_pgn(file_path):
    games_data = []
    with open(file_path) as pgn:
        while True:
            game = chess.pgn.read_game(pgn)
            if game is None:
                break
            # Extract data from the game
            # Example: game.headers["Event"], game.mainline_moves(), etc.
            extracted_data = {
                "Event": game.headers.get("Event", "N/A"),
                "Site": game.headers.get("Site", "N/A"),
                "White": game.headers.get("White", "N/A"),
                "Black": game.headers.get("Black", "N/A"),
                "Result": game.headers.get("Result", "N/A"),
                "WhiteElo": game.headers.get("WhiteElo", "N/A"),
                "BlackElo": game.headers.get("BlackElo", "N/A"),
                "Opening": game.headers.get("Opening", "N/A")  # Extract Opening
            }
            games_data.append(extracted_data)
    
    return pd.DataFrame(games_data)

# Path to your PGN file
file_path = 'lichess_db_standard_rated_2016-09_10k.feather'

# Read and process the PGN file
# chess_data = read_pgn('')
df = pd.read_feather(file_path)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Event            10000 non-null  object
 1   Site             10000 non-null  object
 2   White            10000 non-null  object
 3   Black            10000 non-null  object
 4   Result           10000 non-null  object
 5   WhiteElo         10000 non-null  object
 6   BlackElo         10000 non-null  object
 7   WhiteRatingDiff  9996 non-null   object
 8   BlackRatingDiff  9996 non-null   object
 9   ECO              10000 non-null  object
 10  Opening          10000 non-null  object
 11  TimeControl      10000 non-null  object
 12  Termination      10000 non-null  object
 13  Moves            10000 non-null  object
dtypes: object(14)
memory usage: 1.1+ MB


In [35]:
df = df.dropna(subset=['WhiteRatingDiff'])
df = df[df['Termination'] != 'Abandoned']
df = df[~df['Event'].str.contains('Bullet')]
df = df[~df['Event'].str.contains('Correspondence')]

In [27]:
df.head()

Unnamed: 0,Event,Site,White,Black,Result,WhiteElo,BlackElo,WhiteRatingDiff,BlackRatingDiff,ECO,Opening,TimeControl,Termination,Moves
2,Rated Classical tournament https://lichess.org...,https://lichess.org/13YDbkSL,ronniak,Ezequielluis,1-0,1801,1768,10,-10,A04,Zukertort Opening: Black Mustang Defense,600+0,Normal,1. Nf3 Nc6 2. g3 Nf6 3. Bg2 g6 4. d4 d5 5. O-O...
3,Rated Blitz tournament https://lichess.org/tou...,https://lichess.org/Tongyamf,URDEADMEAT,brumia,1-0,1627,1659,13,-14,D00,Queen's Pawn Game: Chigorin Variation,180+0,Time forfeit,1. d4 d5 2. Nc3 f6 3. Bf4 e6 4. a3 Bd6 5. Bg3 ...
6,Rated Blitz tournament https://lichess.org/tou...,https://lichess.org/Sd0Jxl5Y,cougarsrule,iwangrozni,1-0,1763,1756,11,-11,B76,"Sicilian Defense: Dragon Variation, Yugoslav A...",180+0,Time forfeit,1. e4 c5 2. Nf3 Nc6 3. d4 cxd4 4. Nxd4 Nf6 5. ...
7,Rated Classical tournament https://lichess.org...,https://lichess.org/rYs8Rj2X,hassenrouissi28,MedRx,1-0,1500,1500,162,-163,A45,Indian Game,600+0,Normal,1. d4 Nf6 2. Bf4 d5 3. h3 Nc6 4. e3 Bf5 5. a3 ...
8,Rated Blitz tournament https://lichess.org/tou...,https://lichess.org/D7LEbazz,mrbnk,Thiago_Siqueira,0-1,1973,1946,-12,12,B22,Sicilian Defense: Alapin Variation,180+0,Normal,1. e4 c5 2. c3 e6 3. d4 cxd4 4. cxd4 d6 5. Nc3...


In [29]:
df['Result'].value_counts()

Result
1-0        3517
0-1        3275
1/2-1/2     284
Name: count, dtype: int64

In [42]:
df_clean = df.drop(['Site', 'White', 'Black'], axis=1)
# We can change the min # move here
min_total_moves = 8
df_clean = df_clean[df_clean['Moves'].str.contains(' ' + repr(min_total_moves) + '\. ') | df_clean['Moves'].str.contains('#')]

df_clean

Unnamed: 0,Event,Result,WhiteElo,BlackElo,WhiteRatingDiff,BlackRatingDiff,ECO,Opening,TimeControl,Termination,Moves
2,Rated Classical tournament https://lichess.org...,1-0,1801,1768,+10,-10,A04,Zukertort Opening: Black Mustang Defense,600+0,Normal,1. Nf3 Nc6 2. g3 Nf6 3. Bg2 g6 4. d4 d5 5. O-O...
3,Rated Blitz tournament https://lichess.org/tou...,1-0,1627,1659,+13,-14,D00,Queen's Pawn Game: Chigorin Variation,180+0,Time forfeit,1. d4 d5 2. Nc3 f6 3. Bf4 e6 4. a3 Bd6 5. Bg3 ...
6,Rated Blitz tournament https://lichess.org/tou...,1-0,1763,1756,+11,-11,B76,"Sicilian Defense: Dragon Variation, Yugoslav A...",180+0,Time forfeit,1. e4 c5 2. Nf3 Nc6 3. d4 cxd4 4. Nxd4 Nf6 5. ...
7,Rated Classical tournament https://lichess.org...,1-0,1500,1500,+162,-163,A45,Indian Game,600+0,Normal,1. d4 Nf6 2. Bf4 d5 3. h3 Nc6 4. e3 Bf5 5. a3 ...
8,Rated Blitz tournament https://lichess.org/tou...,0-1,1973,1946,-12,+12,B22,Sicilian Defense: Alapin Variation,180+0,Normal,1. e4 c5 2. c3 e6 3. d4 cxd4 4. cxd4 d6 5. Nc3...
...,...,...,...,...,...,...,...,...,...,...,...
9995,Rated Classical game,1-0,1712,1741,+24,-13,B50,Sicilian Defense: Delayed Alapin,600+10,Normal,1. e4 { [%eval 0.3] } 1... c5 { [%eval 0.34] }...
9996,Rated Blitz game,0-1,1979,2010,-10,+9,D15,Slav Defense: Three Knights Variation,180+0,Normal,1. d4 c6 2. c4 d5 3. Nc3 Nf6 4. Nf3 Bf5 5. Bf4...
9997,Rated Blitz game,1/2-1/2,1700,1754,+2,-1,D01,Queen's Pawn Game: Chigorin Variation,180+3,Normal,1. d4 Nf6 2. Nc3 d5 3. h3 Bf5 4. Bg5 e6 5. a3 ...
9998,Rated Blitz game,1-0,1836,1870,+16,-11,C45,Scotch Game,180+2,Normal,1. e4 e5 2. Nf3 Nc6 3. d4 exd4 4. Nxd4 d6 5. B...


In [43]:
df_clean['Moves'].value_counts()

Moves
1. e4 e5 2. Qh5 Nc6 3. Bc4 Nf6 4. Qxf7#                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

In [48]:
df_clean.infer_objects(copy=False)
df_clean['Result'] = df_clean['Result'].replace({'1-0': 1.0, '0-1': 0.0, '1/2-1/2': 0.5})
df_clean.drop(['Event', 'BlackElo', 'BlackRatingDiff', 'TimeControl', 'Termination'], axis=1, inplace=True)
df_clean

Unnamed: 0,Result,WhiteElo,WhiteRatingDiff,ECO,Opening,Moves
2,1.0,1801,+10,A04,Zukertort Opening: Black Mustang Defense,1. Nf3 Nc6 2. g3 Nf6 3. Bg2 g6 4. d4 d5 5. O-O...
3,1.0,1627,+13,D00,Queen's Pawn Game: Chigorin Variation,1. d4 d5 2. Nc3 f6 3. Bf4 e6 4. a3 Bd6 5. Bg3 ...
6,1.0,1763,+11,B76,"Sicilian Defense: Dragon Variation, Yugoslav A...",1. e4 c5 2. Nf3 Nc6 3. d4 cxd4 4. Nxd4 Nf6 5. ...
7,1.0,1500,+162,A45,Indian Game,1. d4 Nf6 2. Bf4 d5 3. h3 Nc6 4. e3 Bf5 5. a3 ...
8,0.0,1973,-12,B22,Sicilian Defense: Alapin Variation,1. e4 c5 2. c3 e6 3. d4 cxd4 4. cxd4 d6 5. Nc3...
...,...,...,...,...,...,...
9995,1.0,1712,+24,B50,Sicilian Defense: Delayed Alapin,1. e4 { [%eval 0.3] } 1... c5 { [%eval 0.34] }...
9996,0.0,1979,-10,D15,Slav Defense: Three Knights Variation,1. d4 c6 2. c4 d5 3. Nc3 Nf6 4. Nf3 Bf5 5. Bf4...
9997,0.5,1700,+2,D01,Queen's Pawn Game: Chigorin Variation,1. d4 Nf6 2. Nc3 d5 3. h3 Bf5 4. Bg5 e6 5. a3 ...
9998,1.0,1836,+16,C45,Scotch Game,1. e4 e5 2. Nf3 Nc6 3. d4 exd4 4. Nxd4 d6 5. B...
