In [1]:
# Replace string with your username
user = "horsegobrrrr"
file_path = "lichess_horsegobrrrr_2024-05-08.pgn"

In [2]:
import chess.pgn
import chess
import chess.engine
import math
import numpy as np
import pandas as pd
import os
import pyarrow
import logging
import re
from eco import eco_decode, simplify_eco, eco_cluster

# Function to read a PGN file and extract data
def read_pgn(file_path):
    games_data = []
    with open(file_path) as pgn:
        while True:
            game = chess.pgn.read_game(pgn)
            if game is None:
                break
            # Extract data from the game
            # Example: game.headers["Event"], game.mainline_moves(), etc.
            extracted_data = {
                "Event": game.headers.get("Event", "N/A"),
                "Site": game.headers.get("Site", "N/A"),
                "White": game.headers.get("White", "N/A"),
                "Black": game.headers.get("Black", "N/A"),
                "Result": game.headers.get("Result", "N/A"),
                "WhiteElo": game.headers.get("WhiteElo", "N/A"),
                "BlackElo": game.headers.get("BlackElo", "N/A"),
                "ECO": game.headers.get("ECO", "N/A"),  # Extract ECO
            }
            games_data.append(extracted_data)
    return pd.DataFrame(games_data)

df = read_pgn(file_path)

In [3]:
df

Unnamed: 0,Event,Site,White,Black,Result,WhiteElo,BlackElo,ECO
0,Rated blitz game,https://lichess.org/0GBLERcn,winandTbagU,horsegobrrrr,0-1,1359,1447,B21
1,Rated blitz game,https://lichess.org/Mp39oMSx,horsegobrrrr,winandTbagU,1-0,1442,1364,C41
2,Rated blitz game,https://lichess.org/Tu5Fruad,Napomat,horsegobrrrr,0-1,1381,1437,B20
3,Rated blitz game,https://lichess.org/WnDAnlpS,Toyya254,horsegobrrrr,1-0,1410,1443,A00
4,Rated blitz game,https://lichess.org/5qN7kVyf,horsegobrrrr,fredy4511,0-1,1449,1468,B40
...,...,...,...,...,...,...,...,...
1669,Casual rapid game,https://lichess.org/fBE8S13l,horsegobrrrr,Abdouu10,1-0,1500,1307,A04
1670,Casual rapid game,https://lichess.org/HpDOhr9o,horsegobrrrr,KI1,1-0,1500,1212,A04
1671,Casual blitz game,https://lichess.org/XhA7RNJL,horsegobrrrr,Degurachow-VII,1-0,1500,1304,A07
1672,Casual blitz game,https://lichess.org/2PQpInTE,horsegobrrrr,chipotlepeppers,0-1,1500,1222,A04


In [4]:
pd.set_option('future.no_silent_downcasting', True)

In [5]:
# Convert result into integer value
df['Result'] = df['Result'].replace({'1-0': 1, '0-1': -1, '1/2-1/2': 0})
# Drop unknown openings
df = df[(df.ECO != '?')]
# Drop ties
df = df[(df.Result != 0)]
# Drop ?
df = df[(df.WhiteElo != '?')]
df = df[(df.BlackElo != '?')]
# Turn to int
df.WhiteElo = df['WhiteElo'].astype(int)
df.BlackElo = df['BlackElo'].astype(int)
# Add diff col
df['Diff'] = np.where(df['White'] == user, df['WhiteElo'] - df['BlackElo'], df['BlackElo'] - df['WhiteElo'])
# Consider only Rated Blitz or Rated Rapid
df = df[df['Event'].str.contains('Rated') & (df['Event'].str.contains('blitz') | df['Event'].str.contains('rapid'))]
# Separate games
df_w = df[df.White == user]
df_b = df[df.Black == user]

In [6]:
df_w_clean = df_w.drop(['BlackElo', 'Site', 'Black', 'Event'], axis=1)
df_w_clean

Unnamed: 0,White,Result,WhiteElo,ECO,Diff
1,horsegobrrrr,1,1442,C41,78
4,horsegobrrrr,-1,1449,B40,-19
5,horsegobrrrr,1,1443,B00,-10
6,horsegobrrrr,-1,1448,C41,-16
8,horsegobrrrr,-1,1448,C41,-16
...,...,...,...,...,...
1647,horsegobrrrr,-1,1291,B01,-88
1649,horsegobrrrr,-1,1506,C44,-35
1651,horsegobrrrr,1,1476,C40,2
1653,horsegobrrrr,1,1458,C44,99


In [7]:
df_w_ml = df_w_clean.drop(['White'], axis=1)
df_w_ml

Unnamed: 0,Result,WhiteElo,ECO,Diff
1,1,1442,C41,78
4,-1,1449,B40,-19
5,1,1443,B00,-10
6,-1,1448,C41,-16
8,-1,1448,C41,-16
...,...,...,...,...
1647,-1,1291,B01,-88
1649,-1,1506,C44,-35
1651,1,1476,C40,2
1653,1,1458,C44,99


In [8]:
df.ECO.describe()

count     987
unique    113
top       B50
freq       93
Name: ECO, dtype: object

In [9]:
from eco import simplify_eco
df_w_ml['ECO'] = df_w_ml['ECO'].apply(simplify_eco)

In [10]:
df_w_ml.ECO.describe()

count     502
unique     16
top       C60
freq      213
Name: ECO, dtype: object

In [11]:
df_w_ml

Unnamed: 0,Result,WhiteElo,ECO,Diff
1,1,1442,C41,78
4,-1,1449,B20,-19
5,1,1443,B00,-10
6,-1,1448,C41,-16
8,-1,1448,C41,-16
...,...,...,...,...
1647,-1,1291,B01,-88
1649,-1,1506,C44,-35
1651,1,1476,C40,2
1653,1,1458,C44,99


In [12]:
#df_w_wr = df_w_ml.groupby('ECO')['Result'].agg(lambda x: (x == 1).sum() / len(x))

grouped = df_w_ml.groupby('ECO')

# Aggregate data
df_w_eco = grouped.agg(
    average_WhiteElo=('WhiteElo', 'mean'),
    wins=('Result', lambda x: (x == 1).sum()),
    losses=('Result', lambda x: (x == -1).sum()),
    total_games=('Result', 'size'),
    average_diff=('Diff', 'mean'),
    winrate=('Result', lambda x: (x == -1).sum()/len(x)),
).reset_index()

df_w_sort = df_w_eco.sort_values(by='total_games', ascending=False)
df_w_sort

Unnamed: 0,ECO,average_WhiteElo,wins,losses,total_games,average_diff,winrate
15,C60,1326.920188,130,83,213,7.286385,0.389671
7,B20,1311.71875,29,35,64,-5.875,0.546875
6,B10,1317.386364,20,24,44,5.386364,0.545455
1,B00,1335.9375,19,13,32,7.21875,0.40625
8,C00,1339.785714,16,12,28,7.892857,0.428571
11,C42,1288.75,14,14,28,2.964286,0.5
10,C41,1314.96,17,8,25,-3.76,0.32
2,B01,1277.409091,14,8,22,1.318182,0.363636
9,C40,1327.533333,8,7,15,42.733333,0.466667
12,C44,1249.090909,5,6,11,-12.909091,0.545455


In [13]:
from eco import eco_decode
df_w_sort['Opening'] = df_w_sort['ECO'].apply(eco_decode)
df_w_sort

Unnamed: 0,ECO,average_WhiteElo,wins,losses,total_games,average_diff,winrate,Opening
15,C60,1326.920188,130,83,213,7.286385,0.389671,Ruy Lopez (Spanish opening)
7,B20,1311.71875,29,35,64,-5.875,0.546875,Sicilian defence
6,B10,1317.386364,20,24,44,5.386364,0.545455,Caro-Kann defence
1,B00,1335.9375,19,13,32,7.21875,0.40625,King's pawn opening
8,C00,1339.785714,16,12,28,7.892857,0.428571,French defence
11,C42,1288.75,14,14,28,2.964286,0.5,Petrov's defence
10,C41,1314.96,17,8,25,-3.76,0.32,Philidor's defence
2,B01,1277.409091,14,8,22,1.318182,0.363636,Scandinavian (centre counter) defence
9,C40,1327.533333,8,7,15,42.733333,0.466667,King's knight opening
12,C44,1249.090909,5,6,11,-12.909091,0.545455,King's pawn game


In [14]:
worst_opening = df_w_sort[df_w_sort['winrate'] < 0.45].head(1)
worst_opening

Unnamed: 0,ECO,average_WhiteElo,wins,losses,total_games,average_diff,winrate,Opening
15,C60,1326.920188,130,83,213,7.286385,0.389671,Ruy Lopez (Spanish opening)


In [21]:
from IPython.display import Markdown
markdown_text = "## Your worst opening is the **{}** with a winrate of {:.1f}%. \n ## We reccomend you study the openings below."output = markdown_text.format(worst_opening['Opening'].values[0], worst_opening['winrate'].values[0]*100)

res = eco_cluster(worst_opening['ECO'].values[0])
res = res.merge(df_w_sort[['ECO', 'winrate']], on='ECO', how='left')
res = res.rename(columns={"winrate": "Your winrate"})Markdown(output)


## Your worst opening is the **Ruy Lopez (Spanish opening)** with a winrate of 39.0%. 
 ## We reccomend you study the openings below.

In [22]:
res

Unnamed: 0,ECO,Cluster,Opening,Your winrate
0,C40,7,King's knight opening,0.466667
1,C41,7,Philidor's defence,0.32
2,C42,7,Petrov's defence,0.5
3,C60,7,Ruy Lopez (Spanish opening),0.389671
