In [8]:
import requests
import zipfile
import os
from pathlib import Path, PurePath
import shutil
from collections import defaultdict
import numpy as np
import pandas as pd
import re
import plotly.express as px

I will use https://www.pgnmentor.com as a source for chess games in PGN format. This resource allows downloading zipped PGNs.

In [59]:
lastname="Carlsen"
top = 30
directory_to_extract_to = 'downloads'

#### Downloading and saving archive with PGN file

In [60]:
print('Downloading ZIP with PGNs of '+lastname)
zipFileUrl = "https://www.pgnmentor.com/players/"+lastname+".zip"
zipFile = requests.get(zipFileUrl)

if not os.path.exists(directory_to_extract_to):
    os.makedirs(directory_to_extract_to)
else:
    shutil.rmtree(directory_to_extract_to)           # Removes all the subdirectories!
    os.makedirs(directory_to_extract_to)

path_to_zip_file = "downloads/"+lastname+'.zip'

print("Download successful")



Downloading ZIP with PGNs of Carlsen
Download successful


In [61]:
with open(os.fspath(PurePath(path_to_zip_file)), 'wb') as f:  
    f.write(zipFile.content)

#### Unpacking PGN file

In [62]:
zip_ref = zipfile.ZipFile(os.fspath(PurePath(path_to_zip_file)), 'r')
zip_ref.extractall(os.fspath(PurePath(directory_to_extract_to)))
zip_ref.close()

#### Converting PGN to DataFrame

In [76]:
def convertPGNtoDF(path):
    """"""
    with open(path, 'r') as file:
        data_from_pgn = file.read()
        
    dct = defaultdict(list)

    moves = False
    for element in data_from_pgn.split("\n\n"):
        tags=['Event', 'Site', 'Date', 'Round', 'White', 'Black', 'Result', 'WhiteElo', 'BlackElo', 'ECO', 'moves']
        if moves:
            dct["moves"].append(element)
            moves = False
            tags.remove('moves')
        else:
            for t in re.findall("\[.*\]", element):
                tag = t[1:-1].split('"')[0][:-1]
                val = t[1:-1].split('"')[1]
                dct[tag].append(val)
                tags.remove(tag)
                
            for t in tags:
                dct[t].append(np.nan)
                #print("nan for ", tag)
            moves = True
    for d in dct:
        print(d,":",len(dct[d]))
    print(dct.keys())
    input()
    return pd.DataFrame(dct)

In [77]:
rawDf = convertPGNtoDF(directory_to_extract_to+'/'+lastname+'.pgn')
rawDf

Event : 3431
Site : 3431
Date : 3431
Round : 3431
White : 3431
Black : 3431
Result : 3431
WhiteElo : 3431
BlackElo : 3431
ECO : 3431
moves : 3430
dict_keys(['Event', 'Site', 'Date', 'Round', 'White', 'Black', 'Result', 'WhiteElo', 'BlackElo', 'ECO', 'moves'])


KeyboardInterrupt: Interrupted by user

#### Finding out the top openings

In [44]:
df = rawDf.groupby("ECO").count().sort_values(by="Event", ascending=False).head(top)
df.reset_index(inplace=True)
df.drop(df.iloc[:, 2:], axis = 1, inplace = True)
df.columns=["ECO","Frequency"]
df

Unnamed: 0,ECO,Frequency
0,B90,152
1,C42,127
2,C65,120
3,C67,103
4,C50,87
5,E15,81
6,C88,72
7,B33,70
8,D37,66
9,B12,65


In [45]:
dr = requests.get("http://www.neerpeltersk.be/palview4/p4sets/P3eco.txt")
path_to_Eco = 'eco.txt'

with open(path_to_Eco, 'wb') as f:  
    f.write(dr.content) 
    

In [46]:
with open(path_to_Eco, 'r') as file:
    data_from_eco = file.read()

In [47]:
ecoDct = defaultdict(list)
for row in data_from_eco.split("\n\n")[4:]:
    code = row[:3]
    desc = row[3:].lstrip()
    ecoDct[code].append(desc)

In [48]:
def getECOdesc(eco):
    try:
        return ecoDct[eco][0]
    except:
        return np.nan

In [55]:
df["desc"]=df["ECO"].map(getECOdesc)
df["perc"]=df["Frequency"]/len(rawDf)
df

Unnamed: 0,ECO,Frequency,desc,perc
0,B90,152,Sicilian: Najdorf\n1.e4 c5 2.Nf3 d6 3.d4 cxd4 ...,0.039024
1,C42,127,Petrov's Defense\n1.e4 e5 2.Nf3 Nf6 1/2,0.032606
2,C65,120,Ruy Lopez: Berlin Defense\n1.e4 e5 2.Nf3 Nc6 3...,0.030809
3,C67,103,Ruy Lopez: Berlin Defense: Open Variation\n1.e...,0.026444
4,C50,87,King's Pawn Game\n1.e4 e5 2.Nf3 Nc6 3.Bc4 1/2,0.022336
5,E15,81,Queen's Indian: 4.g3\n1.d4 Nf6 2.c4 e6 3.Nf3 b...,0.020796
6,C88,72,Ruy Lopez: Closed: Balla Variation\n1.e4 e5 2....,0.018485
7,B33,70,Sicilian Defense\n1.e4 c5 2.Nf3 Nc6 3.d4 cxd4 ...,0.017972
8,D37,66,QGD: 4.Nf3\n1.d4 d5 2.c4 e6 3.Nc3 Nf6 4.Nf3 Be...,0.016945
9,B12,65,Caro-Kann Defense\n1.e4 c6 2.d4 1/2,0.016688


In [58]:
fig = px.bar(df, x='ECO', y='Frequency',
             hover_data=['desc','perc'],
             height=400, 
             title = "Top-"+str(top)+" openings of "+lastname,
             labels={'ECO':'ECO (opening code)', 
                     'Frequency':'Frequency',
                    'perc':'Ratio'},
             color='Frequency',
             color_continuous_scale="rdbu_r"
            )
fig.show()