In [3]:
import chess.pgn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import lichess.api
from lichess.format import PGN
import json
from itertools import islice
from efficient_apriori import apriori

In [17]:
# Read chess games data file and create dataframe for games and a dictionary with users and their unique openings played
pgn = open("lichess_db_standard_rated_2016-05.pgn")

users = {}

games_arr = []
for x in range(6000000):
    headers = chess.pgn.read_headers(pgn)
    opening = headers.get('Opening').split(':')[0]
    opening = opening.split(' #')[0]
    if headers.get('Event') == "Rated Classical game":
        row = [headers.get('Event'), headers.get('White'), headers.get('Black'), opening, headers.get('WhiteElo'), headers.get('BlackElo'), headers.get('BlackRatingDiff'), headers.get('WhiteRatingDiff')]
        games_arr.append(row)
        
        
        if users.get(headers.get('White')):
            users[headers.get('White')].append(opening)
        else:
            users[headers.get('White')] = [opening]
        
        if users.get(headers.get('Black')):
            users[headers.get('Black')].append(opening)
        else:
            users[headers.get('Black')] = [opening]
            
# Export users to json to avoid having to reread data
with open('users.json','w') as data: 
    json.dump(users, data)

In [21]:
# Export games dataframe to csv to avoid having to reread data
dfGames = pd.DataFrame(games_arr, columns = ['Event', 'White', 'Black', 'Opening', 'WhiteElo', 'BlackElo', 'BlackRatingDiff', 'WhiteRatingDiff'])
print(len(games_arr))
dfGames.to_csv('out.csv', encoding='utf-8', index=False)

1501080


In [6]:
# Import users dictionary from file
with open('users.json') as f:
    data = f.read()
    
idkdict = json.loads(data)

for key in idkdict:
    idkdict[key] = list(set(idkdict[key]))

In [7]:
# Import dataframe from csv
df = pd.read_csv('out.csv')
print(df)

                        Event        White             Black  \
0        Rated Classical game    edumenoyo   platito_de_lefa   
1        Rated Classical game     a7md2010           dreypat   
2        Rated Classical game     raking41          mariofil   
3        Rated Classical game     JOHN14-6            naikus   
4        Rated Classical game      guevara          Sacchina   
...                       ...          ...               ...   
1501075  Rated Classical game  skailain279  pincheweylopezXD   
1501076  Rated Classical game     Zimerman         paramirez   
1501077  Rated Classical game       Airon8           ardbano   
1501078  Rated Classical game  Danieloaiza         dannyatac   
1501079  Rated Classical game  papadesarah         photomium   

                        Opening  WhiteElo  BlackElo  BlackRatingDiff  \
0                  Owen Defense      1770      1625             -7.0   
1             Hungarian Opening      1704      1764            -12.0   
2              

In [8]:
# Print average basket size
basket_size_sum = 0
count = 0
for k in idkdict:
    count += 1
    basket_size_sum += len(idkdict[k])
    
print(basket_size_sum/count)

11.348420649691786


In [9]:
# Format users data for use in efficient apriori library
openingswithusers = [(k, v) for k, v in idkdict.items()]
openings = []
for i in range(len(openingswithusers)):
    openings.append(openingswithusers[i][1])

In [10]:
# Call efficient apriori
itemsets, rules = apriori(openings, min_support = 0.15, min_confidence = 0.85)
results = []
for i in range(5):
    rules_rhs = filter(lambda rule: len(rule.lhs) == (1 + i) and len(rule.rhs) == 1, rules)
    for rule in sorted(rules_rhs, key=lambda rule: rule.lift):
        results.append(rule)

In [11]:
# Print results sorted by lift value
for j in range(len(results)):
    print("{}\n".format(results[j])) 

{English Opening, French Defense} -> {Sicilian Defense} (conf: 0.853, supp: 0.180, lift: 1.723, conv: 3.436)

{French Defense, Scandinavian Defense} -> {Sicilian Defense} (conf: 0.858, supp: 0.219, lift: 1.733, conv: 3.550)

{French Defense, Philidor Defense} -> {Sicilian Defense} (conf: 0.859, supp: 0.211, lift: 1.736, conv: 3.583)

{Bishop's Opening, French Defense} -> {Sicilian Defense} (conf: 0.860, supp: 0.163, lift: 1.737, conv: 3.601)

{Caro-Kann Defense, Queen's Pawn Game} -> {Sicilian Defense} (conf: 0.863, supp: 0.151, lift: 1.744, conv: 3.695)

{Italian Game, Philidor Defense} -> {Sicilian Defense} (conf: 0.865, supp: 0.174, lift: 1.748, conv: 3.752)

{French Defense, Italian Game} -> {Sicilian Defense} (conf: 0.869, supp: 0.189, lift: 1.755, conv: 3.843)

{French Defense, Ruy Lopez} -> {Sicilian Defense} (conf: 0.877, supp: 0.173, lift: 1.771, conv: 4.091)

{Philidor Defense, Ruy Lopez} -> {Sicilian Defense} (conf: 0.878, supp: 0.162, lift: 1.774, conv: 4.144)

{Philidor De

In [12]:
# Calculate winrate and frequency of openings
allopenings = df["Opening"].unique()
winrate = dict.fromkeys(allopenings, 0)
frequency = dict.fromkeys(allopenings, 0)
for index, row in df.iterrows():
    temp = row["Opening"]
    if temp in winrate:
        if "Defense" in temp:
            if row["BlackRatingDiff"] > 0:
                winrate[temp] = winrate[temp] + 1          
        else:
            if row["WhiteRatingDiff"] > 0:
                winrate[temp] = winrate[temp] + 1
        frequency[temp] = frequency[temp] + 1

In [13]:
# remove outliers and clean data
for key in frequency:
    if frequency.get(key, 0) < 30:
        winrate.pop(key)
    else:
        winrate[key] = (winrate[key] / frequency[key])
        winrate[key] = round(winrate[key], 3)

In [14]:
allopenings = df["Opening"].unique()
basketfreq = dict.fromkeys(allopenings, 0)
numbaskets = 0
#check support values for openings
for k in idkdict:
    for x in idkdict[k]:
        basketfreq[x] += 1
    numbaskets += 1

In [15]:
#count openings above support
popc = 0
tc = 0
for k in basketfreq:
    print(k, basketfreq[k]/numbaskets)
    if basketfreq[k]/numbaskets > 0.07:
        popc += 1
    tc +=1
print(popc)
print(tc)

Owen Defense 0.1524043347985058
Hungarian Opening 0.15817262635462453
Italian Game 0.31901160258187145
Horwitz Defense 0.15811982734724586
French Defense 0.43704378357686874
Scotch Game 0.232064837181061
Philidor Defense 0.3483150516770285
Queen's Pawn Game 0.4462440106126005
Center Game 0.1740915270792909
Budapest Defense 0.014981718343695138
King's Knight Opening 0.17019760028511463
Ruy Lopez 0.28589342520360617
Sicilian Defense 0.4949642946712602
King's Gambit Accepted 0.027904275399622487
Slav Defense 0.12398526907694135
Reti Opening 0.05999287213400388
Van't Kruijs Opening 0.25620718330495384
English Opening 0.29963436687390277
Caro-Kann Defense 0.24947530986417454
Nimzowitsch Defense 0.21728111511503584
Three Knights Opening 0.1285391834633509
Bishop's Opening 0.2573027627080611
Scandinavian Defense 0.3598780342929553
Four Knights Game 0.2162647342229966
Old Benoni Defense 0.04762470465555248
King's Pawn Game 0.42199606647395027
Robatsch (Modern) Defense 0.07544978154410698
Queen

In [16]:
# normalized values
for k in frequency:
    if frequency[k] > 30:
        print(k,frequency[k]/1501080)

Owen Defense 0.01128920510565726
Hungarian Opening 0.01035987422389213
Italian Game 0.04644189516881179
Horwitz Defense 0.012664881285474458
French Defense 0.06353292296213393
Scotch Game 0.02173701601513577
Philidor Defense 0.049298505076345034
Queen's Pawn Game 0.0707803714658779
Center Game 0.012431049644256136
Budapest Defense 0.0007054920457270765
King's Knight Opening 0.00954845844325419
Ruy Lopez 0.03369707144189517
Sicilian Defense 0.10559463852692728
King's Gambit Accepted 0.0011758200762117942
Slav Defense 0.00837130599301836
Reti Opening 0.002524848775548272
Van't Kruijs Opening 0.020908945559197378
English Opening 0.027007221467210275
Caro-Kann Defense 0.02156180883097503
Nimzowitsch Defense 0.01407986249900072
Three Knights Opening 0.007350041303594745
Bishop's Opening 0.030268206891038454
Scandinavian Defense 0.046276014602819306
Four Knights Game 0.016498787539638127
Old Benoni Defense 0.002323660297918832
King's Pawn Game 0.0682508593812455
Robatsch (Modern) Defense 0.0