In [None]:
# Python STL
import re
import time

# General data science
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# sklearn
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression

# Chess
import chess
import chess.pgn

# Nicer-looking plotting for notebooks
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# Plotting defaults
from matplotlib import style
style.use('seaborn-muted')
sns.set_style('whitegrid')
sns.set_context('paper')
mpl.rcParams['savefig.dpi'] = 500

# Models

In [None]:
data = pd.read_csv("../data/boards.csv", index_col=0)

In [None]:
print(len(data))
sns.kdeplot(data=data, x='stockfish_evaluation')

# Linear models and the meaning behind their coefficients

In [None]:
def get_feature_coefficients(model, X):
    coefs = pd.Series(model.coef_[0])
    feats = X.columns

    return dict(zip(feats, coefs))


def print_large_coefs(model, X, thresh=1e4):
    coefs = pd.Series(model.coef_[0])

    feats = X.columns

    big_pos_coefs = [(i, c, features[i]) for i, c in enumerate(coefs) if c > thresh]
    big_neg_coefs = [(i, c, features[i]) for i, c in enumerate(coefs) if c < -thresh]

    print(big_pos_coefs)
    print(big_neg_coefs)


def score_scatterplot(model, X_np, y_np, ax=None, title=None):
    actual = y_np.reshape(1, -1)[0]
    preds = model.predict(X_np).reshape(1, -1)[0]
    g = sns.scatterplot(x=actual, y=preds, ax=ax)
    g.set_title(title)


def board_heatmap(model, X, square_fmt, flip=False, ax=None, title=None):
    feat_coefs = get_feature_coefficients(model, X)
    coefs = np.zeros([8, 8])

    for square in chess.SQUARES:
        name = chess.square_name(square)
        key = square_fmt.format(name)
        control_side_coef = feat_coefs[key]
        if flip:
            control_side_coef = -control_side_coef
        flipped = chess.square_mirror(square)
        rank, file = chess.square_rank(flipped), chess.square_file(flipped)
        coefs[rank][file] = control_side_coef

    g = sns.heatmap(coefs, center=0, cmap='coolwarm', ax=ax, cbar=False)
    g.set_xticklabels(list("ABCDEFGH"))
    g.set_yticklabels(range(8, 0, -1))
    g.set_title(title)


def mobility_importance(model, X, ax, title=None):
    piece_names = {
        'K': 'King',
        'Q': 'Queen',
        'R': 'Rook',
        'B': 'Bishop',
        'N': 'Knight',
        'P': 'Pawn',
    }
    
    feat_coefs = get_feature_coefficients(model, X)

    mean_mobility_coefs = list()

    for color in ['White', 'Black']:
        for piece in ['K', 'Q', 'R', 'B', 'N', 'P']:
            name = piece_names[piece]
            if color == 'Black':
                piece = piece.lower()
            mobility_feats = [f for f in X.columns if f.startswith(piece) and f.endswith('mobility')]
            mean_coef = np.mean([feat_coefs[feat] for feat in mobility_feats])
            if color == 'Black':
                mean_coef *= -1
            mean_mobility_coefs.append([name, mean_coef, color])

    mobility = pd.DataFrame.from_records(mean_mobility_coefs, columns=['Piece', 'Mean coefficient', 'Color'])

    g = sns.barplot(data=mobility, x='Piece', y='Mean coefficient', hue='Color', ax=ax)
    g.set_title(title)
    g.set_xlabel("")
    g.set_ylabel("")
    g.get_legend().remove()

In [None]:
header_feats = [
    "Event", "Site", "Date", "Round", "White", "Black", "Result", "BlackElo", "BlackRatingDiff", "ECO",
    "Opening", "Termination", "TimeControl", "UTCDate", "UTCTime", "WhiteElo", "WhiteRatingDiff",
]

ignore_feats = ['extra_' + piece for piece in ['q', 'r', 'b', 'n', 'k', 'Q', 'R', 'B', 'N', 'K']]
ignore_feats.extend(['is_checkmate', 'is_stalemate'])
ignore_feats.extend(['white_has_castled', 'black_has_castled'])
ignore_feats.extend(['stockfish_evaluation'])
ignore_feats.extend(header_feats)

features = [feat for feat in data.columns if feat not in ignore_feats]

low_cutoff = 3250
high_cutoff = 9600

nonzero_range = (data.stockfish_evaluation != 0)
low_range = (abs(data.stockfish_evaluation) <= low_cutoff)
mid_range = (abs(data.stockfish_evaluation) < high_cutoff) & (abs(data.stockfish_evaluation) > low_cutoff)
high_range = (abs(data.stockfish_evaluation) >= high_cutoff)

model_ranges = [
    ['All', [True] * len(data)],
    ['Low', low_range],
    ['Mid', mid_range],
    ['High', high_range],
]

plot_titles = [
    'Evaluation scatterplots',
    'Number of pieces controlling each square',
    'Lowest-valued white piece controlling',
    'Lowest-valued black piece controlling',
    'Piece mobility importance',
]

num_plots = len(plot_titles)
figsize = (num_plots * 4, len(model_ranges) * 4)

plt.clf()
fig, axes = plt.subplots(len(model_ranges), len(plot_titles), sharex=False, sharey=False, figsize=figsize)

for (score_group, score_range), ax_row in zip(model_ranges, axes):
    print(f"\n{score_group.upper()} SCORING BOARDS\n")
    print("Selecting data", end=': ')

    selected = data[score_range]

    X = selected.loc[:,features]
    y = selected.loc[:,'stockfish_evaluation']

    X_np = X.to_numpy()
    y_np = y.to_numpy().reshape(-1, 1)
          
    print(f"{len(X)} data points")
    
    print("Training model on range", end=': ')
    t0 = time.time()
    model = LinearRegression()
    model.fit(X_np, y_np)
    score = model.score(X_np, y)
    elapsed = time.time() - t0
    print(f'score = {score:.3f}, elapsed time = {elapsed:.1f} seconds')

    print("Creating plots")

    if score_group == 'All':
        titles = plot_titles
    else:
        titles = [None] * 5

    score_scatterplot(model, X_np, y_np, ax=ax_row[0], title=titles[0])
    board_heatmap(model, X, 'side_controlling_{}', flip=False, ax=ax_row[1], title=titles[1])
    board_heatmap(model, X, 'white_{}_control', flip=False, ax=ax_row[2], title=titles[2])
    board_heatmap(model, X, 'black_{}_control', flip=True, ax=ax_row[3], title=titles[3])
    mobility_importance(model, X, ax=ax_row[4], title=titles[4])

In [None]:
fig.set_tight_layout(True)
fig.canvas.draw()
fig.savefig('../images/linear_subplots_flipped_mobility.png')

## Clustering of the Stockfish evaluation vs. predicted

In [None]:
def kmeans_score_plot(model, X_np, y_np, n_clusters):
    preds = model.predict(X_np)

    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(preds)
    clusters = kmeans.predict(y_np)

    actual = y_np.reshape(1, -1)[0]
    preds = preds.reshape(1, -1)[0]
    clusters = clusters.reshape(1, -1)[0]
    
    # Rename the clusters from ints to make a diverging colormap.
    cluster_names = [chr(c + ord('A')) for c in clusters]

    plt.clf()
    g = sns.scatterplot(x=actual, y=preds, hue=cluster_names)
    g.set_xlabel("Stockfish evaluation")
    g.set_ylabel(f"Linear model predicted evaluation")
    g.set_title(f"Clustering of board evaluations")
    legend = g.get_legend()
    del legend
    plt.savefig(f'../images/kmeans2.png')
    plt.show()

In [None]:
print("Selecting data")

X = data.loc[:,features]
y = data.loc[:,'stockfish_evaluation']

X_np = X.to_numpy()
y_np = y.to_numpy().reshape(-1, 1)

# Re-fit a linear model on all of the data
print("Fitting linear regression")
model = LinearRegression()
model.fit(X_np, y_np)


In [None]:
print("Creating kmeans plot")
kmeans_score_plot(model, X_np, y_np, 6)

In [None]:
preds = model.predict(X_np)
kmeans = KMeans(n_clusters=6)
kmeans.fit(preds)

In [None]:
kmeans.cluster_centers_

In [None]:
centers = sorted(kmeans.cluster_centers_.reshape(1, -1)[0])
print(centers)

for i in range(len(centers) - 1):
    left = centers[i]
    right = centers[i + 1]
    print((left + right) / 2)

# Diagrams for the blog post and presentation

In [None]:
from IPython.display import Image

## Unique pieces moved

In [None]:
board = chess.Board('r1bqkb1r/pp3ppp/2n1p3/1B1pPn2/3P1P2/5N2/PP4PP/RNBQK2R b KQkq - 4 8')
display(board)

svg = chess.svg.board(board)
with open('../images/board_unique.svg', 'w') as fp:
    fp.write(svg)

## Material, king mobility, can-castle

In [None]:
board = chess.Board('r3kb1r/pp3ppp/1qb1p3/3pPn2/3P1P2/2N2N2/PP4PP/R1BQ1RK1 b kq - 1 11')
display(board)

squares = chess.SquareSet([chess.D7, chess.E7, chess.F2])

svg = chess.svg.board(board, squares=squares)
with open('../images/board_kings.svg', 'w') as fp:
    fp.write(svg)

## Pawn islands, isolated pawns

In [None]:
board = chess.Board()
board.clear()
pieces = [
    (chess.A4, chess.PAWN, chess.WHITE),
    (chess.B5, chess.PAWN, chess.WHITE),
    (chess.E2, chess.PAWN, chess.WHITE),
    (chess.G3, chess.PAWN, chess.WHITE),
    (chess.D6, chess.PAWN, chess.BLACK),
    (chess.E6, chess.PAWN, chess.BLACK),
    (chess.F7, chess.PAWN, chess.BLACK),
    (chess.H4, chess.PAWN, chess.BLACK),
]

for square, piece_type, color in pieces:
    board.set_piece_at(square, chess.Piece(piece_type, color))

display(board)

squares = chess.SquareSet([chess.A4, chess.B5, chess.D6, chess.E6, chess.F7])

svg = chess.svg.board(board, squares=squares)
with open('../images/board_pawns.svg', 'w') as fp:
    fp.write(svg)

In [None]:
board = chess.Board()
board.clear()
pieces = [
    (chess.A4, chess.PAWN, chess.WHITE),
    (chess.B5, chess.PAWN, chess.WHITE),
    (chess.C4, chess.PAWN, chess.WHITE),
    (chess.E2, chess.PAWN, chess.WHITE),
    (chess.G3, chess.PAWN, chess.WHITE),
    (chess.D6, chess.PAWN, chess.BLACK),
    (chess.E6, chess.PAWN, chess.BLACK),
    (chess.F7, chess.PAWN, chess.BLACK),
    (chess.G6, chess.PAWN, chess.BLACK),
    (chess.H4, chess.PAWN, chess.BLACK),
]

for square, piece_type, color in pieces:
    board.set_piece_at(square, chess.Piece(piece_type, color))

display(board)

## Attackers/defendings

In [None]:
board = chess.Board("2r2rk1/p3bppp/1pq1p3/2NpPn2/1PbP1P2/P4N2/1B3RPP/R2Q2K1 w - - 0 18")

squares = [chess.D4]

arrows = [
    # White pieces
    chess.svg.Arrow(chess.B2, chess.D4, color='red'),
    chess.svg.Arrow(chess.D1, chess.D4, color='red'),
    chess.svg.Arrow(chess.F3, chess.D4, color='red'),

    # Black pieces
    chess.svg.Arrow(chess.F5, chess.D4, color='blue'),
]

svg = chess.svg.board(board, squares=squares, arrows=arrows)
with open('../images/board_center.svg', 'w') as fp:
    fp.write(svg)
    
!inkscape --export-type=png --export-dpi=500 ../images/board_center.svg

Image(filename='../images/board_center.png', width=512)

## Pins/forks/skewers

In [None]:
board = chess.Board()
board.clear()
pieces = [
    # White queen forks bishop/rook, pins rook to king
    (chess.D4, chess.QUEEN, chess.WHITE),
    (chess.B4, chess.BISHOP, chess.BLACK),
    (chess.F6, chess.ROOK, chess.BLACK),
    (chess.G7, chess.KING, chess.BLACK),
    # Black bishop skewers king and rook
    (chess.B7, chess.BISHOP, chess.BLACK),
    (chess.G2, chess.KING, chess.WHITE),
    (chess.H1, chess.ROOK, chess.WHITE),
]

for square, piece_type, color in pieces:
    board.set_piece_at(square, chess.Piece(piece_type, color))

arrows = [
    # White queen pin
    chess.svg.Arrow(chess.D4, chess.G7, color='red'),
    # White queen fork
    chess.svg.Arrow(chess.D4, chess.B4, color='blue'),
    chess.svg.Arrow(chess.D4, chess.F6, color='blue'),
    # Black bishop skewer
    chess.svg.Arrow(chess.B7, chess.H1, color='green'),
]

svg = chess.svg.board(board, arrows=arrows)
with open('../images/board_pins_skewers_forks.svg', 'w') as fp:
    fp.write(svg)
    
!inkscape --export-type=png --export-dpi=500 ../images/board_pins_skewers_forks.svg

Image(filename='../images/board_pins_skewers_forks.png', width=512)

In [None]:
board = chess.Board('2r2rk1/4bppp/1p2p3/3pPn2/pP1P1PP1/P1qQ1N2/5R1P/R5K1 w - - 0 22')

# Forks:
#  Black queen -> white queen, rook
#  White queen -> black queen, knight
# Pins:
#  Black queen -> d4 pawn?
#  Black bishop -> b4,a3 pawns
# Skewers:
#  White queen -> black knight
#  Black queen -> white queen

arrows = [
    ### Pins
    # Black queen -> d4, e4 pawns
    chess.svg.Arrow(chess.C3, chess.E5, color='blue'),
    # Black bishop -> b4, a3 pawns
    chess.svg.Arrow(chess.E7, chess.A3, color='blue'),

    ### Skewers
    # Black queen -> white queen, knight
    chess.svg.Arrow(chess.C3, chess.F3, color='red'),
    # White queen -> black knight, h7 pawn
    chess.svg.Arrow(chess.D3, chess.H7, color='red'),

    ### Forks
    # Black queen -> white queen, rook, pawns
    chess.svg.Arrow(chess.C3, chess.A1, color='green'),
    chess.svg.Arrow(chess.C3, chess.D3, color='green'),
    chess.svg.Arrow(chess.C3, chess.D4, color='green'),
    chess.svg.Arrow(chess.C3, chess.B4, color='green'),
    chess.svg.Arrow(chess.C3, chess.A3, color='green'),
    # White queen -> black queen, knight
    chess.svg.Arrow(chess.D3, chess.C3, color='green'),
    chess.svg.Arrow(chess.D3, chess.F5, color='green'),
]

svg = chess.svg.board(board, arrows=arrows)
with open('../images/board_pins_skewers_forks2.svg', 'w') as fp:
    fp.write(svg)
    
!inkscape --export-type=png --export-dpi=500 ../images/board_pins_skewers_forks2.svg

Image(filename='../images/board_pins_skewers_forks2.png', width=512)

# Visualizing opening popularity by rating range

This section draws on work that Jordan did in his analysis.

In [None]:
df = pd.read_csv('../all_games.csv', low_memory=False)

# Remove uncalibrated games
df = df[df['White Rating'] != '?']
df = df[df['Black Rating'] != '?']

df = df.astype({
    'White Rating': int,
    'Black Rating': int,
})

df['Rating Diff'] = abs(df['White Rating'] - df['Black Rating'])
df['Avg Rating'] = (df['White Rating'] + df['Black Rating']) / 2

In [None]:
def opening_prune(op):
    split = re.split('[:#,]', op)
    return split[0].rstrip()

df['Opening Short'] = df['Opening'].apply(opening_prune)

In [None]:
RatingRanges = [0, 1200, 1400, 1600, 1800, 2000, 2200, 2500, 2500, np.inf]

openings = pd.DataFrame()

for (rating_min, rating_max) in zip(RatingRanges, RatingRanges[1:]):
    df_rate = df[(df['White Rating'] >= rating_min) & (df['Black Rating'] >= rating_min) &
                (df['White Rating'] < rating_max) & (df['Black Rating'] < rating_max)]
    rating_range = f"[{rating_min}, {rating_max})"
    print(f'Popular Openings in Rating Range: {rating_range}')      
    counts = df_rate['Opening Short'].value_counts()
    percentage = df_rate['Opening Short'].value_counts(normalize=True).mul(100)
    open_stats = pd.DataFrame({'counts': counts, 'percentage':percentage})
    open_stats.index.name='Opening'
    open_stats.reset_index(inplace=True)
    open_stats['Rating min'] = rating_min
    open_stats['Rating max'] = rating_max
    open_stats['Rating range'] = rating_range  # Nicely-formatted string
    openings = openings.append(open_stats)

openings.reset_index(inplace=True)

In [None]:
# Get the N most popular openings at each rating range
N = 7
popular_openings = set()

for rating_range, df_rate in openings.groupby('Rating range'):
    popular_openings.update(df_rate['Opening'].head(N))

print(len(popular_openings))
popular_openings
openings.head()

In [None]:
top_openings = pd.DataFrame()

NORMALIZE = True

for opening in popular_openings:
    sub_df = openings[openings['Opening'] == opening]
    top_openings = top_openings.append(sub_df)
    for rating_range in openings['Rating range'].unique():
        try:
            percent = sub_df[sub_df['Rating range'] == rating_range]['percentage'].iloc[0]
        except KeyError:
            print("Key error")

other_openings = list()
for rating_range, sub_df in top_openings.groupby('Rating range'):
    percentages = sum(sub_df['percentage'])
    counts = sum(sub_df['counts'])

    if NORMALIZE:
        indices = sub_df.index
        new_percentages = top_openings['percentage'].loc[indices].apply(lambda p: p / percentages)
        top_openings.loc[indices, 'percentage'] = new_percentages
    else:
        other_percentages = 100 - percentages
        other_percents[rating_range] = 100 - percentages
        rating_min, rating_max = sub_df['Rating min'].unique()[0], sub_df['Rating max'].unique()[0]
        new_row = [0, "All other openings", -1, other_percentages, rating_min, rating_max, rating_range]
        other_openings.append(new_row)
    
top_openings.sort_values(by=['Rating min', 'percentage'], inplace=True, ascending=False)
top_openings.reset_index(inplace=True, drop=True)
top_openings

In [None]:
most_openings = dict()
rating_ranges = sorted(top_openings['Rating range'].unique())

for opening, sub_df in top_openings.groupby('Opening'):
    # Which rating range uses this opening the most?
    highest_percent_idx = sub_df['percentage'].idxmax()
    row = sub_df.loc[highest_percent_idx]
    rating_range = sub_df.loc[highest_percent_idx]['Rating range']
    
    # Get the index that its bar will have on the plot below.
    bar_idx = rating_ranges.index(rating_range)
    most_openings[opening] = bar_idx

most_openings

In [None]:
sns.set_style('white')

fig, ax = plt.subplots(figsize=(12, 8))

unique_openings = list(top_openings[top_openings['Rating min'] == 2500]['Opening'])
prev_percents = np.zeros(len(rating_ranges))

sorted_openings = top_openings.sort_values(by='Rating range')

bar_width = 0.90

for opening in unique_openings:
    # Each column is one rating range, and the stacked bars represent openings.
    sub_df = sorted_openings[sorted_openings['Opening'] == opening]
    percentages = np.array(sub_df['percentage'])
    splot = ax.bar(rating_ranges, percentages, bottom=prev_percents, label=opening, width=bar_width)

    # Annotate one of the bars with the opening name
    annotate_bar = splot.patches[most_openings[opening]]
    x, y = annotate_bar.xy
    x += bar_width / 2
    y += annotate_bar._height / 2
    plt.text(x, y, opening, horizontalalignment='center', fontsize=8)
    
    prev_percents += percentages

ax.set_xlabel("Player skill level (ELO) range")
ax.set_ylabel("Share of openings")
ax.set_yticklabels([])
sns.despine(left=True)

_, legend = ax.get_legend_handles_labels()
del legend
plt.savefig('../images/openings_vs_skill.png', dpi=1000)