In [None]:
from matplotlib import rcParams
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Deletes the first 5 rows from .txt file because they contain useless information.
# Then save it to a new file.

file = open('games_original.txt', "r")
lines = file.readlines()
file.close()

for i in range(5):
    del lines[0]

new_file = open("games.txt", "w+")

for line in lines:
    new_file.write(line)
new_file.close()

In [None]:
# DO NOT WORRY THIS STEP TAKES A LOT OF TIME BECAUSE OF THE FILE SIZE(2.5GB).
# Reading in the data and converting it to a Pandas dataframe.

raw_data = pd.read_csv('games.txt', delimiter=" ### ", header=None)

column_names = ["#", "date", "result", "welo", "belo", "len", "date_c", "resu_c", "welo_c", "belo_c", "edate_c", "setup", "fen", "resu2_c", "oyrange", "bad_len", "DELETE THIS"]
data = pd.DataFrame(raw_data[0].str.split(" ").to_list(), columns=column_names)

data.drop(['DELETE THIS'], axis=1, inplace=True)

data = pd.concat([data, raw_data[1]], axis=1)
data = data.rename(columns = {1: 'moves'}, inplace = False)
data['len'] = data['len'].astype(int)
data.head(5)

In [None]:
data.columns

In [None]:

# Data cleaning.
# Removing columns and rows that are useless to use.
data.drop(data[data["setup"] != "setup_false"].index, inplace=True)
data.drop(['resu_c', 'welo_c', 'belo_c', 'setup', 'fen', 'oyrange', 'bad_len', 'resu2_c', 'edate_c', 'date_c'], axis=1, inplace=True)
data.drop(data[data.result == "*"].index, inplace=True) # Remove all rows where the result is "*", meaning that the result is corrupted or missing.
data.drop(data[data.len < 4].index, inplace=True) # Removing all rows where the len is smaller than 4.

# Changing datatypes
data.welo = data.welo.replace("None", np.nan)
data.belo = data.belo.replace("None", np.nan)
data.welo = data.welo.astype("float")
data.belo = data.belo.astype("float")

data.head()

In [None]:
data.columns

In [None]:
print("White winrate: " + str(round(len(data[data["result"] == "1-0"]) / len(data) * 100, 2)) + "%")
print("Black winrate: " + str(round(len(data[data["result"] == "0-1"]) / len(data) * 100, 2)) + "%")
print("Draw: " + str(round(len(data[data["result"] == "1/2-1/2"]) / len(data) * 100, 2)) + "%")

In [None]:
# Found it out just in case some ideas pop up and there is enough time left to implement them

rcParams["figure.figsize"] = 12, 12

elo_ranges = [[0, 500], [501, 1000], [1001, 1500], [1501, 2000], [2001, 2500], [2501, 3000], ["None", "None"]]

# White elo ranges on x
# Black elo ranges on y
x_columns = [str(x[0]) + "-" + str(x[1]) for x in elo_ranges]
y_columns = [str(x[0]) + "-" + str(x[1]) for x in elo_ranges]
elo_clusters = pd.DataFrame(columns=x_columns, index=y_columns)
elo_clusters["0-500"]["501-1000"] = 1
elo_clusters
print(elo_clusters.columns)

dataset = data

for white_elo_range in elo_ranges:
    for black_elo_range in elo_ranges:
        white_elo = str(white_elo_range[0]) + "-" + str(white_elo_range[1])
        black_elo = str(black_elo_range[0]) + "-" + str(black_elo_range[1])
        min_elo_w = white_elo_range[0]
        max_elo_w = white_elo_range[1]
        min_elo_b = black_elo_range[0]
        max_elo_b = black_elo_range[1]
        if white_elo == "None-None":
            white_count = dataset.welo.isnull()
        else:
            white_count = (dataset["welo"] >= min_elo_w) & (dataset["welo"] < max_elo_w)
        
        if black_elo == "None-None":
            black_count = dataset.belo.isnull()
        else:
            black_count = (dataset["belo"] < max_elo_b) & (dataset["belo"] >= min_elo_b)
           
        count = len(dataset[white_count & black_count])
        
        elo_clusters[white_elo][black_elo] = count

elo_clusters

In [None]:
#MITTE KUSTUTADA
#PRIIT PÄRN TEGELEB SELLEGA
data_copy = data.copy(deep=True)
data_copy.drop(['date', 'welo', 'belo', 'len'], axis=1, inplace=True)
data_copy.drop(data_copy[data_copy.result == "1/2-1/2"].index, inplace=True) # Remove all rows where the game ended in a draw.
data_copy.head()

In [None]:
#MITTE KUSTUTADA
#PRIIT PÄRN TEGELEB SELLEGA
winnersMoves = []
for index, row in data_copy.iterrows():
    if (row['result'] == "1-0"): # If the white player won.
        movesWithQueue = row['moves'].split(" ")
        moves = ""
        i = 0
        for moveWithQueue in movesWithQueue:
            if (i == 3):#VANA 5
                break
            if (moveWithQueue[0] == "W"):
                move = moveWithQueue.split(".")
                moves += " "
                moves += move[1]
                i += 1
        winnersMoves.append(moves)
    else: # If the black player won.
        movesWithQueue = row['moves'].split(" ")
        moves = ""
        j = 0
        for moveWithQueue in movesWithQueue:
            if (j == 3):#VANA 5
                break
            if (moveWithQueue[0] == "B"):
                move = moveWithQueue.split(".")
                moves += " "
                moves += move[1]
                j += 1
                
        winnersMoves.append(moves)

winnerMovesDataFrame = pd.DataFrame(data=winnersMoves, columns = ['Moves'])
winnerMovesDataFrame.head()

In [None]:
weights = winnerMovesDataFrame["Moves"].value_counts()
values = list()
counter = 0
for x in weights:
    if (counter == 5):
        break
    values.append(x)
    counter += 1
    
keys = list()
counter = 0
for key in weights.keys():
    if (counter == 5):
        break
    keys.append(key)
    counter += 1
keys[0] = "Elephant gambit"
keys[1] = "Queen's gambit"
keys[2] = "Center game"
keys[3] = "King's Indian defence"
keys[4] = "The Ruy Lopez"
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(keys,values)
plt.xticks(rotation='vertical')
plt.xlabel("Most popular openings")
plt.ylabel("Amount of games")
plt.title('Top 5 openings in winning games')
plt.show()

In [None]:
# Data preparation for finding winrates of the first moves.
moves = data["moves"]
first_move_data = pd.DataFrame({
    "result": data["result"],
    "welo": data["welo"],
    "belo": data["belo"],
    "w_move": [move.split(" ")[0].split(".")[1] for move in moves],
    "b_move": [move.split(" ")[1].split(".")[1] for move in moves]
})

first_move_data

In [None]:
# Finds winning rates for the first move for black and white.

rcParams["figure.figsize"] = 15, 30
subplot_index = 1
elo_ranges = [[0, 500], [501, 1000], [1001, 1500], [1501, 2000], [2001, 2500], [2501, 3000]]

dataset = first_move_data

for elo_range in elo_ranges:
    min_elo = elo_range[0]
    max_elo = elo_range[1]
    elo = str(min_elo) + "-" + str(max_elo)
    print(elo) # Just to indicate how far the process is.
    
    white_games = dataset[(dataset["welo"] >= min_elo) & (dataset["welo"] < max_elo)]
    black_games = dataset[(dataset["belo"] >= min_elo) & (dataset["belo"] < max_elo)]
    
    white_moves = white_games["w_move"].unique()
    black_moves = black_games["b_move"].unique()

    white_winrates = pd.DataFrame({"wins": [len(white_games[(white_games["w_move"]==move) & (white_games["result"]=="1-0")]) for move in white_moves],
                             "losses": [len(white_games[(white_games["w_move"]==move) & (white_games["result"]=="0-1")]) for move in white_moves],
                             "draws": [len(white_games[(white_games["w_move"]==move) & (white_games["result"]=="1/2-1/2")]) for move in white_moves],
                                  "total_games": [len(white_games[(white_games["w_move"]==move)])for move in white_moves]},
                            index=white_moves)
    
    black_winrates = pd.DataFrame({"wins": [len(black_games[(black_games["b_move"]==move) & (black_games["result"]=="0-1")])for move in black_moves],
                             "losses": [len(black_games[(black_games["b_move"]==move) & (black_games["result"]=="1-0")])for move in black_moves],
                             "draws": [len(black_games[(black_games["b_move"]==move) & (black_games["result"]=="1/2-1/2")])for move in black_moves],
                                  "total_games": [len(black_games[(black_games["b_move"]==move)])for move in black_moves]},
                            index=black_moves)
    
    # Filters out moves which have been played less than 1% of times.
    white_winrates = white_winrates[white_winrates["total_games"] / sum(white_winrates["total_games"]) * 100 >= 1]
    black_winrates = black_winrates[black_winrates["total_games"] / sum(black_winrates["total_games"]) * 100 >= 1]
    
    # Plot for white
    width = 0.25
    bars1 = np.arange(len(white_winrates))
    bars2 = [x + width for x in bars1]
    bars3 = [x + width for x in bars2]
    
    wins = [row.wins / row.total_games for idx,row in white_winrates.iterrows()]
    losses = [row.losses / row.total_games for idx,row in white_winrates.iterrows()]
    draws = [row.draws / row.total_games for idx,row in white_winrates.iterrows()]
    
    # Sketchy but maybe works
    if not wins:
        wins = [0]
    if not losses:
        losses = [0]
    if not draws:
        draws = [0]
        
    plt.subplot(6, 2, subplot_index)
    plt.bar(bars1, wins, width=width, label="wins")
    plt.bar(bars2, losses, width=width, label="losses")
    plt.bar(bars3, draws, width=width, label="draws")
    plt.title("white " + elo)
    plt.xticks([x + width for x in range(len(white_winrates))], white_winrates.index)
    subplot_index += 1
    plt.legend(bbox_to_anchor=(0.93, 1.05), loc="upper left")
    
    # Plot for black
    width = 0.25
    bars1 = np.arange(len(black_winrates))
    bars2 = [x + width for x in bars1]
    bars3 = [x + width for x in bars2]

    wins = [row.wins / row.total_games for idx,row in black_winrates.iterrows()]
    losses = [row.losses / row.total_games for idx,row in black_winrates.iterrows()]
    draws = [row.draws / row.total_games for idx,row in black_winrates.iterrows()]
    
    # To avoid errors.
    if not wins:
        wins = [0]
    if not losses:
        losses = [0]
    if not draws:
        draws = [0]
        
    plt.subplot(6, 2, subplot_index)
    plt.bar(bars1, wins, width=width, label="wins")
    plt.bar(bars2, losses, width=width, label="losses")
    plt.bar(bars3, draws, width=width, label="draws")
    plt.title("black " + elo)
    plt.xticks([x + width for x in range(len(black_winrates))], black_winrates.index)
    subplot_index += 1
    plt.legend(bbox_to_anchor=(0.93, 1.05), loc="upper left")

In [None]:
elo_ranges = [[0, 500], [501, 1000], [1001, 1500], [1501, 2000], [2001, 2500], [2501, 3000]]
for elo in elo_ranges:
    elo_name = str(elo[0]) + "-" + str(elo[1])
    games_count = len(data[(data.welo >= elo[0]) & (data.welo < elo[1])])
    percentage = round(games_count / len(data) * 100, 2)
    print("White games in {} elo range: {}%".format(elo_name, str(percentage)))

for elo in elo_ranges:
    elo_name = str(elo[0]) + "-" + str(elo[1])
    games_count = len(data[(data.belo >= elo[0]) & (data.belo < elo[1])])
    percentage = round(games_count / len(data) * 100, 2)
    print("Black games in {} elo range: {}%".format(elo_name, str(percentage)))

In [None]:
# Data preparation for finding checks percetange.
moves = data["moves"]
white_moves = []
black_moves = []

for move_seq in moves:
    white_check_counter = 0
    black_check_counter = 0
    for move in move_seq.split(" "):
        if "W" in move and "+" in move:
            white_check_counter += 1
        if "B" in move and "+" in move:
            black_check_counter += 1
    white_moves.append(white_check_counter)
    black_moves.append(black_check_counter)

checks_data = pd.DataFrame({
    "welo": data["welo"],
    "belo": data["belo"],
    "w_checks": white_moves,
    "b_checks": black_moves
})

checks_data

In [None]:
# Finding the average number of checks made in a game within different elo ranges.
elo_ranges = [[0, 500], [501, 1000], [1001, 1500], [1501, 2000], [2001, 2500], [2501, 3000]]
white_data = []
black_data = []

dataset = checks_data

for elo_range in elo_ranges:
    min_elo = elo_range[0]
    max_elo = elo_range[1]
    elo = str(min_elo) + "-" + str(max_elo)
    print(elo) # Just to indicate how far the process is.
        
    white_games = dataset[(dataset["welo"] >= min_elo) & (dataset["welo"] < max_elo)]
    black_games = dataset[(dataset["belo"] >= min_elo) & (dataset["belo"] < max_elo)]
    
    white_checks = sum(white_games.w_checks)
    black_checks = sum(black_games.b_checks)
    
    if white_checks:
        white_avg = round(white_checks / len(white_games), 1)
    else:
        white_avg = 0
        
    if black_checks:
        black_avg = round(black_checks / len(black_games), 1)
    else:
        black_avg = 0
    
    white_data.append(white_avg)
    black_data.append(black_avg)
    

# Plotting the data
white_df = pd.DataFrame({
    "elo": [str(elo[0])+"-"+str(elo[1]) for elo in elo_ranges],
    "white": white_data
})

black_df = pd.DataFrame({
    "elo": [str(elo[0])+"-"+str(elo[1]) for elo in elo_ranges],
    "black": black_data
})

rcParams["figure.figsize"] = 7, 7
plt.xticks(rotation=45)
plt.title("Average number of checks in a game")
plt.xlabel("Elo range")
plt.ylabel("Number of checks")
plt.plot("elo", "white", data = white_df)
plt.plot("elo", "black", data = black_df)
plt.legend(bbox_to_anchor=(0.93, 1.05), loc="upper left")