In [9]:
#import requests
#from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import time

import seaborn
from matplotlib import pyplot

from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import metrics
from sklearn.metrics import PrecisionRecallDisplay, mean_squared_error, mean_absolute_error

import statsmodels.api as sm
import statsmodels.formula.api as smf

Read `chess_games_cleaned.csv` into python:

In [10]:
chess_games_cleaned = pd.read_csv("chess_games_cleaned.csv")

chess_games_cleaned.head()

Unnamed: 0.1,Unnamed: 0,Result,WhiteElo,BlackElo,WhiteRatingDiff,ECO,Opening,TimeControl,Termination,Base (min),Increment (sec),Win_Rate
0,0,1-0,1901,1896,5,D10,slav defense,300+5,Time forfeit,5.0,5,0.490376
1,1,0-1,1641,1627,14,C20,king's pawn opening: 2.b3,300+0,Normal,5.0,0,0.41206
2,2,1-0,1647,1688,-41,B01,scandinavian defense: mieses-kotroc variation,180+0,Time forfeit,3.0,0,0.55855
3,3,0-1,1945,1900,45,B90,"sicilian defense: najdorf, lipnitsky attack",180+0,Time forfeit,3.0,0,0.444695
4,4,0-1,1773,1809,-36,C27,vienna game,180+0,Normal,3.0,0,0.579278


Feature engineering/cleaning:

In [32]:
#CLASSIFICATION FOR EACH OPENING --> Open Game, Semi-Open game, Semi-Closed game, Closed gam

#Created a column OpenGame - True if open game type, False otherwise
chess_games_cleaned["OpenGame"] = chess_games_cleaned["Opening"].str.contains("portuguese opening") | \
chess_games_cleaned["Opening"].str.contains("center pawn opening") | \
chess_games_cleaned["Opening"].str.contains("vienna game") | \
chess_games_cleaned["Opening"].str.contains("bishop's opening") | \
chess_games_cleaned["Opening"].str.contains("danish gambit") | \
chess_games_cleaned["Opening"].str.contains("center game") | \
chess_games_cleaned["Opening"].str.contains("alapin's opening") | \
chess_games_cleaned["Opening"].str.contains("ruy lopez") | \
chess_games_cleaned["Opening"].str.contains("ponziani opening") | \
chess_games_cleaned["Opening"].str.contains("three knights game") | \
chess_games_cleaned["Opening"].str.contains("four knights game") | \
chess_games_cleaned["Opening"].str.contains("italian game") | \
chess_games_cleaned["Opening"].str.contains("giuoco piano") | \
chess_games_cleaned["Opening"].str.contains("evans gambit") | \
chess_games_cleaned["Opening"].str.contains("hungarian defense") | \
chess_games_cleaned["Opening"].str.contains("two knights defense") | \
chess_games_cleaned["Opening"].str.contains("scotch game") | \
chess_games_cleaned["Opening"].str.contains("inverted hungarian opening") | \
chess_games_cleaned["Opening"].str.contains("konstantinopolsky opening") | \
chess_games_cleaned["Opening"].str.contains("elephant gambit") | \
chess_games_cleaned["Opening"].str.contains("philidor defense") | \
chess_games_cleaned["Opening"].str.contains("latvian gambit") | \
chess_games_cleaned["Opening"].str.contains("damiano defense") | \
chess_games_cleaned["Opening"].str.contains("petrov's defense") | \
chess_games_cleaned["Opening"].str.contains("greco defense") | \
chess_games_cleaned["Opening"].str.contains("napoleon opening") | \
chess_games_cleaned["Opening"].str.contains("king's gambit") | \
chess_games_cleaned["Opening"].str.contains("king's pawn opening") | \
chess_games_cleaned["Opening"].str.contains("danvers opening") | \
chess_games_cleaned["Opening"].str.contains("bongcloud attack")

#Created a column SemiOpenGame - True if semi-open game type, False otherwise
chess_games_cleaned["SemiOpenGame"] = chess_games_cleaned["Opening"].str.contains("corn stalk defense") | \
chess_games_cleaned["Opening"].str.contains("st. george defense") | \
chess_games_cleaned["Opening"].str.contains("lemming defense") | \
chess_games_cleaned["Opening"].str.contains("owen's defense") | \
chess_games_cleaned["Opening"].str.contains("sicilian defense") | \
chess_games_cleaned["Opening"].str.contains("caro-kann defense") | \
chess_games_cleaned["Opening"].str.contains("nimzowitch defense") | \
chess_games_cleaned["Opening"].str.contains("scandinavian defense") | \
chess_games_cleaned["Opening"].str.contains("balogh defense") | \
chess_games_cleaned["Opening"].str.contains("pirc defense") | \
chess_games_cleaned["Opening"].str.contains("french defense") | \
chess_games_cleaned["Opening"].str.contains("fred defense") | \
chess_games_cleaned["Opening"].str.contains("barnes defense") | \
chess_games_cleaned["Opening"].str.contains("alehkine's defense") | \
chess_games_cleaned["Opening"].str.contains("borg opening") | \
chess_games_cleaned["Opening"].str.contains("modern defense") | \
chess_games_cleaned["Opening"].str.contains("goldsmith defense") | \
chess_games_cleaned["Opening"].str.contains("carr defense") | \
chess_games_cleaned["Opening"].str.contains("adams defense")

#Created a column SemiClosedGame - True if semi-closed game type, False otherwise
chess_games_cleaned["SemiClosedGame"] = chess_games_cleaned["Opening"].str.contains("polish defense") | \
chess_games_cleaned["Opening"].str.contains("benoni defense") | \
chess_games_cleaned["Opening"].str.contains("queen's knight defense") | \
chess_games_cleaned["Opening"].str.contains("wade defense") | \
chess_games_cleaned["Opening"].str.contains("englund gambit") | \
chess_games_cleaned["Opening"].str.contains("english defense") | \
chess_games_cleaned["Opening"].str.contains("keres defense") | \
chess_games_cleaned["Opening"].str.contains("dutch defense") | \
chess_games_cleaned["Opening"].str.contains("indian game") | \
chess_games_cleaned["Opening"].str.contains("nimzo-indian defense") | \
chess_games_cleaned["Opening"].str.contains("queen's indian defense") | \
chess_games_cleaned["Opening"].str.contains("bogo–indian defense") | \
chess_games_cleaned["Opening"].str.contains("blumenfeld countergambit") | \
chess_games_cleaned["Opening"].str.contains("catalan opening") | \
chess_games_cleaned["Opening"].str.contains("king's indian defense") | \
chess_games_cleaned["Opening"].str.contains("benoni defense") | \
chess_games_cleaned["Opening"].str.contains("benko gambit") | \
chess_games_cleaned["Opening"].str.contains("old indian defense") | \
chess_games_cleaned["Opening"].str.contains("budapest gambit") | \
chess_games_cleaned["Opening"].str.contains("modern benoni")

#Created a column ClosedGame - True if closed game type, False otherwise
    # NOTE: "Queens Gambit" includes both Queen's gambit accepted and Queen's gambit declined
chess_games_cleaned["ClosedGame"] = chess_games_cleaned["Opening"].str.contains("queen's pawn") | \
chess_games_cleaned["Opening"].str.contains("closed game") | \
chess_games_cleaned["Opening"].str.contains("queen's gambit") | \
chess_games_cleaned["Opening"].str.contains("slav defense") | \
chess_games_cleaned["Opening"].str.contains("stonewall attack") | \
chess_games_cleaned["Opening"].str.contains("colle system") | \
chess_games_cleaned["Opening"].str.contains("richter-veresov attack") | \
chess_games_cleaned["Opening"].str.contains("torre attack") | \
chess_games_cleaned["Opening"].str.contains("symmetrical defense") | \
chess_games_cleaned["Opening"].str.contains("chigorin defense") | \
chess_games_cleaned["Opening"].str.contains("baltic defense") | \
chess_games_cleaned["Opening"].str.contains("marshall defense") | \
chess_games_cleaned["Opening"].str.contains("blackmar-diemer gambit") | \
chess_games_cleaned["Opening"].str.contains("colle system") | \
chess_games_cleaned["Opening"].str.contains("london system")

#Denoting each game's opening type played under column "OpeningType"
chess_games_cleaned.loc[(chess_games_cleaned['OpenGame'] == True), 'OpeningType'] = 'Open Game'
chess_games_cleaned.loc[(chess_games_cleaned['SemiOpenGame'] == True), 'OpeningType'] = 'Semi Open Game'
chess_games_cleaned.loc[(chess_games_cleaned['ClosedGame'] == True), 'OpeningType'] = 'Closed Game'
chess_games_cleaned.loc[(chess_games_cleaned['SemiClosedGame'] == True), 'OpeningType'] = 'Semi Closed Game'

#Creating a numeric column for the 4 Opening Types
chess_games_cleaned.loc[(chess_games_cleaned['OpenGame'] == True), 'OpeningType_Num'] = '1'
chess_games_cleaned.loc[(chess_games_cleaned['SemiOpenGame'] == True), 'OpeningType_Num'] = '2'
chess_games_cleaned.loc[(chess_games_cleaned['ClosedGame'] == True), 'OpeningType_Num'] = '3'
chess_games_cleaned.loc[(chess_games_cleaned['SemiClosedGame'] == True), 'OpeningType_Num'] = '4'

#Dropping additional columns created
chess_games_cleaned = chess_games_cleaned.drop(['OpenGame', 'SemiOpenGame', 'SemiClosedGame', 'ClosedGame'], axis = 1)

#Dropping NaNs in the OpeningType column, indicating openings that were not classified
chess_games_cleaned = chess_games_cleaned.dropna(subset = ['OpeningType'])

#Changing Results to Binary
chess_games_cleaned.loc[(chess_games_cleaned['Result'] == '1-0'), 'Result_Binary'] = 1
chess_games_cleaned.loc[(chess_games_cleaned['Result'] == '0-1'), 'Result_Binary'] = 0

#Drop NaNs in the Result_Binary column, which are draws; converted binary variable to ints
chess_games_cleaned = chess_games_cleaned.dropna(subset = ['Result_Binary'])
chess_games_cleaned['Result_Binary'] = chess_games_cleaned['Result_Binary'].astype(int)

le = preprocessing.LabelEncoder()

chess_games_cleaned['OpeningEnc'] = le.fit_transform(chess_games_cleaned['Opening'])

le = preprocessing.LabelEncoder()

chess_games_cleaned['ECOEnc'] = le.fit_transform(chess_games_cleaned['ECO'])

#print(chess_games_cleaned['Result_Binary'].unique())

display(chess_games_cleaned.head(50))

Unnamed: 0.1,Unnamed: 0,Result,WhiteElo,BlackElo,WhiteRatingDiff,ECO,Opening,TimeControl,Termination,Base (min),Increment (sec),Win_Rate,OpeningType,OpeningType_Num,Result_Binary,OpeningEnc,ECOEnc
0,0,1-0,1901,1896,5,D10,slav defense,300+5,Time forfeit,5.0,5,0.490376,Closed Game,3,1,744,194
1,1,0-1,1641,1627,14,C20,king's pawn opening: 2.b3,300+0,Normal,5.0,0,0.41206,Open Game,1,0,348,124
2,2,1-0,1647,1688,-41,B01,scandinavian defense: mieses-kotroc variation,180+0,Time forfeit,3.0,0,0.55855,Semi Open Game,2,1,580,37
3,3,0-1,1945,1900,45,B90,"sicilian defense: najdorf, lipnitsky attack",180+0,Time forfeit,3.0,0,0.444695,Semi Open Game,2,0,699,100
4,4,0-1,1773,1809,-36,C27,vienna game,180+0,Normal,3.0,0,0.579278,Open Game,1,0,770,131
5,5,0-1,1895,1886,9,B10,caro-kann defense: two knights attack,180+0,Time forfeit,3.0,0,0.52122,Semi Open Game,2,0,76,42
6,6,1-0,2155,2356,-201,D02,queen's pawn game: london system,180+0,Normal,3.0,0,0.499276,Closed Game,3,1,509,187
7,7,0-1,2010,2111,-101,A45,indian game,300+0,Normal,5.0,0,0.450352,Semi Closed Game,4,0,219,11
8,8,1-0,1764,1773,-9,B01,scandinavian defense: mieses-kotroc variation,180+0,Time forfeit,3.0,0,0.55855,Semi Open Game,2,1,580,37
9,9,0-1,1649,1638,11,C57,"italian game: two knights defense, traxler cou...",900+3,Normal,15.0,3,0.408377,Open Game,1,0,280,159


## Idea:

What is the significance of different factors in determining game outcome? Let's train Logistic Regression clasifiers to determine this:

Let's look at:
- ELO
- ELO Difference
- ECO
- Opening Type
- Opening

In [37]:
features_selected = chess_games_cleaned[["WhiteElo",
                                         "BlackElo",
                                         "WhiteRatingDiff",
                                         "OpeningType_Num",
                                         "OpeningEnc",
                                         "ECOEnc",
                                         "Result_Binary"]]

features_selected.head()

Unnamed: 0,WhiteElo,BlackElo,WhiteRatingDiff,OpeningType_Num,OpeningEnc,ECOEnc,Result_Binary
0,1901,1896,5,3,744,194,1
1,1641,1627,14,1,348,124,0
2,1647,1688,-41,2,580,37,1
3,1945,1900,45,2,699,100,0
4,1773,1809,-36,1,770,131,0


Train on these features:

In [47]:
 X_train, X_test, y_train, y_test = train_test_split(features_selected[["WhiteRatingDiff",
                                                                        "WhiteElo",
                                                                        "BlackElo",
                                                                        "ECOEnc",
                                                                        "OpeningType_Num",
                                                                        "OpeningEnc"]],
                                                     features_selected["Result_Binary"],
                                                     test_size=0.2, random_state=42)

In [48]:
clf = LogisticRegression(random_state=42).fit(X_train, y_train)

In [49]:
clf.score(X_test, y_test)

0.6537212951760323

In [50]:
clf.coef_

array([[ 3.18893174e-03,  1.59391854e-03, -1.59501320e-03,
         5.12135681e-04, -1.90346272e-06,  4.70606224e-05]])

From these results, it appears that ELO rating and features generated from ELO rating are the most significant predictors in determining game outcome; opening type and opening don't matter here. 

Let's train a single logistic regression on each of these features:

In [64]:
for each in ["WhiteRatingDiff", "WhiteElo", "BlackElo", "ECOEnc", "OpeningType_Num", "OpeningEnc"]:
    
    print(f"Training Logistic Regression on {each}")
    

    X_train, X_test, y_train, y_test = train_test_split(features_selected[each].to_numpy().reshape(-1, 1),
                                                        features_selected["Result_Binary"],
                                                        test_size=0.2, random_state=42)
    
    clf = LogisticRegression(random_state=42).fit(X_train, y_train)
    
    print(f"Score: {clf.score(X_test, y_test)}")

Training Logistic Regression on WhiteRatingDiff
Score: 0.6533259861761641
Training Logistic Regression on WhiteElo
Score: 0.5587153655410343
Training Logistic Regression on BlackElo
Score: 0.5599252506618431
Training Logistic Regression on ECOEnc
Score: 0.514344925070976
Training Logistic Regression on OpeningType_Num
Score: 0.5172438577366762
Training Logistic Regression on OpeningEnc
Score: 0.5172438577366762


As we can see, the logisitic regression trained on `WhiteRatingDiff` was the most accurate out of all of the fitted models. 