In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

In [2]:
chessdata = pd.read_csv('cleanedCleanedChessdata.csv')

As the data is still absolutely massive, we split the dataset into bullet, blitz, rapid and classical, which are time formats, for which each player is allowed a set amount of time to play their moves. They represent, generally, 1 minute, 3 minute, 10 minute and 30 minute games respectively. We further split the data based ond on ranking, also remove the data on each players' clock, as it boils down to whether a game is forfeited on time, which is provided in another row, 'Termination' which has either Normal(checkmate) or time forfeit

In [None]:
for x in range (1,201):
    chessdata=chessdata.drop(f'Clock_ply_{x}', axis=1)
    chessdata=chessdata.drop(f'Eval_ply_{x}', axis=1)

In [None]:
chessdata=chessdata.drop('Unnamed: 0.1',axis=1)
chessdata=chessdata.drop('Unnamed: 0',axis=1)
chessdata=chessdata.drop('Index.1',axis=1)
chessdata=chessdata.drop('BlackRatingDiff',axis=1)
chessdata=chessdata.drop('WhiteRatingDiff',axis=1)
chessdata=chessdata.drop('Date',axis=1)



In [None]:
chessdata['ECO'].describe()

count     197768
unique       461
top          A00
freq        9112
Name: ECO, dtype: object

In [None]:

# calculate average rating and rating difference as numeric predictors
chessdata['ELO Diff'] = chessdata['WhiteElo'] - chessdata['BlackElo']
chessdata['Ave ELO']=.5*(chessdata['WhiteElo'] +chessdata['BlackElo'])


In [None]:
chessdata['ELO Diff'].describe()

count    197768.000000
mean         -0.505714
std         129.740981
min       -1454.000000
25%         -44.000000
50%           0.000000
75%          43.000000
max        1406.000000
Name: ELO Diff, dtype: float64

In [None]:
blitzData=chessdata.loc[chessdata['Category'] == 'Blitz']
rapidData=chessdata.loc[chessdata['Category'] == 'Rapid']
bulletData=chessdata.loc[chessdata['Category'] == 'Bullet']
classicalData=chessdata.loc[chessdata['Category'] == 'Classical']


In [None]:
chessdata['Ave ELO'].describe()

count    197768.000000
mean       1511.641170
std         315.124362
min         800.000000
25%        1281.000000
50%        1504.500000
75%        1729.000000
max        2960.000000
Name: Ave ELO, dtype: float64

we will use the above stats to construct our low, mid and high threshholds

In [None]:
blitzLowRatingData=blitzData[blitzData['Ave ELO'].between(0, 1300, inclusive=False)]
blitzMidRatingData=blitzData[blitzData['Ave ELO'].between(1300, 1700, inclusive=True)]
blitzHighRatingData=blitzData[blitzData['Ave ELO'].between(1700, 3500, inclusive=False)]


rapidLowRatingData=rapidData[rapidData['Ave ELO'].between(0, 1300, inclusive=False)]
rapidMidRatingData=rapidData[rapidData['Ave ELO'].between(1300, 1700, inclusive=True)]
rapidHighRatingData=rapidData[rapidData['Ave ELO'].between(1700, 3500, inclusive=False)]


bulletLowRatingData=bulletData[bulletData['Ave ELO'].between(0, 1300, inclusive=False)]
bulletMidRatingData=bulletData[bulletData['Ave ELO'].between(1300, 1700, inclusive=True)]
bulletHighRatingData=bulletData[bulletData['Ave ELO'].between(1700, 3500, inclusive=False)]


classicalLowRatingData=classicalData[classicalData['Ave ELO'].between(0, 1300, inclusive=False)]
classicalMidRatingData=classicalData[classicalData['Ave ELO'].between(1300, 1700, inclusive=True)]
classicalHighRatingData=classicalData[classicalData['Ave ELO'].between(1700, 3500, inclusive=False)]


  blitzLowRatingData=blitzData[blitzData['Ave ELO'].between(0, 1300, inclusive=False)]
  blitzMidRatingData=blitzData[blitzData['Ave ELO'].between(1300, 1700, inclusive=True)]
  blitzHighRatingData=blitzData[blitzData['Ave ELO'].between(1700, 3500, inclusive=False)]
  rapidLowRatingData=rapidData[rapidData['Ave ELO'].between(0, 1300, inclusive=False)]
  rapidMidRatingData=rapidData[rapidData['Ave ELO'].between(1300, 1700, inclusive=True)]
  rapidHighRatingData=rapidData[rapidData['Ave ELO'].between(1700, 3500, inclusive=False)]
  bulletLowRatingData=bulletData[bulletData['Ave ELO'].between(0, 1300, inclusive=False)]
  bulletMidRatingData=bulletData[bulletData['Ave ELO'].between(1300, 1700, inclusive=True)]
  bulletHighRatingData=bulletData[bulletData['Ave ELO'].between(1700, 3500, inclusive=False)]
  classicalLowRatingData=classicalData[classicalData['Ave ELO'].between(0, 1300, inclusive=False)]
  classicalMidRatingData=classicalData[classicalData['Ave ELO'].between(1300, 1700, inclusi

And, export the data which is finally ready to be worked on

In [None]:
blitzLowRatingData.to_csv('blitzLowRatingData.csv')
blitzMidRatingData.to_csv('blitzMidRatingData.csv')
blitzHighRatingData.to_csv('blitzHighingData.csv')

bulletLowRatingData.to_csv('bulletLowRatingData.csv')
bulletMidRatingData.to_csv('bulletMidRatingData.csv')
bulletHighRatingData.to_csv('bulletHighingData.csv')

rapidLowRatingData.to_csv('rapidLowRatingData.csv')
rapidMidRatingData.to_csv('rapidMidRatingData.csv')
rapidHighRatingData.to_csv('rapidHighingData.csv')

classicalLowRatingData.to_csv('classicalLowRatingData.csv')
classicalMidRatingData.to_csv('classicalMidRatingData.csv')
classicalHighRatingData.to_csv('classicalHighingData.csv')

We move first to blitz mid rating, as it is our largest dataset, and may require more steps including random sampling