In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

Read CSV file

In [2]:
df2 = pd.read_csv('../csv/d2.csv')

Print first 5 rows

In [3]:
df2.head()

Unnamed: 0.1,Unnamed: 0,Event,Date,Round,White,Black,Result,BlackElo,ECO,Opening,Termination,TimeControl,WhiteElo,WhiteRatingDiff
0,0,Rated Blitz tournament https://lichess.org/tou...,2020.09.01,-,AttackSparrow,danicuva,1-0,2218,C00,French Defense: Schlechter Variation,Time forfeit,180+0,2460,2
1,1,Rated Blitz game,2020.09.01,-,onthewaygm,starkspieler,1-0,2424,E90,"King's Indian Defense: Normal Variation, Rare ...",Normal,180+0,2428,6
2,2,Rated Rapid tournament https://lichess.org/tou...,2020.09.01,-,OjaiJoao,FitzwilliamDarcy,1-0,2300,B06,Modern Defense: Standard Defense,Normal,600+5,2441,5
3,3,Rated Blitz tournament https://lichess.org/tou...,2020.09.01,-,WenceslaoRodrigo,zonrobla,0-1,2667,E71,King's Indian Defense: Makogonov Variation,Normal,180+1,2280,-2
4,4,Rated Blitz game,2020.09.01,-,HoldenHc,gg-gm-gmg,1-0,2682,A41,Queen's Pawn,Normal,180+0,2557,8


Print shape of dataframe (columns, rows)

In [4]:
df2.shape

(99913, 14)

Drop useless columns

In [5]:
df2.drop(['Unnamed: 0','Date', 'Round', 'Opening', 'WhiteRatingDiff', 'White', 'Black'], axis = 1, inplace = True)

Print 5 new rows

In [6]:
df2.head()

Unnamed: 0,Event,Result,BlackElo,ECO,Termination,TimeControl,WhiteElo
0,Rated Blitz tournament https://lichess.org/tou...,1-0,2218,C00,Time forfeit,180+0,2460
1,Rated Blitz game,1-0,2424,E90,Normal,180+0,2428
2,Rated Rapid tournament https://lichess.org/tou...,1-0,2300,B06,Normal,600+5,2441
3,Rated Blitz tournament https://lichess.org/tou...,0-1,2667,E71,Normal,180+1,2280
4,Rated Blitz game,1-0,2682,A41,Normal,180+0,2557


Checking for null values

In [7]:
df2.isnull().sum()

Event          0
Result         0
BlackElo       0
ECO            0
Termination    0
TimeControl    0
WhiteElo       0
dtype: int64

Drop NaN values

In [8]:
df2.dropna(inplace=True)

Change type of column *Event* into string

In [9]:
df2['Event'] = df2.Event.astype('string')

Print count of unique value in column *Event*

In [10]:
df2.Event.nunique()

655

Cut strings in column *Event*. This cell cut strings from 6 index of string

In [11]:
df2['Event'] = df2['Event'].map(lambda x: str(x)[6:])

Split string when 'tournament' appears and left values (string) before this word

In [12]:
df2['Event'] = df2['Event'].map(lambda x: x.split('tournament',1)[0])

Count values of column *Event*

In [13]:
df2.Event.value_counts()

Blitz game        75010
Blitz             23372
Rapid game         1238
Rapid               224
Classical game       62
Classical             7
Name: Event, dtype: int64

Merge similiar values into one

In [14]:
df2['Event'] = df2['Event'].replace(['Classical game', 'Classical '], 'Classical')
df2['Event'] = df2['Event'].replace(['Blitz game','Blitz '], 'Blitz')
df2['Event'] = df2['Event'].replace(['Rapid game','Rapid '], 'Rapid')

Replacing result values into numeric, because it will help in the future.

1 - means White wins
2 - means Black wins
3 - means it's a draw

In [15]:
df2['Result'] = df2['Result'].replace(['1-0'], 1)
df2['Result'] = df2['Result'].replace(['0-1'], 2)
df2['Result'] = df2['Result'].replace(['1/2-1/2'], 3)

Create new dataframe and drop rows where termination were *time forfeit* and it was a draw.

In [16]:
df2 = df2.drop(df2[(df2['Termination'] == 'Time forfeit') & (df2['Result'] == 3)].index)

Drop rows where termination were *rules infraction* or *abandoned* because I will focus only on games ended by checkmate or time forfeit

In [17]:
df2 = df2[df2['Termination'] != "Rules infraction"]
df2 = df2[df2['Termination'] != "Abandoned"]

Split strings in *TimeControl* column. Only values before *+* will stay.

In [18]:
df2['TimeControl'] = df2['TimeControl'].str.split('+').str[0]

In [19]:
df2['TimeControl'] = df2['TimeControl'].astype('int64')

In [20]:
df2.shape

(98770, 7)

In [21]:
df_blitz = df2[df2['Event'] == 'Blitz']
df_rapid = df2[df2['Event'] == 'Rapid']
df_classical = df2[df2['Event'] == 'Classical']

Create new column *EloDiff*. This column contain difference of Elo points between players. If value is lower than 0, it means that player who plays as Black had higher Elo.

In [22]:
df2['EloDiff'] = df2['WhiteElo'] - df2['BlackElo']
df2.head()

Unnamed: 0,Event,Result,BlackElo,ECO,Termination,TimeControl,WhiteElo,EloDiff
0,Blitz,1,2218,C00,Time forfeit,180,2460,242
1,Blitz,1,2424,E90,Normal,180,2428,4
2,Rapid,1,2300,B06,Normal,600,2441,141
3,Blitz,2,2667,E71,Normal,180,2280,-387
4,Blitz,1,2682,A41,Normal,180,2557,-125


In [23]:
df2_enc = df2

I'm changing values into numeric

In [24]:
df2_enc['Event'] = df2_enc['Event'].replace(['Blitz'], 0)
df2_enc['Event'] = df2_enc['Event'].replace(['Classical'], 2)
df2_enc['Event'] = df2_enc['Event'].replace(['Rapid'], 4)

In [25]:
df2_enc['Termination'] = df2_enc['Termination'].replace(['Normal'], 0)
df2_enc['Termination'] = df2_enc['Termination'].replace(['Time forfeit'], 1)

In [26]:
df2_enc = df2_enc[df2_enc['ECO'] != '?']

In [27]:
df2_enc['ECO'].nunique()

490

In [28]:
labelencoder = LabelEncoder()

In [29]:
df2_enc.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In this and next cells I'm changing values manually, because I need the same values in dataset D1 and D2

In [30]:
df2_enc.loc[99913,'ECO'] = 'D62'
df2_enc.loc[99914,'ECO'] = 'D98'
df2_enc.loc[99915,'ECO'] = 'E57'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_enc.loc[99913,'ECO'] = 'D62'


In [31]:
df2_enc.tail()

Unnamed: 0,Event,Result,BlackElo,ECO,Termination,TimeControl,WhiteElo,EloDiff
99911,0.0,2.0,2440.0,B98,0.0,180.0,2414.0,-26.0
99912,0.0,1.0,2543.0,A56,1.0,180.0,2363.0,-180.0
99913,,,,D62,,,,
99914,,,,D98,,,,
99915,,,,E57,,,,


In [32]:
df2_enc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98759 entries, 0 to 99915
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Event        98756 non-null  float64
 1   Result       98756 non-null  float64
 2   BlackElo     98756 non-null  float64
 3   ECO          98759 non-null  object 
 4   Termination  98756 non-null  float64
 5   TimeControl  98756 non-null  float64
 6   WhiteElo     98756 non-null  float64
 7   EloDiff      98756 non-null  float64
dtypes: float64(7), object(1)
memory usage: 8.8+ MB


In [33]:
df2_enc['ECO_enc'] = labelencoder.fit_transform(df2_enc['ECO'])

In [34]:
df2_enc.dropna(inplace=True)

In [35]:
df2_enc.tail()

Unnamed: 0,Event,Result,BlackElo,ECO,Termination,TimeControl,WhiteElo,EloDiff,ECO_enc
99908,0.0,2.0,2462.0,B12,1.0,180.0,2467.0,5.0,112
99909,0.0,3.0,2477.0,A45,0.0,180.0,2484.0,7.0,45
99910,0.0,1.0,2295.0,A05,1.0,180.0,2511.0,216.0,5
99911,0.0,2.0,2440.0,B98,0.0,180.0,2414.0,-26.0,198
99912,0.0,1.0,2543.0,A56,1.0,180.0,2363.0,-180.0,56


In [36]:
df2_enc['Event'] = df2_enc['Event'].astype('int64')
df2_enc['Result'] = df2_enc['Result'].astype('int64')
df2_enc['BlackElo'] = df2_enc['BlackElo'].astype('int64')
df2_enc['Termination'] = df2_enc['Termination'].astype('int64')
df2_enc['TimeControl'] = df2_enc['TimeControl'].astype('int64')
df2_enc['WhiteElo'] = df2_enc['WhiteElo'].astype('int64')
df2_enc['EloDiff'] = df2_enc['EloDiff'].astype('int64')

In [37]:
df2_enc['TimeControl_enc'] = df2_enc['TimeControl']

In [38]:
df2_enc['TimeControl_enc'] = df2_enc['TimeControl_enc'].replace(15, 34)
df2_enc['TimeControl_enc'] = df2_enc['TimeControl_enc'].replace(60, 3)
df2_enc['TimeControl_enc'] = df2_enc['TimeControl_enc'].replace(120, 5)
df2_enc['TimeControl_enc'] = df2_enc['TimeControl_enc'].replace(180, 6)
df2_enc['TimeControl_enc'] = df2_enc['TimeControl_enc'].replace(240, 7)
df2_enc['TimeControl_enc'] = df2_enc['TimeControl_enc'].replace(300, 8)
df2_enc['TimeControl_enc'] = df2_enc['TimeControl_enc'].replace(360, 9)
df2_enc['TimeControl_enc'] = df2_enc['TimeControl_enc'].replace(420, 10)
df2_enc['TimeControl_enc'] = df2_enc['TimeControl_enc'].replace(480, 11)
df2_enc['TimeControl_enc'] = df2_enc['TimeControl_enc'].replace(600, 13)
df2_enc['TimeControl_enc'] = df2_enc['TimeControl_enc'].replace(660, 14)
df2_enc['TimeControl_enc'] = df2_enc['TimeControl_enc'].replace(780, 16)
df2_enc['TimeControl_enc'] = df2_enc['TimeControl_enc'].replace(840, 17)
df2_enc['TimeControl_enc'] = df2_enc['TimeControl_enc'].replace(900, 18)
df2_enc['TimeControl_enc'] = df2_enc['TimeControl_enc'].replace(1200, 23)
df2_enc['TimeControl_enc'] = df2_enc['TimeControl_enc'].replace(1500, 24)
df2_enc['TimeControl_enc'] = df2_enc['TimeControl_enc'].replace(1800, 25)
df2_enc['TimeControl_enc'] = df2_enc['TimeControl_enc'].replace(2700, 28)
df2_enc['TimeControl_enc'] = df2_enc['TimeControl_enc'].replace(5400, 30)

In [39]:
df2_enc.head()

Unnamed: 0,Event,Result,BlackElo,ECO,Termination,TimeControl,WhiteElo,EloDiff,ECO_enc,TimeControl_enc
0,0,1,2218,C00,1,180,2460,242,200,6
1,0,1,2424,E90,0,180,2428,4,483,6
2,4,1,2300,B06,0,600,2441,141,106,13
3,0,2,2667,E71,0,180,2280,-387,464,6
4,0,1,2682,A41,0,180,2557,-125,41,6


In [40]:
df2_final = df2_enc[['Result','WhiteElo','BlackElo','EloDiff','Event','ECO_enc','Termination','TimeControl_enc']]
df2_final

Unnamed: 0,Result,WhiteElo,BlackElo,EloDiff,Event,ECO_enc,Termination,TimeControl_enc
0,1,2460,2218,242,0,200,1,6
1,1,2428,2424,4,0,483,0,6
2,1,2441,2300,141,4,106,0,13
3,2,2280,2667,-387,0,464,0,6
4,1,2557,2682,-125,0,41,0,6
...,...,...,...,...,...,...,...,...
99908,2,2467,2462,5,0,112,1,6
99909,3,2484,2477,7,0,45,0,6
99910,1,2511,2295,216,0,5,1,6
99911,2,2414,2440,-26,0,198,0,6


In [41]:
df2_final.to_csv('../csv/d2_final.csv')