### 1. Import and data loading

In [1]:
import pandas as pd 
import numpy as np

In [2]:
# Load clean dataset 
df = pd.read_csv("data/chess_games_clean.csv")
df

Unnamed: 0,Event,Date,White,Black,Result,ECO,WhiteElo,BlackElo,EventType
0,Rated Blitz game,2024.01.01,Peixeiro,VaRYemezAmca72,1-0,A14,2504,2371,blitz
1,Rated Blitz game,2024.01.01,bayad2016,Yoda-wins,1/2-1/2,E01,2487,2596,blitz
2,Rated Blitz game,2024.01.01,FantacticEman,why_this_bot_exists,1/2-1/2,D35,2915,2921,blitz
3,Rated Blitz game,2024.01.01,Chigorinez,Luhrman,1-0,B40,2542,2433,blitz
4,Rated Blitz game,2024.01.01,syad_bony77,GERINDRA_BOS,0-1,C48,2473,2658,blitz
...,...,...,...,...,...,...,...,...,...
3266127,Rated Blitz game,2024.12.31,AfricanJo,Yuri_Guenther,1/2-1/2,C47,2665,2655,blitz
3266128,Rated Blitz game,2024.12.31,cad2024,emiliofelixramirez,1-0,D23,2537,2449,blitz
3266129,Rated Blitz game,2024.12.31,learningchessreally,BiletskiyDanylo,0-1,A15,2583,2557,blitz
3266130,Rated Blitz game,2024.12.31,lotrisking,ezis71,1-0,D30,2513,2398,blitz


### 2. Variable encoding
Convert `Result` to numeric for machine learning

In [3]:
result_map = {
    '1-0': 1,
    '0-1': 0,
    '1/2-1/2': 0.5,
}
df['ResultNumeric'] = df['Result'].map(result_map)

Also convert the `EventType` to numeric for machine learning

In [4]:
event_type_map = {
    'blitz': 0, 
    'rapid': 1, 
    'classical': 2
}
df['EventTypeID'] = df['EventType'].map(event_type_map)
df.head(10)

Unnamed: 0,Event,Date,White,Black,Result,ECO,WhiteElo,BlackElo,EventType,ResultNumeric,EventTypeID
0,Rated Blitz game,2024.01.01,Peixeiro,VaRYemezAmca72,1-0,A14,2504,2371,blitz,1.0,0
1,Rated Blitz game,2024.01.01,bayad2016,Yoda-wins,1/2-1/2,E01,2487,2596,blitz,0.5,0
2,Rated Blitz game,2024.01.01,FantacticEman,why_this_bot_exists,1/2-1/2,D35,2915,2921,blitz,0.5,0
3,Rated Blitz game,2024.01.01,Chigorinez,Luhrman,1-0,B40,2542,2433,blitz,1.0,0
4,Rated Blitz game,2024.01.01,syad_bony77,GERINDRA_BOS,0-1,C48,2473,2658,blitz,0.0,0
5,Rated Blitz game,2024.01.01,falerito,estaka1,0-1,A08,2404,2534,blitz,0.0,0
6,Rated Blitz game,2024.01.01,TorBot_SL,ToromBot,1/2-1/2,E61,2860,2922,blitz,0.5,0
7,Rated Blitz game,2024.01.01,CPU2006,Phalanx-XXV,1-0,D46,2752,2395,blitz,1.0,0
8,Rated Blitz game,2024.01.01,MadderRose,Philidor23,1-0,B38,2538,2430,blitz,1.0,0
9,Rated Blitz game,2024.01.01,MassterofMayhem,Pliukha_Mikhail,1-0,B01,2604,2727,blitz,1.0,0


### 3. Basic feature engineering
create new numeric features for machine learning 

In [5]:
# Rating difference
df['RatingDiff'] = np.abs(df['WhiteElo'] - df['BlackElo'])

# Average rating
df['AvgRating'] = (df['WhiteElo'] + df['BlackElo']) / 2

# Indicator if White is higher rated (binary)
df['WhiteHigherRated'] = (df['WhiteElo'] > df['BlackElo']).astype(int)

df

Unnamed: 0,Event,Date,White,Black,Result,ECO,WhiteElo,BlackElo,EventType,ResultNumeric,EventTypeID,RatingDiff,AvgRating,WhiteHigherRated
0,Rated Blitz game,2024.01.01,Peixeiro,VaRYemezAmca72,1-0,A14,2504,2371,blitz,1.0,0,133,2437.5,1
1,Rated Blitz game,2024.01.01,bayad2016,Yoda-wins,1/2-1/2,E01,2487,2596,blitz,0.5,0,109,2541.5,0
2,Rated Blitz game,2024.01.01,FantacticEman,why_this_bot_exists,1/2-1/2,D35,2915,2921,blitz,0.5,0,6,2918.0,0
3,Rated Blitz game,2024.01.01,Chigorinez,Luhrman,1-0,B40,2542,2433,blitz,1.0,0,109,2487.5,1
4,Rated Blitz game,2024.01.01,syad_bony77,GERINDRA_BOS,0-1,C48,2473,2658,blitz,0.0,0,185,2565.5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3266127,Rated Blitz game,2024.12.31,AfricanJo,Yuri_Guenther,1/2-1/2,C47,2665,2655,blitz,0.5,0,10,2660.0,1
3266128,Rated Blitz game,2024.12.31,cad2024,emiliofelixramirez,1-0,D23,2537,2449,blitz,1.0,0,88,2493.0,1
3266129,Rated Blitz game,2024.12.31,learningchessreally,BiletskiyDanylo,0-1,A15,2583,2557,blitz,0.0,0,26,2570.0,1
3266130,Rated Blitz game,2024.12.31,lotrisking,ezis71,1-0,D30,2513,2398,blitz,1.0,0,115,2455.5,1


### 4. Opening encoding (ECO)
Since there is almost 500 different ECO, we decide to just use the top 20 most use while the rest will be others.

In [6]:
# Get top 20 ECO codes
top_20_eco = df['ECO'].value_counts().head(20).index.tolist()

# Group less frequent ECO codes into 'Other'
def group_eco(code):
    if code in top_20_eco:
        return code
    return 'Other'

df['ECOGroup'] = df['ECO'].apply(group_eco)

# Create a mapping for encoding
eco_list = ['Other'] + top_20_eco
df['ECOID'] = df['ECOGroup'].apply(lambda x: eco_list.index(x))
df.head()


Unnamed: 0,Event,Date,White,Black,Result,ECO,WhiteElo,BlackElo,EventType,ResultNumeric,EventTypeID,RatingDiff,AvgRating,WhiteHigherRated,ECOGroup,ECOID
0,Rated Blitz game,2024.01.01,Peixeiro,VaRYemezAmca72,1-0,A14,2504,2371,blitz,1.0,0,133,2437.5,1,Other,0
1,Rated Blitz game,2024.01.01,bayad2016,Yoda-wins,1/2-1/2,E01,2487,2596,blitz,0.5,0,109,2541.5,0,Other,0
2,Rated Blitz game,2024.01.01,FantacticEman,why_this_bot_exists,1/2-1/2,D35,2915,2921,blitz,0.5,0,6,2918.0,0,Other,0
3,Rated Blitz game,2024.01.01,Chigorinez,Luhrman,1-0,B40,2542,2433,blitz,1.0,0,109,2487.5,1,B40,10
4,Rated Blitz game,2024.01.01,syad_bony77,GERINDRA_BOS,0-1,C48,2473,2658,blitz,0.0,0,185,2565.5,0,Other,0


### 5. Remove unneeded columns

In [7]:
# 'Event' can be drop, since 'EventType' will be used
df = df.drop(['Event'], axis=1)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3266132 entries, 0 to 3266131
Data columns (total 15 columns):
 #   Column            Dtype  
---  ------            -----  
 0   Date              object 
 1   White             object 
 2   Black             object 
 3   Result            object 
 4   ECO               object 
 5   WhiteElo          int64  
 6   BlackElo          int64  
 7   EventType         object 
 8   ResultNumeric     float64
 9   EventTypeID       int64  
 10  RatingDiff        int64  
 11  AvgRating         float64
 12  WhiteHigherRated  int64  
 13  ECOGroup          object 
 14  ECOID             int64  
dtypes: float64(2), int64(6), object(7)
memory usage: 373.8+ MB
