In [1]:
import chess.pgn
import chess
import chess.engine
import math
import numpy
import pandas as pd
import os
import pyarrow
import logging
import re

# Configure logging
logging.basicConfig(filename='process.log', level=logging.INFO, 
                    format='%(asctime)s - %(message)s', datefmt='%H:%M:%S')

final_path = './lichess_2016-09_valid_games.feather'

df = pd.read_feather(final_path)
len(df)

3744304

In [2]:
df = df.dropna(subset=['WhiteRatingDiff', 'BlackRatingDiff'])
len(df)

3742370

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3742370 entries, 2 to 6813118
Data columns (total 14 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   Event            object
 1   Site             object
 2   White            object
 3   Black            object
 4   Result           object
 5   WhiteElo         object
 6   BlackElo         object
 7   WhiteRatingDiff  object
 8   BlackRatingDiff  object
 9   ECO              object
 10  Opening          object
 11  TimeControl      object
 12  Termination      object
 13  Moves            object
dtypes: object(14)
memory usage: 428.3+ MB


In [4]:
df.drop(['Event', 'Site', 'Black', 'BlackRatingDiff', 'ECO', 'TimeControl', 'Termination', 'Moves'], axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3742370 entries, 2 to 6813118
Data columns (total 6 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   White            object
 1   Result           object
 2   WhiteElo         object
 3   BlackElo         object
 4   WhiteRatingDiff  object
 5   Opening          object
dtypes: object(6)
memory usage: 199.9+ MB


In [5]:
df['Result'] = df['Result'].replace({'1-0': 1, '0-1': -1, '1/2-1/2': 0})
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3742370 entries, 2 to 6813118
Data columns (total 6 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   White            object
 1   Result           int64 
 2   WhiteElo         object
 3   BlackElo         object
 4   WhiteRatingDiff  object
 5   Opening          object
dtypes: int64(1), object(5)
memory usage: 199.9+ MB


  df['Result'] = df['Result'].replace({'1-0': 1, '0-1': -1, '1/2-1/2': 0})


In [6]:
df['WhiteElo'] = df['WhiteElo'].astype(int)
df['BlackElo'] = df['BlackElo'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3742370 entries, 2 to 6813118
Data columns (total 6 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   White            object
 1   Result           int64 
 2   WhiteElo         int64 
 3   BlackElo         int64 
 4   WhiteRatingDiff  object
 5   Opening          object
dtypes: int64(3), object(3)
memory usage: 199.9+ MB


In [7]:
df['WhiteRatingDiff'] = df['WhiteRatingDiff'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3742370 entries, 2 to 6813118
Data columns (total 6 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   White            object
 1   Result           int64 
 2   WhiteElo         int64 
 3   BlackElo         int64 
 4   WhiteRatingDiff  int64 
 5   Opening          object
dtypes: int64(4), object(2)
memory usage: 199.9+ MB


In [8]:
int_columns = df.select_dtypes(include=['int64']).columns

for col in int_columns:
    df[col] = pd.to_numeric(df[col], downcast='integer')

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3742370 entries, 2 to 6813118
Data columns (total 6 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   White            object
 1   Result           int8  
 2   WhiteElo         int16 
 3   BlackElo         int16 
 4   WhiteRatingDiff  int16 
 5   Opening          object
dtypes: int16(3), int8(1), object(2)
memory usage: 110.6+ MB


In [9]:
df['White'].value_counts()

White
bobificher      999
valmasia3       912
chaabanesami    886
r2d227          826
philippe941     784
               ... 
joaovitordf       1
HelderViana       1
Ayre              1
raezhugo          1
kamburmen         1
Name: count, Length: 115117, dtype: int64

In [10]:
games_played_per = df['White'].value_counts()
lt_fifty_games_played = games_played_per[games_played_per < 50]
lt_fifty_games_played.sum()

1117487

In [11]:
games_played_per = df['White'].value_counts()

lt_fifty_games_played_player = games_played_per[games_played_per < 50].index

df_filtered = df[~df['White'].isin(lt_fifty_games_played_player)]
df_filtered['White'].value_counts()

White
bobificher      999
valmasia3       912
chaabanesami    886
r2d227          826
philippe941     784
               ... 
Kent85           50
jerr68           50
szaszynkowo      50
SirOchsen        50
MrLight          50
Name: count, Length: 23924, dtype: int64

In [12]:
df = df_filtered

In [13]:
df = df[(df['Opening'] != '?')]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2623531 entries, 2 to 6813118
Data columns (total 6 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   White            object
 1   Result           int8  
 2   WhiteElo         int16 
 3   BlackElo         int16 
 4   WhiteRatingDiff  int16 
 5   Opening          object
dtypes: int16(3), int8(1), object(2)
memory usage: 77.6+ MB


In [14]:
df.loc[:, 'EloDiff'] = df['WhiteElo'] - df['BlackElo']
df = df.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'EloDiff'] = df['WhiteElo'] - df['BlackElo']


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2623531 entries, 2 to 6813118
Data columns (total 7 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   White            object
 1   Result           int8  
 2   WhiteElo         int16 
 3   BlackElo         int16 
 4   WhiteRatingDiff  int16 
 5   Opening          object
 6   EloDiff          int16 
dtypes: int16(4), int8(1), object(2)
memory usage: 82.6+ MB


In [16]:
df.drop(['WhiteElo', 'BlackElo', 'WhiteRatingDiff'], axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2623531 entries, 2 to 6813118
Data columns (total 4 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   White    object
 1   Result   int8  
 2   Opening  object
 3   EloDiff  int16 
dtypes: int16(1), int8(1), object(2)
memory usage: 67.6+ MB


In [20]:
df['Opening'] = df['Opening'].str.split(':').str[0].str.rstrip()
df['Opening'] = df['Opening'].str.split(',').str[0].str.rstrip()
df['Opening'] = df['Opening'].str.split('#').str[0].str.rstrip()
pd.set_option('display.max_rows', None)
df['Opening'].value_counts()

Opening
Sicilian Defense                     333900
French Defense                       186077
Queen's Pawn Game                    168758
Scandinavian Defense                 122180
King's Pawn Game                     108790
Italian Game                          99673
Philidor Defense                      98807
Caro-Kann Defense                     83916
English Opening                       79892
Ruy Lopez                             73410
Bishop's Opening                      62286
Scotch Game                           52006
Modern Defense                        48022
Queen's Gambit Declined               46498
Van't Kruijs Opening                  46390
Indian Game                           44646
Pirc Defense                          41350
Zukertort Opening                     36214
Horwitz Defense                       35146
King's Gambit Accepted                33308
Nimzowitsch Defense                   32948
Slav Defense                          30470
Four Knights Game       

In [21]:
pd.reset_option('display.max_rows')

In [25]:
games_played_per = df['Opening'].value_counts()

lt_300_games_played = games_played_per[games_played_per < 300].index

df_filtered = df[~df['Opening'].isin(lt_300_games_played)]
df_filtered['Opening'].value_counts()

Opening
Sicilian Defense           333900
French Defense             186077
Queen's Pawn Game          168758
Scandinavian Defense       122180
King's Pawn Game           108790
                            ...  
Wade Defense                  427
Creepy Crawly Formation       392
Rubinstein Opening            349
Benko Gambit Declined         327
Center Game Accepted          310
Name: count, Length: 129, dtype: int64

In [26]:
len(df)

2623531