# Liberally remove all games with imperfect data

In [1]:
import chess.pgn
import math
import numpy
import pandas as pd
import os
import pyarrow

feather_path = '../lichess_db_standard_rated_2016-09.feather'

In [24]:
# Easily reusable funtion for future game imports
def delete_bad_games(delete_from_df):
    delete_abandoned(delete_from_df)
    delete_unterminated(delete_from_df)
    delete_rules_infraction(delete_from_df)

In [23]:
def delete_abandoned(delete_from_df):
    before = len(delete_from_df)
    abandoned_games_df = delete_from_df[delete_from_df['Termination'] == 'Abandoned']
    delete_from_df.drop(abandoned_games_df.index, inplace=True)
    return before - len(delete_from_df)

def delete_unterminated(delete_from_df):
    before = len(delete_from_df)
    unterminated_games_df = delete_from_df[delete_from_df['Termination'] == 'Unterminated']
    delete_from_df.drop(unterminated_games_df.index, inplace=True)
    return before - len(delete_from_df)

def delete_rules_infraction(delete_from_df):
    before = len(delete_from_df)
    rules_infraction_games_df = delete_from_df[delete_from_df['Termination'] == 'Rules infraction']
    delete_from_df.drop(rules_infraction_games_df.index, inplace=True)
    return before - len(delete_from_df)

In [2]:
df = pd.read_feather(feather_path) if os.path.exists(feather_path) else pd.DataFrame()
len(df)

6813113

### Consider termination type

In [3]:
df['Termination'].value_counts()

Termination
Normal              4635102
Time forfeit        2143022
Abandoned             34852
Rules infraction        132
Unterminated              5
Name: count, dtype: int64

#### Consider deleting abandoned games

In [5]:
abandoned_df = df[df['Termination'] == 'Abandoned']

In [7]:
abandoned_df.head()

Unnamed: 0,Event,Site,White,Black,Result,WhiteElo,BlackElo,WhiteRatingDiff,BlackRatingDiff,ECO,Opening,TimeControl,Termination,Moves
1,Rated Classical tournament https://lichess.org...,https://lichess.org/q3tbWY1W,Fherhuitron,Alequine,0-1,1346,1474,-25,8,?,?,600+0,Abandoned,
22,Rated Classical tournament https://lichess.org...,https://lichess.org/I9FzSzig,Teagy,knightowl16,1-0,1955,1937,10,-12,B00,King's Pawn,600+0,Abandoned,1. e4
26,Rated Blitz tournament https://lichess.org/tou...,https://lichess.org/XHWjOWL3,elgranmanny,Ded_Ahmed,0-1,1750,1756,-11,12,?,?,180+0,Abandoned,
28,Rated Classical tournament https://lichess.org...,https://lichess.org/2mQSSU3s,Jorlu,fandeev,0-1,1927,1910,-13,12,?,?,600+0,Abandoned,
32,Rated Blitz tournament https://lichess.org/tou...,https://lichess.org/PeJIqKDh,misadr2,salarabdi,1-0,1718,1727,12,-17,B00,King's Pawn,300+0,Abandoned,1. e4


#### Delete abandoned games

In [10]:
delete_abandoned(df)

34852

#### Consider deleting unterminated games

In [12]:
unterminated_df = df[df['Termination'] == 'Unterminated']

In [14]:
unterminated_df[:3]

Unnamed: 0,Event,Site,White,Black,Result,WhiteElo,BlackElo,WhiteRatingDiff,BlackRatingDiff,ECO,Opening,TimeControl,Termination,Moves
518570,Rated Correspondence game,https://lichess.org/NuHYVTDl,Quasselstrippe,obelisk,*,1894,1937,,,A20,English Opening: King's English Variation,-,Unterminated,1. c4 e5 2. g3 d6 3. Bg2 g6 4. b3 f5 5. Bb2 Bg...
947053,Rated Correspondence game,https://lichess.org/Mkvwinr1,cunha18,carnefrita29,*,1311,1885,,,B20,Sicilian Defense: Staunton-Cochrane Variation,-,Unterminated,1. e4 c5 2. c4 Nc6 3. Nc3 e5 4. Nf3 d6 5. d3 g...
1605273,Rated Correspondence game,https://lichess.org/hUaPi7zA,Rikzz,bon-jovi,*,2002,1981,,,B06,Robatsch (Modern) Defense,-,Unterminated,1. e4 g6 2. d4 Bg7 3. c3 d6 4. Bc4 Nf6 5. Qc2 ...


In [15]:
unterminated_df['Event'].value_counts()

Event
Rated Correspondence game    5
Name: count, dtype: int64

#### Delete unterminated games

In [16]:
delete_unterminated(df)

5

In [21]:
rules_infraction_df = df[df['Termination'] == 'Rules infraction']

In [22]:
rules_infraction_df[:3]

Unnamed: 0,Event,Site,White,Black,Result,WhiteElo,BlackElo,WhiteRatingDiff,BlackRatingDiff,ECO,Opening,TimeControl,Termination,Moves
21983,Rated Blitz game,https://lichess.org/HTFLtODp,naotomate,G4lvezLuciano,1-0,1621,1564,9,-11,D51,"Queen's Gambit Declined: Modern, Knight Defens...",180+0,Rules infraction,1. d4 Nf6 2. c4 e6 3. Nc3 d5 4. Bg5 Nbd7 5. e3 h6
22156,Rated Blitz game,https://lichess.org/EtthZozB,GrofodHadzica,G4lvezLuciano,1-0,1658,1553,11,-9,C42,Russian Game: Urusov Gambit,300+0,Rules infraction,1. e4 e5 2. Nf3 Nf6 3. Bc4 Nxe4 4. Nxe5 d5 5. ...
63994,Rated Classical game,https://lichess.org/y6vJXofx,bulychev691,TuxTux1,1-0,1771,1741,9,-88,C84,Ruy Lopez: Closed Variations,600+10,Rules infraction,1. e4 e5 2. Nf3 Nc6 3. Bb5 a6 4. Ba4 Nf6 5. O-...


#### On the website it says cheating was detected for each game
#### Delete rules infraction games

In [25]:
delete_rules_infraction(df)

132