### 1. Import and data loading

In [None]:
# Imports
import pandas as pd
import numpy as np
import chess

In [None]:
# Load data
df = pd.read_csv("data/chess_games_raw.csv")
df

### 2. Initial data exploration 

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df['Date'].value_counts()

### 3. Drop columns that has many missing data or irrelevant

In [None]:
df = df.drop(['Site', 'Round', 'PlyCount', 'EventDate'], axis=1)
df = df.dropna(subset=['ECO', 'Moves'])
df.isnull().sum()

### 4. Convert data types

In [None]:
df['WhiteElo'] = df['WhiteElo'].astype(int)
df['BlackElo'] = df['BlackElo'].astype(int)
df.info()

### 5. Remove * or unfinished games rows 

In [None]:
df = df[df['Result'].isin(['1-0', '0-1', '1/2-1/2'])]
df['Result'].value_counts()

### 6. Extract first 10 moves

In [None]:
import chess

def extract_early_game_data(moves_str):
    """Extract first 10 moves AND board state at move 20"""
    if pd.isna(moves_str) or moves_str == '':
        return '', 0, None
    
    moves_list = str(moves_str).split()
    num_plies = len(moves_list)
    first_ten = ' '.join(moves_list[:20])
    
    # Play through first 20 moves and save board state
    board = chess.Board()
    for move_san in moves_list[:20]:
        try:
            board.push_san(move_san)
        except:
            break
    
    # Save board as FEN
    board_fen = board.fen() if num_plies >= 20 else None
    
    return first_ten, num_plies, board_fen

df[['FirstTenMoves', 'TotalPlies', 'BoardStateAtMove20']] = df['Moves'].apply(
    lambda x: pd.Series(extract_early_game_data(x))
)

# Keep only games with at least 10 full moves
df = df[df['TotalPlies'] >= 20]

### 7. Parse and group game type into 3 categories

In [None]:
#df['Event'].unique()
df['Event'].value_counts()

In [None]:
def extract_time_class(event):
    event_lower = str(event).lower()
    if 'blitz' in event_lower:
        return 'blitz'
    elif 'rapid' in event_lower:
        return 'rapid'
    elif 'classical' in event_lower:
        return 'classical'
    else:
        return np.nan

df['EventType'] = df['Event'].apply(extract_time_class)
df = df[df['EventType'].notna()]
df['EventType'].value_counts(dropna=False)

### 8. Drop duplicates 

In [None]:
df[df.duplicated()]

In [None]:
df = df.drop_duplicates()

### 8. Drop columns not use as features

In [None]:
# Full move list and Total plies are for filtering
df = df.drop(['Moves', 'TotalPlies'], axis=1)
df

### 9. Export cleaned data

In [None]:
df.to_csv('data/chess_games_clean.csv', index=False)