# 4. Feature Engineering

In [2]:
import pandas as pd
import numpy as np
import chess
import re
from collections import Counter, defaultdict

In [3]:
train_df = pd.read_csv('train_cleaned.csv')
test_df = pd.read_csv('test_cleaned.csv')

In [4]:
class ChessPuzzleFeatureEngineer:
    
    def __init__(self):
        self.piece_values = {'p': 1, 'n': 3, 'b': 3, 'r': 5, 'q': 9, 'k': 0}
        
    def create_advanced_features(self, df, sample_size=None):
        
        print("Starting Advanced Feature Engineering...")
        
        if sample_size:
            print(f"Working with sample of {sample_size} records")
            df = df.sample(n=min(sample_size, len(df)), random_state=42).reset_index(drop=True)
        
        features = self._create_basic_features(df)
        
        print("Creating enhanced success probability features...")
        success_features = self._create_success_probability_features(df)
        features = pd.concat([features, success_features], axis=1)
        
        print("Extracting chess position features...")
        position_features = self._create_position_features(df)
        features = pd.concat([features, position_features], axis=1)
        
        print("Analyzing move sequences...")
        move_features = self._create_move_sequence_features(df)
        features = pd.concat([features, move_features], axis=1)
        
        print("Processing themes and metadata...")
        theme_features = self._create_theme_features(df)
        features = pd.concat([features, theme_features], axis=1)
        
        print("Creating feature interactions...")
        interaction_features = self._create_interaction_features(features)
        features = pd.concat([features, interaction_features], axis=1)
        
        print(f"Feature engineering complete! Created {len(features.columns)} features")
        return features.fillna(0)
    
    def _create_basic_features(self, df):
        features = pd.DataFrame()
        features['PuzzleId'] = df['PuzzleId']
        
        success_cols = [col for col in df.columns if 'success_prob' in col]
        for col in success_cols:
            features[col] = df[col].fillna(0)
        
        return features
    
    def _create_success_probability_features(self, df):
        
        features = pd.DataFrame()
        
        success_cols = [col for col in df.columns if 'success_prob' in col]
        rapid_cols = [col for col in success_cols if 'rapid' in col]
        blitz_cols = [col for col in success_cols if 'blitz' in col]
        
        if not rapid_cols or not blitz_cols:
            return features
        
        features['rapid_mean'] = df[rapid_cols].mean(axis=1)
        features['rapid_std'] = df[rapid_cols].std(axis=1).fillna(0)
        features['rapid_min'] = df[rapid_cols].min(axis=1)
        features['rapid_max'] = df[rapid_cols].max(axis=1)
        features['rapid_range'] = features['rapid_max'] - features['rapid_min']
        
        features['blitz_mean'] = df[blitz_cols].mean(axis=1)
        features['blitz_std'] = df[blitz_cols].std(axis=1).fillna(0)
        features['blitz_min'] = df[blitz_cols].min(axis=1)
        features['blitz_max'] = df[blitz_cols].max(axis=1)
        features['blitz_range'] = features['blitz_max'] - features['blitz_min']
        
        features['rapid_blitz_diff'] = features['rapid_mean'] - features['blitz_mean']
        features['rapid_blitz_ratio'] = features['rapid_mean'] / (features['blitz_mean'] + 0.001)
        
        if 'success_prob_rapid_1050' in df.columns and 'success_prob_rapid_1150' in df.columns:
            features['skill_gap_1050_1150'] = df['success_prob_rapid_1150'] - df['success_prob_rapid_1050']
            features['skill_ratio_1050_1150'] = df['success_prob_rapid_1150'] / (df['success_prob_rapid_1050'] + 0.001)
        
        features['difficulty_slope'] = self._calculate_difficulty_slope(df, rapid_cols)
        features['difficulty_inflection'] = self._find_difficulty_inflection(df, rapid_cols)
        
        if 'success_prob_rapid_1050' in df.columns and 'success_prob_blitz_1050' in df.columns:
            features['time_pressure_1050'] = df['success_prob_rapid_1050'] - df['success_prob_blitz_1050']
        
        if 'success_prob_rapid_1150' in df.columns and 'success_prob_blitz_1150' in df.columns:
            features['time_pressure_1150'] = df['success_prob_rapid_1150'] - df['success_prob_blitz_1150']
        
        return features
    
    def _calculate_difficulty_slope(self, df, rating_cols):
        slopes = []
        ratings = [int(col.split('_')[-1]) for col in rating_cols]
        
        for idx in range(len(df)):
            success_rates = [df[col].iloc[idx] for col in rating_cols]
            slope = np.polyfit(ratings, success_rates, 1)[0] if len(ratings) > 1 else 0
            slopes.append(slope)
        
        return slopes
    
    def _find_difficulty_inflection(self, df, rating_cols):
        inflections = []
        
        for idx in range(len(df)):
            success_rates = [df[col].iloc[idx] for col in rating_cols]
            if len(success_rates) < 3:
                inflections.append(0)
                continue
            
            diffs = [success_rates[i+1] - success_rates[i] for i in range(len(success_rates)-1)]
            max_diff_idx = np.argmax(diffs) if diffs else 0
            inflections.append(max_diff_idx)
        
        return inflections
    
    def _create_position_features(self, df):
        features = pd.DataFrame()
        
        print("  Parsing FEN strings...")
        position_data = []
        
        for idx, fen in enumerate(df['FEN']):
            if idx % 10000 == 0:
                print(f"    Processed {idx}/{len(df)} positions")
            
            pos_features = self._parse_fen_features(fen)
            position_data.append(pos_features)
        
        position_df = pd.DataFrame(position_data).fillna(0)
        return position_df
    
    def _parse_fen_features(self, fen_string):
        if pd.isna(fen_string) or not fen_string:
            return self._get_empty_position_features()
        
        try:
            board = chess.Board(fen_string)
            features = {}
            
            white_material, black_material = self._calculate_material(board)
            features['white_material'] = white_material
            features['black_material'] = black_material
            features['material_balance'] = white_material - black_material
            features['total_material'] = white_material + black_material
            features['material_ratio'] = white_material / max(black_material, 1)
            
            for piece_symbol in ['P', 'N', 'B', 'R', 'Q', 'K', 'p', 'n', 'b', 'r', 'q', 'k']:
                count = len([sq for sq in chess.SQUARES 
                           if board.piece_at(sq) and board.piece_at(sq).symbol() == piece_symbol])
                features[f'{piece_symbol}_count'] = count
            
            if features['total_material'] > 50:
                features['game_phase'] = 0
            elif features['total_material'] > 15:
                features['game_phase'] = 1
            else:
                features['game_phase'] = 2
            
            white_king = board.king(chess.WHITE)
            black_king = board.king(chess.BLACK)
            
            if white_king and black_king:
                features['king_distance'] = chess.square_distance(white_king, black_king)
                features['white_king_center'] = self._is_center_square(white_king)
                features['black_king_center'] = self._is_center_square(black_king)
            else:
                features['king_distance'] = 0
                features['white_king_center'] = 0
                features['black_king_center'] = 0
            
            features['white_to_move'] = int(board.turn == chess.WHITE)
            features['in_check'] = int(board.is_check())
            features['num_legal_moves'] = len(list(board.legal_moves))
            
            features['can_castle'] = int(any([
                board.has_kingside_castling_rights(chess.WHITE),
                board.has_queenside_castling_rights(chess.WHITE),
                board.has_kingside_castling_rights(chess.BLACK),
                board.has_queenside_castling_rights(chess.BLACK)
            ]))
            
            return features
            
        except Exception as e:
            return self._get_empty_position_features()
    
    def _calculate_material(self, board):
        white_material = black_material = 0
        
        for square in chess.SQUARES:
            piece = board.piece_at(square)
            if piece:
                value = self.piece_values.get(piece.symbol().lower(), 0)
                if piece.color == chess.WHITE:
                    white_material += value
                else:
                    black_material += value
        
        return white_material, black_material
    
    def _is_center_square(self, square):
        if square is None:
            return 0
        file = chess.square_file(square)
        rank = chess.square_rank(square)
        return int(file in [3, 4] and rank in [3, 4])
    
    def _get_empty_position_features(self):
        features = {}
        
        for key in ['white_material', 'black_material', 'material_balance', 'total_material', 'material_ratio']:
            features[key] = 0
        
        for piece in ['P', 'N', 'B', 'R', 'Q', 'K', 'p', 'n', 'b', 'r', 'q', 'k']:
            features[f'{piece}_count'] = 0
        
        for key in ['game_phase', 'king_distance', 'white_king_center', 'black_king_center',
                   'white_to_move', 'in_check', 'num_legal_moves', 'can_castle']:
            features[key] = 0
        
        return features
    
    def _create_move_sequence_features(self, df):
        features = pd.DataFrame()
        
        print("  Analyzing move sequences...")
        move_data = []
        
        for idx, moves_str in enumerate(df['Moves']):
            if idx % 10000 == 0:
                print(f"    Processed {idx}/{len(df)} move sequences")
            
            move_features = self._parse_move_features(moves_str)
            move_data.append(move_features)
        
        move_df = pd.DataFrame(move_data).fillna(0)
        return move_df
    
    def _parse_move_features(self, moves_string):
        if pd.isna(moves_string) or not moves_string:
            return {'move_count': 0, 'captures_count': 0, 'checks_count': 0}
        
        moves = moves_string.strip().split()
        features = {}
        
        features['move_count'] = len(moves)
        features['captures_count'] = sum(1 for move in moves if 'x' in move)
        features['checks_count'] = sum(1 for move in moves if '+' in move)
        features['castling_count'] = sum(1 for move in moves if move in ['O-O', 'O-O-O'])
        features['promotion_count'] = sum(1 for move in moves if '=' in move)
        
        features['avg_move_length'] = np.mean([len(move) for move in moves]) if moves else 0
        features['complex_moves'] = sum(1 for move in moves if len(move) > 4)
        
        features['capture_rate'] = features['captures_count'] / max(features['move_count'], 1)
        features['check_rate'] = features['checks_count'] / max(features['move_count'], 1)
        
        piece_moves = {'K': 0, 'Q': 0, 'R': 0, 'B': 0, 'N': 0}
        for move in moves:
            if move and move[0] in piece_moves:
                piece_moves[move[0]] += 1
        
        for piece, count in piece_moves.items():
            features[f'{piece}_moves'] = count
        
        features['pawn_moves'] = sum(1 for move in moves 
                                   if move and move[0] not in ['K', 'Q', 'R', 'B', 'N', 'O'])
        
        return features
    
    def _create_theme_features(self, df):
        features = pd.DataFrame()
        
        for col in ['NbPlays', 'Popularity', 'RatingDeviation']:
            if col in df.columns:
                features[col] = df[col].fillna(0)
                features[f'log_{col.lower()}'] = np.log1p(df[col].fillna(0))
        
        if 'Themes' in df.columns:
            theme_features = self._parse_themes(df['Themes'])
            features = pd.concat([features, theme_features], axis=1)
        
        return features
    
    def _parse_themes(self, theme_series):
        important_themes = [
            'mate', 'mateIn1', 'mateIn2', 'mateIn3', 'mateIn4', 'mateIn5',
            'endgame', 'middlegame', 'opening',
            'advantage', 'crushing', 'equality',
            'short', 'long', 'veryLong',
            'fork', 'pin', 'skewer', 'discoveredAttack', 'deflection',
            'sacrifice', 'attraction', 'clearance',
            'backRankMate', 'smotheredMate', 'anastasiasMate',
            'rookEndgame', 'queenEndgame', 'pawnEndgame', 'knightEndgame', 'bishopEndgame',
            'kingsideAttack', 'queensideAttack',
            'trappedPiece', 'hangingPiece',
            'master', 'superGM'
        ]
        
        theme_features = pd.DataFrame()
        
        for theme in important_themes:
            theme_features[f'theme_{theme}'] = theme_series.fillna('').str.contains(theme, case=False).astype(int)
        
        theme_features['total_themes'] = theme_series.fillna('').str.split().str.len()
        theme_features['has_mate_theme'] = theme_series.fillna('').str.contains('mate', case=False).astype(int)
        theme_features['has_tactic_theme'] = theme_series.fillna('').str.contains('fork|pin|skewer|deflection', case=False).astype(int)
        
        return theme_features
    
    def _create_interaction_features(self, features):
        interactions = pd.DataFrame()
        
        if 'rapid_mean' in features.columns and 'move_count' in features.columns:
            interactions['rapid_mean_x_move_count'] = features['rapid_mean'] * features['move_count']
        
        if 'blitz_mean' in features.columns and 'captures_count' in features.columns:
            interactions['blitz_mean_x_captures'] = features['blitz_mean'] * features['captures_count']
        
        if 'material_balance' in features.columns and 'rapid_mean' in features.columns:
            interactions['material_x_rapid'] = features['material_balance'] * features['rapid_mean']
        
        success_cols = [col for col in features.columns if 'success_prob' in col]
        if len(success_cols) >= 2:
            if 'success_prob_rapid_1150' in features.columns and 'success_prob_rapid_1050' in features.columns:
                interactions['success_1150_x_1050'] = (features['success_prob_rapid_1150'] * 
                                                     features['success_prob_rapid_1050'])
        
        return interactions

def run_feature_engineering(train_df, test_df, sample_size=100000):
    
    print("CHESS PUZZLE FEATURE ENGINEERING PIPELINE")
    print("="*60)
    
    feature_engineer = ChessPuzzleFeatureEngineer()
    
    print("\nProcessing training data...")
    train_features = feature_engineer.create_advanced_features(train_df, sample_size=sample_size)
    
    print("\nProcessing test data...")
    test_features = feature_engineer.create_advanced_features(test_df)
    
    common_features = list(set(train_features.columns) & set(test_features.columns))
    train_features = train_features[common_features]
    test_features = test_features[common_features]
    
    print(f"\nFeature engineering complete!")
    print(f"Training features shape: {train_features.shape}")
    print(f"Test features shape: {test_features.shape}")
    print(f"Total features created: {len(common_features)}")
    
    feature_types = {
        'success_prob': len([col for col in common_features if 'success_prob' in col]),
        'position': len([col for col in common_features if any(x in col for x in ['material', 'piece', 'king', 'check'])]),
        'moves': len([col for col in common_features if any(x in col for x in ['move', 'capture', 'check'])]),
        'themes': len([col for col in common_features if 'theme' in col]),
        'interactions': len([col for col in common_features if '_x_' in col]),
        'other': len([col for col in common_features if not any(x in col for x in ['success_prob', 'material', 'piece', 'king', 'move', 'capture', 'theme', '_x_'])])
    }
    
    print(f"\nFeature breakdown:")
    for feat_type, count in feature_types.items():
        print(f"  {feat_type}: {count} features")
    
    return train_features, test_features

In [5]:
# if __name__ == "__main__":
#     print("Ready to run advanced feature engineering!")
#     print("Use: train_features, test_features = run_feature_engineering(train_df, test_df)")

In [6]:
train_features, test_features = run_feature_engineering(
    train_df, 
    test_df, 
    sample_size=100000  # Start with 100k for speed
)

CHESS PUZZLE FEATURE ENGINEERING PIPELINE

Processing training data...
Starting Advanced Feature Engineering...
Working with sample of 100000 records
Creating enhanced success probability features...
Extracting chess position features...
  Parsing FEN strings...
    Processed 0/100000 positions
    Processed 10000/100000 positions
    Processed 20000/100000 positions
    Processed 30000/100000 positions
    Processed 40000/100000 positions
    Processed 50000/100000 positions
    Processed 60000/100000 positions
    Processed 70000/100000 positions
    Processed 80000/100000 positions
    Processed 90000/100000 positions
Analyzing move sequences...
  Analyzing move sequences...
    Processed 0/100000 move sequences
    Processed 10000/100000 move sequences
    Processed 20000/100000 move sequences
    Processed 30000/100000 move sequences
    Processed 40000/100000 move sequences
    Processed 50000/100000 move sequences
    Processed 60000/100000 move sequences
    Processed 70000/100

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Creating feature interactions...
Feature engineering complete! Created 131 features

Processing test data...
Starting Advanced Feature Engineering...
Creating enhanced success probability features...
Extracting chess position features...
  Parsing FEN strings...
    Processed 0/2235 positions
Analyzing move sequences...
  Analyzing move sequences...
    Processed 0/2235 move sequences
Processing themes and metadata...
Creating feature interactions...
Feature engineering complete! Created 85 features

Feature engineering complete!
Training features shape: (100000, 85)
Test features shape: (2235, 85)
Total features created: 85

Feature breakdown:
  success_prob: 22 features
  position: 12 features
  moves: 18 features
  themes: 0 features
  interactions: 4 features
  other: 38 features


# Models

In [7]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

In [9]:
class ChessFeatureEmbedding(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        
        self.theme_embedding = nn.Embedding(
            config['theme_vocab_size'], 
            config['theme_embed_dim']
        )
        
        self.numerical_projection = nn.Linear(
            config['num_numerical_features'], 
            config['numerical_embed_dim']
        )
        
        self.success_prob_projection = nn.Linear(
            config['num_success_prob_features'],
            config['success_prob_embed_dim']
        )
        
        self.dropout = nn.Dropout(config['dropout'])
        
    def forward(self, features_dict):
        embeddings = []
        
        if 'theme_features' in features_dict:
            theme_emb = self.theme_embedding(features_dict['theme_features'])
            theme_emb = theme_emb.mean(dim=1)
            embeddings.append(theme_emb)
        
        if 'numerical_features' in features_dict:
            num_emb = self.numerical_projection(features_dict['numerical_features'])
            embeddings.append(num_emb)
        
        if 'success_prob_features' in features_dict:
            success_emb = self.success_prob_projection(features_dict['success_prob_features'])
            embeddings.append(success_emb)
        
        combined = torch.cat(embeddings, dim=-1)
        return self.dropout(combined)

class ChessTransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.attention = nn.MultiheadAttention(
            embed_dim, num_heads, dropout=dropout, batch_first=True
        )
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(ff_dim, embed_dim),
            nn.Dropout(dropout)
        )
        
    def forward(self, x):
        attn_out, _ = self.attention(x, x, x)
        x = self.norm1(x + attn_out)
        
        ff_out = self.feed_forward(x)
        x = self.norm2(x + ff_out)
        
        return x

class ChessPuzzleTransformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        
        self.feature_embedding = ChessFeatureEmbedding(config)
        
        total_embed_dim = (
            config.get('theme_embed_dim', 32) +
            config.get('numerical_embed_dim', 64) +
            config.get('success_prob_embed_dim', 128)
        )
        
        self.pos_encoding = nn.Parameter(
            torch.randn(1, config['max_seq_len'], total_embed_dim) * 0.1
        )
        
        self.transformer_blocks = nn.ModuleList([
            ChessTransformerBlock(
                total_embed_dim, 
                config['num_heads'], 
                config['ff_dim'],
                config['dropout']
            ) for _ in range(config['num_layers'])
        ])
        
        self.output_head = nn.Sequential(
            nn.LayerNorm(total_embed_dim),
            nn.Linear(total_embed_dim, config['hidden_dim']),
            nn.GELU(),
            nn.Dropout(config['dropout']),
            nn.Linear(config['hidden_dim'], config['hidden_dim'] // 2),
            nn.GELU(),
            nn.Dropout(config['dropout']),
            nn.Linear(config['hidden_dim'] // 2, 1)
        )
        
    def forward(self, features_dict):
        x = self.feature_embedding(features_dict)
        
        if x.dim() == 2:
            x = x.unsqueeze(1)
        
        seq_len = min(x.size(1), self.config['max_seq_len'])
        x = x[:, :seq_len, :] + self.pos_encoding[:, :seq_len, :]
        
        for transformer_block in self.transformer_blocks:
            x = transformer_block(x)
        
        x = x.mean(dim=1)
        
        output = self.output_head(x)
        return output.squeeze(-1)

class TreeNeuralHybrid(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        
        self.neural_net = nn.Sequential(
            nn.Linear(config['neural_input_dim'], 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.1),
            
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        
        self.combination_layer = nn.Linear(2, 1)
        
    def forward(self, neural_features, tree_predictions=None):
        neural_output = self.neural_net(neural_features)
        
        if tree_predictions is not None:
            combined = torch.cat([neural_output, tree_predictions.unsqueeze(-1)], dim=-1)
            final_output = self.combination_layer(combined)
            return final_output.squeeze(-1)
        
        return neural_output.squeeze(-1)

class ChessHybridModel:
    def __init__(self, config):
        self.config = config
        self.tree_model = None
        self.neural_model = None
        self.scaler = StandardScaler()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
    def prepare_features(self, features_df):
        tree_features = [col for col in features_df.columns if any(x in col.lower() for x in [
            'success_prob', 'material', 'move_count', 'theme_', 'nbplays', 'popularity',
            'captures_count', 'checks_count', 'piece_count', 'game_phase'
        ])]
        
        neural_features = [col for col in features_df.columns if any(x in col.lower() for x in [
            '_x_', 'interaction', 'ratio', 'std', 'mean', 'range', 'slope', 'inflection'
        ])]
        
        remaining_features = [col for col in features_df.columns 
                            if col not in tree_features + neural_features and col != 'PuzzleId']
        neural_features.extend(remaining_features)
        
        tree_X = features_df[tree_features].fillna(0)
        neural_X = features_df[neural_features].fillna(0)
        
        return tree_X, neural_X, tree_features, neural_features
    
    def train(self, X_train, y_train, X_val, y_val):
        print("Training Tree Component (LightGBM)...")
        
        tree_X_train, neural_X_train, tree_features, neural_features = self.prepare_features(X_train)
        tree_X_val, neural_X_val, _, _ = self.prepare_features(X_val)
        
        print(f"  Tree features: {len(tree_features)}")
        print(f"  Neural features: {len(neural_features)}")
        
        train_data = lgb.Dataset(tree_X_train, label=y_train)
        val_data = lgb.Dataset(tree_X_val, label=y_val, reference=train_data)
        
        lgb_params = {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'num_leaves': 100,
            'learning_rate': 0.05,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1,
            'random_state': 42
        }
        
        self.tree_model = lgb.train(
            lgb_params,
            train_data,
            valid_sets=[val_data],
            num_boost_round=1000,
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
        )
        
        tree_pred_train = self.tree_model.predict(tree_X_train)
        tree_pred_val = self.tree_model.predict(tree_X_val)
        
        print("Training Neural Component...")
        
        neural_X_train_scaled = self.scaler.fit_transform(neural_X_train)
        neural_X_val_scaled = self.scaler.transform(neural_X_val)
        
        neural_config = {
            'neural_input_dim': neural_X_train_scaled.shape[1]
        }
        self.neural_model = TreeNeuralHybrid(neural_config).to(self.device)
        
        optimizer = torch.optim.AdamW(self.neural_model.parameters(), lr=0.001, weight_decay=0.01)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10, factor=0.5)
        criterion = nn.MSELoss()
        
        train_neural_tensor = torch.FloatTensor(neural_X_train_scaled).to(self.device)
        train_tree_tensor = torch.FloatTensor(tree_pred_train).to(self.device)
        train_target_tensor = torch.FloatTensor(y_train.values).to(self.device)
        
        val_neural_tensor = torch.FloatTensor(neural_X_val_scaled).to(self.device)
        val_tree_tensor = torch.FloatTensor(tree_pred_val).to(self.device)
        val_target_tensor = torch.FloatTensor(y_val.values).to(self.device)
        
        best_val_loss = float('inf')
        patience_counter = 0
        
        for epoch in range(200):
            self.neural_model.train()
            optimizer.zero_grad()
            
            predictions = self.neural_model(train_neural_tensor, train_tree_tensor)
            loss = criterion(predictions, train_target_tensor)
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.neural_model.parameters(), 1.0)
            optimizer.step()
            
            self.neural_model.eval()
            with torch.no_grad():
                val_predictions = self.neural_model(val_neural_tensor, val_tree_tensor)
                val_loss = criterion(val_predictions, val_target_tensor)
            
            scheduler.step(val_loss)
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                torch.save(self.neural_model.state_dict(), 'best_hybrid_model.pth')
            else:
                patience_counter += 1
                if patience_counter >= 20:
                    break
            
            if epoch % 25 == 0:
                print(f"  Epoch {epoch}: Train Loss = {loss:.4f}, Val Loss = {val_loss:.4f}")
        
        self.neural_model.load_state_dict(torch.load('best_hybrid_model.pth'))
        
        print("Hybrid model training complete!")
        
        return best_val_loss.item()
    
    def predict(self, X_test):
        tree_X_test, neural_X_test, _, _ = self.prepare_features(X_test)
        
        tree_predictions = self.tree_model.predict(tree_X_test)
        
        neural_X_test_scaled = self.scaler.transform(neural_X_test)
        neural_tensor = torch.FloatTensor(neural_X_test_scaled).to(self.device)
        tree_tensor = torch.FloatTensor(tree_predictions).to(self.device)
        
        self.neural_model.eval()
        with torch.no_grad():
            hybrid_predictions = self.neural_model(neural_tensor, tree_tensor)
        
        return hybrid_predictions.cpu().numpy()

class ChessPuzzleDataset(Dataset):
    def __init__(self, features_df, targets=None, config=None):
        self.features_df = features_df
        self.targets = targets
        self.config = config
        
        self.success_prob_features = [col for col in features_df.columns if 'success_prob' in col]
        self.theme_features = [col for col in features_df.columns if 'theme_' in col]
        self.numerical_features = [col for col in features_df.columns 
                                 if col not in self.success_prob_features + self.theme_features
                                 and col != 'PuzzleId']
        
        self.scaler = StandardScaler()
        if self.numerical_features:
            self.numerical_data = self.scaler.fit_transform(
                features_df[self.numerical_features].fillna(0)
            )
        else:
            self.numerical_data = np.array([])
    
    def __len__(self):
        return len(self.features_df)
    
    def __getitem__(self, idx):
        features_dict = {}
        
        if self.success_prob_features:
            features_dict['success_prob_features'] = torch.FloatTensor(
                self.features_df[self.success_prob_features].iloc[idx].fillna(0).values
            )
        
        if self.theme_features:
            theme_indices = torch.LongTensor([
                i for i, val in enumerate(self.features_df[self.theme_features].iloc[idx].values) 
                if val > 0
            ])
            if len(theme_indices) == 0:
                theme_indices = torch.LongTensor([0])
            features_dict['theme_features'] = theme_indices
        
        if len(self.numerical_data) > 0:
            features_dict['numerical_features'] = torch.FloatTensor(self.numerical_data[idx])
        
        if self.targets is not None:
            return features_dict, torch.FloatTensor([self.targets.iloc[idx]])
        
        return features_dict

def train_transformer_model(train_features, test_features, train_targets, sample_size=50000):
    print("TRAINING TRANSFORMER MODEL")
    print("=" * 40)
    
    if len(train_features) > sample_size:
        sample_idx = np.random.choice(len(train_features), sample_size, replace=False)
        train_features_sample = train_features.iloc[sample_idx].reset_index(drop=True)
        train_targets_sample = train_targets.iloc[sample_idx].reset_index(drop=True)
    else:
        train_features_sample = train_features
        train_targets_sample = train_targets
    
    X_train, X_val, y_train, y_val = train_test_split(
        train_features_sample, train_targets_sample, test_size=0.2, random_state=42
    )
    
    theme_cols = [col for col in train_features.columns if 'theme_' in col]
    success_prob_cols = [col for col in train_features.columns if 'success_prob' in col]
    numerical_cols = [col for col in train_features.columns 
                     if col not in success_prob_cols + theme_cols and col != 'PuzzleId']
    
    config = {
        'theme_vocab_size': len(theme_cols) + 1,
        'theme_embed_dim': 32,
        'numerical_embed_dim': 64,
        'success_prob_embed_dim': 128,
        'num_numerical_features': len(numerical_cols),
        'num_success_prob_features': len(success_prob_cols),
        'max_seq_len': 10,
        'num_heads': 8,
        'num_layers': 4,
        'ff_dim': 512,
        'hidden_dim': 256,
        'dropout': 0.1
    }
    
    print(f"  Theme features: {len(theme_cols)}")
    print(f"  Success prob features: {len(success_prob_cols)}")
    print(f"  Numerical features: {len(numerical_cols)}")
    
    train_dataset = ChessPuzzleDataset(X_train, y_train, config)
    val_dataset = ChessPuzzleDataset(X_val, y_val, config)
    
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = ChessPuzzleTransformer(config).to(device)
    
    print(f"  Using device: {device}")
    print(f"  Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)
    criterion = nn.MSELoss()
    
    best_val_loss = float('inf')
    patience = 0
    
    for epoch in range(100):
        model.train()
        train_loss = 0
        for batch_features, batch_targets in train_loader:
            for key in batch_features:
                batch_features[key] = batch_features[key].to(device)
            batch_targets = batch_targets.to(device).squeeze()
            
            optimizer.zero_grad()
            predictions = model(batch_features)
            loss = criterion(predictions, batch_targets)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            train_loss += loss.item()
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_features, batch_targets in val_loader:
                for key in batch_features:
                    batch_features[key] = batch_features[key].to(device)
                batch_targets = batch_targets.to(device).squeeze()
                
                predictions = model(batch_features)
                loss = criterion(predictions, batch_targets)
                val_loss += loss.item()
        
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        
        scheduler.step()
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience = 0
            torch.save(model.state_dict(), 'best_transformer_model.pth')
        else:
            patience += 1
            if patience >= 15:
                break
        
        if epoch % 10 == 0:
            print(f"  Epoch {epoch}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")
    
    model.load_state_dict(torch.load('best_transformer_model.pth'))
    
    print("Transformer training complete!")
    
    return model, config, best_val_loss

def run_advanced_models_pipeline(train_features, test_features, train_df, sample_size=50000):
    print("ADVANCED CHESS PUZZLE MODELS PIPELINE")
    print("=" * 60)
    
    y = train_df['Rating'].iloc[:len(train_features)]
    
    X_train, X_val, y_train, y_val = train_test_split(
        train_features, y, test_size=0.2, random_state=42
    )
    
    results = {}
    
    print("\nTRANSFORMER MODEL")
    print("-" * 30)
    try:
        transformer_model, transformer_config, transformer_val_loss = train_transformer_model(
            train_features, test_features, y, sample_size=sample_size
        )
        results['transformer'] = {
            'model': transformer_model,
            'config': transformer_config,
            'val_loss': transformer_val_loss,
            'rmse': np.sqrt(transformer_val_loss)
        }
        print(f"Transformer RMSE: {results['transformer']['rmse']:.1f}")
    except Exception as e:
        print(f"Transformer training failed: {e}")
        results['transformer'] = {'rmse': float('inf')}
    
    print("\nHYBRID TREE+NEURAL MODEL")
    print("-" * 30)
    try:
        hybrid_config = {'neural_input_dim': X_train.shape[1] - 1}
        hybrid_model = ChessHybridModel(hybrid_config)
        
        X_train_clean = X_train.drop(['PuzzleId'], axis=1, errors='ignore')
        X_val_clean = X_val.drop(['PuzzleId'], axis=1, errors='ignore')
        
        hybrid_val_loss = hybrid_model.train(X_train_clean, y_train, X_val_clean, y_val)
        results['hybrid'] = {
            'model': hybrid_model,
            'val_loss': hybrid_val_loss,
            'rmse': np.sqrt(hybrid_val_loss)
        }
        print(f"Hybrid RMSE: {results['hybrid']['rmse']:.1f}")
    except Exception as e:
        print(f"Hybrid training failed: {e}")
        results['hybrid'] = {'rmse': float('inf')}
    
    print("\nMODEL COMPARISON")
    print("=" * 40)
    print(f"Baseline (XGBoost):     RMSE = 337")
    
    for model_name, result in results.items():
        if 'rmse' in result and result['rmse'] != float('inf'):
            improvement = ((337 - result['rmse']) / 337 * 100)
            print(f"{model_name.capitalize():20s} RMSE = {result['rmse']:.1f} ({improvement:+.1f}%)")
    
    valid_models = {k: v for k, v in results.items() if v['rmse'] != float('inf')}
    if valid_models:
        best_model_name = min(valid_models.keys(), key=lambda k: valid_models[k]['rmse'])
        print(f"\nBest Model: {best_model_name.upper()}")
    else:
        print("\nNo models trained successfully")
        best_model_name = None
    
    return results, best_model_name

def generate_predictions(results, best_model_name, test_features):
    if best_model_name is None or best_model_name not in results:
        print("No valid model for predictions")
        return None
    
    print(f"\nGenerating predictions with {best_model_name.upper()} model...")
    
    best_model = results[best_model_name]['model']
    test_clean = test_features.drop(['PuzzleId'], axis=1, errors='ignore')
    
    if best_model_name == 'transformer':
        print("Transformer predictions require special dataset handling")
        return None
    
    elif best_model_name == 'hybrid':
        predictions = best_model.predict(test_clean)
        predictions = np.round(predictions).astype(int)
        
        with open(f'{best_model_name}_submission.txt', 'w') as f:
            for pred in predictions:
                f.write(f"{pred}\n")
        
        print(f"Predictions saved to {best_model_name}_submission.txt")
        return predictions
    
    return None

In [10]:
results, best_model = run_advanced_models_pipeline(train_features, test_features, train_df)

ADVANCED CHESS PUZZLE MODELS PIPELINE

TRANSFORMER MODEL
------------------------------
TRAINING TRANSFORMER MODEL
  Theme features: 0
  Success prob features: 22
  Numerical features: 62
  Using device: cpu
  Model parameters: 1,830,753
Transformer training failed: The size of tensor a (192) must match the size of tensor b (224) at non-singleton dimension 2

HYBRID TREE+NEURAL MODEL
------------------------------
Training Tree Component (LightGBM)...
  Tree features: 33
  Neural features: 54
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 540.399
Training Neural Component...
  Epoch 0: Train Loss = 506212.6562, Val Loss = 489290.7500
  Epoch 25: Train Loss = 472873.1250, Val Loss = 456861.1875
  Epoch 50: Train Loss = 441103.4688, Val Loss = 425915.5938
  Epoch 75: Train Loss = 410010.8438, Val Loss = 395175.9062
  Epoch 100: Train Loss = 378626.0312, Val Loss = 363403.3125
  Epoch 125: Train Loss = 346620.0000, Val L