In [1]:
#necessary
import os
import sys
from pathlib import Path
import polars as pl  # fast for big data
import pandas as pd  # for csv
import numpy as np   # for matrix ops
#kfold
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold
from sklearn.metrics import mean_squared_error as mse


#model
import lightgbm as lgb
from  lightgbm import LGBMRegressor, log_evaluation, early_stopping
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import dill         # to serialize/deserialize objects
from sklearn.feature_extraction.text import TfidfVectorizer
import re           # regex
import gc           # garbage collector
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

import random
#sys.path.append("/kaggle/input/um-game-playing-strength-of-mcts-variants")
#import kaggle_evaluation.mcts_inference_server

pd.options.display.max_rows = None
pd.options.display.max_columns = None


def seed_everything(seed):
    # tek tip random sonuc icin
    np.random.seed(seed)
    random.seed(seed)
seed_everything(seed=2024)

In [2]:
class model_1:
    train=pl.read_csv("/kaggle/input/um-game-playing-strength-of-mcts-variants/train.csv")
    train=train.to_pandas()
    print(f"len(train):{len(train)}")
    test=pl.read_csv("/kaggle/input/um-game-playing-strength-of-mcts-variants/test.csv")
    test=test.to_pandas()
    print(f"len(test):{len(test)}")
    test.head()

    class Preprocessor():
        def __init__(self,seed=2024,target='utility_agent1',train=None,num_folds=10):
            self.seed=seed
            self.target=target
            self.train=train
            self.model_paths=[]
            self.tfidf_paths=[]
            self.num_folds=num_folds
            
        def clean(self,df,col):
            # basit metin temizligi
            df[col]=df[col].fillna("nan")
            df[col]=df[col].apply(lambda x:x.lower())
            ps='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
            for p in ps:
                df[col]=df[col].apply(lambda x:x.replace(p,' '))
            return df
        
        def ARI(self,txt):
            # text readability score
            characters=len(txt)
            words=len(re.split(' |\\n|\\.|\\?|\\!|\,',txt))
            sentence=len(re.split('\\.|\\?|\\!',txt))
            ari_score=4.71*(characters/words)+0.5*(words/sentence)-21.43
            return ari_score

        def McAlpine_EFLAW(self,txt):
            # text readability score
            W=len(re.split(' |\\n|\\.|\\?|\\!|\,',txt))
            S=len(re.split('\\.|\\?|\\!',txt))
            mcalpine_eflaw_score=(W+S*W)/S
            return mcalpine_eflaw_score

        def CLRI(self,txt):
            # text readability score
            characters=len(txt)
            words=len(re.split(' |\\n|\\.|\\?|\\!|\,',txt))
            sentence=len(re.split('\\.|\\?|\\!',txt))
            L=100*characters/words
            S=100*sentence/words
            clri_score=0.0588*L-0.296*S-15.8
            return clri_score
            
        def pickle_dump(self,obj, path):
            with open(path, mode="wb") as f:
                dill.dump(obj, f, protocol=4)

        def pickle_load(self,path):
            with open(path, mode="rb") as f:
                data = dill.load(f)
                return data
        
        def reduce_mem_usage(self,df, float16_as32=True):
            # df bellek optimizasyonu
            start_mem = df.memory_usage().sum() / 1024**2
            print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

            for col in df.columns:
                col_type = df[col].dtype
                if col_type != object and str(col_type)!='category':
                    c_min,c_max = df[col].min(),df[col].max()
                    if str(col_type)[:3] == 'int':
                        if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                            df[col] = df[col].astype(np.int8)
                        elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                            df[col] = df[col].astype(np.int16)
                        elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                            df[col] = df[col].astype(np.int32)
                        elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                            df[col] = df[col].astype(np.int64)  
                    else:
                        if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                            if float16_as32:
                                df[col] = df[col].astype(np.float32)
                            else:
                                df[col] = df[col].astype(np.float16)
                        elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                            df[col] = df[col].astype(np.float32)
                        else:
                            df[col] = df[col].astype(np.float64)
            end_mem = df.memory_usage().sum() / 1024**2
            print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
            print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

            return df
            
        def FE(self,df,mode='train'):
            print(f"FE:{mode}")

            print("agent position feature")
            total_agent=[
                'MCTS-ProgressiveHistory-0.1-MAST-false', 'MCTS-ProgressiveHistory-0.1-MAST-true',
                'MCTS-ProgressiveHistory-0.1-NST-false', 'MCTS-ProgressiveHistory-0.1-NST-true',
                'MCTS-ProgressiveHistory-0.1-Random200-false','MCTS-ProgressiveHistory-0.1-Random200-true',
                'MCTS-ProgressiveHistory-0.6-MAST-false','MCTS-ProgressiveHistory-0.6-MAST-true',
                'MCTS-ProgressiveHistory-0.6-NST-false','MCTS-ProgressiveHistory-0.6-NST-true',
                'MCTS-ProgressiveHistory-0.6-Random200-false','MCTS-ProgressiveHistory-0.6-Random200-true',
                'MCTS-ProgressiveHistory-1.41421356237-MAST-false','MCTS-ProgressiveHistory-1.41421356237-MAST-true',
                'MCTS-ProgressiveHistory-1.41421356237-NST-false','MCTS-ProgressiveHistory-1.41421356237-NST-true',
                'MCTS-ProgressiveHistory-1.41421356237-Random200-false','MCTS-ProgressiveHistory-1.41421356237-Random200-true',
                'MCTS-UCB1-0.1-MAST-false','MCTS-UCB1-0.1-MAST-true','MCTS-UCB1-0.1-NST-false','MCTS-UCB1-0.1-NST-true',
                'MCTS-UCB1-0.1-Random200-false','MCTS-UCB1-0.1-Random200-true','MCTS-UCB1-0.6-MAST-false','MCTS-UCB1-0.6-MAST-true',
                'MCTS-UCB1-0.6-NST-false','MCTS-UCB1-0.6-NST-true','MCTS-UCB1-0.6-Random200-false','MCTS-UCB1-0.6-Random200-true',
                'MCTS-UCB1-1.41421356237-MAST-false','MCTS-UCB1-1.41421356237-MAST-true','MCTS-UCB1-1.41421356237-NST-false',
                'MCTS-UCB1-1.41421356237-NST-true','MCTS-UCB1-1.41421356237-Random200-false','MCTS-UCB1-1.41421356237-Random200-true',
                'MCTS-UCB1GRAVE-0.1-MAST-false','MCTS-UCB1GRAVE-0.1-MAST-true','MCTS-UCB1GRAVE-0.1-NST-false','MCTS-UCB1GRAVE-0.1-NST-true',
                'MCTS-UCB1GRAVE-0.1-Random200-false','MCTS-UCB1GRAVE-0.1-Random200-true','MCTS-UCB1GRAVE-0.6-MAST-false',
                'MCTS-UCB1GRAVE-0.6-MAST-true','MCTS-UCB1GRAVE-0.6-NST-false','MCTS-UCB1GRAVE-0.6-NST-true',
                'MCTS-UCB1GRAVE-0.6-Random200-false','MCTS-UCB1GRAVE-0.6-Random200-true',
                'MCTS-UCB1GRAVE-1.41421356237-MAST-false','MCTS-UCB1GRAVE-1.41421356237-MAST-true',
                'MCTS-UCB1GRAVE-1.41421356237-NST-false','MCTS-UCB1GRAVE-1.41421356237-NST-true',
                'MCTS-UCB1GRAVE-1.41421356237-Random200-false','MCTS-UCB1GRAVE-1.41421356237-Random200-true',
                'MCTS-UCB1Tuned-0.1-MAST-false','MCTS-UCB1Tuned-0.1-MAST-true','MCTS-UCB1Tuned-0.1-NST-false','MCTS-UCB1Tuned-0.1-NST-true',
                'MCTS-UCB1Tuned-0.1-Random200-false','MCTS-UCB1Tuned-0.1-Random200-true','MCTS-UCB1Tuned-0.6-MAST-false',
                'MCTS-UCB1Tuned-0.6-MAST-true','MCTS-UCB1Tuned-0.6-NST-false','MCTS-UCB1Tuned-0.6-NST-true',
                'MCTS-UCB1Tuned-0.6-Random200-false','MCTS-UCB1Tuned-0.6-Random200-true',
                'MCTS-UCB1Tuned-1.41421356237-MAST-false','MCTS-UCB1Tuned-1.41421356237-MAST-true',
                'MCTS-UCB1Tuned-1.41421356237-NST-false','MCTS-UCB1Tuned-1.41421356237-NST-true',
                'MCTS-UCB1Tuned-1.41421356237-Random200-false','MCTS-UCB1Tuned-1.41421356237-Random200-true']
            agent1,agent2=df['agent1'].values,df['agent2'].values
            for i in range(len(total_agent)):
                value=np.zeros(len(df))
                for j in range(len(df)):
                    if agent1[j]==total_agent[i]:
                        value[j]+=1
                    elif agent2[j]==total_agent[i]:
                        value[j]-=1
                df[f'agent_{total_agent[i]}']=value

            df['area']=df['NumRows']*df['NumColumns']
            df['row_equal_col']=(df['NumColumns']==df['NumRows']).astype(np.int8)
            df['Playouts/Moves'] = df['PlayoutsPerSecond'] / (df['MovesPerSecond'] + 1e-15)
            df['EfficiencyPerPlayout'] = df['MovesPerSecond'] / (df['PlayoutsPerSecond'] + 1e-15)
            df['TurnsDurationEfficiency'] = df['DurationActions'] / (df['DurationTurnsStdDev'] + 1e-15)
            df['AdvantageBalanceRatio'] = df['AdvantageP1'] / (df['Balance'] + 1e-15)
            df['ActionTimeEfficiency'] = df['DurationActions'] / (df['MovesPerSecond'] + 1e-15)
            df['StandardizedTurnsEfficiency'] = df['DurationTurnsStdDev'] / (df['DurationActions'] + 1e-15)
            df['AdvantageTimeImpact'] = df['AdvantageP1'] / (df['DurationActions'] + 1e-15)
            df['DurationToComplexityRatio'] = df['DurationActions'] / (df['StateTreeComplexity'] + 1e-15)
            df['NormalizedGameTreeComplexity'] =  df['GameTreeComplexity'] /  (df['StateTreeComplexity'] + 1e-15)
            df['ComplexityBalanceInteraction'] =  df['Balance'] *  df['GameTreeComplexity']
            df['OverallComplexity'] =  df['StateTreeComplexity'] +  df['GameTreeComplexity']
            df['ComplexityPerPlayout'] =  df['GameTreeComplexity'] /  (df['PlayoutsPerSecond'] + 1e-15)
            df['TurnsNotTimeouts/Moves'] = df['DurationTurnsNotTimeouts'] / (df['MovesPerSecond'] + 1e-15)
            df['Timeouts/DurationActions'] = df['Timeouts'] / (df['DurationActions'] + 1e-15)
            df['OutcomeUniformity/AdvantageP1'] = df['OutcomeUniformity'] / (df['AdvantageP1'] + 1e-15)
            df['ComplexDecisionRatio'] = df['StepDecisionToEnemy'] + df['SlideDecisionToEnemy'] + df['HopDecisionMoreThanOne']
            df['AggressiveActionsRatio'] = df['StepDecisionToEnemy'] + df['HopDecisionEnemyToEnemy'] + df['HopDecisionFriendToEnemy'] + df['SlideDecisionToEnemy']
            
            print("deal with outliers")
            df['PlayoutsPerSecond']=df['PlayoutsPerSecond'].clip(0,25000)
            df['MovesPerSecond']=df['MovesPerSecond'].clip(0,1000000)
            
            print("agent1 agent2 feature")
            cols=['selection','exploration_const','playout','score_bounds']
            for i in range(len(cols)):
                for j in range(2):
                    df[f'{cols[i]}{j+1}']=df[f'agent{j+1}'].apply(lambda x:x.split('-')[i+1])
            
            print(f"one_hot_encoder")
            onehot_cols=[['NumOffDiagonalDirections', [0.0, 4.82, 2.0, 5.18, 3.08, 0.06]],
                         ['NumLayers', [1, 0, 4, 5]],
                         ['NumPhasesBoard', [3, 2, 1, 5, 4]],
                         ['NumContainers', [1, 4, 3, 2]],
                         ['NumDice', [0, 2, 1, 4, 6, 3, 5, 7]],
                         ['ProposeDecisionFrequency', [0.0, 0.05, 0.01]],
                         ['PromotionDecisionFrequency', [0.0, 0.01, 0.03, 0.02, 0.11, 0.05, 0.04]],
                         ['SlideDecisionToFriendFrequency', [0.0, 0.19, 0.06]],
                         ['LeapDecisionToEnemyFrequency', [0.0, 0.04, 0.01, 0.02, 0.07, 0.03, 0.14, 0.08]],
                         ['HopDecisionFriendToFriendFrequency', [0.0, 0.13, 0.09]],
                         ['HopDecisionEnemyToEnemyFrequency', [0.0, 0.01, 0.2, 0.03]],
                         ['HopDecisionFriendToEnemyFrequency', [0.0, 0.01, 0.09, 0.25, 0.02]],
                         ['FromToDecisionFrequency', [0.0, 0.38, 1.0, 0.31, 0.94, 0.67]],
                         ['ProposeEffectFrequency', [0.0, 0.01, 0.03]],
                         ['PushEffectFrequency', [0.0, 0.5, 0.96, 0.25]],
                         ['FlipFrequency', [0.0, 0.87, 1.0, 0.96]],
                         ['SetCountFrequency', [0.0, 0.62, 0.54, 0.02]],
                         ['DirectionCaptureFrequency', [0.0, 0.55, 0.54]],
                         ['EncloseCaptureFrequency', [0.0, 0.08, 0.1, 0.07, 0.12, 0.02, 0.09]],
                         ['InterveneCaptureFrequency', [0.0, 0.01, 0.14, 0.04]],
                         ['SurroundCaptureFrequency', [0.0, 0.01, 0.03, 0.02]],
                         ['NumPlayPhase', [1, 2, 3, 4, 5, 6, 7, 8]],
                         ['LineLossFrequency', [0.0, 0.96, 0.87, 0.46, 0.26, 0.88, 0.94]],
                         ['ConnectionEndFrequency', [0.0, 0.19, 1.0, 0.23, 0.94, 0.35, 0.97]],
                         ['ConnectionLossFrequency', [0.0, 0.54, 0.78]],
                         ['GroupEndFrequency', [0.0, 1.0, 0.11, 0.79]],
                         ['GroupWinFrequency', [0.0, 0.11, 1.0]],
                         ['LoopEndFrequency', [0.0, 0.14, 0.66]],
                         ['LoopWinFrequency', [0.0, 0.14, 0.66]],
                         ['PatternEndFrequency', [0.0, 0.63, 0.35]],
                         ['PatternWinFrequency', [0.0, 0.63, 0.35]],
                         ['NoTargetPieceWinFrequency', [0.0, 0.72, 0.77, 0.95, 0.32, 1.0]],
                         ['EliminatePiecesLossFrequency', [0.0, 0.85, 0.96, 0.68]],
                         ['EliminatePiecesDrawFrequency', [0.0, 0.03, 0.91, 1.0, 0.36, 0.86]],
                         ['NoOwnPiecesLossFrequency', [0.0, 1.0, 0.68]],
                         ['FillEndFrequency', [0.0, 1.0, 0.04, 0.01, 0.99, 0.72]],
                         ['FillWinFrequency', [0.0, 1.0, 0.04, 0.01, 0.99]],
                         ['ReachDrawFrequency', [0.0, 0.9, 0.98]],
                         ['ScoringLossFrequency', [0.0, 0.6, 0.62]],
                         ['NoMovesLossFrequency', [0.0, 1.0, 0.13, 0.06]],
                         ['NoMovesDrawFrequency', [0.0, 0.01, 0.04, 0.03, 0.22]],
                         ['BoardSitesOccupiedChangeNumTimes', [0.0, 0.06, 0.42, 0.12, 0.14, 0.94]],
                         ['BranchingFactorChangeNumTimesn', [0.0, 0.3, 0.02, 0.07, 0.04, 0.13, 0.01, 0.21, 0.03]],
                         ['PieceNumberChangeNumTimes', [0.0, 0.06, 0.42, 0.12, 0.14, 1.0]],
                         ['selection1', ['ProgressiveHistory', 'UCB1', 'UCB1GRAVE', 'UCB1Tuned']],
                         ['selection2', ['ProgressiveHistory', 'UCB1GRAVE', 'UCB1', 'UCB1Tuned']],
                         ['exploration_const1', ['0.1', '0.6', '1.41421356237']],
                         ['exploration_const2', ['0.6', '0.1', '1.41421356237']],
                         ['playout1', ['MAST', 'NST', 'Random200']],
                         ['playout2', ['Random200', 'NST', 'MAST']]]

            for col,unique in onehot_cols:
                for u in unique:
                    df[f'{col}_{u}']=(df[col]==u).astype(np.int8)
                    
            print("deal with LudRules")
            print("1:drop game")
            def drop_gamename(rule):
                rule=rule[len('(game "'):]
                for i in range(len(rule)):
                    if rule[i]=='"':
                        return rule[i+1:]
            df['LudRules']=df['LudRules'].apply(lambda x:drop_gamename(x))

            print("2:player")
            def get_player(rule):
                player=''
                stack=[]
                for i in range(len(rule)):
                    player+=rule[i]
                    if rule[i] in ['(','{']:
                        stack.append(rule[i])
                    elif rule[i] in [')','}']:
                        stack=stack[:-1]
                        if len(stack)==0:
                            return player
            df['player']=df['LudRules'].apply(lambda rule:get_player(rule))
            df=self.clean(df,'player')
            df['player_len']=df['player'].apply(len)
            df['LudRules']=[rule[len(player):] for player,rule in zip(df['player'],df['LudRules'])]
            df.drop(['player'],axis=1,inplace=True)
            
            print("Rules readable")
            for rule in ['EnglishRules', 'LudRules']:
                df[rule+"_ARI"]=df[rule].apply(lambda x:self.ARI(x))
                df[rule+"CLRI"]=df[rule].apply(lambda x:self.CLRI(x))
                df[rule+"McAlpine_EFLAW"]=df[rule].apply(lambda x:self.McAlpine_EFLAW(x))
                    
            df['PlayoutsPerSecond/MovesPerSecond']=df['PlayoutsPerSecond']/df['MovesPerSecond']

            drop_cols=[
                'Cooperation','Team','TriangleShape','DiamondShape','SpiralShape','StarShape','SquarePyramidalShape',
                'SemiRegularTiling','CircleTiling','SpiralTiling','MancalaThreeRows','MancalaSixRows','MancalaCircular',
                'AlquerqueBoardWithOneTriangle','AlquerqueBoardWithTwoTriangles','AlquerqueBoardWithFourTriangles',
                'AlquerqueBoardWithEightTriangles','ThreeMensMorrisBoard','ThreeMensMorrisBoardWithTwoTriangles','NineMensMorrisBoard',
                'StarBoard','PachisiBoard','Boardless','NumColumns','NumCorners','NumOffDiagonalDirections','NumLayers',
                'NumCentreSites','NumConvexCorners','NumPhasesBoard','NumContainers','Piece','PieceValue','PieceRotation',
                'PieceDirection','LargePiece','Tile','NumComponentsType','NumDice','OpeningContract','SwapOption','Repetition',
                'TurnKo','PositionalSuperko','AutoMove','InitialRandomPlacement','InitialScore','InitialCost','Moves',
                'VoteDecision','SwapPlayersDecision','SwapPlayersDecisionFrequency','ProposeDecision','ProposeDecisionFrequency',
                'PromotionDecisionFrequency','RotationDecision','RotationDecisionFrequency','StepDecisionToFriend',
                'StepDecisionToFriendFrequency','StepDecisionToEnemy','SlideDecisionToEnemy','SlideDecisionToEnemyFrequency',
                'SlideDecisionToFriend','SlideDecisionToFriendFrequency','LeapDecision','LeapDecisionFrequency','LeapDecisionToEmpty',
                'LeapDecisionToEmptyFrequency','LeapDecisionToEnemy','LeapDecisionToEnemyFrequency','HopDecisionFriendToEmpty',
                'HopDecisionFriendToEmptyFrequency','HopDecisionFriendToFriendFrequency','HopDecisionEnemyToEnemy','HopDecisionEnemyToEnemyFrequency',
                'HopDecisionFriendToEnemy','HopDecisionFriendToEnemyFrequency','FromToDecisionFrequency','FromToDecisionEnemy','FromToDecisionEnemyFrequency',
                'FromToDecisionFriend','SwapPiecesDecision','SwapPiecesDecisionFrequency','ShootDecision','ShootDecisionFrequency','VoteEffect',
                'SwapPlayersEffect','PassEffect','ProposeEffect','ProposeEffectFrequency','AddEffectFrequency','SowFrequency','SowCapture','SowCaptureFrequency',
                'SowRemove','SowBacktracking','SowBacktrackingFrequency','SowProperties','SowOriginFirst','SowCCW','PromotionEffectFrequency','PushEffect',
                'PushEffectFrequency','Flip','FlipFrequency','SetNextPlayer','SetValue','SetValueFrequency','SetCount','SetCountFrequency','SetRotation',
                'SetRotationFrequency','StepEffect','SlideEffect','LeapEffect','ByDieMove','MaxDistance','ReplacementCaptureFrequency','HopCaptureMoreThanOne',
                'DirectionCapture','DirectionCaptureFrequency','EncloseCaptureFrequency','CustodialCapture','CustodialCaptureFrequency','InterveneCapture',
                'InterveneCaptureFrequency','SurroundCapture','SurroundCaptureFrequency','CaptureSequence','CaptureSequenceFrequency','Group','Loop',
                'Pattern','PathExtent','Territory','Fill','CanNotMove','Threat','CountPiecesMoverComparison','ProgressCheck','RotationalDirection','SameLayerDirection',
                'ForwardDirection','BackwardDirection','BackwardsDirection','LeftwardDirection','RightwardsDirection','LeftwardsDirection','ForwardLeftDirection',
                'ForwardRightDirection','BackwardLeftDirection','BackwardRightDirection','SameDirection','OppositeDirection','NumPlayPhase','LineLoss','LineLossFrequency',
                'LineDraw','ConnectionEnd','ConnectionEndFrequency','ConnectionWinFrequency','ConnectionLoss','ConnectionLossFrequency','GroupEnd','GroupEndFrequency',
                'GroupWin','GroupWinFrequency','GroupLoss','GroupDraw','LoopEnd','LoopEndFrequency','LoopWin','LoopWinFrequency','LoopLoss','PatternEnd',
                'PatternEndFrequency','PatternWin','PatternWinFrequency','PathExtentEnd','PathExtentWin','PathExtentLoss','TerritoryEnd','TerritoryWin','TerritoryWinFrequency',
                'Checkmate','CheckmateWin','NoTargetPieceEndFrequency','NoTargetPieceWin','NoTargetPieceWinFrequency','EliminatePiecesLoss','EliminatePiecesLossFrequency',
                'EliminatePiecesDraw','EliminatePiecesDrawFrequency','NoOwnPiecesEnd','NoOwnPiecesWin','NoOwnPiecesLoss','NoOwnPiecesLossFrequency','FillEnd',
                'FillEndFrequency','FillWin','FillWinFrequency','ReachWin','ReachLoss','ReachLossFrequency','ReachDraw','ReachDrawFrequency','ScoringLoss',
                'ScoringLossFrequency','ScoringDraw','NoMovesLoss','NoMovesDrawFrequency','NoProgressEnd','NoProgressEndFrequency','NoProgressDraw','NoProgressDrawFrequency',
                'BoardSitesOccupiedChangeNumTimes','BranchingFactorChangeLineBestFit','BranchingFactorChangeNumTimesn','DecisionFactorChangeNumTimes','MoveDistanceChangeSign',
                'MoveDistanceChangeLineBestFit','PieceNumberChangeNumTimes','PieceNumberMaxIncrease','ScoreDifferenceMedian','ScoreDifferenceVariance','ScoreDifferenceChangeAverage',
                'ScoreDifferenceChangeSign','ScoreDifferenceChangeLineBestFit','ScoreDifferenceChangeNumTimes','ScoreDifferenceMaxIncrease','ScoreDifferenceMaxDecrease','Math',
                'Division','Modulo','Absolute','Exponentiation','Minimum','Maximum','Even','Odd','Visual','GraphStyle','MancalaStyle','PenAndPaperStyle','ShibumiStyle',
                'BackgammonStyle','JanggiStyle','XiangqiStyle','ShogiStyle','TableStyle','SurakartaStyle','NoBoard','ChessComponent','KingComponent','QueenComponent',
                'KnightComponent','RookComponent','BishopComponent','PawnComponent','FairyChessComponent','PloyComponent','ShogiComponent','XiangqiComponent','StrategoComponent',
                'JanggiComponent','TaflComponent','StackType','Stack','ShowPieceValue','ShowPieceState','Implementation','StateType','StackState','VisitedSites','InternalCounter',
                'SetInternalCounter','Efficiency','NumOffDiagonalDirections_0.0','NumOffDiagonalDirections_4.82','NumOffDiagonalDirections_2.0','NumOffDiagonalDirections_5.18',
                'NumOffDiagonalDirections_3.08','NumOffDiagonalDirections_0.06','NumLayers_1','NumLayers_0','NumLayers_4','NumLayers_5','NumPhasesBoard_1','NumPhasesBoard_5',
                'NumDice_0','NumDice_2','NumDice_6','NumDice_3','NumDice_5','NumDice_7','ProposeDecisionFrequency_0.0','ProposeDecisionFrequency_0.05',
                'ProposeDecisionFrequency_0.01','PromotionDecisionFrequency_0.0','PromotionDecisionFrequency_0.01','PromotionDecisionFrequency_0.03','PromotionDecisionFrequency_0.02',
                'PromotionDecisionFrequency_0.11','PromotionDecisionFrequency_0.05','PromotionDecisionFrequency_0.04','SlideDecisionToFriendFrequency_0.0','SlideDecisionToFriendFrequency_0.19',
                'SlideDecisionToFriendFrequency_0.06','LeapDecisionToEnemyFrequency_0.0','LeapDecisionToEnemyFrequency_0.04','LeapDecisionToEnemyFrequency_0.01','LeapDecisionToEnemyFrequency_0.02',
                'LeapDecisionToEnemyFrequency_0.07','LeapDecisionToEnemyFrequency_0.03','LeapDecisionToEnemyFrequency_0.14','LeapDecisionToEnemyFrequency_0.08','HopDecisionFriendToFriendFrequency_0.0',
                'HopDecisionFriendToFriendFrequency_0.13','HopDecisionFriendToFriendFrequency_0.09','HopDecisionEnemyToEnemyFrequency_0.0','HopDecisionEnemyToEnemyFrequency_0.01',
                'HopDecisionEnemyToEnemyFrequency_0.2','HopDecisionEnemyToEnemyFrequency_0.03','HopDecisionFriendToEnemyFrequency_0.0','HopDecisionFriendToEnemyFrequency_0.01',
                'HopDecisionFriendToEnemyFrequency_0.09','HopDecisionFriendToEnemyFrequency_0.25','HopDecisionFriendToEnemyFrequency_0.02','FromToDecisionFrequency_0.0',
                'FromToDecisionFrequency_0.38','FromToDecisionFrequency_1.0','FromToDecisionFrequency_0.31','FromToDecisionFrequency_0.94','FromToDecisionFrequency_0.67',
                'ProposeEffectFrequency_0.0','ProposeEffectFrequency_0.01','ProposeEffectFrequency_0.03','PushEffectFrequency_0.0','PushEffectFrequency_0.5','PushEffectFrequency_0.96',
                'PushEffectFrequency_0.25','FlipFrequency_0.0','FlipFrequency_0.87','FlipFrequency_1.0','FlipFrequency_0.96','SetCountFrequency_0.0','SetCountFrequency_0.62',
                'SetCountFrequency_0.54','SetCountFrequency_0.02','DirectionCaptureFrequency_0.0','DirectionCaptureFrequency_0.55','DirectionCaptureFrequency_0.54',
                'EncloseCaptureFrequency_0.0','EncloseCaptureFrequency_0.08','EncloseCaptureFrequency_0.1','EncloseCaptureFrequency_0.07','EncloseCaptureFrequency_0.12',
                'EncloseCaptureFrequency_0.02','EncloseCaptureFrequency_0.09','InterveneCaptureFrequency_0.0','InterveneCaptureFrequency_0.01','InterveneCaptureFrequency_0.14',
                'InterveneCaptureFrequency_0.04','SurroundCaptureFrequency_0.0','SurroundCaptureFrequency_0.01','SurroundCaptureFrequency_0.03','SurroundCaptureFrequency_0.02',
                'NumPlayPhase_3','NumPlayPhase_4','NumPlayPhase_5','NumPlayPhase_6','NumPlayPhase_7','NumPlayPhase_8','LineLossFrequency_0.0','LineLossFrequency_0.96',
                'LineLossFrequency_0.87','LineLossFrequency_0.46','LineLossFrequency_0.26','LineLossFrequency_0.88','LineLossFrequency_0.94','ConnectionEndFrequency_0.0',
                'ConnectionEndFrequency_0.19','ConnectionEndFrequency_1.0','ConnectionEndFrequency_0.23','ConnectionEndFrequency_0.94','ConnectionEndFrequency_0.35',
                'ConnectionEndFrequency_0.97','ConnectionLossFrequency_0.0','ConnectionLossFrequency_0.54','ConnectionLossFrequency_0.78','GroupEndFrequency_0.0','GroupEndFrequency_1.0',
                'GroupEndFrequency_0.11','GroupEndFrequency_0.79','GroupWinFrequency_0.0','GroupWinFrequency_0.11','GroupWinFrequency_1.0','LoopEndFrequency_0.0','LoopEndFrequency_0.14',
                'LoopEndFrequency_0.66','LoopWinFrequency_0.0','LoopWinFrequency_0.14','LoopWinFrequency_0.66','PatternEndFrequency_0.0','PatternEndFrequency_0.63',
                'PatternEndFrequency_0.35','PatternWinFrequency_0.0','PatternWinFrequency_0.63','PatternWinFrequency_0.35','NoTargetPieceWinFrequency_0.0','NoTargetPieceWinFrequency_0.72',
                'NoTargetPieceWinFrequency_0.77','NoTargetPieceWinFrequency_0.95','NoTargetPieceWinFrequency_0.32','NoTargetPieceWinFrequency_1.0','EliminatePiecesLossFrequency_0.0',
                'EliminatePiecesLossFrequency_0.85','EliminatePiecesLossFrequency_0.96','EliminatePiecesLossFrequency_0.68','EliminatePiecesDrawFrequency_0.0','EliminatePiecesDrawFrequency_0.03',
                'EliminatePiecesDrawFrequency_0.91','EliminatePiecesDrawFrequency_1.0','EliminatePiecesDrawFrequency_0.36','EliminatePiecesDrawFrequency_0.86','NoOwnPiecesLossFrequency_0.0',
                'NoOwnPiecesLossFrequency_1.0','NoOwnPiecesLossFrequency_0.68','FillEndFrequency_0.0','FillEndFrequency_1.0','FillEndFrequency_0.04','FillEndFrequency_0.01',
                'FillEndFrequency_0.99','FillEndFrequency_0.72','FillWinFrequency_0.0','FillWinFrequency_1.0','FillWinFrequency_0.04','FillWinFrequency_0.01',
                'FillWinFrequency_0.99','ReachDrawFrequency_0.0','ReachDrawFrequency_0.9','ReachDrawFrequency_0.98','ScoringLossFrequency_0.0','ScoringLossFrequency_0.6',
                'ScoringLossFrequency_0.62','NoMovesLossFrequency_0.0','NoMovesLossFrequency_1.0','NoMovesLossFrequency_0.13','NoMovesLossFrequency_0.06',
                'NoMovesDrawFrequency_0.0','NoMovesDrawFrequency_0.01','NoMovesDrawFrequency_0.04','NoMovesDrawFrequency_0.03','NoMovesDrawFrequency_0.22',
                'BoardSitesOccupiedChangeNumTimes_0.0','BoardSitesOccupiedChangeNumTimes_0.06','BoardSitesOccupiedChangeNumTimes_0.42','BoardSitesOccupiedChangeNumTimes_0.12',
                'BoardSitesOccupiedChangeNumTimes_0.14','BoardSitesOccupiedChangeNumTimes_0.94','BranchingFactorChangeNumTimesn_0.0','BranchingFactorChangeNumTimesn_0.3',
                'BranchingFactorChangeNumTimesn_0.02','BranchingFactorChangeNumTimesn_0.07','BranchingFactorChangeNumTimesn_0.04','BranchingFactorChangeNumTimesn_0.13',
                'BranchingFactorChangeNumTimesn_0.01','BranchingFactorChangeNumTimesn_0.21','BranchingFactorChangeNumTimesn_0.03','PieceNumberChangeNumTimes_0.0',
                'PieceNumberChangeNumTimes_0.06','PieceNumberChangeNumTimes_0.42','PieceNumberChangeNumTimes_0.12','PieceNumberChangeNumTimes_0.14',
                'PieceNumberChangeNumTimes_1.0','KintsBoard','FortyStonesWithFourGapsBoard','Roll','SumDice','CheckmateFrequency','NumDice_4'
            ]

            df.drop(['Id',
                     'Properties','Format','Time','Discrete','Realtime','Turns','Alternating','Simultaneous','HiddenInformation',
                     'Match','AsymmetricRules','AsymmetricPlayRules','AsymmetricEndRules','AsymmetricSetup','Players','NumPlayers',
                     'Simulation','Solitaire','TwoPlayer','Multiplayer','Coalition','Puzzle','DeductionPuzzle','PlanningPuzzle',
                     'Equipment','Container','Board','PrismShape','ParallelogramShape','RectanglePyramidalShape','TargetShape',
                     'BrickTiling','CelticTiling','QuadHexTiling','Hints','PlayableSites','Component','DiceD3','BiasedDice','Card','Domino','Rules',
                     'SituationalTurnKo','SituationalSuperko','InitialAmount','InitialPot','Play','BetDecision','BetDecisionFrequency','VoteDecisionFrequency',
                     'ChooseTrumpSuitDecision','ChooseTrumpSuitDecisionFrequency','LeapDecisionToFriend','LeapDecisionToFriendFrequency','HopDecisionEnemyToFriend',
                     'HopDecisionEnemyToFriendFrequency','HopDecisionFriendToFriend','FromToDecisionWithinBoard','FromToDecisionBetweenContainers','BetEffect','BetEffectFrequency',
                     'VoteEffectFrequency','SwapPlayersEffectFrequency','TakeControl','TakeControlFrequency','PassEffectFrequency','SetCost','SetCostFrequency','SetPhase',
                     'SetPhaseFrequency','SetTrumpSuit','SetTrumpSuitFrequency','StepEffectFrequency','SlideEffectFrequency','LeapEffectFrequency','HopEffectFrequency','FromToEffectFrequency',
                     'SwapPiecesEffect','SwapPiecesEffectFrequency','ShootEffect','ShootEffectFrequency','MaxCapture','OffDiagonalDirection','Information','HidePieceType','HidePieceOwner',
                     'HidePieceCount','HidePieceRotation','HidePieceValue','HidePieceState','InvisiblePiece','End','LineDrawFrequency','ConnectionDraw','ConnectionDrawFrequency','GroupLossFrequency',
                     'GroupDrawFrequency','LoopLossFrequency','LoopDraw','LoopDrawFrequency','PatternLoss','PatternLossFrequency','PatternDraw','PatternDrawFrequency','PathExtentEndFrequency',
                     'PathExtentWinFrequency','PathExtentLossFrequency','PathExtentDraw','PathExtentDrawFrequency','TerritoryLoss','TerritoryLossFrequency','TerritoryDraw','TerritoryDrawFrequency',
                     'CheckmateLoss','CheckmateLossFrequency','CheckmateDraw','CheckmateDrawFrequency','NoTargetPieceLoss','NoTargetPieceLossFrequency','NoTargetPieceDraw','NoTargetPieceDrawFrequency',
                     'NoOwnPiecesDraw','NoOwnPiecesDrawFrequency','FillLoss','FillLossFrequency','FillDraw','FillDrawFrequency','ScoringDrawFrequency','NoProgressWin','NoProgressWinFrequency',
                     'NoProgressLoss','NoProgressLossFrequency','SolvedEnd','Behaviour','StateRepetition','PositionalRepetition','SituationalRepetition','Duration','Complexity','BoardCoverage',
                     'GameOutcome','StateEvaluation','Clarity','Narrowness','Variance','Decisiveness','DecisivenessMoves','DecisivenessThreshold','LeadChange','Stability','Drama','DramaAverage',
                     'DramaMedian','DramaMaximum','DramaMinimum','DramaVariance','DramaChangeAverage','DramaChangeSign','DramaChangeLineBestFit','DramaChangeNumTimes','DramaMaxIncrease','DramaMaxDecrease',
                     'MoveEvaluation','MoveEvaluationAverage','MoveEvaluationMedian','MoveEvaluationMaximum','MoveEvaluationMinimum','MoveEvaluationVariance','MoveEvaluationChangeAverage','MoveEvaluationChangeSign',
                     'MoveEvaluationChangeLineBestFit','MoveEvaluationChangeNumTimes','MoveEvaluationMaxIncrease','MoveEvaluationMaxDecrease','StateEvaluationDifference','StateEvaluationDifferenceAverage',
                     'StateEvaluationDifferenceMedian','StateEvaluationDifferenceMaximum','StateEvaluationDifferenceMinimum','StateEvaluationDifferenceVariance','StateEvaluationDifferenceChangeAverage',
                     'StateEvaluationDifferenceChangeSign','StateEvaluationDifferenceChangeLineBestFit','StateEvaluationDifferenceChangeNumTimes','StateEvaluationDifferenceMaxIncrease',
                     'StateEvaluationDifferenceMaxDecrease','BoardSitesOccupied','BoardSitesOccupiedMinimum','BranchingFactor','BranchingFactorMinimum','DecisionFactor','DecisionFactorMinimum',
                     'MoveDistance','MoveDistanceMinimum','PieceNumber','PieceNumberMinimum','ScoreDifference','ScoreDifferenceMinimum','ScoreDifferenceChangeNumTimes','Roots','Cosine','Sine','Tangent',
                     'Exponential','Logarithm','ExclusiveDisjunction','Float','HandComponent','SetHidden','SetInvisible','SetHiddenCount','SetHiddenRotation','SetHiddenState','SetHiddenValue','SetHiddenWhat',
                     'SetHiddenWho',
                     'num_wins_agent1','num_draws_agent1','num_losses_agent1',
                     'Behaviour','StateRepetition','Duration','Complexity','BoardCoverage','GameOutcome','StateEvaluation','Clarity','Decisiveness','Drama','MoveEvaluation','StateEvaluationDifference','BoardSitesOccupied','BranchingFactor','DecisionFactor','MoveDistance','PieceNumber','ScoreDifference',
                     'selection1','selection2','exploration_const1','exploration_const2','playout1','playout2','score_bounds1','score_bounds2'
                    ]+drop_cols,axis=1,inplace=True,errors='ignore')
            
            df=self.reduce_mem_usage(df)
            print(f"feature_count:{len(df.columns)}")
            print("-"*30)
            return df

        def CV_feats(self,df,mode='',model_name='',fold=0):
            # text features
            str_cols=['EnglishRules', 'LudRules']
            for col in str_cols:
                df=self.clean(df,col)
                df[f'{col}_len']=df[col].apply(len)
                if mode=='train':
                    tfidf = TfidfVectorizer(max_features=275,ngram_range=(2,3))
                    tfidf_feats=tfidf.fit_transform(df[col]).toarray()
                    for i in range(tfidf_feats.shape[1]):
                        df[f"{col}_tfidf_{i}"]=tfidf_feats[:,i]
                    self.pickle_dump(tfidf,f'{model_name}_{fold}_{col}tfidf.model')
                    self.tfidf_paths.append((model_name,fold,col))
                else:
                    for i in range(len(self.tfidf_paths)):
                        if (model_name,fold,col)==self.tfidf_paths[i]:
                            tfidf=self.pickle_load(f'{model_name}_{fold}_{col}tfidf.model')
                            tfidf_feats=tfidf.transform(df[col]).toarray()
                            for j in range(tfidf_feats.shape[1]):
                                df[f"{col}_tfidf_{j}"]=tfidf_feats[:,j]
            df.drop(str_cols+['agent1','agent2'],axis=1,inplace=True)
            return df 
        
        def RMSE(self,y_true,y_pred):
            return np.sqrt(np.mean((y_true-y_pred)**2))
        
        def train_model(self,):
            self.train=self.FE(self.train,mode='train')
            cat_params1={
                'task_type': "GPU",
                'eval_metric': "RMSE",
                'bagging_temperature': 0.50,
                'iterations': 3072,
                'learning_rate': 0.08,
                'max_depth': 10,
                'l2_leaf_reg': 1.25,
                'min_data_in_leaf': 24,
                'random_strength': 0.25, 
                'verbose': 0,
            }
            
            cat_params2={
                'task_type': "GPU",
                'eval_metric': "RMSE",
                'bagging_temperature': 0.60,
                'iterations': 3072,
                'learning_rate': 0.08,
                'max_depth': 10,
                'l2_leaf_reg': 1.25,
                'min_data_in_leaf': 24,
                'random_strength': 0.20, 
                'max_bin':2048,
                'verbose': 0,
            }
            models=[
                (CatBoostRegressor(**cat_params1),'cat1'),
                (CatBoostRegressor(**cat_params2),'cat2'),
            ]
            
            for (model,model_name) in models:
                print("start training")
                X=self.train.drop([self.target,'GameRulesetName'],axis=1)
                GameRulesetName=self.train['GameRulesetName']
                y=self.train[self.target]
                oof_preds=np.zeros(len(X))
                
                y_int=round(y*15)
                
                sgkf = StratifiedGroupKFold(n_splits=self.num_folds,random_state=2024,shuffle=True)
                for fold, (train_index, valid_index) in (enumerate(sgkf.split(X,y_int,GameRulesetName))):
                    print(f"fold:{fold}")

                    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
                    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

                    X_train=self.CV_feats(X_train,mode='train',model_name=model_name,fold=fold)
                    X_valid=self.CV_feats(X_valid,mode='test',model_name=model_name,fold=fold)

                    model.fit(X_train, y_train,
                        eval_set=(X_valid, y_valid),
                        early_stopping_rounds=100, verbose=100)
                    
                    oof_preds[valid_index]=model.predict(X_valid)

                    self.pickle_dump(model,f'{model_name}_{fold}.model')
                    self.model_paths.append((model_name,fold))

                    del X_train,X_valid,y_train,y_valid
                    gc.collect()
                
                np.save(f"{model_name}_oof.npy",np.clip(oof_preds*1.1,-0.985,0.985))
                
                print(f"RMSE:{self.RMSE(y.values,np.clip(oof_preds*1.1,-0.985,0.985) )}")
                
        def infer_model(self,test):
            test=self.FE(test,mode='test')
            test.drop(['GameRulesetName'],axis=1,inplace=True)
            test_preds=[]
            for i in range(len(self.model_paths)):
                model_name,fold=self.model_paths[i]
                test_copy=self.CV_feats(test.copy(),mode='test',model_name=model_name,fold=fold)
                model=self.pickle_load(f'{model_name}_{fold}.model')
                test_preds+=[np.clip(model.predict(test_copy)*1.1,-0.985,0.985)]
            return np.mean(test_preds,axis=0)
        
    preprocessor=Preprocessor(num_folds=5,train=train)
    counter = 0
    def predict(test, submission):
        if model_1.counter == 0:
            model_1.preprocessor.train_model()  
        model_1.counter += 1
        return model_1.preprocessor.infer_model(test.to_pandas())


len(train):233234
len(test):3


In [3]:
import re
import tqdm

In [4]:
import os
import sys
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')

In [5]:
import numpy as np
import polars as pl
import pandas as pd
import plotly.graph_objects as go

In [6]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [7]:
import lightgbm as lgb
from catboost import CatBoostRegressor
import kaggle_evaluation.mcts_inference_server
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error as mse

In [8]:
class model_2:
    class CFG:
    
        importances_path = Path("importances_3.csv")    
        train_path = Path('train.csv')
        batch_size = 65536

        early_stop = 500
        n_splits = 5
        color = '#C9A9A6'

        lgb_w = 0.75
        lgb_p = {
            'objective': 'regression',
            'min_child_samples': 24,
            'num_iterations': 20000,
            'learning_rate': 0.07,
            'extra_trees': True,
            'reg_lambda': 0.8,
            'reg_alpha': 0.1,
            'num_leaves': 64,
            'metric': 'rmse',
            'device': 'cpu',
            'max_depth': 24,
            'max_bin': 128,
            'verbose': -1,
            'seed': 42
        }

        ctb_w = 0.35
        ctb_p = {
            'loss_function': 'RMSE',
            'learning_rate': 0.03,
            'num_trees': 20000,
            'random_state': 42,
            'task_type': 'GPU',
            'reg_lambda': 0.8,
            'depth': 8
        }
    class FE:

        def __init__(self, batch_size, importances_path):
            self.batch_size = batch_size
            self.importances_path = importances_path

        def drop_cols(self, df, bad_cols=None):
            # column cleanup
            cols = ['Id', 
                    'LudRules', 
                    'EnglishRules',
                    'num_wins_agent1',
                    'num_draws_agent1',
                    'num_losses_agent1',
                   ]
            df = df.drop([col for col in cols if col in df.columns])
            df = df.drop([col for col in df.columns if df.select(pl.col(col).null_count()).item() == df.height])
            bad_cols = [col for col in df.columns if df.select(pl.col(col).n_unique()).item() == 1] if bad_cols is None else bad_cols
            df = df.drop(bad_cols)

            df = df.to_pandas()
            importances = pd.read_csv(self.importances_path)
            drop_features = importances['drop_features'].tolist()
            df = df.drop(columns=drop_features,axis=1)
            df = pl.from_pandas(df)
            return df, bad_cols

        def cast_datatypes(self, df):
            cat_cols = ['GameRulesetName', 'agent1', 'agent2','p1_selection', 'p2_selection']
            df = df.with_columns([pl.col(col).cast(pl.String) for col in cat_cols])   
            for col in df.columns:
                if col not in cat_cols:
                    val = df.select(pl.col(col).drop_nulls().first()).item()
                    df = df.with_columns(pl.col(col).cast(pl.Int16) if isinstance(val, int) else pl.col(col).cast(pl.Float32))   
            return df    

        def info(self, df):
            print(f'Shape: {df.shape}')   
            mem = df.estimated_size() / 1024**2
            print('Memory usage: {:.2f} MB\n'.format(mem))


        def split_agent(self, df):
            # agent text parse
            df = df.with_columns(
                pl.col('agent1').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 1).alias('p1_selection'),
                pl.col('agent1').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 2).alias('p1_exploration').cast(pl.Float32),
                pl.col('agent1').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 3).alias('p1_playout'),
                pl.col('agent1').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 4).alias('p1_bounds'),
                pl.col('agent2').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 1).alias('p2_selection'),
                pl.col('agent2').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 2).alias('p2_exploration').cast(pl.Float32),
                pl.col('agent2').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 3).alias('p2_playout'),
                pl.col('agent2').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 4).alias('p2_bounds')
            )
            return df

        def get_tables(self, df):
            df_pandas = df.to_pandas()
            results_df_selection = (
                df_pandas.groupby(['p1_selection', 'p2_selection'], as_index=False)['utility_agent1'].mean()
                .rename(columns={'utility_agent1': 'p_selection_means'})
            )
            results_df_playout = (
                df_pandas.groupby(['p1_playout', 'p2_playout'], as_index=False)['utility_agent1'].mean()
                .rename(columns={'utility_agent1': 'p_playout_means'})
            )
            results_df_selection = pl.DataFrame(results_df_selection)
            results_df_playout = pl.DataFrame(results_df_playout)
            return results_df_playout, results_df_selection


        def add_columns(self, df, results_df_playout, results_df_selection):
            merged_df = df.join(
                results_df_playout,
                on=["p1_playout", "p2_playout"],
                how="left"
            )
            merged_df = merged_df.join(
                results_df_selection,
                on=["p1_selection", "p2_selection"],
                how="left"
            )
            merged_df = merged_df.to_pandas() 
            score_mapping = {
                'MAST': 2,
                'Random200': 1,
                'NST': 0
            }
            merged_df['p1_playout'] = merged_df['p1_playout'].map(score_mapping).astype(int)
            merged_df['p2_playout'] = merged_df['p2_playout'].map(score_mapping).astype(int)
            merged_df["p1_bounds"] = merged_df["p1_bounds"].map({"true": True, "false": False})
            merged_df["p2_bounds"] = merged_df["p2_bounds"].map({"true": True, "false": False})
            merged_df = pl.from_pandas(merged_df)
            return merged_df

        def clean(self,df,col):
            df[col]=df[col].fillna("nan")
            df[col]=df[col].apply(lambda x:x.lower())
            ps='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
            for p in ps:
                df[col]=df[col].apply(lambda x:x.replace(p,' '))
            return df

        def ARI(self,txt):
            characters=len(txt)
            words=len(re.split(' |\\n|\\.|\\?|\\!|\,',txt))
            sentence=len(re.split('\\.|\\?|\\!',txt))
            ari_score=4.71*(characters/words)+0.5*(words/sentence)-21.43
            return ari_score

        def McAlpine_EFLAW(self,txt):
            W=len(re.split(' |\\n|\\.|\\?|\\!|\,',txt))
            S=len(re.split('\\.|\\?|\\!',txt))
            mcalpine_eflaw_score=(W+S*W)/S
            return mcalpine_eflaw_score

        def CLRI(self,txt):
            characters=len(txt)
            words=len(re.split(' |\\n|\\.|\\?|\\!|\,',txt))
            sentence=len(re.split('\\.|\\?|\\!',txt))
            L=100*characters/words
            S=100*sentence/words
            clri_score=0.0588*L-0.296*S-15.8
            return clri_score

        def ludrules_edit(self,df):
            # drop name + parse player
            df = df.to_pandas()

            def clean(self,df,col):
                df[col]=df[col].fillna("nan")
                df[col]=df[col].apply(lambda x:x.lower())
                ps='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
                for p in ps:
                    df[col]=df[col].apply(lambda x:x.replace(p,' '))
                return df

            def drop_gamename(rule):
                rule=rule[len('(game "'):]
                for i in range(len(rule)):
                    if rule[i]=='"':
                        return rule[i+1:]

            print("first:",df["LudRules"][0])
            df['LudRules']=df['LudRules'].apply(lambda x:drop_gamename(x))
            print("end:",df["LudRules"][0])

            def get_player(rule):
                player=''
                stack=[]
                for i in range(len(rule)):
                    player+=rule[i]
                    if rule[i] in ['(','{']:
                        stack.append(rule[i])
                    elif rule[i] in [')','}']:
                        stack=stack[:-1]
                        if len(stack)==0:
                            return player

            df['player']=df['LudRules'].apply(lambda rule:get_player(rule))
            print("first:",df["player"][0])
            df=self.clean(df,'player')
            print("end:",df["player"][0])
            df['player_len']=df['player'].apply(len)
            print("player_len:",df["player_len"][0])
            print("ludrules1:",df["LudRules"][0])
            df['LudRules']=[rule[len(player):] for player,rule in zip(df['player'],df['LudRules'])]
            print("ludrules2:",df["LudRules"][0])
            df.drop(['player'],axis=1,inplace=True)

            print("Rules readable")
            for rule in ['EnglishRules', 'LudRules']:
                df[rule+"_ARI"]=df[rule].apply(lambda x:self.ARI(x))
                df[rule+"CLRI"]=df[rule].apply(lambda x:self.CLRI(x))
                df[rule+"McAlpine_EFLAW"]=df[rule].apply(lambda x:self.McAlpine_EFLAW(x))

            df['PlayoutsPerSecond/MovesPerSecond']=df['PlayoutsPerSecond']/df['MovesPerSecond']
            print("len cols:",len(df.columns))
            df = pl.from_pandas(df)
            return df

        def get_same_cols(self, df):
            # detect identical columns
            def find_identical_columns(df):
                identical_columns = {}
                columns = df.columns
                for i, col1 in enumerate(columns):
                    for col2 in columns[i+1:]:
                        if df[col1].equals(df[col2]):
                            identical_columns.setdefault(col1, []).append(col2)
                return identical_columns

            identical_columns = find_identical_columns(df)
            print("Tamamen ayni kolonlar:")
            print(identical_columns)
            same_cols = list(set([item for sublist in identical_columns.values() for item in sublist]))
            return same_cols

        def drop_same_cols(self, df, same_cols):
            df = df.drop(same_cols)
            return df

        def apply_fe(self, path):
            df = pl.read_csv(path, batch_size=self.batch_size)
            df = self.split_agent(df)
            df = self.ludrules_edit(df)
            print("1",len(df.columns))
            results_df_playout, results_df_selection = self.get_tables(df)
            print("2")
            df = self.add_columns(df, results_df_playout, results_df_selection)
            print("3")
            df, bad_cols = self.drop_cols(df)
            print("4")
            df = self.cast_datatypes(df)
            print("5")
            same_cols = self.get_same_cols(df)
            print("6")
            df = self.drop_same_cols(df, same_cols)
            print("7")
            self.info(df)

            cat_cols = [col for col in df.columns if df[col].dtype == pl.String]
            return df, bad_cols, cat_cols, results_df_playout, results_df_selection, same_cols

    fe = FE(CFG.batch_size, CFG.importances_path)
    class MD:

        def __init__(self, 
                     importances_path, 
                     early_stop, 
                     n_splits, 
                     lgb_p, 
                     ctb_p, 
                     lgb_w,
                     ctb_w,
                     color,
                    ):
            self.importances_path = importances_path
            self.early_stop = early_stop
            self.n_splits = n_splits
            self.lgb_p = lgb_p
            self.ctb_p = ctb_p
            self.lgb_w = lgb_w
            self.ctb_w = ctb_w
            self.color = color

        def plot_cv(self, fold_scores, title):
            fold_scores = [round(score, 3) for score in fold_scores]
            mean_score = round(np.mean(fold_scores), 3)
            std_score = round(np.std(fold_scores), 3)

            fig = go.Figure()
            fig.add_trace(go.Scatter(
                x = list(range(1, len(fold_scores) + 1)),
                y = fold_scores,
                mode = 'markers', 
                name = 'Fold Scores',
                marker = dict(size = 24, color=self.color, symbol='diamond'),
                text = [f'{score:.3f}' for score in fold_scores],
                hovertemplate = 'Fold %{x}: %{text}<extra></extra>',
                hoverlabel=dict(font=dict(size=16))  
            ))
            fig.add_trace(go.Scatter(
                x = [1, len(fold_scores)],
                y = [mean_score, mean_score],
                mode = 'lines',
                name = f'Mean: {mean_score:.3f}',
                line = dict(dash = 'dash', color = '#FFBF00'),
                hoverinfo = 'none'
            ))
            fig.update_layout(
                title = f'{title} | Cross-Validation RMSE Scores | Variation of CV scores: {mean_score} ± {std_score}',
                xaxis_title = 'Fold',
                yaxis_title = 'RMSE Score',
                plot_bgcolor = 'rgba(0,0,0,0)',
                paper_bgcolor = 'rgba(0,0,0,0)',
                xaxis = dict(
                    gridcolor = 'lightgray',
                    tickmode = 'linear',
                    tick0 = 1,
                    dtick = 1,
                    range = [0.5, len(fold_scores) + 0.5]
                ),
                yaxis = dict(gridcolor = 'lightgray')
            )
            fig.show() 

        def train_model(self, data, cat_cols, title):
            for col in cat_cols:
                data[col] = data[col].astype('category')

            X = data.drop(['utility_agent1'], axis=1)
            y = data['utility_agent1']
            group = data['GameRulesetName']
            cv = GroupKFold(n_splits=self.n_splits)
            models, scores = [], []
            oof_preds = np.zeros(len(X))

            for fold, (train_index, valid_index) in enumerate(cv.split(X, y, group)):
                X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
                y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
                print(f'Fold {fold+1} | {X_train.shape[0]:,} train rows | {X_valid.shape[0]:,} valid rows | {X_train.shape[1]} features')

                if title.startswith('LightGBM'):
                    model = lgb.LGBMRegressor(**self.lgb_p)
                    model.fit(X_train, y_train,
                              eval_set=[(X_valid, y_valid)],
                              eval_metric='rmse',
                              callbacks=[lgb.early_stopping(self.early_stop, verbose=0), lgb.log_evaluation(0)])
                elif title.startswith('CatBoost'):
                    model = CatBoostRegressor(**self.ctb_p, verbose=0, cat_features=cat_cols)
                    model.fit(X_train, y_train,
                              eval_set=(X_valid, y_valid),
                              early_stopping_rounds=self.early_stop, verbose=0)

                models.append(model)
                oof_preds[valid_index] = model.predict(X_valid)
                score = mse(y_valid, oof_preds[valid_index], squared=False)
                print("score:",score)
                scores.append(score)

            self.plot_cv(scores, title)
            return models, oof_preds


        def inference(self, data, cat_cols, lgb_models, ctb_models, lgb_models_oof, ctb_models_oof):
            for col in cat_cols:
                data[col] = data[col].astype('category')

            data['lgb_oof_preds'] = np.mean([model.predict(data) for model in lgb_models], axis=0)
            data['ctb_oof_preds'] = np.mean([model.predict(data) for model in ctb_models], axis=0)
            lgb_preds = np.mean([model.predict(data) for model in lgb_models_oof], axis=0)  
            ctb_preds = np.mean([model.predict(data) for model in ctb_models_oof], axis=0)  
            all_preds = lgb_preds * self.lgb_w + ctb_preds * self.ctb_w
            all_preds = np.clip(all_preds, -0.985, 0.985)
            return all_preds
    md = MD(CFG.importances_path, 
        CFG.early_stop, 
        CFG.n_splits, 
        CFG.lgb_p, 
        CFG.ctb_p, 
        CFG.lgb_w,
        CFG.ctb_w,
        CFG.color)
    bad_cols = None
    cat_cols = None
    lgb_models = None
    ctb_models = None
    lgb_models_oof = None
    ctb_models_oof = None
    results_df_selection = None
    results_df_playout = None
    same_cols = None
    
    def train_model():
        train, model_2.bad_cols, model_2.cat_cols, model_2.results_df_playout, model_2.results_df_selection, model_2.same_cols = model_2.fe.apply_fe(model_2.CFG.train_path)
        train = train.to_pandas()

        model_2.lgb_models, model_2.lgb_oof_preds = model_2.md.train_model(train, model_2.cat_cols, title='LightGBM')
        model_2.ctb_models, model_2.ctb_oof_preds = model_2.md.train_model(train, model_2.cat_cols, title='CatBoost') 

        train['lgb_oof_preds'] = model_2.lgb_oof_preds
        train['ctb_oof_preds'] = model_2.ctb_oof_preds

        model_2.lgb_models_oof, _ = model_2.md.train_model(train, model_2.cat_cols, title='LightGBM w/ OOF Predictions')
        model_2.ctb_models_oof, _ = model_2.md.train_model(train, model_2.cat_cols, title='CatBoost w/ OOF Predictions')
    counter = 0
    def predict(test, submission):
        if model_2.counter == 0:
            model_2.train_model() 
        model_2.counter += 1
        test = model_2.fe.split_agent(test)
        print(len(test.columns))
        test = model_2.fe.ludrules_edit(test)
        print(len(test.columns))
        test = model_2.fe.add_columns(test, model_2.results_df_playout, model_2.results_df_selection)
        print(len(test.columns))
        test, _ = model_2.fe.drop_cols(test, model_2.bad_cols)
        print(len(test.columns))
        test = model_2.fe.drop_same_cols(test, model_2.same_cols)
        print(len(test.columns))
        test = model_2.fe.cast_datatypes(test)
        print(len(test.columns))
        test = test.to_pandas()
        print(len(test.columns))
        return  model_2.md.inference(test, model_2.cat_cols, model_2.lgb_models, model_2.ctb_models, model_2.lgb_models_oof, model_2.ctb_models_oof)


In [9]:
def predict(test, submission):
    result_1 = model_1.predict(test, submission)
    result_2 = model_2.predict(test, submission)
    results_final = result_1 * 0.60 + result_2 * 0.50
    results_final_clipped = np.clip(results_final, -0.985, 0.985)
    return submission.with_columns(pl.Series('utility_agent1',results_final_clipped))

In [10]:
inference_server = kaggle_evaluation.mcts_inference_server.MCTSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/um-game-playing-strength-of-mcts-variants/test.csv',
            '/kaggle/input/um-game-playing-strength-of-mcts-variants/sample_submission.csv'
        )
    )

FE:train
agent position feature
deal with outliers
agent1 agent2 feature
one_hot_encoder
deal with LudRules
1:drop game
2:player
Rules readable
Memory usage of dataframe is 740.02 MB
Memory usage after optimization is: 245.12 MB
Decreased by 66.9%
feature_count:443
------------------------------
start training
fold:0
0:	learn: 0.6050425	test: 0.6096701	best: 0.6096701 (0)	total: 17.3s	remaining: 14h 46m 20s
100:	learn: 0.3519127	test: 0.4574709	best: 0.4574709 (100)	total: 21.5s	remaining: 10m 33s
200:	learn: 0.3113398	test: 0.4450364	best: 0.4450364 (200)	total: 25.6s	remaining: 6m 5s
300:	learn: 0.2872501	test: 0.4399531	best: 0.4399302 (296)	total: 29.6s	remaining: 4m 32s
400:	learn: 0.2713657	test: 0.4373644	best: 0.4373644 (400)	total: 33.6s	remaining: 3m 43s
500:	learn: 0.2601585	test: 0.4360392	best: 0.4360090 (494)	total: 37.5s	remaining: 3m 12s
600:	learn: 0.2513304	test: 0.4351057	best: 0.4351057 (600)	total: 41.5s	remaining: 2m 50s
700:	learn: 0.2436645	test: 0.4339431	best:

Fold 1 | 186,550 train rows | 46,684 valid rows | 305 features
score: 0.4658489855536145
Fold 2 | 186,596 train rows | 46,638 valid rows | 305 features
score: 0.4827054902449785
Fold 3 | 186,594 train rows | 46,640 valid rows | 305 features
score: 0.44474106820692977
Fold 4 | 186,598 train rows | 46,636 valid rows | 305 features
score: 0.4847780498523948
Fold 5 | 186,598 train rows | 46,636 valid rows | 305 features
score: 0.4578800984009931


Fold 1 | 186,550 train rows | 46,684 valid rows | 307 features
score: 0.4348481261100112
Fold 2 | 186,596 train rows | 46,638 valid rows | 307 features
score: 0.4608741115649503
Fold 3 | 186,594 train rows | 46,640 valid rows | 307 features
score: 0.4304506477720695
Fold 4 | 186,598 train rows | 46,636 valid rows | 307 features
score: 0.46377158059913814
Fold 5 | 186,598 train rows | 46,636 valid rows | 307 features
score: 0.43405049486351305


Fold 1 | 186,550 train rows | 46,684 valid rows | 307 features
score: 0.45310727064453793
Fold 2 | 186,596 train rows | 46,638 valid rows | 307 features
score: 0.4694359030330564
Fold 3 | 186,594 train rows | 46,640 valid rows | 307 features
score: 0.4345584274955853
Fold 4 | 186,598 train rows | 46,636 valid rows | 307 features
score: 0.4709803207995498
Fold 5 | 186,598 train rows | 46,636 valid rows | 307 features
score: 0.45053008258524935


818
first: (game "00'Y'" (players 2) (equipment { (board (tri Limping 4) use:Vertex) (piece "Disc" Each) (piece "Counter" Neutral maxState:2) } ) (rules (play (priority { (if (is Prev Mover) (or (move Add (piece (id "Disc" Mover)) (to (sites Occupied by:Next) if:(< (count Pieces Mover in:(sites Around (to) Orthogonal) ) (count Pieces Next in:(sites Around (to) Orthogonal) ) ) (apply (remove (to))) ) ) (move Add (piece (id "Disc" Next)) (to (sites Occupied by:Mover) if:(< (count Pieces Next in:(sites Around (to) Orthogonal) ) (count Pieces Mover in:(sites Around (to) Orthogonal) ) ) (apply (remove (to))) ) ) ) ) (move Add (to (sites Empty)) (then (set Var "MoveInTurn" (% (+ 3 (var "MoveInTurn")) 2) ) ) ) } (then (if (or (= 1 (var "MoveInTurn")) (can Move (or (move Add (piece (id "Disc" Mover)) (to (sites Occupied by:Next) if:(< (count Pieces Mover in:(sites Around (to) Orthogonal ) ) (count Pieces Next in:(sites Around (to) Orthogonal ) ) ) (apply (remove (to))) ) ) (move Add (piece (id