In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/um-game-playing-strength-of-mcts-variants/sample_submission.csv
/kaggle/input/um-game-playing-strength-of-mcts-variants/train.csv
/kaggle/input/um-game-playing-strength-of-mcts-variants/test.csv
/kaggle/input/um-game-playing-strength-of-mcts-variants/concepts.csv
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/mcts_gateway.py
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/__init__.py
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/mcts_inference_server.py
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/core/templates.py
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/core/base_gateway.py
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/core/relay.py
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evaluation/core/kaggle_evaluation.proto
/kaggle/input/um-game-playing-strength-of-mcts-variants/kaggle_evalua

# Data Loading

In [2]:
def trainTestLoad(train_path, test_path, target_col_name=''):
    # Read training data
    train_df = pd.read_csv(train_path)
    
    # Separate features and target variable in training data
    X_train = train_df.drop(target_col_name, axis=1)
    y_train = train_df[target_col_name]
    
    # Read test data
    X_test = pd.read_csv(test_path)
    
    print(f"Training data shape: {X_train.shape}")
    print(f"Training target shape: {y_train.shape}")
    print(f"Test data shape: {X_test.shape}")
    
    return X_train, y_train, X_test

In [3]:
train_file_path = "/kaggle/input/um-game-playing-strength-of-mcts-variants/train.csv"
test_file_path = "/kaggle/input/um-game-playing-strength-of-mcts-variants/test.csv"

X_train, y_train, test = trainTestLoad(train_file_path, test_file_path,
                                              target_col_name="utility_agent1")

Training data shape: (233234, 813)
Training target shape: (233234,)
Test data shape: (3, 810)


# Analyzing Missing Columns & Variance Columns

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

def analyzeMissingCols(df, missing_percentage=1):
    missing = df.isnull().sum()
    missing_percent = 100 * df.isnull().sum() / len(df)
    missing_table = pd.concat([missing, missing_percent], axis=1, keys=['Missing Values', 'Percent Missing'])
    missing_table = missing_table.sort_values('Percent Missing', ascending=False)
    missing_table = missing_table[missing_table["Percent Missing"] >= missing_percentage]
    
    
    print("Missing Data Analysis:")
    if missing_table['Missing Values'].sum() > 0:
        print(missing_table)
    else:
        print("No missing values found in the dataset.")
    
    return missing_table.index.tolist()

In [5]:
missing_values_columns = analyzeMissingCols(X_train)

Missing Data Analysis:
                           Missing Values  Percent Missing
Behaviour                          233234            100.0
Duration                           233234            100.0
PieceNumber                        233234            100.0
MoveDistance                       233234            100.0
Complexity                         233234            100.0
BoardCoverage                      233234            100.0
GameOutcome                        233234            100.0
DecisionFactor                     233234            100.0
ScoreDifference                    233234            100.0
StateEvaluation                    233234            100.0
Clarity                            233234            100.0
StateRepetition                    233234            100.0
BranchingFactor                    233234            100.0
Drama                              233234            100.0
Decisiveness                       233234            100.0
StateEvaluationDifference        

In [6]:
missing_values_columns

['Behaviour',
 'Duration',
 'PieceNumber',
 'MoveDistance',
 'Complexity',
 'BoardCoverage',
 'GameOutcome',
 'DecisionFactor',
 'ScoreDifference',
 'StateEvaluation',
 'Clarity',
 'StateRepetition',
 'BranchingFactor',
 'Drama',
 'Decisiveness',
 'StateEvaluationDifference',
 'MoveEvaluation',
 'BoardSitesOccupied']

## Zero Variance Columns

In [7]:
def zeroVarCols(df):
    # Check for zero-variance columns
    zero_var_columns = df.columns[df.nunique() == 1].tolist()
    print("\nZero-variance columns (all values are the same):")
    if zero_var_columns:
        for col in zero_var_columns:
            print(f"- {col}: {df[col].nunique()}")
        return zero_var_columns
    else:
        print("No zero-variance columns found.")

In [8]:
zero_var_columns = zeroVarCols(X_train)



Zero-variance columns (all values are the same):
- Properties: 1
- Format: 1
- Time: 1
- Discrete: 1
- Realtime: 1
- Turns: 1
- Alternating: 1
- Simultaneous: 1
- HiddenInformation: 1
- Match: 1
- AsymmetricRules: 1
- AsymmetricPlayRules: 1
- AsymmetricEndRules: 1
- AsymmetricSetup: 1
- Players: 1
- NumPlayers: 1
- Simulation: 1
- Solitaire: 1
- TwoPlayer: 1
- Multiplayer: 1
- Coalition: 1
- Puzzle: 1
- DeductionPuzzle: 1
- PlanningPuzzle: 1
- Equipment: 1
- Container: 1
- Board: 1
- PrismShape: 1
- ParallelogramShape: 1
- RectanglePyramidalShape: 1
- TargetShape: 1
- BrickTiling: 1
- CelticTiling: 1
- QuadHexTiling: 1
- Hints: 1
- PlayableSites: 1
- Component: 1
- DiceD3: 1
- BiasedDice: 1
- Card: 1
- Domino: 1
- Rules: 1
- SituationalTurnKo: 1
- SituationalSuperko: 1
- InitialAmount: 1
- InitialPot: 1
- Play: 1
- BetDecision: 1
- BetDecisionFrequency: 1
- VoteDecisionFrequency: 1
- ChooseTrumpSuitDecision: 1
- ChooseTrumpSuitDecisionFrequency: 1
- LeapDecisionToFriend: 1
- LeapDecis

# Data Cleaning

## Removal of NA Columns and Zero Variance Columns

In [9]:
def removeNAVarCols(df):
    # columns not needed
    columns_to_drop = ["Id", "LudRules", "EnglishRules", "GameRulesetName", 'agent1', 'agent2',
                       'num_draws_agent1', 'num_losses_agent1', 'num_wins_agent1']
    
    zero_var_columns = zeroVarCols(df)
    missing_values_coulmns = analyzeMissingCols(df)
    
    # concatenate all the features to drop them
    drop_features = columns_to_drop + zero_var_columns + missing_values_columns
    
    # drop columns
    selected_df = df.drop(drop_features, axis=1)
    
    print("Shape of data after dropping the features : \n", selected_df.shape)
    
    return selected_df

In [10]:
X_train = removeNAVarCols(X_train)


Zero-variance columns (all values are the same):
- Properties: 1
- Format: 1
- Time: 1
- Discrete: 1
- Realtime: 1
- Turns: 1
- Alternating: 1
- Simultaneous: 1
- HiddenInformation: 1
- Match: 1
- AsymmetricRules: 1
- AsymmetricPlayRules: 1
- AsymmetricEndRules: 1
- AsymmetricSetup: 1
- Players: 1
- NumPlayers: 1
- Simulation: 1
- Solitaire: 1
- TwoPlayer: 1
- Multiplayer: 1
- Coalition: 1
- Puzzle: 1
- DeductionPuzzle: 1
- PlanningPuzzle: 1
- Equipment: 1
- Container: 1
- Board: 1
- PrismShape: 1
- ParallelogramShape: 1
- RectanglePyramidalShape: 1
- TargetShape: 1
- BrickTiling: 1
- CelticTiling: 1
- QuadHexTiling: 1
- Hints: 1
- PlayableSites: 1
- Component: 1
- DiceD3: 1
- BiasedDice: 1
- Card: 1
- Domino: 1
- Rules: 1
- SituationalTurnKo: 1
- SituationalSuperko: 1
- InitialAmount: 1
- InitialPot: 1
- Play: 1
- BetDecision: 1
- BetDecisionFrequency: 1
- VoteDecisionFrequency: 1
- ChooseTrumpSuitDecision: 1
- ChooseTrumpSuitDecisionFrequency: 1
- LeapDecisionToFriend: 1
- LeapDecis

## Encoding Categorical Variables

In [11]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

def encodingCat(df):
    categorical_cols = df.select_dtypes(include=['object']).columns
    numerical_cols = df.select_dtypes(exclude=['object']).columns
    
    # OneHotEncoder is used for nominal categorical data (non-ordinal, no specific order)
    one_hot_encoder = OneHotEncoder(drop='first', sparse=False)  # drop='first' to avoid multicollinearity

    # Option 2: Apply Label Encoding for categorical columns with ordinal data or when there are too many categories
    label_encoder = LabelEncoder()

    # Loop through the categorical columns and decide on the encoding
    for col in categorical_cols:
        # If the number of unique categories is high, use Label Encoding; else One-Hot Encoding
        if df[col].nunique() < 10:
            one_hot_encoded = pd.DataFrame(one_hot_encoder.fit_transform(df[[col]]), columns=one_hot_encoder.get_feature_names_out([col]))
            df = df.drop(columns=[col])  # Drop the original categorical column
            df = pd.concat([df, one_hot_encoded], axis=1)  # Concatenate the new One-Hot columns
        else:
            df[col] = label_encoder.fit_transform(df[col])  # Use Label Encoding for high cardinality columns
    return df

In [12]:
X_train = encodingCat(X_train)

In [13]:
X_train.shape

(233234, 588)

# Feature Engineering

## Let us now pick Top 10 Features from the Data

In [14]:
import numpy as np
def highCorr(df):
    # Calculate the correlation matrix
    corr_matrix = df.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find features with correlation greater than 0.9 (or any threshold you define)
    high_corr_features = [column for column in upper.columns if any(upper[column] > 0.82)]

    # Drop the highly correlated features
    df = df.drop(columns=high_corr_features)

    print(f"Highly correlated features removed: {high_corr_features}")
    return df

In [15]:
X_train = highCorr(X_train)

Highly correlated features removed: ['AsymmetricForces', 'AsymmetricPiecesType', 'Team', 'RegularShape', 'PolygonShape', 'Tiling', 'CircleTiling', 'SpiralTiling', 'ThreeMensMorrisBoardWithTwoTriangles', 'TrackLoop', 'NumPlayableSitesOnBoard', 'NumDiagonalDirections', 'NumInnerSites', 'NumLayers', 'NumEdges', 'NumCells', 'NumVertices', 'NumPerimeterSites', 'NumBottomSites', 'NumLeftSites', 'NumConvexCorners', 'NumContainers', 'NumPlayableSites', 'PieceDirection', 'Dice', 'Tile', 'NumComponentsTypePerPlayer', 'NumDice', 'SwapOption', 'PositionalSuperko', 'NumStartComponentsBoardPerPlayer', 'NumStartComponentsHandPerPlayer', 'NumStartComponentsPerPlayer', 'SwapPlayersDecision', 'PassDecision', 'ProposeDecision', 'AddDecisionFrequency', 'RotationDecisionFrequency', 'StepDecisionFrequency', 'StepDecisionToEmpty', 'StepDecisionToEmptyFrequency', 'SlideDecisionFrequency', 'SlideDecisionToEmptyFrequency', 'LeapDecisionToEmpty', 'LeapDecisionToEmptyFrequency', 'LeapDecisionToEnemy', 'HopDecisio

In [16]:
top_features = ['AdvantageP1', 'PlayoutsPerSecond',
       'DurationTurnsStdDev', 'DrawFrequency', 'DurationTurnsNotTimeouts',
       'OutcomeUniformity', 'PieceNumberAverage', 'DurationActions',
       'MovesPerSecond']

In [17]:
def impFeatures(df):
    df = df[top_features]
    return df

In [18]:
X_train = impFeatures(X_train)

In [19]:
X_train.shape

(233234, 9)

# Model Training

We can use different models and perform cross-validation to select the best one based on performance metrics.
    1.  Train multiple models.
    2.  Perform cross-validation to evaluate performance.
    3.  Select the best model based on cross-validation results.
   
Here are some popular models:
    •   Linear Regression.
    •   RandomForest.
    •   Gradient Boosting (GBM).
    •   XGBoost.
We will train each model, perform cross-validation, and select the best one.

In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer
from xgboost import XGBRegressor
import optuna
from math import sqrt

def modelTrain(df, df_target, scaler=True, test_size=0.4, random_state=42, n_trials=50):
    
    # Error metric
    def rmse(y_true, y_pred):
        return np.sqrt(mean_squared_error(y_true, y_pred))
    
    # Prepare the data
    X = df
    y = df_target
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Objective function for Optuna
    def objective(trial):
        # Define the hyper-parameters to tune
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'max_depth': trial.suggest_int('max_depth', 3, 9),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 0.5),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)
        }
        
        # Define XGBoost pipeline with optional scaler
        xgb_pipeline = Pipeline([
            ('scaler', StandardScaler() if scaler else 'passthrough'),
            ('regressor', XGBRegressor(random_state=random_state, **params))
        ])
        
        # Cross-validation
        kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
        
        # Perform cross-validation and return the mean RMSE
        cv_scores = cross_val_score(xgb_pipeline, X_train, y_train, cv=kf, 
                                    scoring=make_scorer(rmse, greater_is_better=False))
        return -cv_scores.mean()  # minimize RMSE
    
    # Perform Optuna optimization
    print(f"Running Optuna hyper-parameter tuning for XGBoost with {n_trials} trials...")
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)
    
    # Get the best hyper-parameters
    best_params = study.best_params
    print(f"Best Hyper-parameters: {best_params}")
    
    # Build the final model with the best parameters
    best_model = Pipeline([
        ('scaler', StandardScaler() if scaler else 'passthrough'),
        ('regressor', XGBRegressor(random_state=random_state, **best_params))
    ])
    
    # Train the best model on the full training data
    best_model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = best_model.predict(X_test)
    
    # Evaluate the model on the test data
    test_score = best_model.score(X_test, y_test)
    rmse_value = rmse(y_test, y_pred)
    
    print(f"Test R^2 Score: {test_score}")
    print(f"Test RMSE: {rmse_value}")
    
    return best_model, y_pred

# Example usage:
# modelTrain(df, df_target, n_trials=50)

In [21]:
best_model,y_pred = modelTrain(X_train,y_train)

[I 2024-10-05 08:28:45,345] A new study created in memory with name: no-name-0f187b7f-7722-4d97-a57a-6ea54c2c3a4f


Running Optuna hyper-parameter tuning for XGBoost with 50 trials...


[I 2024-10-05 08:28:51,033] Trial 0 finished with value: 0.4650493962364819 and parameters: {'n_estimators': 139, 'learning_rate': 0.08885841975374009, 'max_depth': 9, 'subsample': 0.8472907109924175, 'colsample_bytree': 0.6289632253188413, 'gamma': 0.033711846582090754, 'min_child_weight': 9}. Best is trial 0 with value: 0.4650493962364819.
[I 2024-10-05 08:28:55,395] Trial 1 finished with value: 0.4652826939787257 and parameters: {'n_estimators': 184, 'learning_rate': 0.16783692099559297, 'max_depth': 5, 'subsample': 0.581418621342531, 'colsample_bytree': 0.8386660898048803, 'gamma': 0.32670221287185136, 'min_child_weight': 1}. Best is trial 0 with value: 0.4650493962364819.
[I 2024-10-05 08:28:58,443] Trial 2 finished with value: 0.46522067152021884 and parameters: {'n_estimators': 64, 'learning_rate': 0.20553464902955146, 'max_depth': 9, 'subsample': 0.8177194265147236, 'colsample_bytree': 0.8959398274667187, 'gamma': 0.0481982718051025, 'min_child_weight': 2}. Best is trial 0 with

Best Hyper-parameters: {'n_estimators': 155, 'learning_rate': 0.14922867299742237, 'max_depth': 9, 'subsample': 0.9311462944322912, 'colsample_bytree': 0.7813426696424157, 'gamma': 0.202791557262319, 'min_child_weight': 1}
Test R^2 Score: 0.4421574467399644
Test RMSE: 0.46549150338097506


We have selected XGBoost as our best Model and now we will make predictions on test set(Splitted from the training set)

# Predictions and Submissions

In [22]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt


def predict_with_model(model, X_new, y_true=None):
    # Make predictions
    print("Running Inference.....\n")
    y_pred = model.predict(X_new)
    
    # If true values are provided, calculate and print performance metrics
    if y_true is not None:
        rmse = sqrt(mean_squared_error(y_true, y_pred))
        rmse = -rmse
        print(f"RMSE on new data: {rmse:.4f}")
        
         # Create a scatter plot of predicted vs actual values
        import matplotlib.pyplot as plt
        plt.figure(figsize=(10, 6))
        plt.scatter(y_true, y_pred, alpha=0.5)
        plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)
        plt.xlabel('Actual Values')
        plt.ylabel('Predicted Values')
        plt.title('Actual vs Predicted Values on New Data')
        plt.show()
        
        # Create a residual plot
        residuals = y_true - y_pred
        plt.figure(figsize=(10, 6))
        plt.scatter(y_pred, residuals, alpha=0.5)
        plt.axhline(y=0, color='r', linestyle='--')
        plt.xlabel('Predicted Values')
        plt.ylabel('Residuals')
        plt.title('Residual Plot on New Data')
        plt.show()
        
    print("Completed Running Inference.....\n")
    
    return y_pred

In [23]:
# Ensure 'top_features' is a list
top_features = list(top_features)

# Select test data features based on predefined top features
test = test[top_features]

X_test_processed = test[top_features]

# Now, let's try to predict with the test dataset....
predictions = predict_with_model(best_model, X_test_processed)

Running Inference.....

Completed Running Inference.....



In [24]:
import kaggle_evaluation.mcts_inference_server
import mcts_inference_server
import polars as pl
import numpy as np

best_model = None
counter = 0

def predict(test, submission):
    global best_model, top_features, counter
    
    # Train the model if it's the first prediction call
    if counter == 0:
        best_model, y_pred = modelTrain(X_train, y_train)
    counter += 1
    
    # Select test data features based on predefined top features
    test = test[top_features]
    
    # Generate predictions with the trained model
    submissions = best_model.predict(test)
    
    print("Predictions Generated...!!!")
    
    # Convert the 'submissions' numpy array to a polars Series
    submission_pl = pl.DataFrame(submission)  # Convert the submission to a polars DataFrame if not already
    submissions_series = pl.Series('utility_agent1', submissions)
    
    # Add the predictions as a new column to the submission DataFrame
    submission_pl = submission_pl.with_columns(submissions_series)
    
    return submission_pl

In [25]:
import kaggle_evaluation.mcts_inference_server
import mcts_inference_server

# Set up the inference server
inference_server = kaggle_evaluation.mcts_inference_server.MCTSInferenceServer(predict)

# Check if it's a Kaggle competition rerun or running locally
if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/um-game-playing-strength-of-mcts-variants/test.csv',
            '/kaggle/input/um-game-playing-strength-of-mcts-variants/sample_submission.csv'
        )
    )

[I 2024-10-05 08:31:39,789] A new study created in memory with name: no-name-b2a881b3-27a9-4481-b628-815caa94fd47


Running Optuna hyper-parameter tuning for XGBoost with 50 trials...


[I 2024-10-05 08:31:44,805] Trial 0 finished with value: 0.46486716844370707 and parameters: {'n_estimators': 172, 'learning_rate': 0.14704882508292383, 'max_depth': 6, 'subsample': 0.9433630819874317, 'colsample_bytree': 0.6131907572915498, 'gamma': 0.016183440750162204, 'min_child_weight': 1}. Best is trial 0 with value: 0.46486716844370707.
[I 2024-10-05 08:31:48,918] Trial 1 finished with value: 0.46507567375300374 and parameters: {'n_estimators': 137, 'learning_rate': 0.12213610770066996, 'max_depth': 7, 'subsample': 0.6989772640401795, 'colsample_bytree': 0.7326440640386434, 'gamma': 0.39405486032654113, 'min_child_weight': 5}. Best is trial 0 with value: 0.46486716844370707.
[I 2024-10-05 08:31:50,957] Trial 2 finished with value: 0.46786896165781594 and parameters: {'n_estimators': 70, 'learning_rate': 0.2094787176622842, 'max_depth': 5, 'subsample': 0.8859823412172643, 'colsample_bytree': 0.8250280343462657, 'gamma': 0.031522982165250346, 'min_child_weight': 9}. Best is trial 

Best Hyper-parameters: {'n_estimators': 136, 'learning_rate': 0.05181115104099214, 'max_depth': 9, 'subsample': 0.9398939018276251, 'colsample_bytree': 0.7139431018440748, 'gamma': 0.07378932702217625, 'min_child_weight': 5}
Test R^2 Score: 0.44221668217961674
Test RMSE: 0.4654667882299688
Predictions Generated...!!!
