In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd

import plotly.express as px

In [3]:
df = pd.read_csv("data/champions_group_2025.csv")

# Drop overlooked columns instructed by organising committee
df = df.drop(columns=['Parent Company', 'Parent Country'])

# Data Cleaning
1. Handling duplicates
1. Dropping unnecessary columns that do not provide insights
1. Handling null values

## Checking for duplicates

In [4]:
# Check for duplicated AccountID
print(f"Number of duplicated AccountID: {df['AccountID'].duplicated().sum()}")
# Check for duplicated Company
print(f"Number of duplicated Company: {df['Company'].duplicated().sum()}")
# Check if there are duplicate rows
print(f"Number of duplicated rows: {df.duplicated().sum()}")

Number of duplicated AccountID: 0
Number of duplicated Company: 0
Number of duplicated rows: 0


Since there are no duplicates, no need to drop any columns.

## Dropping unnecessary columns
- Drop 'AccountID' and 'Company' as they are identifiers that do not provide
  meaningful analysis. Drop 'Company Description' as well.
- Drop 'Industry' and '8-Digit SIC Description'
  as they are descriptions for numerical codes that are already in the data.
- Drop 'Company Status (Active/Inactive)' as all rows are "active".

In [5]:
cols_to_drop = ['AccountID', 'Company', 'Company Description',
                'Industry', '8-Digit SIC Description',
                'Company Status (Active/Inactive)']
df_cleaned_1 = df.drop(columns=cols_to_drop)

## Handling null values
1. Calculate absolute count and percentage of null values per columns 
1. Drop columns with 100% null value

In [6]:
# Get count of null values
null_stats = pd.DataFrame(df_cleaned_1.isnull().sum(), columns=['null_count'])
# Get percentage
null_stats['null_percentage'] = null_stats['null_count'].apply(lambda x: round(x / len(df) * 100, 2))
null_stats.sort_values('null_percentage', ascending=False)

Unnamed: 0,null_count,null_percentage
Square Footage,29182,100.0
Import/Export Status,22569,77.34
Fiscal Year End,22445,76.91
Employees (Single Site),12403,42.5
Employees (Global Ultimate Total),2774,9.51
Year Found,434,1.49
LATITUDE,120,0.41
LONGITUDE,120,0.41
Employees (Domestic Ultimate Total),79,0.27
SIC Code,0,0.0


In [7]:
# drop columns with 100% null value
null_cols = null_stats[null_stats['null_percentage'] == 100].index
df_cleaned_2 = df_cleaned_1.drop(columns=null_cols)

# Data Preparation

## Data Imputation
- For categorical data, impute missing data with "missing".
- For numerical data, perform median imputation.

In [8]:
def impute_data(col: pd.Series) -> pd.Series:
    '''
    Perform data imputation on Series.  
    For categorical data, impute missing data with "Missing".  
    For numerical data, perform median imputation.

    Parameters:
        col (pandas.Series): Series to apply data imputation.

    Returns:
        pandas.Series: Series with data imputed.
    '''
    if not col.isna().any():
        return col
    res = None
    if col.dtype == object:
        res = col.apply(lambda x: "missing" if pd.isna(x) else x)
    else:
        median_val = col.median(skipna=True)
        res = col.apply(lambda x: median_val if pd.isna(x) else x)
    return res

In [9]:
df_cleaned_3 = df_cleaned_2.apply(lambda x: impute_data(x))

## Mapping columns
1. Map 'Year Found' column to the age of company as of 2025, which
is a more direct and intuitive way of comparison.
1. Shortening 'Fiscal Year End' column for ease of understanding.

### Mapping 'Year Found' column

In [10]:
cur_year = 2025
df_cleaned_3['Year Found'] = df_cleaned_3['Year Found'].apply(lambda x: cur_year - x)
df_cleaned_4 = df_cleaned_3.rename(columns={'Year Found': 'Age'})

### Mapping 'Fiscal Year End' column
- Since majority of the non-NA values are at the year end, we decided to map
'Fiscal Year End' to indicate they are at the year end or not.
- If data is missing, it will remain as 'missing'.
- Then, label encode the values:
  - 0: "missing"
  - 1: "no"
  - 2: "yes"

In [11]:
from datetime import datetime

def map_fiscal_year(date_str: str) -> str:
    '''
    Map 'Fiscal Year End' column to indicate whether
    reporting of fiscal year is at the calender year end (i.e. 31 Dec).

    Parameters:
        date_str (str): string containing datetime in "yyyy-mm-ddThh-mm-ssZ format.

    Returns:
        str: One of ["missing", "yes", no"]
    '''
    if date_str == "missing":
        return "missing"
    else:
        timestamp = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ")
        month = timestamp.month
        day = timestamp.day
        is_year_end = (month == 12) & (day == 31)
        if is_year_end:
            return "yes"
        else:
            return "no"

In [12]:
df_cleaned_4['Fiscal Year End'] = df_cleaned_4['Fiscal Year End'].apply(lambda x: map_fiscal_year(x))

## Encoding for Categorical Data
- One Hot Encoding for Categorical data with less than 5 categories.
- Frequency Encoding for SIC codes that has many categories.


### One Hot Encoding

In [13]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


ohe_cols = ['Import/Export Status', 'Ownership Type', 'Fiscal Year End']
one_hot_encoder = ColumnTransformer(
    transformers=[("onehot", OneHotEncoder(sparse_output=False), ohe_cols)],
    remainder="passthrough"
)
one_hot_encoder.set_output(transform="pandas")
df_encoded: pd.DataFrame = one_hot_encoder.fit_transform(df_cleaned_4)  # type: ignore
df_encoded.columns = df_encoded.columns.str.replace('onehot__', '')
df_encoded.columns = df_encoded.columns.str.replace('remainder__', '')

### Frequency Encoding

In [14]:
freq_encoding_cols = ['SIC Code', '8-Digit SIC Code']
for col in freq_encoding_cols:
    counts = df_encoded[col].value_counts()
    df_encoded[col] = df_encoded[col].map(counts)

## Type Casting
- Convert all float64 columns to float32 for optimisation with ML models

In [15]:
# Convert float64 columns to float32
df_final = df_encoded.apply(lambda x: x.astype('float32') if x.dtype  == 'float64' else x)

In [16]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29182 entries, 0 to 29181
Data columns (total 25 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Import/Export Status_Both Imports & Exports  29182 non-null  float32
 1   Import/Export Status_Exports                 29182 non-null  float32
 2   Import/Export Status_Imports                 29182 non-null  float32
 3   Import/Export Status_missing                 29182 non-null  float32
 4   Ownership Type_Non-Corporates                29182 non-null  float32
 5   Ownership Type_Nonprofit                     29182 non-null  float32
 6   Ownership Type_Partnership                   29182 non-null  float32
 7   Ownership Type_Private                       29182 non-null  float32
 8   Ownership Type_Public                        29182 non-null  float32
 9   Ownership Type_Public Sector                 29182 non-null  float32
 10

# Split training and test data
- Since there are 2 dependent variables ('Is Domestic Ultimate' and 'Is Global Ultimate'),
2 different set of train-test data is needed
- Train-test will be 80-20 ratio

In [17]:
from sklearn.model_selection import train_test_split

# Get X and y columns
X = df_final.drop(columns=['Is Domestic Ultimate', 'Is Global Ultimate'])
y_dom = df_final['Is Domestic Ultimate']
y_global = df_final['Is Global Ultimate']

# Get train-test for domestic ultimate
X_dom_train, X_dom_test, y_dom_train, y_dom_test = train_test_split(X, y_dom,
                                                                    test_size=0.2,
                                                                    random_state=42)

# Get train-test for global ultimate
X_global_train, X_global_test, y_global_train, y_global_test = train_test_split(X, y_global,
                                                                                test_size=0.2,
                                                                                random_state=42)

# Model testing
- Initial testing of different models using default parameters.
- Evaluate different models using accuracy, F1 Score, AUC

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_validate
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

def evaluate_model(model,
                   X_train: pd.DataFrame,
                   y_train: pd.Series,
                   scale: bool=False,
                   oversample: bool=False) -> tuple[float, float, float]:
    '''
    Perform 5 fold cross validation on model get its accuracy, F1 and AUC.
    Apply necessary transformation is needed.

    Parameters:
        model (Estimator-like object): An estimator with fit method.
        X_train (pandas.DataFrame): Independent variables of training dataset.
        y_train (pandas.Series): Dependent variable of training dataset.
        scale (bool): To apply scale function or not.
        oversample (bool): To apply SMOTE or not.

    Returns:
        tuple(float, float, float): accuracy, f1 score, ROC AUC score
    '''
    # Pipeline of transformations for model
    if scale and oversample:
        model_pipeline = Pipeline(
            [
                ('scaler', StandardScaler()),
                ('smote', SMOTE(sampling_strategy='auto', random_state=42)),
                ('clf', model)
            ]
        )
    elif oversample:
        model_pipeline = Pipeline(
            [
                ('smote', SMOTE(sampling_strategy='auto', random_state=42)),
                ('clf', model)
            ]
        )
    elif scale:
        model_pipeline = Pipeline(
            [
                ('scaler', StandardScaler()),
                ('clf', model)
            ]
        )
    else:
        model_pipeline = model

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scoring = ["accuracy", "f1", "roc_auc"]
    cv_results = cross_validate(model_pipeline, X_train, y_train, cv=cv, scoring=scoring)

    accuracy = cv_results['test_accuracy'].mean()
    f1_score = cv_results['test_f1'].mean()
    roc_auc_score = cv_results['test_roc_auc'].mean()

    return accuracy, f1_score, roc_auc_score

## Test for 'Is Domestic Ultimate'

In [19]:
df_final['Is Domestic Ultimate'].value_counts()

Is Domestic Ultimate
1    14593
0    14589
Name: count, dtype: int64

Observation:
- Sample is balanced, no need for sampling techniques.

In [20]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

models = {
    "XGBoost": XGBClassifier(random_state=42),
    "LightGBM": LGBMClassifier(random_state=42, verbose=-1),
    "Random Forest Classifier": RandomForestClassifier(random_state=42),
    "SVC": SVC(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42)
}

In [21]:
data_dom = []  # Nested list to store model evaluation scores
for name, model in models.items():
    if name == "Logistic Regression" or name == "SVC":
        score = evaluate_model(model, X_dom_train, y_dom_train, scale=True)
    else:
        score = evaluate_model(model, X_dom_train, y_dom_train)
    data_dom.append([name, *score])

eval_dom = pd.DataFrame(data_dom, columns=["model", "accuracy", "f1 score", "roc auc score"])
print(eval_dom)

                      model  accuracy  f1 score  roc auc score
0                   XGBoost  0.916428  0.919306       0.962604
1                  LightGBM  0.914114  0.917759       0.962473
2  Random Forest Classifier  0.902549  0.905676       0.950686
3                       SVC  0.650589  0.661126       0.700282
4       Logistic Regression  0.631741  0.622109       0.676703


## Test for 'Is Global Ultimate'

In [22]:
df_final['Is Global Ultimate'].value_counts()

Is Global Ultimate
0    21675
1     7507
Name: count, dtype: int64

Observation:
- 'Is Global Ultimate' is imbalanced.

Solution:
- Apply SMOTE, which generates synthetic data to increase number of minority class.
- During Cross Validation, apply within each fold.

In [23]:
data_global = []  # Nested list to store model evaluation scores
for name, model in models.items():
    if name == "Logistic Regression" or name == "SVC": 
        score = evaluate_model(model,
                               X_global_train, y_global_train,
                               scale=True,
                               oversample=True)
    else:
        score = evaluate_model(model,
                               X_global_train, y_global_train,
                               scale=False, oversample=True)
    data_global.append([name, *score])

eval_global = pd.DataFrame(data_global, columns=["model", "accuracy", "f1 score", "roc auc score"])
print(eval_global)

                      model  accuracy  f1 score  roc auc score
0                   XGBoost  0.937974  0.884419       0.973923
1                  LightGBM  0.937331  0.884557       0.974445
2  Random Forest Classifier  0.936946  0.879611       0.973987
3                       SVC  0.672607  0.539626       0.779854
4       Logistic Regression  0.664339  0.540093       0.744232


## Conclusion from initial model testing:
Observation:
- From the evaluation scores, we can see that XGBoost performs the best across all metrics,
with LightGBM and Random Forest being slightly worse.
- Linear models like Logistic Regression and SVC performs significantly worse than the other models.

# Feature selection using XGBoost Classifier

## Domestic Ultimate

In [24]:
# Fit model
clf_dom = XGBClassifier(random_state=42)
clf_dom.fit(X_dom_train, y_dom_train)

# Get feature importance
feature_importance = clf_dom.feature_importances_

# Plot feature importance
feature_importance_df = pd.DataFrame(
    {
        'feature': clf_dom.feature_names_in_,
        'importance': clf_dom.feature_importances_
    }
)
feature_importance_df.sort_values('importance', ascending=False, inplace=True)
px.bar(feature_importance_df,
       x='importance',
       y='feature',
       title="Feature importance for 'Is Domestic Ultimate'",
       text_auto=True,
       height=600,
       width=2000)

## Global Ultimate

In [25]:
# Fit model
clf_global = XGBClassifier(random_state=42)
clf_global.fit(X_global_train, y_global_train)

# Get feature importance
feature_importance = clf_global.feature_importances_

# Plot feature importance
feature_importance_df = pd.DataFrame(
    {
        'feature': clf_global.feature_names_in_,
        'importance': clf_global.feature_importances_
    }
)
feature_importance_df.sort_values('importance', ascending=False, inplace=True)
px.bar(feature_importance_df,
       x='importance',
       y='feature',
       title="Feature importance for 'Is Global Ultimate'",
       text_auto=True,
       height=600,
       width=2000)

# Hypertuning parameters
Using Optuna to hypertune parameters

## XGBoost

In [26]:
import optuna
from sklearn.model_selection import cross_val_score


def hypertune_xgb(X_train: pd.DataFrame, y_train: pd.Series,
                  oversample: bool=False) -> dict:
  '''
  Get the best parameters for XGBoost model.  
  
  Parameters:
    X_train (pandas.DataFrame): Independent variables of training dataset.
    y_train (pandas.Series): Dependent variable of training dataset.
    oversample (bool): To apply SMOTE or not.

  Returns:
    dict: Dictionary containing the best value for each hyperparameter.
  '''
  def objective(trial):
    # Define hyperparameter search space
    params = {
      'objective': "binary:logistic",
      'max_depth': trial.suggest_int('max_depth', 3, 10),
      'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
      'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
      'subsample': trial.suggest_float('subsample', 0.5, 1.0),
      'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
      'gamma': trial.suggest_float('gamma', 0, 10),
      'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
      'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
      'random_state': 42
    }

    # Train XGBoost model
    model = XGBClassifier(**params)

    # Get model pipeline
    if oversample:
      model_pipeline = Pipeline(
              [
                  ('smote', SMOTE(sampling_strategy='auto', random_state=42)),
                  ('clf', model)
              ]
      )
    else:
      model_pipeline = model

    # Get cross validation score
    score = cross_val_score(model_pipeline, X_train, y_train, cv=5, scoring='f1').mean()
    return score

  sampler = optuna.samplers.TPESampler(seed=42)
  study = optuna.create_study(sampler=sampler, direction="maximize")
  study.optimize(objective, n_trials= 50)  # type: ignore

  return study.best_params


### Hypertuning for 'Is Domestic Ultimate'

In [27]:
best_xgb_params_dom = hypertune_xgb(X_dom_train, y_dom_train)

[I 2025-06-20 15:40:04,623] A new study created in memory with name: no-name-604d76f9-d60f-4ece-a043-0e45d42b0854
[I 2025-06-20 15:40:04,977] Trial 0 finished with value: 0.9093676622479532 and parameters: {'max_depth': 5, 'learning_rate': 0.28570714885887566, 'min_child_weight': 8, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 1.5599452033620265, 'reg_lambda': 0.2904180608409973, 'reg_alpha': 8.661761457749352}. Best is trial 0 with value: 0.9093676622479532.
[I 2025-06-20 15:40:05,331] Trial 1 finished with value: 0.9157389227438472 and parameters: {'max_depth': 7, 'learning_rate': 0.21534104756085318, 'min_child_weight': 1, 'subsample': 0.9849549260809971, 'colsample_bytree': 0.9162213204002109, 'gamma': 2.1233911067827616, 'reg_lambda': 0.9091248360355031, 'reg_alpha': 1.8340450985343382}. Best is trial 1 with value: 0.9157389227438472.
[I 2025-06-20 15:40:05,836] Trial 2 finished with value: 0.912910836212502 and parameters: {'max_depth': 5, 'le

In [28]:
# Fit XGBoost model
xgb_model_dom = XGBClassifier(**best_xgb_params_dom)

### Hypertuning for 'Is Global Ultimate'

In [29]:
best_xgb_params_global = hypertune_xgb(X_global_train, y_global_train, oversample=True)

[I 2025-06-20 15:40:30,423] A new study created in memory with name: no-name-459eca6b-e523-44a2-820c-477b74e50712
[I 2025-06-20 15:40:31,241] Trial 0 finished with value: 0.876938715964056 and parameters: {'max_depth': 5, 'learning_rate': 0.28570714885887566, 'min_child_weight': 8, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 1.5599452033620265, 'reg_lambda': 0.2904180608409973, 'reg_alpha': 8.661761457749352}. Best is trial 0 with value: 0.876938715964056.
[I 2025-06-20 15:40:32,096] Trial 1 finished with value: 0.8802799863820227 and parameters: {'max_depth': 7, 'learning_rate': 0.21534104756085318, 'min_child_weight': 1, 'subsample': 0.9849549260809971, 'colsample_bytree': 0.9162213204002109, 'gamma': 2.1233911067827616, 'reg_lambda': 0.9091248360355031, 'reg_alpha': 1.8340450985343382}. Best is trial 1 with value: 0.8802799863820227.
[I 2025-06-20 15:40:33,190] Trial 2 finished with value: 0.8795957064895328 and parameters: {'max_depth': 5, 'lea

In [30]:
# Fit XGBoost model
xgb_model_global = XGBClassifier(**best_xgb_params_global)

## Random Forest

In [None]:
def hypertune_rf(X_train: pd.DataFrame, y_train: pd.Series,
                 oversample: bool=False) -> dict:
  '''
  Get the best parameters for random forest model.
  
  Parameters:
    x_train (pandas.DataFrame): Independent variables of training dataset
    y_train (pandas.Series): Dependent variable of test dataset
    oversample (bool): To apply SMOTE or not.

  Returns:
    dict: Dictionary containing the best value for each hyperparameter.
  '''
  def objective(trial):
    # Define hyperparameter search space
    params = {
      'n_estimators': trial.suggest_int('n_estimators', 100, 300),
      'max_depth': trial.suggest_int('max_depth', 5, 20),
      'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
      'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
      'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
      'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
      'random_state': 42
}

    # Train Random Forest model
    model = RandomForestClassifier(**params)

    # Get model pipeline
    if oversample:
      model_pipeline = Pipeline(
              [
                  ('smote', SMOTE(sampling_strategy='auto', random_state=42)),
                  ('clf', model)
              ]
      )
    else:
      model_pipeline = model

    # Get cross validation score
    score = cross_val_score(model_pipeline, X_train, y_train, cv=5, scoring='f1').mean()
    return score

  sampler = optuna.samplers.TPESampler(seed=42)
  study = optuna.create_study(sampler=sampler, direction="maximize")
  study.optimize(objective, n_trials= 20)  # type: ignore

  return study.best_params

### Hypertuning for 'Is Domestic Ultimate'

In [32]:
best_rf_params_dom = hypertune_rf(X_dom_train, y_dom_train)

[I 2025-06-20 15:41:26,715] A new study created in memory with name: no-name-853cb3ed-1556-45cf-a087-854749267661
[I 2025-06-20 15:41:41,355] Trial 0 finished with value: 0.9080490323932459 and parameters: {'n_estimators': 175, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 0.9080490323932459.
[I 2025-06-20 15:41:58,471] Trial 1 finished with value: 0.9046361121354174 and parameters: {'n_estimators': 220, 'max_depth': 16, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 0.9080490323932459.
[I 2025-06-20 15:42:11,172] Trial 2 finished with value: 0.9075516255527039 and parameters: {'n_estimators': 161, 'max_depth': 13, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 0.9080490323932459.
[I 2025-06-20 15:42:21,915] Trial 3 finished with value: 0.9045685091273

In [33]:
# Fit Random Forest model
rf_model_dom = RandomForestClassifier(**best_rf_params_dom)

### Hypertuning for 'Is Global Ultimate'

In [34]:
best_rf_params_global = hypertune_rf(X_global_train, y_global_train, oversample=True)

[I 2025-06-20 15:46:17,140] A new study created in memory with name: no-name-4ea35a02-ceb1-4bd4-bdf0-cd24639862ca
[I 2025-06-20 15:46:40,941] Trial 0 finished with value: 0.8807357106846109 and parameters: {'n_estimators': 175, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 0.8807357106846109.
[I 2025-06-20 15:47:08,306] Trial 1 finished with value: 0.8766992456885976 and parameters: {'n_estimators': 220, 'max_depth': 16, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 0.8807357106846109.
[I 2025-06-20 15:47:27,467] Trial 2 finished with value: 0.8778455777376237 and parameters: {'n_estimators': 161, 'max_depth': 13, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 0.8807357106846109.
[I 2025-06-20 15:47:44,171] Trial 3 finished with value: 0.8766831853892

In [35]:
# Fit Random Forest model
rf_model_global = RandomForestClassifier(**best_rf_params_global)

## LightGBM Model

In [None]:
def hypertune_lgb(X_train: pd.DataFrame, y_train: pd.Series,
                  oversample: bool=False) -> dict:
  '''
  Get the best parameters for lightgbm model.

  Parameters:
    x_train (pandas.DataFrame): Independent variables of training dataset
    y_train (pandas.Series): Dependent variable of test dataset
    oversample (bool): To apply SMOTE or not.

  Returns:
    dict: Dictionary containing the best value for each hyperparameter.
  '''
  def objective(trial):
    # Define hyperparameter search space
    params = {
      'objective': "binary",
      'metric': "binary_logloss",
      'boosting': "gbdt",
      'verbosity': -1,
      'n_estimators': trial.suggest_int('n_estimators', 100, 300),
      'max_depth': trial.suggest_int('max_depth', 3, 15),
      'num_leaves': trial.suggest_int('num_leaves', 20, 150),
      'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
      'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
      'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
      'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
      'lambda_l1': trial.suggest_float('lambda_l1', 0, 5),
      'lambda_l2': trial.suggest_float('lambda_l2', 0, 5),
      'random_state': 42
  }

    # Train LightGBM model
    model = LGBMClassifier(**params)

    # Get model pipeline
    if oversample:
      model_pipeline = Pipeline(
              [
                  ('smote', SMOTE(sampling_strategy='auto', random_state=42)),
                  ('clf', model)
              ]
      )
    else:
      model_pipeline = model

    # Get cross validation score
    score = cross_val_score(model_pipeline, X_train, y_train, cv=5, scoring='f1').mean()  # type: ignore
    return score

  sampler = optuna.samplers.TPESampler(seed=42)
  study = optuna.create_study(sampler=sampler, direction="maximize")
  study.optimize(objective, n_trials= 50)  # type: ignore

  return study.best_params

### Hypertuning for 'Is Domestic Ultimate'

In [37]:
best_lgb_params_dom = hypertune_lgb(X_dom_train, y_dom_train)

[I 2025-06-20 15:53:37,120] A new study created in memory with name: no-name-0bdf33ab-942e-48b1-8555-3363b6ef4ce6
[I 2025-06-20 15:53:45,078] Trial 0 finished with value: 0.9136948028926014 and parameters: {'n_estimators': 175, 'max_depth': 15, 'num_leaves': 115, 'learning_rate': 0.1836109604171406, 'bagging_fraction': 0.5780093202212182, 'bagging_freq': 2, 'min_child_samples': 10, 'lambda_l1': 4.330880728874676, 'lambda_l2': 3.005575058716044}. Best is trial 0 with value: 0.9136948028926014.
[I 2025-06-20 15:53:46,151] Trial 1 finished with value: 0.9116948953241459 and parameters: {'n_estimators': 242, 'max_depth': 3, 'num_leaves': 147, 'learning_rate': 0.2514083658321223, 'bagging_fraction': 0.6061695553391381, 'bagging_freq': 2, 'min_child_samples': 22, 'lambda_l1': 1.5212112147976886, 'lambda_l2': 2.6237821581611893}. Best is trial 0 with value: 0.9136948028926014.
[I 2025-06-20 15:53:48,904] Trial 2 finished with value: 0.9134455055435184 and parameters: {'n_estimators': 186, 'ma

In [38]:
# Fit LightGBM model
lgb_model_dom = LGBMClassifier(**best_lgb_params_dom)

### Hypertuning for 'Is Global Ultimate'

In [39]:
best_lgb_params_global = hypertune_lgb(X_global_train, y_global_train, oversample=True)

[I 2025-06-20 15:56:44,978] A new study created in memory with name: no-name-c127243d-e939-4311-8368-24267c4c5653
[I 2025-06-20 15:56:53,017] Trial 0 finished with value: 0.882907722149397 and parameters: {'n_estimators': 175, 'max_depth': 15, 'num_leaves': 115, 'learning_rate': 0.1836109604171406, 'bagging_fraction': 0.5780093202212182, 'bagging_freq': 2, 'min_child_samples': 10, 'lambda_l1': 4.330880728874676, 'lambda_l2': 3.005575058716044}. Best is trial 0 with value: 0.882907722149397.
[I 2025-06-20 15:56:54,769] Trial 1 finished with value: 0.8816413133863854 and parameters: {'n_estimators': 242, 'max_depth': 3, 'num_leaves': 147, 'learning_rate': 0.2514083658321223, 'bagging_fraction': 0.6061695553391381, 'bagging_freq': 2, 'min_child_samples': 22, 'lambda_l1': 1.5212112147976886, 'lambda_l2': 2.6237821581611893}. Best is trial 0 with value: 0.882907722149397.
[I 2025-06-20 15:56:57,836] Trial 2 finished with value: 0.8776409556700484 and parameters: {'n_estimators': 186, 'max_d

In [40]:
# Fit LightGBM modelodel
lgb_model_global = LGBMClassifier(**best_lgb_params_global)

# Stacking
- Stacking XGBoost, LightGBM and Random Forest together.
- Use Logistic Regression as the final estimator.

## Stacking for 'Is Domestic Ultimate'

In [41]:
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import classification_report


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
stack_model_dom = StackingClassifier(
    estimators=[('rf', rf_model_dom), ('xgb', xgb_model_dom), ('lgb', lgb_model_dom)],  # type: ignore
    final_estimator=LogisticRegression(),
    cv=cv,
    passthrough=False
)

stack_model_dom.fit(X_dom_train, y_dom_train)
stack_pred_dom = stack_model_dom.predict(X_dom_test)

# Evaluate model
report = classification_report(y_dom_test, stack_pred_dom)  # type: ignore
print(report)

              precision    recall  f1-score   support

           0       0.94      0.90      0.92      2908
           1       0.91      0.95      0.93      2929

    accuracy                           0.92      5837
   macro avg       0.92      0.92      0.92      5837
weighted avg       0.92      0.92      0.92      5837



## Stacking for 'Is Global Ultimate'

In [42]:
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import classification_report


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
stack_model_global = StackingClassifier(
    estimators=[('rf', rf_model_global), ('xgb', xgb_model_global), ('lgb', lgb_model_global)],  # type: ignore
    final_estimator=LogisticRegression(),
    cv=cv,
    passthrough=False
)

smote = SMOTE(sampling_strategy="auto", random_state=42)
smote_data = smote.fit_resample(X_global_train, y_global_train)
X_global_train_resampled = smote_data[0]
y_global_train_resampled = smote_data[1]

stack_model_global.fit(X_global_train_resampled, y_global_train_resampled)
stack_pred_global = stack_model_global.predict(X_global_test)

# Evaluate model
report = classification_report(y_global_test, stack_pred_global)  # type: ignore
print(report)

              precision    recall  f1-score   support

           0       0.97      0.95      0.96      4314
           1       0.87      0.91      0.89      1523

    accuracy                           0.94      5837
   macro avg       0.92      0.93      0.92      5837
weighted avg       0.94      0.94      0.94      5837

