In [75]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.optimize import minimize
import seaborn as sns
import joblib
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import optuna
import lightgbm as lgb
import catboost as cb
from sklearn.linear_model import ElasticNet
import shap
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import logging
optuna.logging.set_verbosity(logging.WARNING)
from sklearn.preprocessing import RobustScaler
from numpy.random import dirichlet
from pandas.api.types import is_categorical_dtype, is_bool_dtype, is_float_dtype, is_integer_dtype
from sklearn.preprocessing import PowerTransformer
from optuna.integration import XGBoostPruningCallback
import warnings
warnings.filterwarnings('ignore', message="X does not have valid feature names")
oof_predcit_models = []

In [76]:
def apply_mixup_train_data(X_train_df, y_train_df, augmentation_factor=1.0, random_state=42):
    if augmentation_factor <= 0:
        return X_train_df.copy(), y_train_df.copy()

    X = X_train_df.reset_index(drop=True)
    y = y_train_df.reset_index(drop=True)
    N = len(X)
    if N == 0:
        return X.copy(), y.copy()

    N_aug = int(N * augmentation_factor)
    if N_aug <= 0:
        return X.copy(), y.copy()

    ohe_groups = []
    ohe_cols = set()

    rng = np.random.default_rng(random_state)
    idx_A = rng.integers(0, N, size=N_aug)
    idx_B = (idx_A + (rng.integers(1, N, size=N_aug) if N > 1 else 0)) % max(N, 1)
    lam = rng.beta(1.0, 1.0, size=N_aug).reshape(-1, 1)
    pickA = (lam.ravel() > 0.5)

    cols = list(X.columns)
    
    float_cols = [c for c in cols if is_float_dtype(X[c].dtype)] 
    int_cols = [c for c in cols if is_integer_dtype(X[c].dtype)] 
    bool_cols = [c for c in cols if is_bool_dtype(X[c].dtype)]
    
    int_cols = [c for c in int_cols if c not in bool_cols and c not in float_cols]
    
    handled_cols = set(float_cols) | set(int_cols) | set(bool_cols) 
    other_cols = [c for c in cols if c not in handled_cols]

    new_cols = {}

    if float_cols:
        A = X[float_cols].to_numpy()[idx_A]
        B = X[float_cols].to_numpy()[idx_B]
        M = lam * A + (1.0 - lam) * B
        for j, c in enumerate(float_cols):
            new_cols[c] = M[:, j].astype(X[c].dtype, copy=False)

    if int_cols:
        A = X[int_cols].astype('float64', copy=False).to_numpy()[idx_A]
        B = X[int_cols].astype('float64', copy=False).to_numpy()[idx_B]
        M = lam * A + (1.0 - lam) * B
        R = np.rint(M)
        mins = X[int_cols].astype('float64', copy=False).min().to_numpy()
        maxs = X[int_cols].astype('float64', copy=False).max().to_numpy()
        C = np.clip(R, mins, maxs)
        for j, c in enumerate(int_cols):
            new_cols[c] = C[:, j].astype(X[c].dtype, copy=False)

    if bool_cols:
        A = X[bool_cols].to_numpy()[idx_A]
        B = X[bool_cols].to_numpy()[idx_B]
        M = np.where(pickA[:, None], A, B)
        for j, c in enumerate(bool_cols):
            new_cols[c] = M[:, j].astype(X[c].dtype, copy=False)

    if other_cols:
        A = X[other_cols].to_numpy(dtype=object)[idx_A]
        B = X[other_cols].to_numpy(dtype=object)[idx_B]
        M = np.where(pickA[:, None], A, B)
        for j, c in enumerate(other_cols):
            new_cols[c] = M[:, j]

    X_new = pd.DataFrame({c: new_cols[c] for c in cols}, columns=cols)

    yA = y.to_numpy(dtype=np.float64)[idx_A]
    yB = y.to_numpy(dtype=np.float64)[idx_B]
    y_new = lam * yA + (1.0 - lam) * yB
    y_new_df = pd.DataFrame(y_new, columns=y.columns)

    X_aug = pd.concat([X, X_new], ignore_index=True)
    y_aug = pd.concat([y, y_new_df], ignore_index=True)
    return X_aug, y_aug

def target_encode_cv(X_train_df, y_train_df, X_test_df, categorical_col, cv):
    
    X_train_df = X_train_df.reset_index(drop=True)
    y_train_df = y_train_df.reset_index(drop=True)
    X_test_df = X_test_df.reset_index(drop=True)
    
    target_col = y_train_df.columns[0]
    
    oof_encoded = np.zeros(X_train_df.shape[0])
    test_encoded = np.zeros(X_test_df.shape[0])
    
    global_mean = y_train_df[target_col].mean()
    
    for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_df, y_train_df)):
        
        X_train_fold = X_train_df.iloc[train_idx]
        y_train_fold = y_train_df.iloc[train_idx]
        
        X_val_fold = X_train_df.iloc[val_idx]
        
        mean_encoding_map = y_train_fold.groupby(X_train_fold[categorical_col])[target_col].mean()
        
        oof_encoded[val_idx] = X_val_fold[categorical_col].map(mean_encoding_map).fillna(global_mean).values
        
        test_encoded_fold = X_test_df[categorical_col].map(mean_encoding_map).fillna(global_mean).values
        test_encoded += test_encoded_fold / cv.n_splits

    X_train_df[categorical_col] = oof_encoded
    X_test_df[categorical_col] = test_encoded
    
    return X_train_df, X_test_df

def generate_oof_elasticnet(X_train_df, y_train_df, X_test_df, cv, random_state=42):
    
    X_train_df = X_train_df.copy()
    X_test_df = X_test_df.copy()

    X_train_arr = X_train_df.values
    y_train_arr = y_train_df.values.flatten()
    X_test_arr = X_test_df.values

    oof_predictions = np.zeros(X_train_arr.shape[0])
    test_predictions = np.zeros(X_test_arr.shape[0])

    scaler = StandardScaler()

    for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_arr, y_train_arr)):

        model = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=random_state)

        X_train_fold, X_val_fold = X_train_arr[train_idx], X_train_arr[val_idx]
        y_train_fold = y_train_arr[train_idx]

        X_train_scaled = scaler.fit_transform(X_train_fold)

        X_val_scaled = scaler.transform(X_val_fold)
        X_test_scaled = scaler.transform(X_test_arr)

        model.fit(X_train_scaled, y_train_fold)
        oof_predcit_models.append({'fold': fold, 'model': model, 'scaler': X_train_scaled})

        oof_predictions[val_idx] = model.predict(X_val_scaled)

        test_predictions += model.predict(X_test_scaled) / cv.n_splits

    oof_col_name = 'OOF_ElasticNet'

    X_train_df[oof_col_name] = oof_predictions
    X_test_df[oof_col_name] = test_predictions

    return X_train_df, X_test_df

In [77]:
df = pd.read_csv('data/2025-01-01_2025-03-02_puzzle_com.twisted.rope.tangle.csv')
day = 60

df = df[['roas_d0','roas_d1','roas_d2','roas_d3',
        'cumulative_revenue_d0','cumulative_revenue_d1','cumulative_revenue_d2','cumulative_revenue_d3',
        'daily_revenue_d0','daily_revenue_d1','daily_revenue_d2',
        'unique_users_d0','unique_users_d1','unique_users_d2','unique_users_d3','daily_revenue_d3','cost',
        'ltv_d0', 'ltv_d1', 'ltv_d2', 'ltv_d3', f'ltv_d{day}']].copy()
df['ltv_mean'] = df[['ltv_d0', 'ltv_d1', 'ltv_d2', 'ltv_d3']].mean(axis=1)
df['roas_mean'] = df[['roas_d0','roas_d1','roas_d2','roas_d3']].mean(axis=1)
df['cumulative_revenue_mean'] = df[['cumulative_revenue_d0','cumulative_revenue_d1','cumulative_revenue_d2','cumulative_revenue_d3']].mean(axis=1)

df['ltv_std'] = df[['ltv_d0', 'ltv_d1', 'ltv_d2', 'ltv_d3']].std(axis=1)
df['roas_std'] = df[['roas_d0','roas_d1','roas_d2','roas_d3']].std(axis=1)
df['cumulative_revenue_std'] = df[['cumulative_revenue_d0','cumulative_revenue_d1','cumulative_revenue_d2','cumulative_revenue_d3']].std(axis=1)

df['ltv_growth'] = (df['ltv_d3'] - df['ltv_d0']) / (3 + 1e-9)
df['cumulative_revenue_growth'] = df['cumulative_revenue_d3'] - df['cumulative_revenue_d0']

df['revenue_acceleration'] = df['daily_revenue_d3'] - df['daily_revenue_d2'] - df['daily_revenue_d1'] + df['daily_revenue_d0']
df['user_acceleration'] = df['unique_users_d3'] - df['unique_users_d2'] - df['unique_users_d1'] + df['unique_users_d0']

df['roas_trend'] = df['roas_d3'] - df['roas_d0']
df['ltv_roas_ratio'] = df['ltv_d3'] / df['roas_d3']

df['ltv_slope_d0_d1'] = df['ltv_d1'] - df['ltv_d0']
df['ltv_slope_d1_d2'] = df['ltv_d2'] - df['ltv_d1']
df['ltv_slope_d2_d3'] = df['ltv_d3'] - df['ltv_d2']
df['ARPU_d0'] = df['daily_revenue_d0'] / df['unique_users_d0']
df['ARPU_d1'] = df['daily_revenue_d1'] / df['unique_users_d1']
df['ARPU_d2'] = df['daily_revenue_d2'] / df['unique_users_d2']
df['ARPU_d3'] = df['daily_revenue_d3'] / df['unique_users_d3']
df['retention_d1'] = df['unique_users_d1'] / df['unique_users_d0']
df['retention_d2'] = df['unique_users_d2'] / df['unique_users_d0']
df['retention_d3'] = df['unique_users_d3'] / df['unique_users_d0']

df['ltv_acceleration'] = df['ltv_slope_d2_d3'] - df['ltv_slope_d1_d2']
df['roas_slope_d0_d1'] = df['roas_d1'] - df['roas_d0']
df['roas_slope_d1_d2'] = df['roas_d2'] - df['roas_d1']
df['roas_slope_d2_d3'] = df['roas_d3'] - df['roas_d2']
df['roas_acceleration'] = df['roas_slope_d2_d3'] - df['roas_slope_d1_d2']
df['is_ltv_slowing_down'] = (df['ltv_acceleration'] < 0).astype(int)
df['is_roas_slowing_down'] = (df['roas_acceleration'] < 0).astype(int)

df['ltv_gain'] = df['ltv_d3'] - df['ltv_d0']
df['cumulative_users_d3'] = df['unique_users_d0'] + df['unique_users_d1'] + df['unique_users_d2'] + df['unique_users_d3']
df['ARPU_cumulative_d3'] = df['cumulative_revenue_d3'] / df['cumulative_users_d3']
df['ARPU_trend'] = (df['daily_revenue_d3'] / (df['unique_users_d3'] + 1e-9)) - (df['daily_revenue_d0'] / (df['unique_users_d0'] + 1e-9))
df['Payback_Velocity'] = (df['cumulative_revenue_d3'] / df['cost']) / 4
df['Acceleration_Ratio'] = df['revenue_acceleration'] / (df['user_acceleration'] + 1e-9)
df['CAC'] = df['cost'] / df['cumulative_users_d3']
df['ROAS_CV'] = df['roas_std'] / (df['roas_mean'] + 1e-9)
df['ERTI'] = df['cumulative_users_d3'] / df['cost']
df['LTV_CAC'] = df['ltv_d3'] / df['CAC']

df['daily_to_cumulative_revenue_ratio_d3'] = df['daily_revenue_d3'] / (df['cumulative_revenue_d3'] + 1e-9)
df['d0_cohort_value_d3'] = df['cumulative_revenue_d3'] / (df['unique_users_d0'] + 1e-9)
df['user_growth_d3_vs_d0'] = df['unique_users_d3'] / (df['unique_users_d0'] + 1e-9)
df['cost_per_revenue_d3'] = df['cost'] / (df['cumulative_revenue_d3'] + 1e-9)
df['arpu_d3_x_payback'] = df['ARPU_cumulative_d3'] * df['Payback_Velocity']
df['LTV_CV'] = df['ltv_std'] / (df['ltv_mean'] + 1e-9)
df['LTV_CAC_Trend'] = df['LTV_CAC'] * df['ARPU_trend']
df['Cost_LTV_Mean'] = df['cost'] * df['ltv_mean']
df['Payback_Accel'] = df['Payback_Velocity'] * df['ltv_acceleration']
df['ROAS_Std_Weighted'] = df['roas_std'] * df['LTV_CAC']
df['Daily_to_Cumul_LTV'] = df['daily_to_cumulative_revenue_ratio_d3'] * df['ltv_d3']

power_cols_base = ['ltv_mean', 'cost', 'LTV_CAC', 'Payback_Velocity', 'ltv_acceleration', 'roas_mean','ARPU_trend']
poly_transformer = PolynomialFeatures(degree=2, include_bias=False)
X_poly_base = df[power_cols_base]
X_poly_transformed = poly_transformer.fit_transform(X_poly_base)
new_poly_names = poly_transformer.get_feature_names_out(input_features=power_cols_base)
df_poly = pd.DataFrame(X_poly_transformed, columns=new_poly_names, index=df.index)
df = pd.concat([df.drop(columns=power_cols_base), df_poly], axis=1)
df = df.fillna(0)
eps = 1e-9
cv = KFold(n_splits=5, shuffle=True, random_state=42)

df.shape

(50, 100)

In [78]:
df = df.drop(columns=['daily_revenue_d0',
 'LTV_CAC ARPU_trend',
 'ltv_mean Payback_Velocity',
 'LTV_CAC roas_mean',
 'LTV_CAC Payback_Velocity',
 'roas_mean',
 'ARPU_cumulative_d3',
 'cost Payback_Velocity',
 'cost',
 'revenue_acceleration',
 'LTV_CV',
 'Payback_Velocity roas_mean',
 'Payback_Velocity',
 'cumulative_revenue_d2',
 'cumulative_revenue_growth',
 'Payback_Velocity ARPU_trend',
 'LTV_CAC^2',
 'cumulative_users_d3',
 'cumulative_revenue_mean',
 'Payback_Velocity ltv_acceleration',
 'roas_mean^2',
 'cost roas_mean',
 'cumulative_revenue_std',
 'cost_per_revenue_d3',
 'cumulative_revenue_d3',
 'ltv_mean cost',
 'ltv_acceleration roas_mean',
 'cumulative_revenue_d1',
 'cost^2',
 'is_ltv_slowing_down',
 'LTV_CAC_Trend',
 'Payback_Velocity^2',
 'is_roas_slowing_down',
 'arpu_d3_x_payback',
 'daily_revenue_d2',
 'daily_revenue_d3',
 'roas_mean ARPU_trend',
 'cost LTV_CAC',
 'LTV_CAC ltv_acceleration',
 'Payback_Accel',
 'roas_trend',
 'ROAS_Std_Weighted'])#

In [79]:
df_infer = df

In [80]:
df = pd.read_csv('data/raw_2025-04-01_2025-05-31_puzzle_com.twisted.rope.tangle.csv')
day = 60

df = df[['roas_d0','roas_d1','roas_d2','roas_d3',
        'cumulative_revenue_d0','cumulative_revenue_d1','cumulative_revenue_d2','cumulative_revenue_d3',
        'daily_revenue_d0','daily_revenue_d1','daily_revenue_d2',
        'unique_users_d0','unique_users_d1','unique_users_d2','unique_users_d3','daily_revenue_d3','cost',
        'ltv_d0', 'ltv_d1', 'ltv_d2', 'ltv_d3', f'ltv_d{day}']].copy()
df['ltv_mean'] = df[['ltv_d0', 'ltv_d1', 'ltv_d2', 'ltv_d3']].mean(axis=1)
df['roas_mean'] = df[['roas_d0','roas_d1','roas_d2','roas_d3']].mean(axis=1)
df['cumulative_revenue_mean'] = df[['cumulative_revenue_d0','cumulative_revenue_d1','cumulative_revenue_d2','cumulative_revenue_d3']].mean(axis=1)

df['ltv_std'] = df[['ltv_d0', 'ltv_d1', 'ltv_d2', 'ltv_d3']].std(axis=1)
df['roas_std'] = df[['roas_d0','roas_d1','roas_d2','roas_d3']].std(axis=1)
df['cumulative_revenue_std'] = df[['cumulative_revenue_d0','cumulative_revenue_d1','cumulative_revenue_d2','cumulative_revenue_d3']].std(axis=1)

df['ltv_growth'] = (df['ltv_d3'] - df['ltv_d0']) / (3 + 1e-9)
df['cumulative_revenue_growth'] = df['cumulative_revenue_d3'] - df['cumulative_revenue_d0']

df['revenue_acceleration'] = df['daily_revenue_d3'] - df['daily_revenue_d2'] - df['daily_revenue_d1'] + df['daily_revenue_d0']
df['user_acceleration'] = df['unique_users_d3'] - df['unique_users_d2'] - df['unique_users_d1'] + df['unique_users_d0']

df['roas_trend'] = df['roas_d3'] - df['roas_d0']
df['ltv_roas_ratio'] = df['ltv_d3'] / df['roas_d3']

df['ltv_slope_d0_d1'] = df['ltv_d1'] - df['ltv_d0']
df['ltv_slope_d1_d2'] = df['ltv_d2'] - df['ltv_d1']
df['ltv_slope_d2_d3'] = df['ltv_d3'] - df['ltv_d2']
df['ARPU_d0'] = df['daily_revenue_d0'] / df['unique_users_d0']
df['ARPU_d1'] = df['daily_revenue_d1'] / df['unique_users_d1']
df['ARPU_d2'] = df['daily_revenue_d2'] / df['unique_users_d2']
df['ARPU_d3'] = df['daily_revenue_d3'] / df['unique_users_d3']
df['retention_d1'] = df['unique_users_d1'] / df['unique_users_d0']
df['retention_d2'] = df['unique_users_d2'] / df['unique_users_d0']
df['retention_d3'] = df['unique_users_d3'] / df['unique_users_d0']

df['ltv_acceleration'] = df['ltv_slope_d2_d3'] - df['ltv_slope_d1_d2']
df['roas_slope_d0_d1'] = df['roas_d1'] - df['roas_d0']
df['roas_slope_d1_d2'] = df['roas_d2'] - df['roas_d1']
df['roas_slope_d2_d3'] = df['roas_d3'] - df['roas_d2']
df['roas_acceleration'] = df['roas_slope_d2_d3'] - df['roas_slope_d1_d2']
df['is_ltv_slowing_down'] = (df['ltv_acceleration'] < 0).astype(int)
df['is_roas_slowing_down'] = (df['roas_acceleration'] < 0).astype(int)

df['ltv_gain'] = df['ltv_d3'] - df['ltv_d0']
df['cumulative_users_d3'] = df['unique_users_d0'] + df['unique_users_d1'] + df['unique_users_d2'] + df['unique_users_d3']
df['ARPU_cumulative_d3'] = df['cumulative_revenue_d3'] / df['cumulative_users_d3']
df['ARPU_trend'] = (df['daily_revenue_d3'] / (df['unique_users_d3'] + 1e-9)) - (df['daily_revenue_d0'] / (df['unique_users_d0'] + 1e-9))
df['Payback_Velocity'] = (df['cumulative_revenue_d3'] / df['cost']) / 4
df['Acceleration_Ratio'] = df['revenue_acceleration'] / (df['user_acceleration'] + 1e-9)
df['CAC'] = df['cost'] / df['cumulative_users_d3']
df['ROAS_CV'] = df['roas_std'] / (df['roas_mean'] + 1e-9)
df['ERTI'] = df['cumulative_users_d3'] / df['cost']
df['LTV_CAC'] = df['ltv_d3'] / df['CAC']

df['daily_to_cumulative_revenue_ratio_d3'] = df['daily_revenue_d3'] / (df['cumulative_revenue_d3'] + 1e-9)
df['d0_cohort_value_d3'] = df['cumulative_revenue_d3'] / (df['unique_users_d0'] + 1e-9)
df['user_growth_d3_vs_d0'] = df['unique_users_d3'] / (df['unique_users_d0'] + 1e-9)
df['cost_per_revenue_d3'] = df['cost'] / (df['cumulative_revenue_d3'] + 1e-9)
df['arpu_d3_x_payback'] = df['ARPU_cumulative_d3'] * df['Payback_Velocity']
df['LTV_CV'] = df['ltv_std'] / (df['ltv_mean'] + 1e-9)
df['LTV_CAC_Trend'] = df['LTV_CAC'] * df['ARPU_trend']
df['Cost_LTV_Mean'] = df['cost'] * df['ltv_mean']
df['Payback_Accel'] = df['Payback_Velocity'] * df['ltv_acceleration']
df['ROAS_Std_Weighted'] = df['roas_std'] * df['LTV_CAC']
df['Daily_to_Cumul_LTV'] = df['daily_to_cumulative_revenue_ratio_d3'] * df['ltv_d3']

power_cols_base = ['ltv_mean', 'cost', 'LTV_CAC', 'Payback_Velocity', 'ltv_acceleration', 'roas_mean','ARPU_trend']
poly_transformer = PolynomialFeatures(degree=2, include_bias=False)
X_poly_base = df[power_cols_base]
X_poly_transformed = poly_transformer.fit_transform(X_poly_base)
new_poly_names = poly_transformer.get_feature_names_out(input_features=power_cols_base)
df_poly = pd.DataFrame(X_poly_transformed, columns=new_poly_names, index=df.index)
df = pd.concat([df.drop(columns=power_cols_base), df_poly], axis=1)
df = df.fillna(0)
eps = 1e-9
cv = KFold(n_splits=5, shuffle=True, random_state=42)

df.shape

(509, 100)

In [81]:
df = df.drop(columns=['daily_revenue_d0',
 'LTV_CAC ARPU_trend',
 'ltv_mean Payback_Velocity',
 'LTV_CAC roas_mean',
 'LTV_CAC Payback_Velocity',
 'roas_mean',
 'ARPU_cumulative_d3',
 'cost Payback_Velocity',
 'cost',
 'revenue_acceleration',
 'LTV_CV',
 'Payback_Velocity roas_mean',
 'Payback_Velocity',
 'cumulative_revenue_d2',
 'cumulative_revenue_growth',
 'Payback_Velocity ARPU_trend',
 'LTV_CAC^2',
 'cumulative_users_d3',
 'cumulative_revenue_mean',
 'Payback_Velocity ltv_acceleration',
 'roas_mean^2',
 'cost roas_mean',
 'cumulative_revenue_std',
 'cost_per_revenue_d3',
 'cumulative_revenue_d3',
 'ltv_mean cost',
 'ltv_acceleration roas_mean',
 'cumulative_revenue_d1',
 'cost^2',
 'is_ltv_slowing_down',
 'LTV_CAC_Trend',
 'Payback_Velocity^2',
 'is_roas_slowing_down',
 'arpu_d3_x_payback',
 'daily_revenue_d2',
 'daily_revenue_d3',
 'roas_mean ARPU_trend',
 'cost LTV_CAC',
 'LTV_CAC ltv_acceleration',
 'Payback_Accel',
 'roas_trend',
 'ROAS_Std_Weighted'])#

In [82]:
features = df.drop(columns=[f'ltv_d{day}']).columns.tolist()
target = f'ltv_d{day}'

X = df[features]
y = df[[target]]
X_infer = df_infer[features]
y_infer = df_infer[[target]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, y_train = apply_mixup_train_data(X_train, y_train, augmentation_factor=1.0, random_state=42)

In [83]:
features = X_train.columns.tolist()
preprocessor_X = PowerTransformer(method='yeo-johnson', standardize=False)
preprocessor_y = PowerTransformer(method='yeo-johnson', standardize=False)

X_train_transformed = pd.DataFrame(preprocessor_X.fit_transform(X_train), columns=features)
X_test_transformed = pd.DataFrame(preprocessor_X.transform(X_test), columns=features)
X_infer_transformed = pd.DataFrame(preprocessor_X.transform(X_infer), columns=features)
y_train_transformed = pd.DataFrame(preprocessor_y.fit_transform(y_train), columns=[f'ltv_d{day}'])
y_test_transformed = pd.DataFrame(preprocessor_y.transform(y_test), columns=[f'ltv_d{day}'])
y_infer_transformed = pd.DataFrame(preprocessor_y.transform(y_infer), columns=[f'ltv_d{day}'])

In [84]:
print(X_train_transformed.shape)
print(X_test_transformed.shape)

(814, 57)
(102, 57)


In [85]:
X_train_transformed.reset_index(drop=True, inplace=True)
y_test_transformed.reset_index(drop=True, inplace=True)
X_test_transformed.reset_index(drop=True, inplace=True)
y_train_transformed.reset_index(drop=True, inplace=True)
X_train_transformed, X_test_transformed = generate_oof_elasticnet(X_train_transformed, y_train_transformed, X_test_transformed, cv)

In [43]:
print(X_train_transformed.shape)
print(X_test_transformed.shape)

(814, 58)
(102, 58)


In [None]:
def nae_eval_metric(y_pred, y_true):
    y_true_transformed = y_true.get_label()
    y_pred_transformed = y_pred

    y_true_original = preprocessor_y.inverse_transform(
        y_true_transformed.reshape(-1, 1)
    ).flatten()
    
    y_pred_original = preprocessor_y.inverse_transform(
        y_pred_transformed.reshape(-1, 1)
    ).flatten()
    
    nae = np.mean(
        np.abs(y_true_original - y_pred_original) / (np.abs(y_true_original)+ eps)
    )
    
    return 'custom_nae', nae * 100

In [11]:
d_train_xgboost = xgb.DMatrix(X_train_transformed, label=y_train_transformed)
d_train_lightgbm = lgb.Dataset(X_train_transformed, label=y_train_transformed, params={'feature_pre_filter': False})
d_train_catboost = cb.Pool(X_train_transformed, label=y_train_transformed)

In [None]:
def objective_xgboost(trial, d_train):
    
    param = {
        'objective': 'reg:absoluteerror',
        'random_state': 42,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 5),
        'subsample': trial.suggest_float('subsample', 0.5, 0.95),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.9)
    }

    cv_results = xgb.cv(
        params=param,
        dtrain=d_train,
        num_boost_round=1000,
        nfold=5,
        custom_metric=nae_eval_metric,
        maximize=False,
        as_pandas=True,
        early_stopping_rounds=30
    )
    best_iteration = cv_results['test-custom_nae-mean'].argmin()
    best_nae = cv_results['test-custom_nae-mean'].min()
    n_estimators_optimal = best_iteration + 1
    
    trial.set_user_attr('n_estimators_optimal', n_estimators_optimal)    
    return best_nae

def objective_lightgbm_function(trial):
    
    param = {
        'objective': 'regression_l1',
        'metric': 'l1',
        'random_state': 42,
        'force_col_wise': True,
        'feature_pre_filter': False, 
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 15, 40),
        'max_depth': trial.suggest_int('max_depth', 3, 11),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 50),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'verbose': -1
    }
    
    cv_results = lgb.cv(
        params=param,
        train_set=d_train_lightgbm,
        num_boost_round=2000,
        nfold=5,
        stratified=False,
        callbacks=[lgb.early_stopping(30, verbose=False)],
        seed=42
    )
    
    best_iteration = cv_results['valid l1-mean'].index(min(cv_results['valid l1-mean']))
    best_mae = min(cv_results['valid l1-mean'])

    n_estimators_optimal = best_iteration + 1
    trial.set_user_attr('n_estimators_optimal', n_estimators_optimal) 
    
    return best_mae

def objective_catboost_function(trial):
    bootstrap_type = trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS'])
    param = {
        'loss_function': 'MAE',
        'eval_metric': 'MAE',
        'random_seed': 42,
        'bootstrap_type': bootstrap_type,
        'verbose': -1,
        'task_type': 'GPU',
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2, log=True),
        'depth': trial.suggest_int('max_depth', 3, 9),
        'l2_leaf_reg': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'min_data_in_leaf': trial.suggest_int('min_child_weight', 1, 5),
    }

    if bootstrap_type == 'Bernoulli':
        param['subsample'] = trial.suggest_float('subsample', 0.5, 0.95)

    pool_cv = cb.Pool(
        data=X_train_transformed, 
        label=y_train_transformed
    )
    
    cv_results = cb.cv(
        pool=pool_cv,
        params=param,
        fold_count=5,
        iterations=1000,
        early_stopping_rounds=30,
        shuffle=True,
        verbose=False,
        seed=42
    )

    metric_key = 'test-MAE-mean'
    
    best_iteration = cv_results[metric_key].argmin()
    best_loss = cv_results[metric_key].min()
    n_estimators_optimal = best_iteration + 1
    
    trial.set_user_attr('n_estimators_optimal', n_estimators_optimal) 
    
    return best_loss

def objective_xgboost(trial, d_train):
    
    param = {
        'objective': 'reg:absoluteerror',
        'random_state': 42,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 5),
        'subsample': trial.suggest_float('subsample', 0.5, 0.95),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.9)
    }

    cv_results = xgb.cv(
        params=param,
        dtrain=d_train,
        num_boost_round=1000,
        nfold=5,
        custom_metric=nae_eval_metric,
        maximize=False,
        as_pandas=True,
        early_stopping_rounds=30
    )
    best_iteration = cv_results['test-custom_nae-mean'].argmin()
    best_nae = cv_results['test-custom_nae-mean'].min()
    n_estimators_optimal = best_iteration + 1
    
    trial.set_user_attr('n_estimators_optimal', n_estimators_optimal)    
    return best_nae

In [13]:
study_xgboost = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=30))
study_xgboost.optimize(objective_xgboost_function, n_trials=60, show_progress_bar=True)
best_xgboost_n_estimators = study_xgboost.best_trial.user_attrs.get('n_estimators_optimal') 
final_xgboost_params = study_xgboost.best_params.copy()
final_xgboost_params['n_estimators'] = best_xgboost_n_estimators
final_xgboost_params['objective'] = 'reg:absoluteerror'
best_xgb_model = xgb.XGBRegressor(**final_xgboost_params)

  0%|          | 0/60 [00:00<?, ?it/s]

In [14]:
study_lightgbm = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=30),study_name='LightGBM_Hyperparameter_Tuning')
study_lightgbm.optimize(objective_lightgbm_function, n_trials=60, show_progress_bar=True)
best_lgbm_n_estimators = study_lightgbm.best_trial.user_attrs.get('n_estimators_optimal')
final_lightgbm_params = study_lightgbm.best_params.copy()
final_lightgbm_params['objective'] = 'regression_l1'
final_lightgbm_params['n_estimators'] = best_lgbm_n_estimators
final_lightgbm_params['metric'] = 'custom'
final_lightgbm_params['random_state'] = 42
final_lightgbm_params['boosting_type'] = 'gbdt'
best_lgbm_model = lgb.LGBMRegressor(**final_lightgbm_params)

  0%|          | 0/60 [00:00<?, ?it/s]

In [15]:
study_catboost = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=30),study_name='CatBoost_Hyperparameter_Tuning')
study_catboost.optimize(objective_catboost_function, n_trials=60, show_progress_bar=True)
best_catboost_iterations = study_catboost.best_trial.user_attrs.get('n_estimators_optimal')
final_catboost_params = study_catboost.best_params.copy()
final_catboost_params['depth'] = final_catboost_params.pop('max_depth')
final_catboost_params['l2_leaf_reg'] = final_catboost_params.pop('lambda')
final_catboost_params['min_data_in_leaf'] = final_catboost_params.pop('min_child_weight')
final_catboost_params.pop('alpha', None)
final_catboost_params['loss_function'] = 'MAE'
final_catboost_params['eval_metric'] = 'MAE'
final_catboost_params['iterations'] = best_catboost_iterations
final_catboost_params['task_type'] = 'GPU'
final_catboost_params['random_seed'] = 42
final_catboost_params['verbose'] = 0
best_catboost_model = cb.CatBoostRegressor(**final_catboost_params)

  0%|          | 0/60 [00:00<?, ?it/s]

Default metric period is 5 because MAE is/are not implemented for GPU


Training on fold [0/5]
bestTest = 0.02635992085
bestIteration = 6
Training on fold [1/5]
bestTest = 0.02416300627
bestIteration = 9
Training on fold [2/5]
bestTest = 0.02928469371
bestIteration = 6
Training on fold [3/5]
bestTest = 0.02584384848
bestIteration = 6
Training on fold [4/5]
bestTest = 0.02882600125
bestIteration = 7
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.02305503418
bestIteration = 8
Training on fold [1/5]
bestTest = 0.02442289569
bestIteration = 11
Training on fold [2/5]
bestTest = 0.0216838392
bestIteration = 8
Training on fold [3/5]
bestTest = 0.02479043036
bestIteration = 8
Training on fold [4/5]
bestTest = 0.02331791984
bestIteration = 8
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01539386562
bestIteration = 271
Training on fold [1/5]
bestTest = 0.0153224439
bestIteration = 338
Training on fold [2/5]
bestTest = 0.01281589233
bestIteration = 254
Training on fold [3/5]
bestTest = 0.01531942637
bestIteration = 362
Training on fold [4/5]
bestTest = 0.01467921263
bestIteration = 439
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01496741816
bestIteration = 73
Training on fold [1/5]
bestTest = 0.01574320588
bestIteration = 92
Training on fold [2/5]
bestTest = 0.01318981896
bestIteration = 104
Training on fold [3/5]
bestTest = 0.01658039298
bestIteration = 103
Training on fold [4/5]
bestTest = 0.01541655888
bestIteration = 115
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01654513043
bestIteration = 29
Training on fold [1/5]
bestTest = 0.01863095219
bestIteration = 30
Training on fold [2/5]
bestTest = 0.01556934462
bestIteration = 32
Training on fold [3/5]
bestTest = 0.01732237354
bestIteration = 35
Training on fold [4/5]
bestTest = 0.01902956138
bestIteration = 46
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.02757333978
bestIteration = 5
Training on fold [1/5]
bestTest = 0.02618144919
bestIteration = 5
Training on fold [2/5]
bestTest = 0.02755388599
bestIteration = 5
Training on fold [3/5]
bestTest = 0.03396521024
bestIteration = 6
Training on fold [4/5]
bestTest = 0.02829943174
bestIteration = 5
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.02317228054
bestIteration = 8
Training on fold [1/5]
bestTest = 0.02266685656
bestIteration = 9
Training on fold [2/5]
bestTest = 0.02278012293
bestIteration = 9
Training on fold [3/5]
bestTest = 0.02418216167
bestIteration = 10
Training on fold [4/5]
bestTest = 0.02348162804
bestIteration = 9
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01845949559
bestIteration = 31
Training on fold [1/5]
bestTest = 0.01909450519
bestIteration = 22
Training on fold [2/5]
bestTest = 0.01771913716
bestIteration = 26
Training on fold [3/5]
bestTest = 0.02042119342
bestIteration = 22
Training on fold [4/5]
bestTest = 0.01931242295
bestIteration = 26
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01593785637
bestIteration = 82
Training on fold [1/5]
bestTest = 0.01673447135
bestIteration = 59
Training on fold [2/5]
bestTest = 0.01430296313
bestIteration = 61
Training on fold [3/5]
bestTest = 0.01688184329
bestIteration = 90
Training on fold [4/5]
bestTest = 0.01660973643
bestIteration = 67
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01496516854
bestIteration = 150
Training on fold [1/5]
bestTest = 0.0158497746
bestIteration = 167
Training on fold [2/5]
bestTest = 0.01387138601
bestIteration = 169
Training on fold [3/5]
bestTest = 0.01590326959
bestIteration = 180
Training on fold [4/5]
bestTest = 0.01564550253
bestIteration = 183
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01407973167
bestIteration = 449
Training on fold [1/5]
bestTest = 0.01602892495
bestIteration = 278
Training on fold [2/5]
bestTest = 0.01301993622
bestIteration = 318
Training on fold [3/5]
bestTest = 0.01506696011
bestIteration = 359
Training on fold [4/5]
bestTest = 0.01509108072
bestIteration = 378
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01477012166
bestIteration = 429
Training on fold [1/5]
bestTest = 0.01599058783
bestIteration = 240
Training on fold [2/5]
bestTest = 0.01349449743
bestIteration = 270
Training on fold [3/5]
bestTest = 0.01404724823
bestIteration = 473
Training on fold [4/5]
bestTest = 0.0148355534
bestIteration = 292
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01563423543
bestIteration = 273
Training on fold [1/5]
bestTest = 0.01669126786
bestIteration = 260
Training on fold [2/5]
bestTest = 0.01322537551
bestIteration = 309
Training on fold [3/5]
bestTest = 0.01706583222
bestIteration = 193
Training on fold [4/5]
bestTest = 0.01536101176
bestIteration = 333
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01497430743
bestIteration = 74
Training on fold [1/5]
bestTest = 0.01642439702
bestIteration = 66
Training on fold [2/5]
bestTest = 0.01431076366
bestIteration = 63
Training on fold [3/5]
bestTest = 0.01661155999
bestIteration = 65
Training on fold [4/5]
bestTest = 0.01520309183
bestIteration = 49
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01877927634
bestIteration = 25
Training on fold [1/5]
bestTest = 0.01963871242
bestIteration = 42
Training on fold [2/5]
bestTest = 0.01563734803
bestIteration = 19
Training on fold [3/5]
bestTest = 0.01944599415
bestIteration = 33
Training on fold [4/5]
bestTest = 0.01875817187
bestIteration = 17
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01497079697
bestIteration = 177
Training on fold [1/5]
bestTest = 0.01491919032
bestIteration = 177
Training on fold [2/5]
bestTest = 0.01307915032
bestIteration = 128
Training on fold [3/5]
bestTest = 0.01526354427
bestIteration = 198
Training on fold [4/5]
bestTest = 0.01549664839
bestIteration = 204
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01558671115
bestIteration = 70
Training on fold [1/5]
bestTest = 0.01544263612
bestIteration = 88
Training on fold [2/5]
bestTest = 0.01343646956
bestIteration = 98
Training on fold [3/5]
bestTest = 0.01426286346
bestIteration = 266
Training on fold [4/5]
bestTest = 0.01529891991
bestIteration = 75
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.03654484369
bestIteration = 5
Training on fold [1/5]
bestTest = 0.03364687024
bestIteration = 3
Training on fold [2/5]
bestTest = 0.03408703482
bestIteration = 7
Training on fold [3/5]
bestTest = 0.03965280685
bestIteration = 3
Training on fold [4/5]
bestTest = 0.04232975877
bestIteration = 3
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01496257372
bestIteration = 144
Training on fold [1/5]
bestTest = 0.01491460625
bestIteration = 212
Training on fold [2/5]
bestTest = 0.01328843942
bestIteration = 172
Training on fold [3/5]
bestTest = 0.0150891345
bestIteration = 220
Training on fold [4/5]
bestTest = 0.01462124895
bestIteration = 262
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01548973914
bestIteration = 132
Training on fold [1/5]
bestTest = 0.01508426081
bestIteration = 241
Training on fold [2/5]
bestTest = 0.01288177631
bestIteration = 224
Training on fold [3/5]
bestTest = 0.01480166897
bestIteration = 222
Training on fold [4/5]
bestTest = 0.01487735466
bestIteration = 158
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01578624994
bestIteration = 45
Training on fold [1/5]
bestTest = 0.01639835235
bestIteration = 129
Training on fold [2/5]
bestTest = 0.01455992277
bestIteration = 43
Training on fold [3/5]
bestTest = 0.01499723949
bestIteration = 78
Training on fold [4/5]
bestTest = 0.01668067332
bestIteration = 49
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01451444333
bestIteration = 351
Training on fold [1/5]
bestTest = 0.01501293124
bestIteration = 255
Training on fold [2/5]
bestTest = 0.01309589374
bestIteration = 192
Training on fold [3/5]
bestTest = 0.01466370507
bestIteration = 260
Training on fold [4/5]
bestTest = 0.01484073533
bestIteration = 307
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01478760959
bestIteration = 115
Training on fold [1/5]
bestTest = 0.01589742029
bestIteration = 162
Training on fold [2/5]
bestTest = 0.01355450431
bestIteration = 143
Training on fold [3/5]
bestTest = 0.01583759331
bestIteration = 168
Training on fold [4/5]
bestTest = 0.01489965857
bestIteration = 264
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01567906982
bestIteration = 85
Training on fold [1/5]
bestTest = 0.01576001074
bestIteration = 94
Training on fold [2/5]
bestTest = 0.01340063803
bestIteration = 163
Training on fold [3/5]
bestTest = 0.01467519596
bestIteration = 251
Training on fold [4/5]
bestTest = 0.01526659506
bestIteration = 144
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01404838035
bestIteration = 180
Training on fold [1/5]
bestTest = 0.01522175959
bestIteration = 142
Training on fold [2/5]
bestTest = 0.01353465999
bestIteration = 146
Training on fold [3/5]
bestTest = 0.01535565721
bestIteration = 203
Training on fold [4/5]
bestTest = 0.01493798951
bestIteration = 177
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01575062465
bestIteration = 86
Training on fold [1/5]
bestTest = 0.01674947124
bestIteration = 81
Training on fold [2/5]
bestTest = 0.01513352131
bestIteration = 59
Training on fold [3/5]
bestTest = 0.01771635951
bestIteration = 70
Training on fold [4/5]
bestTest = 0.01693275534
bestIteration = 70
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01461896136
bestIteration = 139
Training on fold [1/5]
bestTest = 0.01447158796
bestIteration = 222
Training on fold [2/5]
bestTest = 0.01294205671
bestIteration = 164
Training on fold [3/5]
bestTest = 0.01485305769
bestIteration = 217
Training on fold [4/5]
bestTest = 0.0151647138
bestIteration = 293
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01498785341
bestIteration = 94
Training on fold [1/5]
bestTest = 0.0155925546
bestIteration = 135
Training on fold [2/5]
bestTest = 0.01320654629
bestIteration = 100
Training on fold [3/5]
bestTest = 0.01543150943
bestIteration = 163
Training on fold [4/5]
bestTest = 0.01462625133
bestIteration = 170
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.0176377823
bestIteration = 21
Training on fold [1/5]
bestTest = 0.01933111004
bestIteration = 19
Training on fold [2/5]
bestTest = 0.0179969694
bestIteration = 20
Training on fold [3/5]
bestTest = 0.01885277508
bestIteration = 18
Training on fold [4/5]
bestTest = 0.01952626087
bestIteration = 16
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01585574823
bestIteration = 71
Training on fold [1/5]
bestTest = 0.01558133137
bestIteration = 65
Training on fold [2/5]
bestTest = 0.01336031896
bestIteration = 50
Training on fold [3/5]
bestTest = 0.01427823927
bestIteration = 168
Training on fold [4/5]
bestTest = 0.01600357633
bestIteration = 110
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01454835143
bestIteration = 321
Training on fold [1/5]
bestTest = 0.01550829484
bestIteration = 246
Training on fold [2/5]
bestTest = 0.01331903452
bestIteration = 134
Training on fold [3/5]
bestTest = 0.01477673595
bestIteration = 267
Training on fold [4/5]
bestTest = 0.01484328435
bestIteration = 292
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01479120489
bestIteration = 155
Training on fold [1/5]
bestTest = 0.01450285005
bestIteration = 187
Training on fold [2/5]
bestTest = 0.01335966806
bestIteration = 163
Training on fold [3/5]
bestTest = 0.01506064567
bestIteration = 235
Training on fold [4/5]
bestTest = 0.01481881848
bestIteration = 177
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01380299937
bestIteration = 271
Training on fold [1/5]
bestTest = 0.01517283697
bestIteration = 222
Training on fold [2/5]
bestTest = 0.01374768333
bestIteration = 200
Training on fold [3/5]
bestTest = 0.01521034767
bestIteration = 215
Training on fold [4/5]
bestTest = 0.01450598093
bestIteration = 251
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01423727515
bestIteration = 154
Training on fold [1/5]
bestTest = 0.01571822605
bestIteration = 169
Training on fold [2/5]
bestTest = 0.01299126309
bestIteration = 182
Training on fold [3/5]
bestTest = 0.01518245124
bestIteration = 236
Training on fold [4/5]
bestTest = 0.01571839827
bestIteration = 150
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01515935243
bestIteration = 102
Training on fold [1/5]
bestTest = 0.01507986104
bestIteration = 133
Training on fold [2/5]
bestTest = 0.01364657045
bestIteration = 129
Training on fold [3/5]
bestTest = 0.01530317587
bestIteration = 186
Training on fold [4/5]
bestTest = 0.01628290871
bestIteration = 117
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01522983949
bestIteration = 147
Training on fold [1/5]
bestTest = 0.01568498641
bestIteration = 85
Training on fold [2/5]
bestTest = 0.01432242423
bestIteration = 71
Training on fold [3/5]
bestTest = 0.01499200891
bestIteration = 91
Training on fold [4/5]
bestTest = 0.01563652945
bestIteration = 78
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01448593695
bestIteration = 189
Training on fold [1/5]
bestTest = 0.01464101727
bestIteration = 211
Training on fold [2/5]
bestTest = 0.01327759503
bestIteration = 172
Training on fold [3/5]
bestTest = 0.01504682324
bestIteration = 202
Training on fold [4/5]
bestTest = 0.0145306911
bestIteration = 265
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01453582202
bestIteration = 115
Training on fold [1/5]
bestTest = 0.01565923574
bestIteration = 150
Training on fold [2/5]
bestTest = 0.0128874691
bestIteration = 152
Training on fold [3/5]
bestTest = 0.01630459826
bestIteration = 172
Training on fold [4/5]
bestTest = 0.01481932475
bestIteration = 156
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01530143673
bestIteration = 34
Training on fold [1/5]
bestTest = 0.0172917682
bestIteration = 32
Training on fold [2/5]
bestTest = 0.01571158836
bestIteration = 40
Training on fold [3/5]
bestTest = 0.01612858685
bestIteration = 36
Training on fold [4/5]
bestTest = 0.01628875144
bestIteration = 41
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01479776359
bestIteration = 136
Training on fold [1/5]
bestTest = 0.01480357632
bestIteration = 140
Training on fold [2/5]
bestTest = 0.01335357011
bestIteration = 99
Training on fold [3/5]
bestTest = 0.01519441605
bestIteration = 129
Training on fold [4/5]
bestTest = 0.01505319572
bestIteration = 119
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.02157906374
bestIteration = 11
Training on fold [1/5]
bestTest = 0.02021876288
bestIteration = 12
Training on fold [2/5]
bestTest = 0.01948139565
bestIteration = 14
Training on fold [3/5]
bestTest = 0.01889723503
bestIteration = 14
Training on fold [4/5]
bestTest = 0.02180794728
bestIteration = 12
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01440022182
bestIteration = 142
Training on fold [1/5]
bestTest = 0.01363573484
bestIteration = 228
Training on fold [2/5]
bestTest = 0.0130721923
bestIteration = 165
Training on fold [3/5]
bestTest = 0.01484803334
bestIteration = 276
Training on fold [4/5]
bestTest = 0.01435163433
bestIteration = 244
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01419144174
bestIteration = 176
Training on fold [1/5]
bestTest = 0.01474821933
bestIteration = 216
Training on fold [2/5]
bestTest = 0.01298456543
bestIteration = 155
Training on fold [3/5]
bestTest = 0.01490681011
bestIteration = 236
Training on fold [4/5]
bestTest = 0.01452765347
bestIteration = 220


Default metric period is 5 because MAE is/are not implemented for GPU


Training on fold [0/5]
bestTest = 0.01484985586
bestIteration = 161
Training on fold [1/5]
bestTest = 0.01487134278
bestIteration = 198
Training on fold [2/5]
bestTest = 0.0134096248
bestIteration = 146
Training on fold [3/5]
bestTest = 0.01553334634
bestIteration = 182
Training on fold [4/5]
bestTest = 0.01462417767
bestIteration = 178
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01506478362
bestIteration = 124
Training on fold [1/5]
bestTest = 0.01541225018
bestIteration = 111
Training on fold [2/5]
bestTest = 0.01415110512
bestIteration = 98
Training on fold [3/5]
bestTest = 0.01569035302
bestIteration = 129
Training on fold [4/5]
bestTest = 0.01474778299
bestIteration = 96
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01435438841
bestIteration = 57
Training on fold [1/5]
bestTest = 0.01630021312
bestIteration = 55
Training on fold [2/5]
bestTest = 0.01397158909
bestIteration = 60
Training on fold [3/5]
bestTest = 0.01663092894
bestIteration = 57
Training on fold [4/5]
bestTest = 0.01622706872
bestIteration = 56
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01503829722
bestIteration = 134
Training on fold [1/5]
bestTest = 0.01475687992
bestIteration = 153
Training on fold [2/5]
bestTest = 0.01342130146
bestIteration = 163
Training on fold [3/5]
bestTest = 0.0157231758
bestIteration = 195
Training on fold [4/5]
bestTest = 0.01554159618
bestIteration = 167
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01447006822
bestIteration = 130
Training on fold [1/5]
bestTest = 0.01550637579
bestIteration = 178
Training on fold [2/5]
bestTest = 0.01364473624
bestIteration = 119
Training on fold [3/5]
bestTest = 0.01488080932
bestIteration = 211
Training on fold [4/5]
bestTest = 0.01544325146
bestIteration = 185
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01465778702
bestIteration = 219
Training on fold [1/5]
bestTest = 0.01519051944
bestIteration = 285
Training on fold [2/5]
bestTest = 0.01309726575
bestIteration = 273
Training on fold [3/5]
bestTest = 0.01594142095
bestIteration = 215
Training on fold [4/5]
bestTest = 0.01423092536
bestIteration = 338
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.0161357906
bestIteration = 51
Training on fold [1/5]
bestTest = 0.01677310394
bestIteration = 40
Training on fold [2/5]
bestTest = 0.01614472032
bestIteration = 40
Training on fold [3/5]
bestTest = 0.01712583472
bestIteration = 82
Training on fold [4/5]
bestTest = 0.01773170189
bestIteration = 46
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01662804013
bestIteration = 59
Training on fold [1/5]
bestTest = 0.0175597419
bestIteration = 70
Training on fold [2/5]
bestTest = 0.01481795311
bestIteration = 31
Training on fold [3/5]
bestTest = 0.0179448508
bestIteration = 37
Training on fold [4/5]
bestTest = 0.01832910379
bestIteration = 36
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01447080542
bestIteration = 264
Training on fold [1/5]
bestTest = 0.01434401062
bestIteration = 371
Training on fold [2/5]
bestTest = 0.0120285925
bestIteration = 278
Training on fold [3/5]
bestTest = 0.01450838487
bestIteration = 251
Training on fold [4/5]
bestTest = 0.01534137314
bestIteration = 207
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01461402916
bestIteration = 152
Training on fold [1/5]
bestTest = 0.01450201631
bestIteration = 234
Training on fold [2/5]
bestTest = 0.01298373609
bestIteration = 122
Training on fold [3/5]
bestTest = 0.01548873281
bestIteration = 159
Training on fold [4/5]
bestTest = 0.01478893816
bestIteration = 193
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01486254909
bestIteration = 271
Training on fold [1/5]
bestTest = 0.01464910156
bestIteration = 356
Training on fold [2/5]
bestTest = 0.012984504
bestIteration = 212
Training on fold [3/5]
bestTest = 0.01439915259
bestIteration = 220
Training on fold [4/5]
bestTest = 0.01471181122
bestIteration = 210
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01450966619
bestIteration = 213
Training on fold [1/5]
bestTest = 0.01424554959
bestIteration = 371
Training on fold [2/5]
bestTest = 0.01292158782
bestIteration = 240
Training on fold [3/5]
bestTest = 0.01361456385
bestIteration = 528
Training on fold [4/5]
bestTest = 0.01431884442
bestIteration = 334
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01432998049
bestIteration = 171
Training on fold [1/5]
bestTest = 0.01456862871
bestIteration = 322
Training on fold [2/5]
bestTest = 0.01269141589
bestIteration = 183
Training on fold [3/5]
bestTest = 0.01490896319
bestIteration = 284
Training on fold [4/5]
bestTest = 0.01515048227
bestIteration = 225
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01326196472
bestIteration = 490
Training on fold [1/5]
bestTest = 0.01462052936
bestIteration = 333
Training on fold [2/5]
bestTest = 0.01241782399
bestIteration = 294
Training on fold [3/5]
bestTest = 0.01525303923
bestIteration = 181
Training on fold [4/5]
bestTest = 0.01414157432
bestIteration = 369
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01525025134
bestIteration = 231
Training on fold [1/5]
bestTest = 0.01500032577
bestIteration = 259
Training on fold [2/5]
bestTest = 0.01298046697
bestIteration = 266
Training on fold [3/5]
bestTest = 0.0135301315
bestIteration = 608
Training on fold [4/5]
bestTest = 0.01472735846
bestIteration = 255
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01553336535
bestIteration = 149
Training on fold [1/5]
bestTest = 0.01445329701
bestIteration = 276
Training on fold [2/5]
bestTest = 0.01320520939
bestIteration = 184
Training on fold [3/5]
bestTest = 0.01524306221
bestIteration = 187
Training on fold [4/5]
bestTest = 0.01583395328
bestIteration = 147
Training on fold [0/5]


Default metric period is 5 because MAE is/are not implemented for GPU


bestTest = 0.01420759862
bestIteration = 285
Training on fold [1/5]
bestTest = 0.01475808079
bestIteration = 212
Training on fold [2/5]
bestTest = 0.01325613004
bestIteration = 216
Training on fold [3/5]
bestTest = 0.01597900917
bestIteration = 241
Training on fold [4/5]
bestTest = 0.01481593391
bestIteration = 207


In [16]:
best_xgb_model.fit(X_train_transformed, y_train_transformed)
best_lgbm_model.fit(X_train_transformed, y_train_transformed)
best_catboost_model.fit(X_train_transformed, y_train_transformed)

Default metric period is 5 because MAE is/are not implemented for GPU


<catboost.core.CatBoostRegressor at 0x206c0cbc140>

In [87]:
import joblib
save_path_xgb = "./model/check_model_xgb_no_geo_no_mediasource_d60.joblib"
save_path_lgbm = "./model/check_model_lgbm_no_geo_no_mediasource_d60.joblib"
save_path_catboost = "./model/check_model_catboost_no_geo_no_mediasource_d60.joblib"
#joblib.dump(best_xgb_model, save_path_xgb)
#joblib.dump(best_lgbm_model, save_path_lgbm)
#joblib.dump(best_catboost_model, save_path_catboost)

In [88]:
best_xgb_model = joblib.load(save_path_xgb)
best_lgbm_model = joblib.load(save_path_lgbm)
best_catboost_model = joblib.load(save_path_catboost)

In [89]:
def calculate_nae(y_true_original, y_pred_original, eps=1e-9):
    return np.mean(
        np.abs(y_true_original - y_pred_original) / (np.abs(y_true_original) + eps)
    )

In [90]:
y_test_true = preprocessor_y.inverse_transform(
    y_test_transformed.values.reshape(-1, 1)
).flatten()

y_pred_transformed_xgb = best_xgb_model.predict(X_test_transformed) 
y_pred_original_xgb = preprocessor_y.inverse_transform(
    y_pred_transformed_xgb.reshape(-1, 1)
).flatten()
nae_xgb = calculate_nae(y_test_true, y_pred_original_xgb, eps)

print(f"1. NAE XGBoost (giá trị gốc): {nae_xgb * 100:.4f}%")

y_pred_transformed_lgbm = best_lgbm_model.predict(X_test_transformed)
y_pred_original_lgbm = preprocessor_y.inverse_transform(
    y_pred_transformed_lgbm.reshape(-1, 1)
).flatten()
nae_lgbm = calculate_nae(y_test_true, y_pred_original_lgbm, eps)

print(f"2. NAE LightGBM (giá trị gốc): {nae_lgbm * 100:.4f}%")

y_pred_transformed_cat = best_catboost_model.predict(X_test_transformed)
y_pred_original_cat = preprocessor_y.inverse_transform(
    y_pred_transformed_cat.reshape(-1, 1)
).flatten()
nae_cat = calculate_nae(y_test_true, y_pred_original_cat, eps)

print(f"3. NAE CatBoost (giá trị gốc): {nae_cat * 100:.4f}%")

1. NAE XGBoost (giá trị gốc): 15.1473%
2. NAE LightGBM (giá trị gốc): 15.6778%
3. NAE CatBoost (giá trị gốc): 15.4186%


In [53]:
def get_oof_pre(xgb_params, lgbm_params, catboost_params, X, y, cv):
    xgb_pre = np.zeros(len(y))
    lgbm_pre = np.zeros(len(y))
    catboost_pre = np.zeros(len(y))
    for train_idx, val_idx in cv.split(X, y):
        xgb_fold = xgb.XGBRegressor(**xgb_params)
        xgb_fold.fit(X.iloc[train_idx], y.iloc[train_idx],eval_set=[(X.iloc[val_idx], y.iloc[val_idx])],verbose=False)
        xgb_pre[val_idx] = xgb_fold.predict(X.iloc[val_idx])

        lgbm_fold = lgb.LGBMRegressor(**lgbm_params)
        lgbm_fold.fit(X.iloc[train_idx], y.iloc[train_idx],eval_set=[(X.iloc[val_idx], y.iloc[val_idx])])
        lgbm_pre[val_idx] = lgbm_fold.predict(X.iloc[val_idx])

        catboost_fold = cb.CatBoostRegressor(**catboost_params)
        catboost_fold.fit(X.iloc[train_idx], y.iloc[train_idx],eval_set=[(X.iloc[val_idx], y.iloc[val_idx])],verbose=False)
        catboost_pre[val_idx] = catboost_fold.predict(X.iloc[val_idx])

    return xgb_pre, lgbm_pre, catboost_pre

In [57]:
final_xgboost_params = best_xgb_model.get_params()
final_lightgbm_params = best_lgbm_model.get_params()
final_catboost_params = best_catboost_model.get_params()

In [61]:
oof_pred_xgb, oof_pred_lgbm, oof_pred_cat = get_oof_pre(final_xgboost_params, final_lightgbm_params, final_catboost_params, X_train_transformed, y_train_transformed, cv)

X_meta_train = pd.DataFrame({
    'meta_xgb': oof_pred_xgb,
    'meta_lgbm': oof_pred_lgbm,
    'meta_cat': oof_pred_cat
})
y_meta_train = y_train_transformed.copy()
d_train_meta = xgb.DMatrix(X_meta_train, label=y_meta_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000529 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12631
[LightGBM] [Info] Number of data points in the train set: 651, number of used features: 58
[LightGBM] [Info] Start training from score 0.198695


Default metric period is 5 because MAE is/are not implemented for GPU


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000495 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12628
[LightGBM] [Info] Number of data points in the train set: 651, number of used features: 58
[LightGBM] [Info] Start training from score 0.200687


Default metric period is 5 because MAE is/are not implemented for GPU


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000545 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12630
[LightGBM] [Info] Number of data points in the train set: 651, number of used features: 58
[LightGBM] [Info] Start training from score 0.201101


Default metric period is 5 because MAE is/are not implemented for GPU


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000491 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12630
[LightGBM] [Info] Number of data points in the train set: 651, number of used features: 58
[LightGBM] [Info] Start training from score 0.201235


Default metric period is 5 because MAE is/are not implemented for GPU


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000649 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12634
[LightGBM] [Info] Number of data points in the train set: 652, number of used features: 58
[LightGBM] [Info] Start training from score 0.202615


Default metric period is 5 because MAE is/are not implemented for GPU


In [62]:
study_meta = optuna.create_study(direction='minimize')
study_meta.optimize(lambda trial: objective_xgboost(trial, d_train_meta), n_trials=30, show_progress_bar=True)

  0%|          | 0/30 [00:00<?, ?it/s]

In [63]:
best_meta_n_estimators = study_meta.best_trial.user_attrs.get('n_estimators_optimal') 
final_meta_params = study_meta.best_params.copy()
final_meta_params['n_estimators'] = best_meta_n_estimators
final_meta_params['objective'] = 'reg:absoluteerror'
best_meta_model = xgb.XGBRegressor(**final_meta_params)

In [64]:
best_meta_model.fit(X_meta_train, y_meta_train)

In [65]:
pred_xgb_test = best_xgb_model.predict(X_test_transformed)
pred_lgbm_test = best_lgbm_model.predict(X_test_transformed)
pred_catboost_test = best_catboost_model.predict(X_test_transformed)
X_meta_test = np.column_stack([
    pred_xgb_test.flatten(),
    pred_lgbm_test.flatten(),
    pred_catboost_test.flatten()
])
pred_meta = best_meta_model.predict(X_meta_test)

y_test_true = preprocessor_y.inverse_transform(
    y_test_transformed.values.reshape(-1, 1)
).flatten()

final_prediction = preprocessor_y.inverse_transform(
    pred_meta.reshape(-1, 1)
).flatten()

nae = np.mean(
    np.abs(y_test_true - final_prediction) / (np.abs(y_test_true) + eps)
)

print(f"NAE tập test (Giá trị gốc, Ensemble): {nae * 100:.4f}%")

NAE tập test (Giá trị gốc, Ensemble): 15.3702%


In [None]:
def get_oof_res(meta_params, X, y, cv):
    residuals = np.zeros(len(y))
    meta_pre = np.zeros(len(y))
    for train_idx, val_idx in cv.split(X, y):
        meta_fold = xgb.XGBRegressor(**meta_params)
        meta_fold.fit(X.iloc[train_idx], y.iloc[train_idx],eval_set=[(X.iloc[val_idx], y.iloc[val_idx])],verbose=False)
        pred = meta_fold.predict(X.iloc[val_idx])
        meta_pre[val_idx] = meta_fold.predict(X.iloc[val_idx])
        residuals[val_idx] = y.iloc[val_idx].values.flatten() - pred.flatten()

    return residuals, meta_pre

In [None]:
residuals, meta_oof = get_oof_res(final_meta_params, X_meta_train, y_meta_train, cv)
X_res_train = X_meta_train.copy()
meta_cols = ['meta_xgb', 'meta_lgbm', 'meta_cat']
X_res_train['meta_mean'] = X_res_train[meta_cols].mean(axis=1)
X_res_train['meta_std'] = X_res_train[meta_cols].std(axis=1)
X_res_train['range'] = X_res_train[meta_cols].max(axis=1) - X_res_train[meta_cols].min(axis=1)
X_res_train['meta_oof'] = meta_oof
y_res_train = pd.DataFrame({'residual': residuals})
d_train_res = xgb.DMatrix(X_res_train, label=y_res_train)

In [68]:
study_res = optuna.create_study(direction='minimize')
study_res.optimize(lambda trial: objective_xgboost(trial, d_train_res), n_trials=30, show_progress_bar=True)

  0%|          | 0/30 [00:00<?, ?it/s]

In [69]:
best_res_n_estimators = study_res.best_trial.user_attrs.get('n_estimators_optimal') 
final_res_params = study_res.best_params.copy()
final_res_params['n_estimators'] = best_res_n_estimators
final_res_params['objective'] = 'reg:absoluteerror'
best_res_model = xgb.XGBRegressor(**final_res_params)
best_res_model.fit(X_res_train, y_res_train)

In [None]:
def find_best_weight_residual(y_true_orig, yA_t, yBres_t, preY):
    def loss(w_arr):
        w = float(np.asarray(w_arr).item())
        y_pred_t = yA_t + w * yBres_t
        y_pred = preY.inverse_transform(y_pred_t.reshape(-1,1)).ravel()
        return nae(y_true_orig, y_pred)

    res = minimize(loss, x0=[0.5], bounds=[(0, 1)], method="L-BFGS-B")
    return float(res.x[0])

In [None]:
pred_xgb_vali = best_xgb_model.predict(X_vali_transformed).flatten()
pred_lgbm_vali = best_lgbm_model.predict(X_vali_transformed).flatten()
pred_catboost_vali = best_catboost_model.predict(X_vali_transformed).flatten()
y_vali_orig = preprocessor_y.inverse_transform(y_vali_transformed.values.reshape(-1,1)).ravel()

X_meta_vali = pd.DataFrame({
    'meta_xgb': pred_xgb_vali,
    'meta_lgbm': pred_lgbm_vali,
    'meta_cat': pred_catboost_vali
})

meta_cols = ['meta_xgb', 'meta_lgbm', 'meta_cat']
X_meta_test_base = np.column_stack([
    pred_xgb_vali,
    pred_lgbm_vali,
    pred_catboost_vali
])
pred_meta_vali = best_meta_model.predict(X_meta_test_base)
meta_mean_vali = X_meta_vali[meta_cols].mean(axis=1)
meta_std_vali = X_meta_vali[meta_cols].std(axis=1)
meta_range_vali = X_meta_vali[meta_cols].max(axis=1) - X_meta_vali[meta_cols].min(axis=1)

X_res_vali_final = np.column_stack([
    pred_xgb_vali,
    pred_lgbm_vali,
    pred_catboost_vali,
    meta_mean_vali,
    meta_std_vali,
    meta_range_vali,
    pred_meta_vali
])

pred_res_vali = best_res_model.predict(X_res_vali_final)
w = find_best_weight_residual(y_vali_orig, pred_meta_vali, pred_res_vali, preprocessor_y)

In [70]:
pred_xgb_test = best_xgb_model.predict(X_test_transformed).flatten()
pred_lgbm_test = best_lgbm_model.predict(X_test_transformed).flatten()
pred_catboost_test = best_catboost_model.predict(X_test_transformed).flatten()

X_meta_temp = pd.DataFrame({
    'meta_xgb': pred_xgb_test,
    'meta_lgbm': pred_lgbm_test,
    'meta_cat': pred_catboost_test
})

meta_cols = ['meta_xgb', 'meta_lgbm', 'meta_cat']
X_meta_test_base = np.column_stack([
    pred_xgb_test,
    pred_lgbm_test,
    pred_catboost_test
])
pred_meta_as_feature = best_meta_model.predict(X_meta_test_base)
meta_mean_test = X_meta_temp[meta_cols].mean(axis=1)
meta_std_test = X_meta_temp[meta_cols].std(axis=1)
meta_range_test = X_meta_temp[meta_cols].max(axis=1) - X_meta_temp[meta_cols].min(axis=1)

X_res_test_final = np.column_stack([
    pred_xgb_test,
    pred_lgbm_test,
    pred_catboost_test,
    meta_mean_test,
    meta_std_test,
    meta_range_test,
    pred_meta_as_feature
])

pred_meta = best_res_model.predict(X_res_test_final)

y_test_true = preprocessor_y.inverse_transform(
    y_test_transformed.values.reshape(-1, 1)
).flatten()

final_prediction = preprocessor_y.inverse_transform(
    pred_meta.reshape(-1, 1)
).flatten()

nae = calculate_nae(y_test_true, final_prediction, eps)

print(f"NAE tập test (Giá trị gốc, res): {nae * 100:.4f}%")

NAE tập test (Giá trị gốc, res): 15.4700%


In [73]:
pred_xgb_infer = best_xgb_model.predict(X_infer_transformed).flatten()
pred_lgbm_infer = best_lgbm_model.predict(X_infer_transformed).flatten()
pred_catboost_infer = best_catboost_model.predict(X_infer_transformed).flatten()

X_meta_temp = pd.DataFrame({
    'meta_xgb': pred_xgb_infer,
    'meta_lgbm': pred_lgbm_infer,
    'meta_cat': pred_catboost_infer
})

meta_cols = ['meta_xgb', 'meta_lgbm', 'meta_cat']
X_meta_infer_base = np.column_stack([
    pred_xgb_infer,
    pred_lgbm_infer,
    pred_catboost_infer
])
pred_meta_as_feature = best_meta_model.predict(X_meta_infer_base)
meta_mean_infer = X_meta_temp[meta_cols].mean(axis=1)
meta_std_infer = X_meta_temp[meta_cols].std(axis=1)
meta_range_infer = X_meta_temp[meta_cols].max(axis=1) - X_meta_temp[meta_cols].min(axis=1)

X_res_infer_final = np.column_stack([
    pred_xgb_infer,
    pred_lgbm_infer,
    pred_catboost_infer,
    meta_mean_infer,
    meta_std_infer,
    meta_range_infer,
    pred_meta_as_feature
])

pred_meta_infer = best_res_model.predict(X_res_infer_final)

y_infer_true = preprocessor_y.inverse_transform(
    y_infer_transformed.values.reshape(-1, 1)
).flatten()

final_prediction = preprocessor_y.inverse_transform(
    pred_meta_infer.reshape(-1, 1)
).flatten()

nae = calculate_nae(y_infer_true, final_prediction, eps)

print(f"NAE tập kiểm thử (Giá trị gốc): {nae * 100:.4f}%")

NAE tập kiểm thử (Giá trị gốc): 9.2493%


NAE tập kiểm thử (Giá trị gốc): 9.2493%

In [86]:
artifacts = {
  "power_X": preprocessor_X,
  "power_y": preprocessor_y,
  "feature_list": X_train_transformed.columns.tolist(),
  "oof_predict_models": oof_predcit_models,
  "base_models": {
      "xgb": best_xgb_model, "lgbm": best_lgbm_model, "cat": best_catboost_model
  },
  "randomstate" : 42,
  "meta_model": best_meta_model,
  "residual_model": best_res_model
}
joblib.dump(artifacts, "./model/ltv_d60_stack_pipeline.joblib")

['./model/ltv_d60_stack_pipeline.joblib']

In [None]:
arti = joblib.load("./model/ltv_d60_stack_pipeline.joblib")
best_meta_model = arti['meta_model']

In [None]:
explainer = shap.TreeExplainer(best_xgb_model)
shap_values = explainer.shap_values(X_train_transformed) 

shap_importance = np.abs(shap_values).mean(axis=0)
features = X_train_transformed.columns.tolist()

feature_importance_df = pd.DataFrame({
    'feature': features,
    'importance': shap_importance
}).sort_values(by='importance', ascending=False) 

top_n = 60
best_features = feature_importance_df['feature'].head(top_n).tolist()

all_features = X_train_transformed.columns.tolist()
unimportant_features_to_drop = list(set(all_features) - set(best_features))

X_train_transformed = X_train_transformed.drop(columns=unimportant_features_to_drop)
X_test_transformed = X_test_transformed.drop(columns=unimportant_features_to_drop)

unimportant_features_to_drop

[]