In [13]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.optimize import minimize
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import optuna
import lightgbm as lgb
import catboost as cb
from sklearn.linear_model import ElasticNet
import shap
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import PolynomialFeatures
from ray.tune.search.sample import Integer as Int
from ray.tune.search.sample import Float as Real
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import logging
optuna.logging.set_verbosity(logging.WARNING)
from sklearn.preprocessing import RobustScaler
from numpy.random import dirichlet
from pandas.api.types import is_categorical_dtype, is_bool_dtype, is_float_dtype, is_integer_dtype
from sklearn.preprocessing import PowerTransformer
from optuna.integration import XGBoostPruningCallback
from autogluon.tabular import TabularPredictor
from autogluon.core.metrics import make_scorer

In [14]:
def apply_mixup_train_data(X_train_df, y_train_df, augmentation_factor=1.0, random_state=42):
    if augmentation_factor <= 0:
        return X_train_df.copy(), y_train_df.copy()

    X = X_train_df.reset_index(drop=True)
    y = y_train_df.reset_index(drop=True)
    N = len(X)
    if N == 0:
        return X.copy(), y.copy()

    N_aug = int(N * augmentation_factor)
    if N_aug <= 0:
        return X.copy(), y.copy()

    ohe_groups = []
    ohe_cols = set()

    rng = np.random.default_rng(random_state)
    idx_A = rng.integers(0, N, size=N_aug)
    idx_B = (idx_A + (rng.integers(1, N, size=N_aug) if N > 1 else 0)) % max(N, 1)
    lam = rng.beta(1.0, 1.0, size=N_aug).reshape(-1, 1)
    pickA = (lam.ravel() > 0.5)

    cols = list(X.columns)
    
    float_cols = [c for c in cols if is_float_dtype(X[c].dtype)] 
    int_cols = [c for c in cols if is_integer_dtype(X[c].dtype)] 
    bool_cols = [c for c in cols if is_bool_dtype(X[c].dtype)]
    
    int_cols = [c for c in int_cols if c not in bool_cols and c not in float_cols]
    
    handled_cols = set(float_cols) | set(int_cols) | set(bool_cols) 
    other_cols = [c for c in cols if c not in handled_cols]

    new_cols = {}

    if float_cols:
        A = X[float_cols].to_numpy()[idx_A]
        B = X[float_cols].to_numpy()[idx_B]
        M = lam * A + (1.0 - lam) * B
        for j, c in enumerate(float_cols):
            new_cols[c] = M[:, j].astype(X[c].dtype, copy=False)

    if int_cols:
        A = X[int_cols].astype('float64', copy=False).to_numpy()[idx_A]
        B = X[int_cols].astype('float64', copy=False).to_numpy()[idx_B]
        M = lam * A + (1.0 - lam) * B
        R = np.rint(M)
        mins = X[int_cols].astype('float64', copy=False).min().to_numpy()
        maxs = X[int_cols].astype('float64', copy=False).max().to_numpy()
        C = np.clip(R, mins, maxs)
        for j, c in enumerate(int_cols):
            new_cols[c] = C[:, j].astype(X[c].dtype, copy=False)

    if bool_cols:
        A = X[bool_cols].to_numpy()[idx_A]
        B = X[bool_cols].to_numpy()[idx_B]
        M = np.where(pickA[:, None], A, B)
        for j, c in enumerate(bool_cols):
            new_cols[c] = M[:, j].astype(X[c].dtype, copy=False)

    if other_cols:
        A = X[other_cols].to_numpy(dtype=object)[idx_A]
        B = X[other_cols].to_numpy(dtype=object)[idx_B]
        M = np.where(pickA[:, None], A, B)
        for j, c in enumerate(other_cols):
            new_cols[c] = M[:, j]

    X_new = pd.DataFrame({c: new_cols[c] for c in cols}, columns=cols)

    yA = y.to_numpy(dtype=np.float64)[idx_A]
    yB = y.to_numpy(dtype=np.float64)[idx_B]
    y_new = lam * yA + (1.0 - lam) * yB
    y_new_df = pd.DataFrame(y_new, columns=y.columns)

    X_aug = pd.concat([X, X_new], ignore_index=True)
    y_aug = pd.concat([y, y_new_df], ignore_index=True)
    return X_aug, y_aug

def target_encode_cv(X_train_df, y_train_df, X_test_df, categorical_col, cv):
    
    X_train_df = X_train_df.reset_index(drop=True)
    y_train_df = y_train_df.reset_index(drop=True)
    X_test_df = X_test_df.reset_index(drop=True)
    
    target_col = y_train_df.columns[0]
    
    oof_encoded = np.zeros(X_train_df.shape[0])
    test_encoded = np.zeros(X_test_df.shape[0])
    
    global_mean = y_train_df[target_col].mean()
    
    for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_df, y_train_df)):
        
        X_train_fold = X_train_df.iloc[train_idx]
        y_train_fold = y_train_df.iloc[train_idx]
        
        X_val_fold = X_train_df.iloc[val_idx]
        
        mean_encoding_map = y_train_fold.groupby(X_train_fold[categorical_col])[target_col].mean()
        
        oof_encoded[val_idx] = X_val_fold[categorical_col].map(mean_encoding_map).fillna(global_mean).values
        
        test_encoded_fold = X_test_df[categorical_col].map(mean_encoding_map).fillna(global_mean).values
        test_encoded += test_encoded_fold / cv.n_splits

    X_train_df[categorical_col] = oof_encoded
    X_test_df[categorical_col] = test_encoded
    
    return X_train_df, X_test_df

def generate_oof_elasticnet(X_train_df, y_train_df, X_test_df, cv, random_state=42):
    
    X_train_df = X_train_df.copy()
    X_test_df = X_test_df.copy()

    X_train_arr = X_train_df.values
    y_train_arr = y_train_df.values.flatten()
    X_test_arr = X_test_df.values

    oof_predictions = np.zeros(X_train_arr.shape[0])
    test_predictions = np.zeros(X_test_arr.shape[0])

    scaler = StandardScaler()

    for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_arr, y_train_arr)):

        model = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=random_state)

        X_train_fold, X_val_fold = X_train_arr[train_idx], X_train_arr[val_idx]
        y_train_fold = y_train_arr[train_idx]

        X_train_scaled = scaler.fit_transform(X_train_fold)

        X_val_scaled = scaler.transform(X_val_fold)
        X_test_scaled = scaler.transform(X_test_arr)

        model.fit(X_train_scaled, y_train_fold)

        oof_predictions[val_idx] = model.predict(X_val_scaled)

        test_predictions += model.predict(X_test_scaled) / cv.n_splits

    oof_col_name = 'OOF_ElasticNet'

    X_train_df[oof_col_name] = oof_predictions
    X_test_df[oof_col_name] = test_predictions

    return X_train_df, X_test_df

In [15]:
df_ori = pd.read_csv('data/raw_2025-04-01_2025-05-31_puzzle_com.twisted.rope.tangle.csv')

df = df_ori[['roas_d0','roas_d1','roas_d2','roas_d3',
        'cumulative_revenue_d0','cumulative_revenue_d1','cumulative_revenue_d2','cumulative_revenue_d3',
        'daily_revenue_d0','daily_revenue_d1','daily_revenue_d2',
        'unique_users_d0','unique_users_d1','unique_users_d2','unique_users_d3','daily_revenue_d3','cost',
        'ltv_d0', 'ltv_d1', 'ltv_d2', 'ltv_d3']].copy()
df['ltv_mean'] = df[['ltv_d0', 'ltv_d1', 'ltv_d2', 'ltv_d3']].mean(axis=1)
df['roas_mean'] = df[['roas_d0','roas_d1','roas_d2','roas_d3']].mean(axis=1)
df['cumulative_revenue_mean'] = df[['cumulative_revenue_d0','cumulative_revenue_d1','cumulative_revenue_d2','cumulative_revenue_d3']].mean(axis=1)

df['ltv_std'] = df[['ltv_d0', 'ltv_d1', 'ltv_d2', 'ltv_d3']].std(axis=1)
df['roas_std'] = df[['roas_d0','roas_d1','roas_d2','roas_d3']].std(axis=1)
df['cumulative_revenue_std'] = df[['cumulative_revenue_d0','cumulative_revenue_d1','cumulative_revenue_d2','cumulative_revenue_d3']].std(axis=1)

df['ltv_growth'] = (df['ltv_d3'] - df['ltv_d0']) / (3 + 1e-9)
df['cumulative_revenue_growth'] = df['cumulative_revenue_d3'] - df['cumulative_revenue_d0']

df['revenue_acceleration'] = df['daily_revenue_d3'] - df['daily_revenue_d2'] - df['daily_revenue_d1'] + df['daily_revenue_d0']
df['user_acceleration'] = df['unique_users_d3'] - df['unique_users_d2'] - df['unique_users_d1'] + df['unique_users_d0']

df['roas_trend'] = df['roas_d3'] - df['roas_d0']
df['ltv_roas_ratio'] = df['ltv_d3'] / df['roas_d3']

df['ltv_slope_d0_d1'] = df['ltv_d1'] - df['ltv_d0']
df['ltv_slope_d1_d2'] = df['ltv_d2'] - df['ltv_d1']
df['ltv_slope_d2_d3'] = df['ltv_d3'] - df['ltv_d2']
df['ARPU_d0'] = df['daily_revenue_d0'] / df['unique_users_d0']
df['ARPU_d1'] = df['daily_revenue_d1'] / df['unique_users_d1']
df['ARPU_d2'] = df['daily_revenue_d2'] / df['unique_users_d2']
df['ARPU_d3'] = df['daily_revenue_d3'] / df['unique_users_d3']
df['retention_d1'] = df['unique_users_d1'] / df['unique_users_d0']
df['retention_d2'] = df['unique_users_d2'] / df['unique_users_d0']
df['retention_d3'] = df['unique_users_d3'] / df['unique_users_d0']

df['ltv_acceleration'] = df['ltv_slope_d2_d3'] - df['ltv_slope_d1_d2']
df['roas_slope_d0_d1'] = df['roas_d1'] - df['roas_d0']
df['roas_slope_d1_d2'] = df['roas_d2'] - df['roas_d1']
df['roas_slope_d2_d3'] = df['roas_d3'] - df['roas_d2']
df['roas_acceleration'] = df['roas_slope_d2_d3'] - df['roas_slope_d1_d2']
df['is_ltv_slowing_down'] = (df['ltv_acceleration'] < 0).astype(int)
df['is_roas_slowing_down'] = (df['roas_acceleration'] < 0).astype(int)

df['ltv_gain'] = df['ltv_d3'] - df['ltv_d0']
df['cumulative_users_d3'] = df['unique_users_d0'] + df['unique_users_d1'] + df['unique_users_d2'] + df['unique_users_d3']
df['ARPU_cumulative_d3'] = df['cumulative_revenue_d3'] / df['cumulative_users_d3']
df['ARPU_trend'] = (df['daily_revenue_d3'] / (df['unique_users_d3'] + 1e-9)) - (df['daily_revenue_d0'] / (df['unique_users_d0'] + 1e-9))
df['Payback_Velocity'] = (df['cumulative_revenue_d3'] / df['cost']) / 4
df['Acceleration_Ratio'] = df['revenue_acceleration'] / (df['user_acceleration'] + 1e-9)
df['CAC'] = df['cost'] / df['cumulative_users_d3']
df['ROAS_CV'] = df['roas_std'] / (df['roas_mean'] + 1e-9)
df['ERTI'] = df['cumulative_users_d3'] / df['cost']
df['LTV_CAC'] = df['ltv_d3'] / df['CAC']

df['daily_to_cumulative_revenue_ratio_d3'] = df['daily_revenue_d3'] / (df['cumulative_revenue_d3'] + 1e-9)
df['d0_cohort_value_d3'] = df['cumulative_revenue_d3'] / (df['unique_users_d0'] + 1e-9)
df['user_growth_d3_vs_d0'] = df['unique_users_d3'] / (df['unique_users_d0'] + 1e-9)
df['cost_per_revenue_d3'] = df['cost'] / (df['cumulative_revenue_d3'] + 1e-9)
df['arpu_d3_x_payback'] = df['ARPU_cumulative_d3'] * df['Payback_Velocity']
df['LTV_CV'] = df['ltv_std'] / (df['ltv_mean'] + 1e-9)
df['LTV_CAC_Trend'] = df['LTV_CAC'] * df['ARPU_trend']
df['Cost_LTV_Mean'] = df['cost'] * df['ltv_mean']
df['Payback_Accel'] = df['Payback_Velocity'] * df['ltv_acceleration']
df['ROAS_Std_Weighted'] = df['roas_std'] * df['LTV_CAC']
df['Daily_to_Cumul_LTV'] = df['daily_to_cumulative_revenue_ratio_d3'] * df['ltv_d3']

power_cols_base = ['ltv_mean', 'cost', 'LTV_CAC', 'Payback_Velocity', 'ltv_acceleration', 'roas_mean','ARPU_trend']
poly_transformer = PolynomialFeatures(degree=2, include_bias=False)
X_poly_base = df[power_cols_base]
X_poly_transformed = poly_transformer.fit_transform(X_poly_base)
new_poly_names = poly_transformer.get_feature_names_out(input_features=power_cols_base)
df_poly = pd.DataFrame(X_poly_transformed, columns=new_poly_names, index=df.index)
df = pd.concat([df.drop(columns=power_cols_base), df_poly], axis=1)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
ltv = df_ori.filter(like='ltv_d')
ltv = ltv[sorted(ltv.columns, key=lambda x: int(x[5:]))]
ltv = ltv.iloc[: , 4:]
ltv_cols = ltv.columns.to_list()
df = df.fillna(0)
eps = 1e-9
df = pd.concat([df, ltv], axis=1)

df.shape

(509, 156)

In [16]:
df = df.drop(columns=['daily_revenue_d0',
 'LTV_CAC ARPU_trend',
 'ltv_mean Payback_Velocity',
 'LTV_CAC roas_mean',
 'LTV_CAC Payback_Velocity',
 'roas_mean',
 'ARPU_cumulative_d3',
 'cost Payback_Velocity',
 'cost',
 'revenue_acceleration',
 'LTV_CV',
 'Payback_Velocity roas_mean',
 'Payback_Velocity',
 'cumulative_revenue_d2',
 'cumulative_revenue_growth',
 'Payback_Velocity ARPU_trend',
 'LTV_CAC^2',
 'cumulative_users_d3',
 'cumulative_revenue_mean',
 'Payback_Velocity ltv_acceleration',
 'roas_mean^2',
 'cost roas_mean',
 'cumulative_revenue_std',
 'cost_per_revenue_d3',
 'cumulative_revenue_d3',
 'ltv_mean cost',
 'ltv_acceleration roas_mean',
 'cumulative_revenue_d1',
 'cost^2',
 'is_ltv_slowing_down',
 'LTV_CAC_Trend',
 'Payback_Velocity^2',
 'is_roas_slowing_down',
 'arpu_d3_x_payback',
 'daily_revenue_d2',
 'daily_revenue_d3',
 'roas_mean ARPU_trend',
 'cost LTV_CAC',
 'LTV_CAC ltv_acceleration',
 'Payback_Accel',
 'roas_trend',
 'ROAS_Std_Weighted'])#

In [17]:
wide_id_cols = [c for c in df.columns if c not in ltv_cols]

X_wide = df[wide_id_cols]
y_wide = df[ltv_cols]
X_train_wide, X_test_wide, y_train_wide, y_test_wide = train_test_split(X_wide, y_wide, test_size=0.2, random_state=42)

df_train = pd.concat([X_train_wide.reset_index(drop=True), y_train_wide.reset_index(drop=True)], axis=1)
df_test  = pd.concat([X_test_wide.reset_index(drop=True),  y_test_wide.reset_index(drop=True)],  axis=1)

id_vars_train = [c for c in df_train.columns if c not in ltv_cols]
id_vars_test  = [c for c in df_test.columns  if c not in ltv_cols]

df_train_long = pd.melt(df_train, id_vars=id_vars_train, value_vars=ltv_cols, var_name='Day', value_name='LTV')
df_train_long['Day'] = df_train_long['Day'].str.replace('ltv_d', '').astype(int)

df_test_long = pd.melt(df_test, id_vars=id_vars_test, value_vars=ltv_cols, var_name='Day', value_name='LTV')
df_test_long['Day'] = df_test_long['Day'].str.replace('ltv_d', '').astype(int)

features_long = [c for c in df_train_long.columns if c != 'LTV']
target = ['LTV']

X_train = df_train_long[features_long].reset_index(drop=True)
y_train = df_train_long[target].reset_index(drop=True)
X_test  = df_test_long[features_long].reset_index(drop=True)
y_test  = df_test_long[target].reset_index(drop=True)

In [18]:
features = X_train.columns.tolist()
preprocessor_X = PowerTransformer(method='yeo-johnson', standardize=False)
preprocessor_y = PowerTransformer(method='yeo-johnson', standardize=False)

#X_train = pd.DataFrame(preprocessor_X.fit_transform(X_train), columns=features)
#X_test = pd.DataFrame(preprocessor_X.transform(X_test), columns=features)
#y_train_transformed = pd.DataFrame(preprocessor_y.fit_transform(y_train), columns=['LTV'])
#y_test_transformed = pd.DataFrame(preprocessor_y.transform(y_test), columns=['LTV'])

train_data = pd.concat([X_train, y_train], axis=1) 
test_data = pd.concat([X_test, y_test], axis=1)

In [19]:
def nae_score_func(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    eps = 1e-9 
    return 100 * np.mean(
        np.abs(y_true - y_pred) / (np.abs(y_true) + eps)
    )

custom_nae = make_scorer(
    name='NAE',
    score_func=nae_score_func,
    greater_is_better=False,
    needs_proba=False
)

In [21]:
predictor = TabularPredictor(
    label="LTV", 
    eval_metric=custom_nae,
    path='./AutoGluon_LTV_D60'
)

predictor.fit(
    train_data, 
    presets='best_quality',
    time_limit=3600,
    ag_args_ensemble={'fold_fitting_strategy': 'sequential_local'},
    ds_args={'enable_ray_logging': False}
)

print("\n--- AutoGluon đã hoàn tất huấn luyện! ---")

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.6
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          16
Memory Avail:       4.77 GB / 13.86 GB (34.5%)
Disk Space Avail:   49.23 GB / 125.01 GB (39.4%)
Presets specified: ['best_quality']
Using hyperparameters preset: hyperparameters='zeroshot'
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for

[1000]	valid_set's l2: 4.44143e-05	valid_set's NAE: -1.79149
[2000]	valid_set's l2: 2.8829e-05	valid_set's NAE: -1.2437
[3000]	valid_set's l2: 2.27414e-05	valid_set's NAE: -1.05478
[4000]	valid_set's l2: 2.01099e-05	valid_set's NAE: -0.974154
[5000]	valid_set's l2: 1.83753e-05	valid_set's NAE: -0.930485
[6000]	valid_set's l2: 1.75433e-05	valid_set's NAE: -0.9016
[7000]	valid_set's l2: 1.68057e-05	valid_set's NAE: -0.881215
[8000]	valid_set's l2: 1.63229e-05	valid_set's NAE: -0.863634
[9000]	valid_set's l2: 1.58702e-05	valid_set's NAE: -0.850098
[10000]	valid_set's l2: 1.54429e-05	valid_set's NAE: -0.84256
[1000]	valid_set's l2: 5.26871e-05	valid_set's NAE: -1.82847
[2000]	valid_set's l2: 3.51421e-05	valid_set's NAE: -1.30492
[3000]	valid_set's l2: 3.11622e-05	valid_set's NAE: -1.12021
[4000]	valid_set's l2: 2.94484e-05	valid_set's NAE: -1.02189
[5000]	valid_set's l2: 2.80146e-05	valid_set's NAE: -0.954643
[6000]	valid_set's l2: 2.73604e-05	valid_set's NAE: -0.90725
[7000]	valid_set's l

	-0.7987	 = Validation score   (-NAE)
	134.63s	 = Training   runtime
	2.43s	 = Validation runtime
Fitting model: LightGBM_BAG_L1 ... Training model for up to 418.13s of the 697.43s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy (sequential: cpus=8, gpus=0)


[1000]	valid_set's l2: 3.33408e-05	valid_set's NAE: -1.41423
[2000]	valid_set's l2: 2.36624e-05	valid_set's NAE: -1.01791
[3000]	valid_set's l2: 2.08441e-05	valid_set's NAE: -0.898944
[4000]	valid_set's l2: 1.96844e-05	valid_set's NAE: -0.833554
[5000]	valid_set's l2: 1.91002e-05	valid_set's NAE: -0.798653
[6000]	valid_set's l2: 1.8749e-05	valid_set's NAE: -0.775788
[7000]	valid_set's l2: 1.84267e-05	valid_set's NAE: -0.760893
[8000]	valid_set's l2: 1.82026e-05	valid_set's NAE: -0.749684
[9000]	valid_set's l2: 1.80803e-05	valid_set's NAE: -0.745679
[10000]	valid_set's l2: 1.79859e-05	valid_set's NAE: -0.742558
[1000]	valid_set's l2: 3.96249e-05	valid_set's NAE: -1.39801
[2000]	valid_set's l2: 2.83815e-05	valid_set's NAE: -1.06168
[3000]	valid_set's l2: 2.43296e-05	valid_set's NAE: -0.931032
[4000]	valid_set's l2: 2.22724e-05	valid_set's NAE: -0.870068
[5000]	valid_set's l2: 2.12354e-05	valid_set's NAE: -0.825578
[6000]	valid_set's l2: 2.04275e-05	valid_set's NAE: -0.802129
[7000]	valid

	-0.7271	 = Validation score   (-NAE)
	144.63s	 = Training   runtime
	2.3s	 = Validation runtime
Fitting model: RandomForestMSE_BAG_L1 ... Training model for up to 268.08s of the 547.38s of remaining time.
	-0.6727	 = Validation score   (-NAE)
	10.04s	 = Training   runtime
	1.62s	 = Validation runtime
Fitting model: CatBoost_BAG_L1 ... Training model for up to 255.74s of the 535.04s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy (sequential: cpus=8, gpus=0)
	Ran out of time, early stopping on iteration 2885.
	Ran out of time, early stopping on iteration 3035.
	Ran out of time, early stopping on iteration 3055.
	Ran out of time, early stopping on iteration 3189.
	Ran out of time, early stopping on iteration 3220.
	Ran out of time, early stopping on iteration 3426.
	Ran out of time, early stopping on iteration 3731.
	Ran out of time, early stopping on iteration 4279.
	-1.101	 = Validation score   (-NAE)
	245.2s	 = Training   run

[1000]	valid_set's l2: 2.46191e-05	valid_set's NAE: -1.0765
[2000]	valid_set's l2: 1.91369e-05	valid_set's NAE: -0.92381
[3000]	valid_set's l2: 1.70434e-05	valid_set's NAE: -0.859342
[4000]	valid_set's l2: 1.5772e-05	valid_set's NAE: -0.826351
[5000]	valid_set's l2: 1.50185e-05	valid_set's NAE: -0.803955
[6000]	valid_set's l2: 1.45495e-05	valid_set's NAE: -0.788415
[7000]	valid_set's l2: 1.41573e-05	valid_set's NAE: -0.774846
[8000]	valid_set's l2: 1.38791e-05	valid_set's NAE: -0.763062
[9000]	valid_set's l2: 1.36684e-05	valid_set's NAE: -0.755098
[10000]	valid_set's l2: 1.35146e-05	valid_set's NAE: -0.744829
[1000]	valid_set's l2: 2.79914e-05	valid_set's NAE: -1.13303
[2000]	valid_set's l2: 2.24046e-05	valid_set's NAE: -0.973433
[3000]	valid_set's l2: 2.01694e-05	valid_set's NAE: -0.903656
[4000]	valid_set's l2: 1.90476e-05	valid_set's NAE: -0.861948
[5000]	valid_set's l2: 1.82315e-05	valid_set's NAE: -0.836914
[6000]	valid_set's l2: 1.76455e-05	valid_set's NAE: -0.818208
[7000]	valid

	-0.7511	 = Validation score   (-NAE)
	137.29s	 = Training   runtime
	2.8s	 = Validation runtime
Fitting model: LightGBM_BAG_L2 ... Training model for up to 135.56s of the 135.52s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy (sequential: cpus=8, gpus=0)


[1000]	valid_set's l2: 2.14167e-05	valid_set's NAE: -0.684011
[1000]	valid_set's l2: 1.92718e-05	valid_set's NAE: -0.697785
[1000]	valid_set's l2: 1.95874e-05	valid_set's NAE: -0.651812
[2000]	valid_set's l2: 1.91323e-05	valid_set's NAE: -0.643038
[3000]	valid_set's l2: 1.90318e-05	valid_set's NAE: -0.638954
[4000]	valid_set's l2: 1.88394e-05	valid_set's NAE: -0.63551
[5000]	valid_set's l2: 1.86927e-05	valid_set's NAE: -0.632691
[6000]	valid_set's l2: 1.86534e-05	valid_set's NAE: -0.628275
[7000]	valid_set's l2: 1.86012e-05	valid_set's NAE: -0.627628
[8000]	valid_set's l2: 1.85439e-05	valid_set's NAE: -0.62708
[9000]	valid_set's l2: 1.85187e-05	valid_set's NAE: -0.625804
[10000]	valid_set's l2: 1.8497e-05	valid_set's NAE: -0.625542
[1000]	valid_set's l2: 2.34085e-05	valid_set's NAE: -0.677676
[1000]	valid_set's l2: 1.94459e-05	valid_set's NAE: -0.667631
[2000]	valid_set's l2: 1.9305e-05	valid_set's NAE: -0.661425
[3000]	valid_set's l2: 1.91784e-05	valid_set's NAE: -0.655655
[4000]	vali

	-0.6729	 = Validation score   (-NAE)
	71.03s	 = Training   runtime
	1.05s	 = Validation runtime
Fitting model: RandomForestMSE_BAG_L2 ... Training model for up to 62.03s of the 61.99s of remaining time.
	-0.489	 = Validation score   (-NAE)
	22.75s	 = Training   runtime
	1.77s	 = Validation runtime
Fitting model: CatBoost_BAG_L2 ... Training model for up to 36.98s of the 36.94s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy (sequential: cpus=8, gpus=0)
	Ran out of time, early stopping on iteration 512.
	Ran out of time, early stopping on iteration 528.
	Ran out of time, early stopping on iteration 548.
	Ran out of time, early stopping on iteration 561.
	Ran out of time, early stopping on iteration 587.
	Ran out of time, early stopping on iteration 625.
	Ran out of time, early stopping on iteration 688.
	Ran out of time, early stopping on iteration 777.
	-1.6137	 = Validation score   (-NAE)
	35.35s	 = Training   runtime
	0.06s	

[1000]	valid_set's l2: 4.49378e-05	valid_set's NAE: -1.70539
[2000]	valid_set's l2: 2.72039e-05	valid_set's NAE: -1.21988
[3000]	valid_set's l2: 2.12767e-05	valid_set's NAE: -1.05031
[4000]	valid_set's l2: 1.83909e-05	valid_set's NAE: -0.957863
[5000]	valid_set's l2: 1.66117e-05	valid_set's NAE: -0.897245
[6000]	valid_set's l2: 1.55325e-05	valid_set's NAE: -0.852813
[7000]	valid_set's l2: 1.4757e-05	valid_set's NAE: -0.82244
[8000]	valid_set's l2: 1.42667e-05	valid_set's NAE: -0.799221
[9000]	valid_set's l2: 1.38138e-05	valid_set's NAE: -0.781367
[10000]	valid_set's l2: 1.35e-05	valid_set's NAE: -0.766259
[1000]	valid_set's l2: 3.85818e-05	valid_set's NAE: -1.77818
[2000]	valid_set's l2: 2.20701e-05	valid_set's NAE: -1.21518
[3000]	valid_set's l2: 1.77019e-05	valid_set's NAE: -1.01879
[4000]	valid_set's l2: 1.57277e-05	valid_set's NAE: -0.924988
[5000]	valid_set's l2: 1.44791e-05	valid_set's NAE: -0.863198
[6000]	valid_set's l2: 1.37096e-05	valid_set's NAE: -0.824957
[7000]	valid_set's

	-0.7621	 = Validation score   (-NAE)
	141.1s	 = Training   runtime
	2.77s	 = Validation runtime
Fitting model: LightGBM_BAG_L1 ... Training model for up to 1640.87s of the 2535.43s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy (sequential: cpus=8, gpus=0)


[1000]	valid_set's l2: 2.71955e-05	valid_set's NAE: -1.24515
[2000]	valid_set's l2: 1.85125e-05	valid_set's NAE: -0.931326
[3000]	valid_set's l2: 1.56782e-05	valid_set's NAE: -0.795795
[4000]	valid_set's l2: 1.45349e-05	valid_set's NAE: -0.743509
[5000]	valid_set's l2: 1.39221e-05	valid_set's NAE: -0.713621
[6000]	valid_set's l2: 1.35142e-05	valid_set's NAE: -0.691834
[7000]	valid_set's l2: 1.3242e-05	valid_set's NAE: -0.677011
[8000]	valid_set's l2: 1.30563e-05	valid_set's NAE: -0.662758
[9000]	valid_set's l2: 1.29228e-05	valid_set's NAE: -0.654695
[10000]	valid_set's l2: 1.28358e-05	valid_set's NAE: -0.64889
[1000]	valid_set's l2: 2.94759e-05	valid_set's NAE: -1.27978
[2000]	valid_set's l2: 1.89424e-05	valid_set's NAE: -0.944658
[3000]	valid_set's l2: 1.57197e-05	valid_set's NAE: -0.821208
[4000]	valid_set's l2: 1.42772e-05	valid_set's NAE: -0.747075
[5000]	valid_set's l2: 1.34653e-05	valid_set's NAE: -0.712327
[6000]	valid_set's l2: 1.29916e-05	valid_set's NAE: -0.683752
[7000]	vali

	-0.6767	 = Validation score   (-NAE)
	165.43s	 = Training   runtime
	2.75s	 = Validation runtime
Fitting model: RandomForestMSE_BAG_L1 ... Training model for up to 1469.54s of the 2364.10s of remaining time.
	-0.6084	 = Validation score   (-NAE)
	12.61s	 = Training   runtime
	2.0s	 = Validation runtime
Fitting model: CatBoost_BAG_L1 ... Training model for up to 1454.31s of the 2348.87s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy (sequential: cpus=8, gpus=0)
	-0.7097	 = Validation score   (-NAE)
	1190.45s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: ExtraTreesMSE_BAG_L1 ... Training model for up to 263.49s of the 1158.05s of remaining time.
	-0.4655	 = Validation score   (-NAE)
	4.92s	 = Training   runtime
	1.79s	 = Validation runtime
Fitting model: NeuralNetFastAI_BAG_L1 ... Training model for up to 256.16s of the 1150.72s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with Sequenti

[1000]	valid_set's l2: 2.35137e-05	valid_set's NAE: -1.08813
[2000]	valid_set's l2: 1.94188e-05	valid_set's NAE: -0.943316
[3000]	valid_set's l2: 1.75397e-05	valid_set's NAE: -0.877426
[4000]	valid_set's l2: 1.64953e-05	valid_set's NAE: -0.83431
[5000]	valid_set's l2: 1.5807e-05	valid_set's NAE: -0.807262
[6000]	valid_set's l2: 1.5299e-05	valid_set's NAE: -0.783644
[7000]	valid_set's l2: 1.49423e-05	valid_set's NAE: -0.764814
[8000]	valid_set's l2: 1.4635e-05	valid_set's NAE: -0.75175
[9000]	valid_set's l2: 1.4312e-05	valid_set's NAE: -0.739565
[10000]	valid_set's l2: 1.41251e-05	valid_set's NAE: -0.731082
[1000]	valid_set's l2: 2.21152e-05	valid_set's NAE: -1.11816
[2000]	valid_set's l2: 1.80504e-05	valid_set's NAE: -0.956449
[3000]	valid_set's l2: 1.66046e-05	valid_set's NAE: -0.894123
[4000]	valid_set's l2: 1.5812e-05	valid_set's NAE: -0.857513
[5000]	valid_set's l2: 1.51899e-05	valid_set's NAE: -0.829996
[6000]	valid_set's l2: 1.46964e-05	valid_set's NAE: -0.809636
[7000]	valid_set

	-0.7444	 = Validation score   (-NAE)
	155.39s	 = Training   runtime
	3.28s	 = Validation runtime
Fitting model: LightGBM_BAG_L2 ... Training model for up to 732.16s of the 732.14s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy (sequential: cpus=8, gpus=0)


[1000]	valid_set's l2: 1.54226e-05	valid_set's NAE: -0.653821
[1000]	valid_set's l2: 5.82778e-05	valid_set's NAE: -0.66989
[1000]	valid_set's l2: 1.57098e-05	valid_set's NAE: -0.649907
[1000]	valid_set's l2: 1.66018e-05	valid_set's NAE: -0.635189
[2000]	valid_set's l2: 1.69325e-05	valid_set's NAE: -0.63049
[1000]	valid_set's l2: 1.71155e-05	valid_set's NAE: -0.64525


	-0.6422	 = Validation score   (-NAE)
	27.64s	 = Training   runtime
	0.21s	 = Validation runtime
Fitting model: RandomForestMSE_BAG_L2 ... Training model for up to 703.86s of the 703.84s of remaining time.
	-0.4472	 = Validation score   (-NAE)
	30.17s	 = Training   runtime
	1.83s	 = Validation runtime
Fitting model: CatBoost_BAG_L2 ... Training model for up to 671.31s of the 671.28s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy (sequential: cpus=8, gpus=0)
	Ran out of time, early stopping on iteration 5810.
	Ran out of time, early stopping on iteration 5707.
	Ran out of time, early stopping on iteration 6166.
	Ran out of time, early stopping on iteration 6212.
	Ran out of time, early stopping on iteration 6429.
	Ran out of time, early stopping on iteration 6723.
	Ran out of time, early stopping on iteration 7440.
	Ran out of time, early stopping on iteration 8045.
	-0.7171	 = Validation score   (-NAE)
	643.8s	 = Training   ru


--- AutoGluon đã hoàn tất huấn luyện! ---


In [22]:
X_test_input = test_data.drop(columns=['LTV'])
y_pred = predictor.predict(X_test_input)

test_data["pre_nae"] = 100* np.abs(test_data["LTV"] - y_pred) / (np.abs(test_data["LTV"]) + eps)
mae_by_day = test_data.groupby('Day')['pre_nae'].mean().reset_index()
mae_by_day.rename(columns={'pre_nae': 'NAE'}, inplace=True)

print(mae_by_day)

    Day        NAE
0     4   4.069788
1     5   4.796543
2     6   5.404289
3     7   6.135445
4     8   6.447680
5     9   6.932190
6    10   7.050550
7    11   7.287006
8    12   7.672455
9    13   7.917473
10   14   8.194911
11   15   8.511490
12   16   8.831794
13   17   9.295538
14   18   9.509186
15   19   9.729783
16   20  10.085205
17   21  10.484871
18   22  10.872049
19   23  11.134427
20   24  11.426585
21   25  11.680832
22   26  11.823627
23   27  11.959119
24   28  12.182942
25   29  12.282060
26   30  12.416905
27   31  12.629766
28   32  12.820678
29   33  12.982271
30   34  13.139633
31   35  13.308281
32   36  13.450398
33   37  13.502838
34   38  13.617318
35   39  13.767979
36   40  13.840447
37   41  13.938929
38   42  14.075051
39   43  14.218034
40   44  14.370665
41   45  14.442772
42   46  14.616061
43   47  14.750430
44   48  14.839564
45   49  14.906032
46   50  14.952855
47   51  15.060642
48   52  15.136692
49   53  15.184188
50   54  15.281330
51   55  15.

In [23]:
y_pred_autogluon = predictor.predict(test_data.drop(columns=["LTV"]))

predictor.evaluate(test_data, silent=True) 

leaderboard = predictor.leaderboard(test_data, silent=True)
print("\n--- Bảng xếp hạng các mô hình tự động ---")
print(leaderboard)


--- Bảng xếp hạng các mô hình tự động ---
                     model  score_test   score_val eval_metric  \
0   RandomForestMSE_BAG_L2  -11.671582   -0.447187         NAE   
1      WeightedEnsemble_L3  -11.868333   -0.419216         NAE   
2     ExtraTreesMSE_BAG_L2  -11.888331   -0.431292         NAE   
3      WeightedEnsemble_L2  -11.923858   -0.432650         NAE   
4          LightGBM_BAG_L2  -12.051697   -0.642188         NAE   
5          CatBoost_BAG_L2  -12.134461   -0.717130         NAE   
6     ExtraTreesMSE_BAG_L1  -12.144745   -0.465497         NAE   
7        LightGBMXT_BAG_L2  -12.333209   -0.744387         NAE   
8        LightGBMXT_BAG_L1  -12.400510   -0.762105         NAE   
9          LightGBM_BAG_L1  -12.474995   -0.676652         NAE   
10         CatBoost_BAG_L1  -13.024370   -0.709670         NAE   
11          XGBoost_BAG_L1  -13.147452   -1.215476         NAE   
12   NeuralNetTorch_BAG_L1  -13.662168   -6.444264         NAE   
13  RandomForestMSE_BAG_L1  -13.8

          Day        NAE
0    3.084288   3.971061
1    3.722591   4.699986
2    4.333314   5.325197
3    4.921379   6.120727
4    5.490327   6.485921
5    6.042813   6.928712
6    6.580898   7.075751
7    7.106221   7.369742
8    7.620115   7.687313
9    8.123682   7.941926
10   8.617846   8.175557
11   9.103394   8.527441
12   9.581002   8.864384
13  10.051256   9.259032
14  10.514669   9.447124
15  10.971694   9.707706
16  11.422731  10.069434
17  11.868140  10.412576
18  12.308241  10.764021
19  12.743324  11.062590
20  13.173654  11.254207
21  13.599468  11.503503
22  14.020985  11.649880
23  14.438406  11.796686
24  14.851914  12.009804
25  15.261680  12.120289
26  15.667860  12.263903
27  16.070600  12.525885
28  16.470035  12.719469
29  16.866291  12.935356
30  17.259487  13.115681
31  17.649731  13.254639
32  18.037128  13.391835
33  18.421774  13.492484
34  18.803761  13.619953
35  19.183174  13.746165
36  19.560094  13.881528
37  19.934599  13.971727
38  20.306760  14.121200
39  20.676646  14.249872
40  21.044322  14.373244
41  21.409851  14.436997
42  21.773291  14.596118
43  22.134697  14.695176
44  22.494124  14.801711
45  22.851623  14.859983
46  23.207241  14.919175
47  23.561026  14.998386
48  23.913022  15.066283
49  24.263271  15.101228
50  24.611814  15.176045
51  24.958691  15.223690
52  25.303939  15.237406
53  25.647593  15.478252
54  25.989689  15.576702
55  26.330259  15.661039
56  26.669337  15.692962
