In [None]:
import pandas as pd
import numpy as np
import os
import lightgbm as lgb
from scipy.optimize import curve_fit
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, WhiteKernel
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from joblib import Parallel, delayed
import warnings
import torch
from tqdm.auto import tqdm


In [None]:
warnings.filterwarnings('ignore')


In [None]:
import os
import sys

class DevNull:
    def write(self, msg): pass
    def flush(self): pass

sys.stderr = DevNull()
USE_GPU = torch.cuda.is_available()


In [None]:
DATA_PATH = '/kaggle/input/mallorn-dataset' 
print(f"Data Path: {DATA_PATH}")

In [None]:
def bazin_func(t, A, B, t0, tau_fall, tau_rise):
    with np.errstate(over='ignore', invalid='ignore'):
        flux = A * (np.exp(-(t - t0) / tau_fall) / (1 + np.exp(-(t - t0) / tau_rise))) + B
    return np.nan_to_num(flux)

In [None]:
def fit_bazin(time, flux, flux_err):
    if len(time) < 5: 
        return {k: np.nan for k in ['A', 'B', 't0', 'tau_fall', 'tau_rise', 'chi2']}

    peak_idx = np.argmax(flux)
    # Initial guesses
    p0 = [flux[peak_idx], np.min(flux), time[peak_idx], 50.0, 10.0]
    # Bounds
    bounds = ([0, -np.inf, time.min()-50, 1e-3, 1e-3], [np.inf, np.inf, time.max()+50, 500, 500])

    try:
        popt, _ = curve_fit(bazin_func, time, flux, p0=p0, sigma=flux_err, bounds=bounds, maxfev=1000)
        residuals = flux - bazin_func(time, *popt)
        chi2 = np.sum((residuals / flux_err)**2) / (len(time) - 5)
        return {'A': popt[0], 'B': popt[1], 't0': popt[2], 'tau_fall': popt[3], 'tau_rise': popt[4], 'chi2': chi2}
    except:
        return {k: np.nan for k in ['A', 'B', 't0', 'tau_fall', 'tau_rise', 'chi2']}

In [None]:
def get_gp_prediction(time, flux, flux_err, t_query):
    if len(time) < 3: return np.nan
    kernel = C(1.0) * RBF(length_scale=20.0) + WhiteKernel(noise_level=1.0)
    gp = GaussianProcessRegressor(kernel=kernel, alpha=flux_err**2, n_restarts_optimizer=0)
    try:
        gp.fit(time.reshape(-1, 1), flux)
        pred, _ = gp.predict(np.array([[t_query]]), return_std=True)
        return pred[0]
    except:
        return np.nan

In [None]:
def calculate_stetson(flux, flux_err):
    n = len(flux)
    if n < 2: return np.nan, np.nan
    
    mean_flux = np.mean(flux)
    delta = (flux - mean_flux) / (flux_err + 1e-6) 
    
    abs_delta_mean = np.mean(np.abs(delta))
    delta_sq_mean = np.mean(delta**2)
    k = (1 / np.sqrt(n)) * (abs_delta_mean / np.sqrt(delta_sq_mean))

    j = np.sum(np.sign(delta[:-1] * delta[1:]) * np.sqrt(np.abs(delta[:-1] * delta[1:])))
    j = (j / (n - 1)) * np.sign(j) 
    
    return j, k

In [None]:
def calculate_chi2_red(flux, flux_err):
    if len(flux) < 2: return np.nan
    w_mean = np.average(flux, weights=1/(flux_err**2 + 1e-9))
    chi2 = np.sum(((flux - w_mean)**2) / (flux_err**2 + 1e-9))
    return chi2 / (len(flux) - 1)

def calculate_flux_excess(flux, flux_err):
    if len(flux) < 2: return np.nan
    return (np.max(flux) - np.median(flux)) / (np.std(flux) + 1e-9)

def calculate_abs_mag(peak_flux, z):
    if np.isnan(z) or z <= 0 or peak_flux <= 0:
        return np.nan
    try:
        d_L = cosmo.luminosity_distance(z).value * 1e6 # đổi sang parsec
        m_app = -2.5 * np.log10(peak_flux)
        M_abs = m_app - 5 * (np.log10(d_L) - 1)
        return M_abs
    except:
        return np.nan

In [None]:
def process_single_object(obj_id, df_obj, redshift):
    feats = {'object_id': obj_id}
    
    # Chuẩn hóa thời gian
    t_min = df_obj['Time (MJD)'].min()
    df_obj['Time_Rel'] = df_obj['Time (MJD)'] - t_min
    
    filters = ['u', 'g', 'r', 'i', 'z', 'y']
    peak_time = np.nan
    max_flux_global = -np.inf
    
    filter_data = {} 
    
    for f in filters:
        df_f = df_obj[df_obj['Filter'] == f]
        if df_f.empty:
            for stat in ['chi2_red', 'excess', 'bazin_rise_fall_ratio']:
                feats[f'{f}_{stat}'] = np.nan
            continue
            
        flux = df_f['Flux'].values
        flux_err = df_f['Flux_err'].values
        time = df_f['Time_Rel'].values
        
        feats[f'{f}_chi2_red'] = calculate_chi2_red(flux, flux_err)
        feats[f'{f}_excess'] = calculate_flux_excess(flux, flux_err)
        
        feats[f'{f}_mean'] = np.mean(flux)
        feats[f'{f}_min'] = np.min(flux)
        feats[f'{f}_skew'] = pd.Series(flux).skew() # Cần pandas để tính skew nhanh
        
        j, k = calculate_stetson(flux, flux_err)
        feats[f'{f}_stetson_j'] = j
        feats[f'{f}_stetson_k'] = k
        
        feats[f'{f}_max'] = np.max(flux)
        feats[f'{f}_std'] = np.std(flux)
        if f in ['g', 'r']:
            current_max = np.max(flux)
            if current_max > max_flux_global:
                max_flux_global = current_max
                peak_time = df_f.loc[df_f['Flux'].idxmax(), 'Time_Rel']

        if f in ['g', 'r', 'i']:
            bazin = fit_bazin(time, flux, flux_err)
            for k_bazin, v_bazin in bazin.items():
                feats[f'bazin_{f}_{k_bazin}'] = v_bazin
            
            if not np.isnan(bazin['tau_rise']) and bazin['tau_fall'] > 0:
                feats[f'{f}_rise_fall_ratio'] = bazin['tau_rise'] / bazin['tau_fall']
            else:
                feats[f'{f}_rise_fall_ratio'] = np.nan
        
        filter_data[f] = (time, flux, flux_err)

    if not np.isnan(peak_time):
        if 'g_max' in feats:
            feats['abs_mag_g'] = calculate_abs_mag(feats['g_max'], redshift)
        elif 'r_max' in feats:
            feats['abs_mag_g'] = calculate_abs_mag(feats['r_max'], redshift) # Fallback
        else:
            feats['abs_mag_g'] = np.nan

        flux_at_peak = {}
        flux_at_post = {} 
        dt_post = 20.0
        
        for f in ['g', 'r']:
            if f in filter_data:
                t, y_flux, y_err = filter_data[f]
                flux_at_peak[f] = get_gp_prediction(t, y_flux, y_err, peak_time)
                flux_at_post[f] = get_gp_prediction(t, y_flux, y_err, peak_time + dt_post)
        
        val_g0, val_r0 = flux_at_peak.get('g', np.nan), flux_at_peak.get('r', np.nan)
        val_g1, val_r1 = flux_at_post.get('g', np.nan), flux_at_post.get('r', np.nan)
        
        if not np.any(np.isnan([val_g0, val_r0, val_g1, val_r1])) and val_g0>0 and val_r0>0:
            color_0 = -2.5 * np.log10(val_g0 / val_r0)
            color_1 = -2.5 * np.log10(val_g1 / val_r1)
            
            feats['color_slope_g_r'] = (color_1 - color_0) / dt_post
            feats['color_g_r_peak'] = color_0
        else:
            feats['color_slope_g_r'] = np.nan
            feats['color_g_r_peak'] = np.nan
            
    return feats

In [None]:
def extract_features_parallel(log_df, data_path, n_jobs=-1):
    print("Loading raw lightcurves...")
    all_chunks = []
    unique_splits = log_df['split'].unique()
    
    print(log_df.columns)
    z_col = log_df.columns[1]
    if z_col not in log_df.columns:
        print("Warning: Redshift column not found. Luminosity features will be NaN.")
        z_map = {oid: np.nan for oid in log_df['object_id']}
    else:
        z_map = dict(zip(log_df['object_id'], log_df[z_col]))

    for split in unique_splits:
        is_train = 'target' in log_df.columns
        filename = 'train_full_lightcurves.csv' if is_train else 'test_full_lightcurves.csv'
        path = os.path.join(data_path, split, filename)
        if os.path.exists(path):
            df_chunk = pd.read_csv(path)
            valid_ids = set(log_df[log_df['split'] == split]['object_id'])
            df_chunk = df_chunk[df_chunk['object_id'].isin(valid_ids)]
            all_chunks.append(df_chunk)
            
    full_lc = pd.concat(all_chunks)
    grouped = full_lc.groupby('object_id')
    object_ids = list(grouped.groups.keys())
    
    print(f"Extracting features for {len(object_ids)} objects...")
    
    results = Parallel(n_jobs=n_jobs, backend='loky')(
        delayed(process_single_object)(
            obj_id, 
            grouped.get_group(obj_id), 
            z_map.get(obj_id, np.nan) 
        )
        for obj_id in tqdm(object_ids)
    )
    
    return pd.DataFrame(results)

In [None]:
print("--- PROCESSING TRAIN DATA ---")
train_log = pd.read_csv(os.path.join(DATA_PATH, 'train_log.csv'))

df_train_features = extract_features_parallel(train_log, DATA_PATH, n_jobs=4)

df_train_final = train_log.merge(df_train_features, on='object_id', how='left')
print(f"Train Data Shape: {df_train_final.shape}")

In [None]:
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder

In [None]:
ignore_cols = ['object_id', 'target', 'split', 'English Translation', 'SpecType']
features = [c for c in df_train_final.columns if c not in ignore_cols]

In [None]:
X = df_train_final[features].copy()

X = X.replace([np.inf, -np.inf], np.nan)

X = X.fillna(0)
y = df_train_final['target']

In [None]:
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
models_ensemble = []

In [None]:
from catboost import CatBoostClassifier 

In [None]:
test_log = pd.read_csv(os.path.join(DATA_PATH, 'test_log.csv'))

In [None]:
df_test_features = extract_features_parallel(test_log, DATA_PATH, n_jobs=4)

In [None]:
df_test_final = test_log.merge(df_test_features, on='object_id', how='left')
print(f"Test Data Shape: {df_test_final.shape}")

In [None]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    print(f"Fold {fold+1}: Running SMOTE...")
    try:
        smote = SMOTE(sampling_strategy=0.2, random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    except ValueError:
        print("SMOTE failed, using raw data.")
        X_train_res, y_train_res = X_train, y_train

    clf_lgb = lgb.LGBMClassifier(
        objective='binary', boosting_type='gbdt',
        verbose=-1, random_state=42, n_jobs=-1,
        learning_rate=0.00820959390156126,
        n_estimators=1456,
        num_leaves=21,
        max_depth=7,
        min_child_samples=18,
        subsample=0.8056429645734329,
        colsample_bytree=0.7037094223073973,
        reg_alpha=1.2090883197470177,
        reg_lambda=3.5948946953880816e-08
    )
    
    clf_xgb = XGBClassifier(
        objective='binary:logistic', eval_metric='logloss',
        random_state=42, use_label_encoder=False,
        tree_method='hist',
        device='cuda' if USE_GPU else 'cpu',
        learning_rate=0.0205217807496452,
        n_estimators=1185,
        max_depth=7,
        min_child_weight=7,
        subsample=0.7753093208264389,
        colsample_bytree=0.8265862295956217,
        gamma=1.0058203352400668,
        reg_alpha=4.053334043323454e-06,
        reg_lambda=0.09447983849476049
    )
    
    clf_cat = CatBoostClassifier(
        loss_function='Logloss', eval_metric='F1',
        verbose=0, random_seed=42, allow_writing_files=False,
        task_type='CPU'
        learning_rate=0.04297575732378107,
        iterations=735,
        depth=7,
        l2_leaf_reg=5.44676657488797,
        border_count=182,
        random_strength=2.5066619218798087,
        bagging_temperature=0.9978326033997912
    )
    
    eclf = VotingClassifier(
        estimators=[('lgb', clf_lgb), ('xgb', clf_xgb), ('cat', clf_cat)],
        voting='soft',
        weights=[1, 1, 1] 
    )
    
    eclf.fit(X_train_res, y_train_res)
    
    val_probs = eclf.predict_proba(X_val)[:, 1]
    oof_preds[val_idx] = val_probs
    
    best_f1, best_th = 0, 0.5
    for th in np.linspace(0.1, 0.9, 100):
        score = f1_score(y_val, (val_probs > th).astype(int))
        if score > best_f1: best_f1, best_th = score, th
            
    print(f"Fold {fold+1} Trifecta F1: {best_f1:.4f} (Th: {best_th:.2f})")
    models_ensemble.append(eclf)

In [None]:
thresholds = np.linspace(0.01, 0.99, 200)
f1_list = [f1_score(y, (oof_preds > t).astype(int)) for t in thresholds]
global_best_thresh = thresholds[np.argmax(f1_list)]
print(f"\nGlobal Best F1: {np.max(f1_list):.4f} at Threshold: {global_best_thresh:.2f}")

In [None]:
import optuna
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

ignore_cols = ['object_id', 'target', 'split', 'English Translation', 'SpecType']
features = [c for c in df_train_final.columns if c not in ignore_cols]

X = df_train_final[features].copy().replace([np.inf, -np.inf], np.nan)
y = df_train_final['target']

for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

neg = (y==0).sum()
pos = (y==1).sum()
scale_pos_weight = neg / pos

def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'n_jobs': -1,
        'random_state': 42,
        'scale_pos_weight': scale_pos_weight,
        
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 400, 1500),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 0.95),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.95),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []
    
    for train_idx, val_idx in skf.split(X, y):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        model = lgb.LGBMClassifier(**param)
        model.fit(X_train, y_train)
        
        # Tìm ngưỡng tối ưu nội bộ cho fold này
        preds_proba = model.predict_proba(X_val)[:, 1]
        best_f1 = 0
        for th in np.linspace(0.1, 0.9, 20):
            score = f1_score(y_val, (preds_proba > th).astype(int))
            if score > best_f1: best_f1 = score
        
        f1_scores.append(best_f1)
        
    return np.mean(f1_scores)

print("Starting Optuna Tuning for LightGBM...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print(f"Best F1: {study.best_value:.4f}")
print("Best Params:")
for key, value in study.best_params.items():
    print(f"    '{key}': {value},")

In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import numpy as np
import torch

if 'scale_pos_weight' not in globals():
    neg = (y==0).sum()
    pos = (y==1).sum()
    scale_pos_weight = neg / pos

USE_GPU = torch.cuda.is_available()

def objective_xgb(trial):
    param = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        
        'tree_method': 'hist',   
        'device': 'cuda' if USE_GPU else 'cpu',
        
        'random_state': 42,
        'use_label_encoder': False,
        'scale_pos_weight': scale_pos_weight,
        
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 500, 1500),
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 0.95),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.95),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []
    
    for train_idx, val_idx in skf.split(X, y):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        model = XGBClassifier(**param)
        model.fit(X_train, y_train)
        
        preds_proba = model.predict_proba(X_val)[:, 1]
        best_f1 = 0
        for th in np.linspace(0.1, 0.9, 20):
            score = f1_score(y_val, (preds_proba > th).astype(int))
            if score > best_f1: best_f1 = score
        f1_scores.append(best_f1)
        
    return np.mean(f1_scores)

print("Starting Optuna for XGBoost...")
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=30)
print("Best XGB Params:", study_xgb.best_params)

In [None]:
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import numpy as np

def objective_cat(trial):
    param = {
        'loss_function': 'Logloss',
        'eval_metric': 'F1',
        'task_type': 'CPU',  
        'verbose': 0,
        'random_seed': 42,
        'allow_writing_files': False,
        'auto_class_weights': 'Balanced',
        
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'iterations': trial.suggest_int('iterations', 500, 1500),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_float('random_strength', 0, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1)
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []
    
    for train_idx, val_idx in skf.split(X, y):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        model = CatBoostClassifier(**param)
        model.fit(X_train, y_train)
        
        preds_proba = model.predict_proba(X_val)[:, 1]
        best_f1 = 0
        for th in np.linspace(0.1, 0.9, 20):
            score = f1_score(y_val, (preds_proba > th).astype(int))
            if score > best_f1: best_f1 = score
        f1_scores.append(best_f1)
        
    return np.mean(f1_scores)

print("Starting Optuna for CatBoost (CPU Mode)...")
study_cat = optuna.create_study(direction='maximize')
study_cat.optimize(objective_cat, n_trials=30)
print("Best CatBoost Params:", study_cat.best_params)

In [None]:
X_test = df_test_final[features].copy()
X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(0)

for col in X_test.select_dtypes(include=['object']).columns:
    X_test[col] = pd.to_numeric(X_test[col], errors='coerce').fillna(0)

test_probs = np.zeros(len(X_test))
for model in models_ensemble:
    test_probs += model.predict_proba(X_test)[:, 1] / len(models_ensemble)

predictions = (test_probs > global_best_thresh).astype(int)

In [None]:
submission = pd.DataFrame({
    'object_id': df_test_final['object_id'],
    'prediction': predictions
})

submission.to_csv('submission_2.csv', index=False)
print("\nSuccess! Saved submission.csv with SMOTE & Ensemble.")
print(submission.head())

In [None]:
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    full_col = pd.concat([X[col].astype(str), X_test[col].astype(str)])
    le.fit(full_col)
    X[col] = le.transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

neg = (y==0).sum()
pos = (y==1).sum()
sqrt_weight = np.sqrt(neg / pos) 
print(f"Original Weight Ratio: {neg/pos:.2f} -> Adjusted (Sqrt): {sqrt_weight:.2f}")

USE_GPU = torch.cuda.is_available()

clf_lgb = lgb.LGBMClassifier(
    objective='binary', boosting_type='dart',
    learning_rate=0.05, n_estimators=1000,
    num_leaves=31, max_depth=-1,
    scale_pos_weight=sqrt_weight, 
    colsample_bytree=0.7, subsample=0.7,
    n_jobs=-1, random_state=42, verbose=-1
)

clf_xgb = XGBClassifier(
    objective='binary:logistic', eval_metric='logloss',
    learning_rate=0.03, n_estimators=1000, max_depth=6,
    scale_pos_weight=sqrt_weight,
    colsample_bytree=0.7, subsample=0.7,
    tree_method='hist', device='cuda' if USE_GPU else 'cpu',
    use_label_encoder=False, random_state=42
)

clf_cat = CatBoostClassifier(
    loss_function='Logloss', eval_metric='F1',
    learning_rate=0.03, iterations=1000, depth=6,
    auto_class_weights='SqrtBalanced',
    task_type='CPU',
    verbose=0, random_seed=42, allow_writing_files=False
)

eclf = VotingClassifier(
    estimators=[('lgb', clf_lgb), ('xgb', clf_xgb), ('cat', clf_cat)],
    voting='soft', weights=[1, 1, 1]
)

print("Training on Full Dataset...")
eclf.fit(X, y)

print("Predicting Test Data...")
test_probs = eclf.predict_proba(X_test)[:, 1]

train_tde_rate = y.mean()
print(f"TDE Rate in Train: {train_tde_rate:.2%}")

target_percentile = 100 * (1 - train_tde_rate * 1.1) 
dynamic_threshold = np.percentile(test_probs, target_percentile)

print(f"Dynamic Threshold (Top {train_tde_rate*1.1:.1%} predictions): {dynamic_threshold:.4f}")

predictions = (test_probs > dynamic_threshold).astype(int)

n_tde_pred = predictions.sum()
print(f"Predicted {n_tde_pred} TDEs out of {len(predictions)} objects ({n_tde_pred/len(predictions):.2%})")

submission = pd.DataFrame({
    'object_id': df_test_final['object_id'],
    'prediction': predictions
})

submission.to_csv('submission_final_physics.csv', index=False)
print("\nSuccess! Saved submission_final.csv")