<a href="https://colab.research.google.com/github/hck717/RSI-ETF-Strategy-/blob/main/QRT_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow==2.15.0 catboost==1.2.5 lightgbm==4.3.0 imbalanced-learn==0.12.4 statsmodels==0.14.1 pandas==2.0.3 numpy==1.26.4 scikit-learn==1.4.2 matplotlib==3.8.4 seaborn==0.13.2

Best up-to-date, 13-jul, 73%

In [None]:
import numpy as np
import pandas as pd
import os
import gc
from google.colab import files
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import lightgbm
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
import json
from sklearn.model_selection import ParameterGrid
import xgboost

# Check xgboost version for compatibility
print(f"XGBoost version: {xgboost.__version__}")
if int(xgboost.__version__.split('.')[0]) < 1:
    print("Warning: XGBoost version < 1.0.0 detected. Ensure compatibility with eval_metric in constructor.")

# Set up data directory
DATA_DIR = '/content'

# Load datasets
try:
    X_train = pd.read_csv(os.path.join(DATA_DIR, 'X_train_itDkypA.csv'), index_col='ID', dtype=np.float32)
    y_train = pd.read_csv(os.path.join(DATA_DIR, 'y_train_3LeeT2g.csv'), index_col='ID', dtype=np.float32)
    X_test = pd.read_csv(os.path.join(DATA_DIR, 'X_test_Beg4ey3.csv'), index_col='ID', dtype=np.float32)
    supplementary = pd.read_csv(os.path.join(DATA_DIR, 'supplementary_data_Vkoyn8z.csv'))
except FileNotFoundError as e:
    raise FileNotFoundError(f"Dataset not found: {e}. Please ensure files are uploaded to {DATA_DIR}.")

# Custom weighted accuracy metric
def weighted_accuracy(y_true, y_pred, weights):
    y_pred_mapped = np.where(y_pred == 1, 1, -1)
    correct = (y_pred_mapped == np.sign(y_true)).astype(float)
    return np.sum(np.abs(y_true) * correct) / np.sum(np.abs(y_true))

# Preprocess data with enhanced feature selection
def preprocess_data(X, y=None, supplementary=None, is_train=True, top_cols=None):
    ret_cols = [col for col in X.columns if col.startswith('RET_')]

    # Dynamic feature selection with minimum variance threshold
    if is_train and top_cols is None:
        if len(ret_cols) > 15:
            variances = X[ret_cols].var()
            corr_sum = X[ret_cols].corr().abs().sum()
            combined_score = variances * corr_sum
            # Filter out low-variance features (threshold = 1e-5)
            valid_cols = variances[variances > 1e-5].index
            if len(valid_cols) < 15:
                top_cols = valid_cols.tolist()
            else:
                top_cols = combined_score.loc[valid_cols].sort_values(ascending=False).head(15).index.tolist()
        else:
            top_cols = ret_cols
        print(f"Selected top return columns: {top_cols}")
    elif top_cols is not None:
        ret_cols = [col for col in ret_cols if col in top_cols]
        if not ret_cols:
            raise ValueError("No return columns selected. Check top_cols consistency.")

    # Cache sector-based medians
    sector_medians = {}
    if supplementary is not None:
        sector_map = supplementary.set_index('ID_asset')['CLASS_LEVEL_1']
        for col in ret_cols:
            asset_id = int(col.replace('RET_', ''))
            sector = sector_map.get(asset_id, np.nan)
            if pd.notna(sector):
                sector_assets = supplementary[supplementary['CLASS_LEVEL_1'] == sector]['ID_asset']
                sector_cols = [f'RET_{a}' for a in sector_assets if f'RET_{a}' in X.columns]
                if sector_cols:
                    sector_medians[col] = X[sector_cols].median(axis=1)
            if col in sector_medians:
                X[col] = X[col].fillna(sector_medians[col])
            else:
                X[col] = X[col].fillna(X[col].median())

    # Sector-based features
    new_cols = {}
    if supplementary is not None:
        level = 'CLASS_LEVEL_1'
        sector_groups = supplementary.groupby(level)['ID_asset'].apply(list).to_dict()
        for sector, assets in sector_groups.items():
            asset_cols = [f'RET_{asset}' for asset in assets if f'RET_{asset}' in ret_cols]
            if asset_cols:
                new_cols[f'SECTOR_AVG_{level}_{sector}'] = X[asset_cols].mean(axis=1).replace([np.inf, -np.inf], 0).fillna(0)
        X = X.merge(supplementary[['ID_asset', level]].rename(columns={'ID_asset': 'ID_TARGET', level: f'TARGET_{level}'}),
                    on='ID_TARGET', how='left')
        X = pd.concat([X, pd.get_dummies(X[f'TARGET_{level}'], prefix=f'TARGET_{level}', dummy_na=True)], axis=1)
        X = X.drop(f'TARGET_{level}', axis=1)

    # Cross-sectional features
    new_cols['MEAN_RET'] = X[ret_cols].mean(axis=1)
    new_cols['STD_RET'] = X[ret_cols].std(axis=1)
    new_cols.update({f'CS_RANK_{col}': X.groupby('ID_DAY')[col].rank(pct=True).replace([np.inf, -np.inf], 0).fillna(0) for col in ret_cols})

    # Correlation-based features (top 5 pairs)
    if is_train:
        corr_matrix = X[ret_cols].corr()
        corr_pairs = corr_matrix.unstack().sort_values(ascending=False)
        top_pairs = [(i, j) for i, j in corr_pairs.index if i < j][:5]
        for i, j in top_pairs:
            new_cols[f'CORR_{i}_{j}'] = X[i] * X[j]

    # PCA features (2 components)
    if ret_cols:
        pca = PCA(n_components=min(2, len(ret_cols)), random_state=42)
        pca_features = pca.fit_transform(X[ret_cols].fillna(0))
        for i in range(pca_features.shape[1]):
            new_cols[f'PCA_{i}'] = pca_features[:, i]

    # Add new features to DataFrame
    new_cols_df = pd.DataFrame(new_cols, index=X.index, dtype=np.float32)
    X = pd.concat([X, new_cols_df], axis=1)

    # Standardize numerical features
    feature_cols = [col for col in X.columns if col not in ['ID_DAY', 'ID_TARGET']]
    scaler = StandardScaler()
    X[feature_cols] = scaler.fit_transform(X[feature_cols].replace([np.inf, -np.inf], 0).fillna(0))

    if is_train:
        y['SIGN_TARGET'] = np.where(y['RET_TARGET'] > 0, 1, 0).astype(np.int32)
        return X, y, feature_cols, top_cols
    return X, None, feature_cols, top_cols

# Preprocess data
try:
    X_train, y_train, feature_cols, top_cols = preprocess_data(X_train, y_train, supplementary, is_train=True)
    X_test, _, test_feature_cols, _ = preprocess_data(X_test, supplementary=supplementary, is_train=False, top_cols=top_cols)
except ValueError as e:
    print(f"Preprocessing error: {e}")
    raise

# Align columns
common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]
feature_cols = [col for col in common_cols if col not in ['ID_DAY', 'ID_TARGET']]

# Align y_train
y_train = y_train.loc[X_train.index]

# Define models with lightweight grid search
models = {
    'LightGBM': {
        'model': LGBMClassifier(n_estimators=100, num_leaves=25, min_child_samples=20,
                                lambda_l1=0.5, lambda_l2=0.5, subsample=0.7, colsample_bytree=0.7,
                                random_state=42, n_jobs=-1, verbosity=-1, class_weight='balanced'),
        'param_grid': [{'learning_rate': [0.02, 0.05]}],
        'callbacks': [lightgbm.early_stopping(stopping_rounds=10, verbose=False)]
    },
    'XGBoost': {
        'model': XGBClassifier(n_estimators=100, max_depth=4, subsample=0.7, colsample_bytree=0.7,
                               random_state=42, n_jobs=-1, verbosity=0, eval_metric='logloss',
                               scale_pos_weight=len(y_train[y_train['SIGN_TARGET'] == 0]) / len(y_train[y_train['SIGN_TARGET'] == 1])),
        'param_grid': [{'learning_rate': [0.05, 0.1]}],
        'callbacks': [None]
    }
}

# Function to train and predict for a single target
def train_target(target, X_train, y_train, X_test, feature_cols, model, model_name, kf, param_grid, callbacks):
    X_target = X_train[X_train['ID_TARGET'] == target][feature_cols]
    y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
    weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
    groups = X_train[X_train['ID_TARGET'] == target]['ID_DAY']
    X_test_target = X_test[X_test['ID_TARGET'] == target][feature_cols]
    test_ids = X_test[X_test['ID_TARGET'] == target].index

    # Check for sufficient data
    if len(X_target) < 10 or len(X_test_target) == 0:
        print(f"Warning: Insufficient data for ID_TARGET {target} (train: {len(X_target)}, test: {len(X_test_target)}). Skipping.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids)), []

    param_results = []
    best_params = None
    best_score = 0
    best_model = None

    # Lightweight grid search
    for params in ParameterGrid(param_grid):
        model.set_params(**params)
        fold_accuracies = []
        for train_idx, val_idx in kf.split(X_target, y_target, groups):
            X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
            y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
            w_val = weights.iloc[val_idx]

            if model_name == 'LightGBM' and callbacks:
                model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=callbacks)
            else:
                model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
            y_pred = model.predict(X_val)
            acc = weighted_accuracy(y_val, y_pred, w_val)
            fold_accuracies.append(acc)

        mean_acc = np.mean(fold_accuracies)
        print(f"{model_name} | ID_TARGET: {target} | Params: {params} | Mean Accuracy: {mean_acc:.4f} | Fold Accuracies: {fold_accuracies} | Train Samples: {len(X_target)}", flush=True)
        param_results.append({
            'params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': fold_accuracies
        })
        if mean_acc > best_score:
            best_score = mean_acc
            best_params = params
            best_model = type(model)(**model.get_params())

    # Train with best parameters
    best_model.set_params(**best_params)
    if model_name == 'LightGBM' and callbacks:
        best_model.fit(X_target, y_target, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=callbacks)
    else:
        best_model.fit(X_target, y_target, eval_set=[(X_target, y_target)], verbose=False)

    # Predict
    preds = best_model.predict_proba(X_test_target)[:, 1]

    # Feature importance
    importance = best_model.feature_importances_
    feature_importance = dict(zip(feature_cols, importance))
    top_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:30]

    # Clean up memory
    gc.collect()

    return str(target), best_params, best_score, param_results, test_ids, preds, top_features

# Cross-validation and training
kf = GroupKFold(n_splits=2)  # Reduced folds for faster training
results = {}
predictions = {}
ensemble_weights = {'LightGBM': 0.7, 'XGBoost': 0.3}
param_log = {}
top_features_all = {}
param_summary = []

for model_name, model_info in models.items():
    print(f"\nTraining {model_name}...", flush=True)
    param_log[model_name] = {}
    top_features_all[model_name] = {}

    # Parallelize training across targets
    results_parallel = Parallel(n_jobs=-1)(
        delayed(train_target)(target, X_train, y_train, X_test, feature_cols, model_info['model'], model_name, kf,
                             model_info['param_grid'], model_info['callbacks'])
        for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets")
    )

    # Collect results
    accuracies = []
    for target, params, mean_acc, param_results, test_ids, preds, top_features in results_parallel:
        param_log[model_name][target] = {
            'best_params': params,
            'mean_accuracy': mean_acc,
            'all_params': param_results
        }
        top_features_all[model_name][target] = top_features
        accuracies.append(mean_acc)
        predictions.setdefault(target, {})[model_name] = dict(zip(test_ids, preds))
        param_summary.append({
            'model': model_name,
            'ID_TARGET': target,
            'best_params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': param_results[-1]['fold_accuracies'] if param_results else []
        })

    results[model_name] = np.mean([acc for acc in accuracies if acc > 0])  # Exclude skipped targets
    print(f"{model_name} Weighted Accuracy: {results[model_name]:.4f}", flush=True)

    # Print parameter summary
    print(f"\n{model_name} Parameter Summary:", flush=True)
    for summary in param_summary:
        if summary['model'] == model_name:
            print(f"ID_TARGET: {summary['ID_TARGET']} | Best Params: {summary['best_params']} | Mean Accuracy: {summary['mean_accuracy']:.4f} | Fold Accuracies: {summary['fold_accuracies']}", flush=True)

# Save parameter log
param_log_path = os.path.join(DATA_DIR, 'hyperparameters.json')
with open(param_log_path, 'w') as f:
    json.dump(param_log, f, indent=4)
print(f"Saved hyperparameter log to: {param_log_path}", flush=True)
files.download(param_log_path)

# Second-pass training with top 30 features
print("\nPerforming second-pass training with top 30 features...", flush=True)
for model_name, model_info in models.items():
    print(f"\nSecond-pass {model_name}...", flush=True)
    for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets"):
        target_str = str(target)
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index

        if len(X_target) < 10:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue

        model = model_info['model']
        model.set_params(**param_log[model_name][target_str]['best_params'])
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        else:
            model.fit(X_target, y_target, eval_set=[(X_target, y_target)], verbose=False)
        preds = model.predict_proba(X_test_target)[:, 1]
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Ensemble predictions
final_predictions = {}
for target in X_test['ID_TARGET'].unique():
    target_str = str(target)
    test_ids = X_test[X_test['ID_TARGET'] == target].index
    lgbm_preds = np.array(list(predictions[target_str].get('LightGBM', {idx: 0 for idx in test_ids}).values()))
    xgb_preds = np.array(list(predictions[target_str].get('XGBoost', {idx: 0 for idx in test_ids}).values()))
    ensemble_preds = ensemble_weights['LightGBM'] * lgbm_preds + ensemble_weights['XGBoost'] * xgb_preds
    final_predictions.update({idx: 1 if p >= 0.5 else -1 for idx, p in zip(test_ids, ensemble_preds)})

# Create output CSV
output_df = pd.DataFrame.from_dict(final_predictions, orient='index', columns=['RET_TARGET']).reset_index()
output_df.columns = ['ID', 'RET_TARGET']
output_df = output_df.sort_values('ID')
output_path = os.path.join(DATA_DIR, 'predictions_ensemble.csv')
output_df.to_csv(output_path, index=False)
print(f"Saved predictions to: {output_path}", flush=True)

# Validate output
test_csv = pd.read_csv(output_path)
print(f"Output CSV shape: {test_csv.shape}", flush=True)
print(f"Output CSV ID range: {test_csv['ID'].min()} to {test_csv['ID'].max()}", flush=True)
print(f"Unique RET_TARGET values: {test_csv['RET_TARGET'].unique()}", flush=True)
print(f"Missing values: {test_csv.isna().sum().sum()}", flush=True)

# Download the file
files.download(output_path)

# Print final results
print("\nValidation Results:", flush=True)
for model_name, acc in results.items():
    print(f"{model_name}: {acc:.4f}", flush=True)

XGBoost version: 2.1.4


FileNotFoundError: Dataset not found: [Errno 2] No such file or directory: '/content/X_train_itDkypA.csv'. Please ensure files are uploaded to /content.

less complex than 74.22%- 74.15



In [None]:
import numpy as np
import pandas as pd
import os
import gc
from google.colab import files
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import lightgbm
from xgboost import XGBClassifier
import xgboost
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
import json
from sklearn.model_selection import ParameterGrid

# Check xgboost version for compatibility
print(f"XGBoost version: {xgboost.__version__}")
if int(xgboost.__version__.split('.')[0]) < 1:
    print("Warning: XGBoost version < 1.0.0 detected. Ensure compatibility with eval_metric in constructor.")

# Set up data directory
DATA_DIR = '/content'

# Load datasets
try:
    X_train = pd.read_csv(os.path.join(DATA_DIR, 'X_train_itDkypA.csv'), index_col='ID', dtype=np.float32)
    y_train = pd.read_csv(os.path.join(DATA_DIR, 'y_train_3LeeT2g.csv'), index_col='ID', dtype=np.float32)
    X_test = pd.read_csv(os.path.join(DATA_DIR, 'X_test_Beg4ey3.csv'), index_col='ID', dtype=np.float32)
    supplementary = pd.read_csv(os.path.join(DATA_DIR, 'supplementary_data_Vkoyn8z.csv'))
except FileNotFoundError as e:
    raise FileNotFoundError(f"Dataset not found: {e}. Please ensure files are uploaded to {DATA_DIR}.")

# Custom weighted accuracy metric
def weighted_accuracy(y_true, y_pred, weights):
    y_pred_mapped = np.where(y_pred == 1, 1, -1)
    correct = (y_pred_mapped == np.sign(y_true)).astype(float)
    return np.sum(np.abs(y_true) * correct) / np.sum(np.abs(y_true))

# Preprocess data with enhanced feature selection and momentum feature
def preprocess_data(X, y=None, supplementary=None, is_train=True, top_cols=None):
    ret_cols = [col for col in X.columns if col.startswith('RET_')]
    if is_train and top_cols is None:
        if len(ret_cols) > 15:
            variances = X[ret_cols].var()
            corr_sum = X[ret_cols].corr().abs().sum()
            combined_score = variances * corr_sum
            valid_cols = variances[variances > 1e-5].index
            if len(valid_cols) < 15:
                top_cols = valid_cols.tolist()
            else:
                top_cols = combined_score.loc[valid_cols].sort_values(ascending=False).head(15).index.tolist()
        else:
            top_cols = ret_cols
        print(f"Selected top return columns: {top_cols}")
    elif top_cols is not None:
        ret_cols = [col for col in ret_cols if col in top_cols]
        if not ret_cols:
            raise ValueError("No return columns selected. Check top_cols consistency.")

    sector_medians = {}
    if supplementary is not None:
        sector_map = supplementary.set_index('ID_asset')['CLASS_LEVEL_1']
        for col in ret_cols:
            asset_id = int(col.replace('RET_', ''))
            sector = sector_map.get(asset_id, np.nan)
            if pd.notna(sector):
                sector_assets = supplementary[supplementary['CLASS_LEVEL_1'] == sector]['ID_asset']
                sector_cols = [f'RET_{a}' for a in sector_assets if f'RET_{a}' in X.columns]
                if sector_cols:
                    sector_medians[col] = X[sector_cols].median(axis=1)
            if col in sector_medians:
                X[col] = X[col].fillna(sector_medians[col])
            else:
                X[col] = X[col].fillna(X[col].median())

    new_cols = {}
    if supplementary is not None:
        level = 'CLASS_LEVEL_1'
        sector_groups = supplementary.groupby(level)['ID_asset'].apply(list).to_dict()
        for sector, assets in sector_groups.items():
            asset_cols = [f'RET_{asset}' for asset in assets if f'RET_{asset}' in ret_cols]
            if asset_cols:
                new_cols[f'SECTOR_AVG_{level}_{sector}'] = X[asset_cols].mean(axis=1).replace([np.inf, -np.inf], 0).fillna(0)
        X = X.merge(supplementary[['ID_asset', level]].rename(columns={'ID_asset': 'ID_TARGET', level: f'TARGET_{level}'}),
                    on='ID_TARGET', how='left')
        X = pd.concat([X, pd.get_dummies(X[f'TARGET_{level}'], prefix=f'TARGET_{level}', dummy_na=True)], axis=1)
        X = X.drop(f'TARGET_{level}', axis=1)

    new_cols['MEAN_RET'] = X[ret_cols].mean(axis=1)
    new_cols['STD_RET'] = X[ret_cols].std(axis=1)
    new_cols['MOMENTUM_7D'] = X[ret_cols].rolling(window=7, min_periods=1).mean().mean(axis=1).replace([np.inf, -np.inf], 0).fillna(0)
    new_cols.update({f'CS_RANK_{col}': X.groupby('ID_DAY')[col].rank(pct=True).replace([np.inf, -np.inf], 0).fillna(0) for col in ret_cols})

    if is_train:
        corr_matrix = X[ret_cols].corr()
        corr_pairs = corr_matrix.unstack().sort_values(ascending=False)
        top_pairs = [(i, j) for i, j in corr_pairs.index if i < j][:5]
        for i, j in top_pairs:
            new_cols[f'CORR_{i}_{j}'] = X[i] * X[j]

    if ret_cols:
        pca = PCA(n_components=min(2, len(ret_cols)), random_state=42)
        pca_features = pca.fit_transform(X[ret_cols].fillna(0))
        for i in range(pca_features.shape[1]):
            new_cols[f'PCA_{i}'] = pca_features[:, i]

    new_cols_df = pd.DataFrame(new_cols, index=X.index, dtype=np.float32)
    X = pd.concat([X, new_cols_df], axis=1)

    feature_cols = [col for col in X.columns if col not in ['ID_DAY', 'ID_TARGET']]
    scaler = StandardScaler()
    X[feature_cols] = scaler.fit_transform(X[feature_cols].replace([np.inf, -np.inf], 0).fillna(0))

    if is_train:
        y['SIGN_TARGET'] = np.where(y['RET_TARGET'] > 0, 1, 0).astype(np.int32)
        return X, y, feature_cols, top_cols
    return X, None, feature_cols, top_cols

# Preprocess data
try:
    X_train, y_train, feature_cols, top_cols = preprocess_data(X_train, y_train, supplementary, is_train=True)
    X_test, _, test_feature_cols, _ = preprocess_data(X_test, supplementary=supplementary, is_train=False, top_cols=top_cols)
except ValueError as e:
    print(f"Preprocessing error: {e}")
    raise

# Align columns
common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]
feature_cols = [col for col in common_cols if col not in ['ID_DAY', 'ID_TARGET']]

# Align y_train
y_train = y_train.loc[X_train.index]

# Define models with reduced hyperparameter grids
models = {
    'LightGBM': {
        'model': LGBMClassifier(num_leaves=31, min_child_samples=15, lambda_l1=0.3, lambda_l2=0.3,
                                subsample=0.7, colsample_bytree=0.8, bagging_freq=5, bagging_fraction=0.7,
                                random_state=42, n_jobs=1, verbosity=-1, class_weight='balanced', force_row_wise=True),
        'param_grid': [
            {'learning_rate': [0.02], 'num_leaves': [15], 'min_child_samples': [20], 'bagging_fraction': [0.6], 'reg_alpha': [0.1], 'reg_lambda': [0.1]},
            {'learning_rate': [0.05], 'num_leaves': [31], 'min_child_samples': [10], 'bagging_fraction': [0.7], 'reg_alpha': [0.2], 'reg_lambda': [0.2]}
        ],
        'callbacks': [lightgbm.early_stopping(stopping_rounds=10, verbose=False)]
    },
    'XGBoost': {
        'model': XGBClassifier(max_depth=4, min_child_weight=2, subsample=0.8, colsample_bytree=0.8,
                               random_state=42, n_jobs=1, eval_metric='logloss', verbosity=0,
                               scale_pos_weight=len(y_train[y_train['SIGN_TARGET'] == 0]) / len(y_train[y_train['SIGN_TARGET'] == 1])),
        'param_grid': [
            {'learning_rate': [0.03], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [150], 'reg_alpha': [0.1], 'reg_lambda': [1.0]},
            {'learning_rate': [0.05], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [200], 'reg_alpha': [0.2], 'reg_lambda': [1.5]}
        ],
        'callbacks': [xgboost.callback.EarlyStopping(rounds=10, metric_name='logloss', save_best=True)]
    }
}

# Function to train and predict for a single target
def train_target(target, X_train, y_train, X_test, feature_cols, model, model_name, kf, param_grid, callbacks):
    X_target = X_train[X_train['ID_TARGET'] == target][feature_cols]
    y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
    weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
    low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]
    weight_scale = 1.5 if target in low_performing_targets else 1.0
    weights = weights * weight_scale
    groups = X_train[X_train['ID_TARGET'] == target]['ID_DAY']
    X_test_target = X_test[X_test['ID_TARGET'] == target][feature_cols]
    test_ids = X_test[X_test['ID_TARGET'] == target].index

    if len(X_target) < 100 or len(X_test_target) == 0:
        print(f"Warning: Insufficient data for ID_TARGET {target} (train: {len(X_target)}, test: {len(X_test_target)}). Skipping.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    param_results = []
    best_params = None
    best_score = 0
    best_model = None

    for params in ParameterGrid(param_grid):
        print(f"Testing params for {model_name} ID_TARGET {target}: {params}", flush=True)
        try:
            model.set_params(**params)
        except ValueError as e:
            print(f"Invalid params for {model_name} ID_TARGET {target}: {e}")
            continue
        fold_accuracies = []
        for train_idx, val_idx in kf.split(X_target, y_target, groups):
            X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
            y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
            w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
            if model_name == 'LightGBM' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=callbacks)
            elif model_name == 'XGBoost' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
            else:
                model.fit(X_tr, y_tr, sample_weight=w_tr)
            y_pred = model.predict(X_val)
            acc = weighted_accuracy(y_val, y_pred, w_val)
            fold_accuracies.append(acc)
        mean_acc = np.mean(fold_accuracies)
        print(f"{model_name} | ID_TARGET: {target} | Params: {params} | Mean Accuracy: {mean_acc:.4f} | Fold Accuracies: {fold_accuracies}", flush=True)
        param_results.append({
            'params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': fold_accuracies
        })
        if mean_acc > best_score:
            best_score = mean_acc
            best_params = params
            best_model = type(model)(**model.get_params())

    if best_model is None:
        print(f"No valid model for {model_name} ID_TARGET {target}. Using default.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    best_model.set_params(**best_params)
    if model_name == 'LightGBM' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=callbacks)
    elif model_name == 'XGBoost' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
    else:
        best_model.fit(X_target, y_target, sample_weight=weights)
    preds = best_model.predict_proba(X_test_target)[:, 1].astype(np.float32)
    importance = best_model.feature_importances_
    feature_importance = dict(zip(feature_cols, importance))
    top_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:30]
    gc.collect()
    return str(target), best_params, best_score, param_results, test_ids, preds, top_features

# Cross-validation and training
kf = GroupKFold(n_splits=5)
results = {}
predictions = {}
param_log = {}
top_features_all = {}
param_summary = []
low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]

for model_name, model_info in models.items():
    print(f"\nTraining {model_name}...", flush=True)
    param_log[model_name] = {}
    top_features_all[model_name] = {}
    results_parallel = Parallel(n_jobs=-1)(
        delayed(train_target)(target, X_train, y_train, X_test, feature_cols, model_info['model'], model_name, kf,
                             model_info['param_grid'], model_info['callbacks'])
        for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets")
    )
    accuracies = []
    for target, params, mean_acc, param_results, test_ids, preds, top_features in results_parallel:
        param_log[model_name][target] = {
            'best_params': params,
            'mean_accuracy': mean_acc,
            'all_params': param_results
        }
        top_features_all[model_name][target] = top_features
        accuracies.append(mean_acc)
        predictions.setdefault(target, {})[model_name] = dict(zip(test_ids, preds))
        param_summary.append({
            'model': model_name,
            'ID_TARGET': target,
            'best_params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': param_results[-1]['fold_accuracies'] if param_results else []
        })
    results[model_name] = np.mean([acc for acc in accuracies if acc > 0])
    print(f"{model_name} Weighted Accuracy: {results[model_name]:.4f}", flush=True)
    print(f"\n{model_name} Parameter Summary:", flush=True)
    for summary in param_summary:
        if summary['model'] == model_name:
            print(f"ID_TARGET: {summary['ID_TARGET']} | Best Params: {summary['best_params']} | Mean Accuracy: {summary['mean_accuracy']:.4f} | Fold Accuracies: {summary['fold_accuracies']}", flush=True)

# Second-pass training with top 30 features
print("\nPerforming second-pass training with top 30 features...", flush=True)
for model_name, model_info in models.items():
    print(f"\nSecond-pass {model_name}...", flush=True)
    for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets"):
        target_str = str(target)
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
        weight_scale = 1.5 if target in low_performing_targets else 1.0
        weights = weights * weight_scale
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index
        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue
        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        best_params['n_estimators'] = 200
        model.set_params(**best_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Third-pass training for low-performing targets
print("\nPerforming third-pass training for low-performing targets...", flush=True)
for model_name, model_info in models.items():
    print(f"\nThird-pass {model_name}...", flush=True)
    for target in tqdm(low_performing_targets, desc=f"{model_name} Low-Performing Targets"):
        target_str = str(target)
        if target_str not in param_log[model_name] or param_log[model_name][target_str]['mean_accuracy'] >= 0.75:
            continue
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs() * 1.5
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index
        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue
        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        fine_tune_grid = [
            {'learning_rate': best_params['learning_rate'][0] * 0.8, 'n_estimators': 200, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1},
            {'learning_rate': best_params['learning_rate'][0] * 1.2, 'n_estimators': 200, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1}
        ]
        best_score = param_log[model_name][target_str]['mean_accuracy']
        best_fine_params = best_params
        for params in ParameterGrid(fine_tune_grid):
            print(f"Fine-tuning {model_name} ID_TARGET {target}: {params}", flush=True)
            model.set_params(**params)
            fold_accuracies = []
            for train_idx, val_idx in kf.split(X_target, y_target, groups):
                X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
                y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
                w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
                if model_name == 'LightGBM' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=model_info['callbacks'])
                elif model_name == 'XGBoost' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
                else:
                    model.fit(X_tr, y_tr, sample_weight=w_tr)
                y_pred = model.predict(X_val)
                acc = weighted_accuracy(y_val, y_pred, w_val)
                fold_accuracies.append(acc)
            mean_acc = np.mean(fold_accuracies)
            print(f"{model_name} | ID_TARGET: {target} | Fine-tune Params: {params} | Mean Accuracy: {mean_acc:.4f}", flush=True)
            if mean_acc > best_score:
                best_score = mean_acc
                best_fine_params = params
                param_log[model_name][target_str]['best_params'] = best_fine_params
                param_log[model_name][target_str]['mean_accuracy'] = best_score
                param_summary.append({
                    'model': model_name,
                    'ID_TARGET': target_str,
                    'best_params': best_fine_params,
                    'mean_accuracy': best_score,
                    'fold_accuracies': fold_accuracies
                })
        model.set_params(**best_fine_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Save parameter log
param_log_path = os.path.join(DATA_DIR, 'hyperparameters.json')
with open(param_log_path, 'w') as f:
    json.dump(param_log, f, indent=4)
print(f"Saved hyperparameter log to: {param_log_path}", flush=True)
files.download(param_log_path)

# Ensemble predictions with dynamic weights
final_predictions = {}
missing_day_targets = []
for target in X_test['ID_TARGET'].unique():
    target_str = str(target)
    test_ids = X_test[X_test['ID_TARGET'] == target].index
    lgbm_acc = param_log.get('LightGBM', {}).get(target_str, {}).get('mean_accuracy', 0.5)
    xgb_acc = param_log.get('XGBoost', {}).get(target_str, {}).get('mean_accuracy', 0.5)
    total_acc = lgbm_acc + xgb_acc
    ensemble_weights = {
        'LightGBM': lgbm_acc / total_acc if total_acc > 0 else 0.5,
        'XGBoost': xgb_acc / total_acc if total_acc > 0 else 0.5
    }
    for day in X_test[X_test['ID_TARGET'] == target]['ID_DAY'].unique():
        day_idx = (X_test['ID_TARGET'] == target) & (X_test['ID_DAY'] == day)
        day_test_ids = X_test[day_idx].index
        if len(day_test_ids) == 0:
            print(f"Warning: No test samples for ID_TARGET {target} on ID_DAY {day}. Using default prediction.")
            missing_day_targets.append((target, day))
            for idx in test_ids:
                final_predictions[idx] = -1
            continue
        lgbm_preds = np.array([predictions[target_str].get('LightGBM', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        xgb_preds = np.array([predictions[target_str].get('XGBoost', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        ensemble_preds = ensemble_weights['LightGBM'] * lgbm_preds + ensemble_weights['XGBoost'] * xgb_preds
        smoothed_preds = pd.Series(ensemble_preds).ewm(span=3).mean().values[-1] if len(ensemble_preds) > 0 else 0.5
        for idx in day_test_ids:
            final_predictions[idx] = 1 if smoothed_preds >= 0.5 else -1

if missing_day_targets:
    print(f"Missing day-target pairs: {missing_day_targets}")

# Create output CSV
output_df = pd.DataFrame.from_dict(final_predictions, orient='index', columns=['RET_TARGET']).reset_index()
output_df.columns = ['ID', 'RET_TARGET']
output_df = output_df.sort_values('ID')
expected_ids = set(X_test.index)
submission_ids = set(output_df['ID'])
if expected_ids != submission_ids:
    missing_ids = expected_ids - submission_ids
    print(f"Warning: Submission missing {len(missing_ids)} IDs: {missing_ids}")
    missing_df = pd.DataFrame({'ID': list(missing_ids), 'RET_TARGET': -1})
    output_df = pd.concat([output_df, missing_df], ignore_index=True).sort_values('ID')
output_path = os.path.join(DATA_DIR, 'predictions_ensemble.csv')
output_df.to_csv(output_path, index=False)
print(f"Saved predictions to: {output_path}", flush=True)

# Validate output
test_csv = pd.read_csv(output_path)
print(f"Output CSV shape: {test_csv.shape}", flush=True)
print(f"Output CSV ID range: {test_csv['ID'].min()} to {test_csv['ID'].max()}", flush=True)
print(f"Unique RET_TARGET values: {test_csv['RET_TARGET'].unique()}", flush=True)
print(f"Missing values: {test_csv.isna().sum().sum()}", flush=True)

# Download the file
files.download(output_path)

# Print final results
print("\nValidation Results:", flush=True)
for model_name, acc in results.items():
    print(f"{model_name}: {acc:.4f}", flush=True)

Starting preprocessing at 03:15:27
Selected top return columns: ['RET_262', 'RET_121', 'RET_88', 'RET_172', 'RET_261', 'RET_118', 'RET_150', 'RET_238']
Preprocessing completed in 45.61 seconds


KeyError: 'ID_DAY'

less complex than 74% -- 74.22

In [None]:

import numpy as np
import pandas as pd
import os
import gc
from google.colab import files
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import lightgbm
from xgboost import XGBClassifier
import xgboost
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
import json
from sklearn.model_selection import ParameterGrid

# Check xgboost version for compatibility
print(f"XGBoost version: {xgboost.__version__}")
if int(xgboost.__version__.split('.')[0]) < 1:
    print("Warning: XGBoost version < 1.0.0 detected. Ensure compatibility with eval_metric in constructor.")

# Set up data directory
DATA_DIR = '/content'

# Load datasets
try:
    X_train = pd.read_csv(os.path.join(DATA_DIR, 'X_train_itDkypA.csv'), index_col='ID', dtype=np.float32)
    y_train = pd.read_csv(os.path.join(DATA_DIR, 'y_train_3LeeT2g.csv'), index_col='ID', dtype=np.float32)
    X_test = pd.read_csv(os.path.join(DATA_DIR, 'X_test_Beg4ey3.csv'), index_col='ID', dtype=np.float32)
    supplementary = pd.read_csv(os.path.join(DATA_DIR, 'supplementary_data_Vkoyn8z.csv'))
except FileNotFoundError as e:
    raise FileNotFoundError(f"Dataset not found: {e}. Please ensure files are uploaded to {DATA_DIR}.")

# Custom weighted accuracy metric
def weighted_accuracy(y_true, y_pred, weights):
    y_pred_mapped = np.where(y_pred == 1, 1, -1)
    correct = (y_pred_mapped == np.sign(y_true)).astype(float)
    return np.sum(np.abs(y_true) * correct) / np.sum(np.abs(y_true))

# Preprocess data with enhanced feature selection
def preprocess_data(X, y=None, supplementary=None, is_train=True, top_cols=None):
    ret_cols = [col for col in X.columns if col.startswith('RET_')]
    if is_train and top_cols is None:
        if len(ret_cols) > 15:
            variances = X[ret_cols].var()
            corr_sum = X[ret_cols].corr().abs().sum()
            combined_score = variances * corr_sum
            valid_cols = variances[variances > 1e-5].index
            if len(valid_cols) < 15:
                top_cols = valid_cols.tolist()
            else:
                top_cols = combined_score.loc[valid_cols].sort_values(ascending=False).head(15).index.tolist()
        else:
            top_cols = ret_cols
        print(f"Selected top return columns: {top_cols}")
    elif top_cols is not None:
        ret_cols = [col for col in ret_cols if col in top_cols]
        if not ret_cols:
            raise ValueError("No return columns selected. Check top_cols consistency.")

    sector_medians = {}
    if supplementary is not None:
        sector_map = supplementary.set_index('ID_asset')['CLASS_LEVEL_1']
        for col in ret_cols:
            asset_id = int(col.replace('RET_', ''))
            sector = sector_map.get(asset_id, np.nan)
            if pd.notna(sector):
                sector_assets = supplementary[supplementary['CLASS_LEVEL_1'] == sector]['ID_asset']
                sector_cols = [f'RET_{a}' for a in sector_assets if f'RET_{a}' in X.columns]
                if sector_cols:
                    sector_medians[col] = X[sector_cols].median(axis=1)
            if col in sector_medians:
                X[col] = X[col].fillna(sector_medians[col])
            else:
                X[col] = X[col].fillna(X[col].median())

    new_cols = {}
    if supplementary is not None:
        level = 'CLASS_LEVEL_1'
        sector_groups = supplementary.groupby(level)['ID_asset'].apply(list).to_dict()
        for sector, assets in sector_groups.items():
            asset_cols = [f'RET_{asset}' for asset in assets if f'RET_{asset}' in ret_cols]
            if asset_cols:
                new_cols[f'SECTOR_AVG_{level}_{sector}'] = X[asset_cols].mean(axis=1).replace([np.inf, -np.inf], 0).fillna(0)
        X = X.merge(supplementary[['ID_asset', level]].rename(columns={'ID_asset': 'ID_TARGET', level: f'TARGET_{level}'}),
                    on='ID_TARGET', how='left')
        X = pd.concat([X, pd.get_dummies(X[f'TARGET_{level}'], prefix=f'TARGET_{level}', dummy_na=True)], axis=1)
        X = X.drop(f'TARGET_{level}', axis=1)

    new_cols['MEAN_RET'] = X[ret_cols].mean(axis=1)
    new_cols['STD_RET'] = X[ret_cols].std(axis=1)
    new_cols.update({f'CS_RANK_{col}': X.groupby('ID_DAY')[col].rank(pct=True).replace([np.inf, -np.inf], 0).fillna(0) for col in ret_cols})

    if is_train:
        corr_matrix = X[ret_cols].corr()
        corr_pairs = corr_matrix.unstack().sort_values(ascending=False)
        top_pairs = [(i, j) for i, j in corr_pairs.index if i < j][:5]
        for i, j in top_pairs:
            new_cols[f'CORR_{i}_{j}'] = X[i] * X[j]

    if ret_cols:
        pca = PCA(n_components=min(2, len(ret_cols)), random_state=42)
        pca_features = pca.fit_transform(X[ret_cols].fillna(0))
        for i in range(pca_features.shape[1]):
            new_cols[f'PCA_{i}'] = pca_features[:, i]

    new_cols_df = pd.DataFrame(new_cols, index=X.index, dtype=np.float32)
    X = pd.concat([X, new_cols_df], axis=1)

    feature_cols = [col for col in X.columns if col not in ['ID_DAY', 'ID_TARGET']]
    scaler = StandardScaler()
    X[feature_cols] = scaler.fit_transform(X[feature_cols].replace([np.inf, -np.inf], 0).fillna(0))

    if is_train:
        y['SIGN_TARGET'] = np.where(y['RET_TARGET'] > 0, 1, 0).astype(np.int32)
        return X, y, feature_cols, top_cols
    return X, None, feature_cols, top_cols

# Preprocess data
try:
    X_train, y_train, feature_cols, top_cols = preprocess_data(X_train, y_train, supplementary, is_train=True)
    X_test, _, test_feature_cols, _ = preprocess_data(X_test, supplementary=supplementary, is_train=False, top_cols=top_cols)
except ValueError as e:
    print(f"Preprocessing error: {e}")
    raise

# Align columns
common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]
feature_cols = [col for col in common_cols if col not in ['ID_DAY', 'ID_TARGET']]

# Align y_train
y_train = y_train.loc[X_train.index]

# Define models with reduced hyperparameter grids including regularization
models = {
    'LightGBM': {
        'model': LGBMClassifier(num_leaves=31, min_child_samples=15, lambda_l1=0.3, lambda_l2=0.3,
                                subsample=0.7, colsample_bytree=0.8, bagging_freq=5, bagging_fraction=0.7,
                                random_state=42, n_jobs=1, verbosity=-1, class_weight='balanced', force_row_wise=True),
        'param_grid': [
            {'learning_rate': [0.02], 'num_leaves': [15], 'min_child_samples': [20], 'bagging_fraction': [0.6], 'reg_alpha': [0.1], 'reg_lambda': [0.1]},
            {'learning_rate': [0.05], 'num_leaves': [31], 'min_child_samples': [10], 'bagging_fraction': [0.7], 'reg_alpha': [0.2], 'reg_lambda': [0.2]},
            {'learning_rate': [0.07], 'num_leaves': [47], 'min_child_samples': [15], 'bagging_fraction': [0.8], 'reg_alpha': [0.3], 'reg_lambda': [0.3]}
        ],
        'callbacks': [lightgbm.early_stopping(stopping_rounds=10, verbose=False)]
    },
    'XGBoost': {
        'model': XGBClassifier(max_depth=4, min_child_weight=2, subsample=0.8, colsample_bytree=0.8,
                               random_state=42, n_jobs=1, eval_metric='logloss', verbosity=0,
                               scale_pos_weight=len(y_train[y_train['SIGN_TARGET'] == 0]) / len(y_train[y_train['SIGN_TARGET'] == 1])),
        'param_grid': [
            {'learning_rate': [0.03], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [150], 'reg_alpha': [0.1], 'reg_lambda': [1.0]},
            {'learning_rate': [0.05], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [200], 'reg_alpha': [0.2], 'reg_lambda': [1.5]},
            {'learning_rate': [0.07], 'max_depth': [5], 'min_child_weight': [3], 'subsample': [0.8], 'n_estimators': [150], 'reg_alpha': [0.3], 'reg_lambda': [1.0]}
        ],
        'callbacks': [xgboost.callback.EarlyStopping(rounds=10, metric_name='logloss', save_best=True)]
    }
}

# Function to train and predict for a single target
def train_target(target, X_train, y_train, X_test, feature_cols, model, model_name, kf, param_grid, callbacks):
    X_target = X_train[X_train['ID_TARGET'] == target][feature_cols]
    y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
    weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
    low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]
    weight_scale = 1.5 if target in low_performing_targets else 1.0
    if len(X_target) < 500:
        weight_scale *= 1.5
    weights = weights * weight_scale
    groups = X_train[X_train['ID_TARGET'] == target]['ID_DAY']
    X_test_target = X_test[X_test['ID_TARGET'] == target][feature_cols]
    test_ids = X_test[X_test['ID_TARGET'] == target].index

    if len(X_target) < 100 or len(X_test_target) == 0:
        print(f"Warning: Insufficient data for ID_TARGET {target} (train: {len(X_target)}, test: {len(X_test_target)}). Skipping.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    param_results = []
    best_params = None
    best_score = 0
    best_model = None

    for params in ParameterGrid(param_grid):
        print(f"Testing params for {model_name} ID_TARGET {target}: {params}", flush=True)
        try:
            model.set_params(**params)
        except ValueError as e:
            print(f"Invalid params for {model_name} ID_TARGET {target}: {e}")
            continue
        fold_accuracies = []
        for train_idx, val_idx in kf.split(X_target, y_target, groups):
            X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
            y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
            w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
            if model_name == 'LightGBM' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=callbacks)
            elif model_name == 'XGBoost' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
            else:
                model.fit(X_tr, y_tr, sample_weight=w_tr)
            y_pred = model.predict(X_val)
            acc = weighted_accuracy(y_val, y_pred, w_val)
            fold_accuracies.append(acc)
        mean_acc = np.mean(fold_accuracies)
        print(f"{model_name} | ID_TARGET: {target} | Params: {params} | Mean Accuracy: {mean_acc:.4f} | Fold Accuracies: {fold_accuracies}", flush=True)
        param_results.append({
            'params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': fold_accuracies
        })
        if mean_acc > best_score:
            best_score = mean_acc
            best_params = params
            best_model = type(model)(**model.get_params())

    if best_model is None:
        print(f"No valid model for {model_name} ID_TARGET {target}. Using default.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    best_model.set_params(**best_params)
    if model_name == 'LightGBM' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=callbacks)
    elif model_name == 'XGBoost' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
    else:
        best_model.fit(X_target, y_target, sample_weight=weights)
    preds = best_model.predict_proba(X_test_target)[:, 1].astype(np.float32)
    importance = best_model.feature_importances_
    feature_importance = dict(zip(feature_cols, importance))
    top_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:30]
    gc.collect()
    return str(target), best_params, best_score, param_results, test_ids, preds, top_features

# Cross-validation and training
kf = GroupKFold(n_splits=5)
results = {}
predictions = {}
ensemble_weights = {'LightGBM': 0.7, 'XGBoost': 0.3}
param_log = {}
top_features_all = {}
param_summary = []
low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]

for model_name, model_info in models.items():
    print(f"\nTraining {model_name}...", flush=True)
    param_log[model_name] = {}
    top_features_all[model_name] = {}
    results_parallel = Parallel(n_jobs=-1)(
        delayed(train_target)(target, X_train, y_train, X_test, feature_cols, model_info['model'], model_name, kf,
                             model_info['param_grid'], model_info['callbacks'])
        for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets")
    )
    accuracies = []
    for target, params, mean_acc, param_results, test_ids, preds, top_features in results_parallel:
        param_log[model_name][target] = {
            'best_params': params,
            'mean_accuracy': mean_acc,
            'all_params': param_results
        }
        top_features_all[model_name][target] = top_features
        accuracies.append(mean_acc)
        predictions.setdefault(target, {})[model_name] = dict(zip(test_ids, preds))
        param_summary.append({
            'model': model_name,
            'ID_TARGET': target,
            'best_params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': param_results[-1]['fold_accuracies'] if param_results else []
        })
    results[model_name] = np.mean([acc for acc in accuracies if acc > 0])
    print(f"{model_name} Weighted Accuracy: {results[model_name]:.4f}", flush=True)
    print(f"\n{model_name} Parameter Summary:", flush=True)
    for summary in param_summary:
        if summary['model'] == model_name:
            print(f"ID_TARGET: {summary['ID_TARGET']} | Best Params: {summary['best_params']} | Mean Accuracy: {summary['mean_accuracy']:.4f} | Fold Accuracies: {summary['fold_accuracies']}", flush=True)

# Second-pass training with top 30 features
print("\nPerforming second-pass training with top 30 features...", flush=True)
for model_name, model_info in models.items():
    print(f"\nSecond-pass {model_name}...", flush=True)
    for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets"):
        target_str = str(target)
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
        weight_scale = 1.5 if target in low_performing_targets else 1.0
        if len(X_target) < 500:
            weight_scale *= 1.5
        weights = weights * weight_scale
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index
        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue
        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        best_params['n_estimators'] = 200
        model.set_params(**best_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Third-pass training for low-performing targets
print("\nPerforming third-pass training for low-performing targets...", flush=True)
for model_name, model_info in models.items():
    print(f"\nThird-pass {model_name}...", flush=True)
    for target in tqdm(low_performing_targets, desc=f"{model_name} Low-Performing Targets"):
        target_str = str(target)
        if target_str not in param_log[model_name] or param_log[model_name][target_str]['mean_accuracy'] >= 0.75:
            continue
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs() * 1.5
        if len(X_target) < 500:
            weights = weights * 1.5
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index
        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue
        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        fine_tune_grid = [
            {'learning_rate': best_params['learning_rate'][0] * 0.8, 'n_estimators': 200, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1},
            {'learning_rate': best_params['learning_rate'][0] * 1.0, 'n_estimators': 250, 'reg_alpha': best_params['reg_alpha'][0], 'reg_lambda': best_params['reg_lambda'][0]},
            {'learning_rate': best_params['learning_rate'][0] * 1.2, 'n_estimators': 200, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1}
        ]
        best_score = param_log[model_name][target_str]['mean_accuracy']
        best_fine_params = best_params
        for params in ParameterGrid(fine_tune_grid):
            print(f"Fine-tuning {model_name} ID_TARGET {target}: {params}", flush=True)
            model.set_params(**params)
            fold_accuracies = []
            for train_idx, val_idx in kf.split(X_target, y_target, groups):
                X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
                y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
                w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
                if model_name == 'LightGBM' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=model_info['callbacks'])
                elif model_name == 'XGBoost' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
                else:
                    model.fit(X_tr, y_tr, sample_weight=w_tr)
                y_pred = model.predict(X_val)
                acc = weighted_accuracy(y_val, y_pred, w_val)
                fold_accuracies.append(acc)
            mean_acc = np.mean(fold_accuracies)
            print(f"{model_name} | ID_TARGET: {target} | Fine-tune Params: {params} | Mean Accuracy: {mean_acc:.4f}", flush=True)
            if mean_acc > best_score:
                best_score = mean_acc
                best_fine_params = params
                param_log[model_name][target_str]['best_params'] = best_fine_params
                param_log[model_name][target_str]['mean_accuracy'] = best_score
                param_summary.append({
                    'model': model_name,
                    'ID_TARGET': target_str,
                    'best_params': best_fine_params,
                    'mean_accuracy': best_score,
                    'fold_accuracies': fold_accuracies
                })
        model.set_params(**best_fine_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Save parameter log
param_log_path = os.path.join(DATA_DIR, 'hyperparameters.json')
with open(param_log_path, 'w') as f:
    json.dump(param_log, f, indent=4)
print(f"Saved hyperparameter log to: {param_log_path}", flush=True)
files.download(param_log_path)

# Ensemble predictions
final_predictions = {}
missing_day_targets = []
for target in X_test['ID_TARGET'].unique():
    target_str = str(target)
    test_ids = X_test[X_test['ID_TARGET'] == target].index
    for day in X_test[X_test['ID_TARGET'] == target]['ID_DAY'].unique():
        day_idx = (X_test['ID_TARGET'] == target) & (X_test['ID_DAY'] == day)
        day_test_ids = X_test[day_idx].index
        if len(day_test_ids) == 0:
            print(f"Warning: No test samples for ID_TARGET {target} on ID_DAY {day}. Using default prediction.")
            missing_day_targets.append((target, day))
            for idx in test_ids:
                final_predictions[idx] = -1
            continue
        lgbm_preds = np.array([predictions[target_str].get('LightGBM', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        xgb_preds = np.array([predictions[target_str].get('XGBoost', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        ensemble_preds = ensemble_weights['LightGBM'] * lgbm_preds + ensemble_weights['XGBoost'] * xgb_preds
        smoothed_preds = pd.Series(ensemble_preds).ewm(span=3).mean().values[-1] if len(ensemble_preds) > 0 else 0.5
        for idx in day_test_ids:
            final_predictions[idx] = 1 if smoothed_preds >= 0.5 else -1

if missing_day_targets:
    print(f"Missing day-target pairs: {missing_day_targets}")

# Create output CSV
output_df = pd.DataFrame.from_dict(final_predictions, orient='index', columns=['RET_TARGET']).reset_index()
output_df.columns = ['ID', 'RET_TARGET']
output_df = output_df.sort_values('ID')
expected_ids = set(X_test.index)
submission_ids = set(output_df['ID'])
if expected_ids != submission_ids:
    missing_ids = expected_ids - submission_ids
    print(f"Warning: Submission missing {len(missing_ids)} IDs: {missing_ids}")
    missing_df = pd.DataFrame({'ID': list(missing_ids), 'RET_TARGET': -1})
    output_df = pd.concat([output_df, missing_df], ignore_index=True).sort_values('ID')
output_path = os.path.join(DATA_DIR, 'predictions_ensemble.csv')
output_df.to_csv(output_path, index=False)
print(f"Saved predictions to: {output_path}", flush=True)

# Validate output
test_csv = pd.read_csv(output_path)
print(f"Output CSV shape: {test_csv.shape}", flush=True)
print(f"Output CSV ID range: {test_csv['ID'].min()} to {test_csv['ID'].max()}", flush=True)
print(f"Unique RET_TARGET values: {test_csv['RET_TARGET'].unique()}", flush=True)
print(f"Missing values: {test_csv.isna().sum().sum()}", flush=True)

# Download the file
files.download(output_path)

# Print final results
print("\nValidation Results:", flush=True)
for model_name, acc in results.items():
    print(f"{model_name}: {acc:.4f}", flush=True)


XGBoost version: 2.1.4
Selected top return columns: ['RET_262', 'RET_88', 'RET_172', 'RET_259', 'RET_150', 'RET_118', 'RET_216', 'RET_268', 'RET_115', 'RET_261', 'RET_59', 'RET_238', 'RET_121', 'RET_97', 'RET_30']

Training LightGBM...


LightGBM Targets: 100%|██████████| 100/100 [10:45<00:00,  6.46s/it]


LightGBM Weighted Accuracy: 0.7327

LightGBM Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'bagging_fraction': 0.6, 'learning_rate': 0.02, 'min_child_samples': 20, 'num_leaves': 15, 'reg_alpha': 0.1, 'reg_lambda': 0.1} | Mean Accuracy: 0.9173 | Fold Accuracies: [np.float64(0.7531914893617021), np.float64(0.83203125), np.float64(0.7330677290836654), np.float64(0.8255813953488372), np.float64(0.8112449799196787)]
ID_TARGET: 129.0 | Best Params: {'bagging_fraction': 0.6, 'learning_rate': 0.02, 'min_child_samples': 20, 'num_leaves': 15, 'reg_alpha': 0.1, 'reg_lambda': 0.1} | Mean Accuracy: 0.6481 | Fold Accuracies: [np.float64(0.5959183673469388), np.float64(0.6504065040650406), np.float64(0.6363636363636364), np.float64(0.6118143459915611), np.float64(0.7090909090909091)]
ID_TARGET: 136.0 | Best Params: {'bagging_fraction': 0.8, 'learning_rate': 0.07, 'min_child_samples': 15, 'num_leaves': 47, 'reg_alpha': 0.3, 'reg_lambda': 0.3} | Mean Accuracy: 0.6080 | Fold Accuracies: [np.float6

XGBoost Targets: 100%|██████████| 100/100 [18:40<00:00, 11.21s/it]


XGBoost Weighted Accuracy: 0.7645

XGBoost Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'learning_rate': 0.07, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 150, 'reg_alpha': 0.3, 'reg_lambda': 1.0, 'subsample': 0.8} | Mean Accuracy: 0.7133 | Fold Accuracies: [np.float64(0.7574468085106383), np.float64(0.67578125), np.float64(0.7729083665338645), np.float64(0.6937984496124031), np.float64(0.6666666666666666)]
ID_TARGET: 129.0 | Best Params: {'learning_rate': 0.07, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 150, 'reg_alpha': 0.3, 'reg_lambda': 1.0, 'subsample': 0.8} | Mean Accuracy: 0.6275 | Fold Accuracies: [np.float64(0.6040816326530613), np.float64(0.6869918699186992), np.float64(0.6060606060606061), np.float64(0.6075949367088608), np.float64(0.6327272727272727)]
ID_TARGET: 136.0 | Best Params: {'learning_rate': 0.07, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 150, 'reg_alpha': 0.3, 'reg_lambda': 1.0, 'subsample': 0.8} | Mean Accuracy: 1.0000 |

LightGBM Targets: 100%|██████████| 100/100 [00:56<00:00,  1.78it/s]


Second-pass XGBoost...



XGBoost Targets: 100%|██████████| 100/100 [01:08<00:00,  1.46it/s]


Performing third-pass training for low-performing targets...

Third-pass LightGBM...



LightGBM Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 41304.69it/s]


Third-pass XGBoost...



XGBoost Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 55856.35it/s]

Saved hyperparameter log to: /content/hyperparameters.json





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved predictions to: /content/predictions_ensemble.csv
Output CSV shape: (114468, 2)
Output CSV ID range: 0 to 114467
Unique RET_TARGET values: [ 1 -1]
Missing values: 0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Validation Results:
LightGBM: 0.7327
XGBoost: 0.7645


74%

In [None]:
import numpy as np
import pandas as pd
import os
import gc
from google.colab import files
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import lightgbm
from xgboost import XGBClassifier
import xgboost
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
import json
from sklearn.model_selection import ParameterGrid

# Check xgboost version for compatibility
print(f"XGBoost version: {xgboost.__version__}")
if int(xgboost.__version__.split('.')[0]) < 1:
    print("Warning: XGBoost version < 1.0.0 detected. Ensure compatibility with eval_metric in constructor.")

# Set up data directory
DATA_DIR = '/content'

# Load datasets
try:
    X_train = pd.read_csv(os.path.join(DATA_DIR, 'X_train_itDkypA.csv'), index_col='ID', dtype=np.float32)
    y_train = pd.read_csv(os.path.join(DATA_DIR, 'y_train_3LeeT2g.csv'), index_col='ID', dtype=np.float32)
    X_test = pd.read_csv(os.path.join(DATA_DIR, 'X_test_Beg4ey3.csv'), index_col='ID', dtype=np.float32)
    supplementary = pd.read_csv(os.path.join(DATA_DIR, 'supplementary_data_Vkoyn8z.csv'))
except FileNotFoundError as e:
    raise FileNotFoundError(f"Dataset not found: {e}. Please ensure files are uploaded to {DATA_DIR}.")

# Custom weighted accuracy metric
def weighted_accuracy(y_true, y_pred, weights):
    y_pred_mapped = np.where(y_pred == 1, 1, -1)
    correct = (y_pred_mapped == np.sign(y_true)).astype(float)
    return np.sum(np.abs(y_true) * correct) / np.sum(np.abs(y_true))

# Preprocess data with enhanced feature selection
def preprocess_data(X, y=None, supplementary=None, is_train=True, top_cols=None):
    ret_cols = [col for col in X.columns if col.startswith('RET_')]

    # Dynamic feature selection with minimum variance threshold
    if is_train and top_cols is None:
        if len(ret_cols) > 15:
            variances = X[ret_cols].var()
            corr_sum = X[ret_cols].corr().abs().sum()
            combined_score = variances * corr_sum
            # Filter out low-variance features (threshold = 1e-5)
            valid_cols = variances[variances > 1e-5].index
            if len(valid_cols) < 15:
                top_cols = valid_cols.tolist()
            else:
                top_cols = combined_score.loc[valid_cols].sort_values(ascending=False).head(15).index.tolist()
        else:
            top_cols = ret_cols
        print(f"Selected top return columns: {top_cols}")
    elif top_cols is not None:
        ret_cols = [col for col in ret_cols if col in top_cols]
        if not ret_cols:
            raise ValueError("No return columns selected. Check top_cols consistency.")

    # Cache sector-based medians
    sector_medians = {}
    if supplementary is not None:
        sector_map = supplementary.set_index('ID_asset')['CLASS_LEVEL_1']
        for col in ret_cols:
            asset_id = int(col.replace('RET_', ''))
            sector = sector_map.get(asset_id, np.nan)
            if pd.notna(sector):
                sector_assets = supplementary[supplementary['CLASS_LEVEL_1'] == sector]['ID_asset']
                sector_cols = [f'RET_{a}' for a in sector_assets if f'RET_{a}' in X.columns]
                if sector_cols:
                    sector_medians[col] = X[sector_cols].median(axis=1)
            if col in sector_medians:
                X[col] = X[col].fillna(sector_medians[col])
            else:
                X[col] = X[col].fillna(X[col].median())

    # Sector-based features
    new_cols = {}
    if supplementary is not None:
        level = 'CLASS_LEVEL_1'
        sector_groups = supplementary.groupby(level)['ID_asset'].apply(list).to_dict()
        for sector, assets in sector_groups.items():
            asset_cols = [f'RET_{asset}' for asset in assets if f'RET_{asset}' in ret_cols]
            if asset_cols:
                new_cols[f'SECTOR_AVG_{level}_{sector}'] = X[asset_cols].mean(axis=1).replace([np.inf, -np.inf], 0).fillna(0)
        X = X.merge(supplementary[['ID_asset', level]].rename(columns={'ID_asset': 'ID_TARGET', level: f'TARGET_{level}'}),
                    on='ID_TARGET', how='left')
        X = pd.concat([X, pd.get_dummies(X[f'TARGET_{level}'], prefix=f'TARGET_{level}', dummy_na=True)], axis=1)
        X = X.drop(f'TARGET_{level}', axis=1)

    # Cross-sectional features
    new_cols['MEAN_RET'] = X[ret_cols].mean(axis=1)
    new_cols['STD_RET'] = X[ret_cols].std(axis=1)
    new_cols.update({f'CS_RANK_{col}': X.groupby('ID_DAY')[col].rank(pct=True).replace([np.inf, -np.inf], 0).fillna(0) for col in ret_cols})

    # Correlation-based features (top 5 pairs)
    if is_train:
        corr_matrix = X[ret_cols].corr()
        corr_pairs = corr_matrix.unstack().sort_values(ascending=False)
        top_pairs = [(i, j) for i, j in corr_pairs.index if i < j][:5]
        for i, j in top_pairs:
            new_cols[f'CORR_{i}_{j}'] = X[i] * X[j]

    # PCA features (2 components)
    if ret_cols:
        pca = PCA(n_components=min(2, len(ret_cols)), random_state=42)
        pca_features = pca.fit_transform(X[ret_cols].fillna(0))
        for i in range(pca_features.shape[1]):
            new_cols[f'PCA_{i}'] = pca_features[:, i]

    # Add new features to DataFrame
    new_cols_df = pd.DataFrame(new_cols, index=X.index, dtype=np.float32)
    X = pd.concat([X, new_cols_df], axis=1)

    # Standardize numerical features
    feature_cols = [col for col in X.columns if col not in ['ID_DAY', 'ID_TARGET']]
    scaler = StandardScaler()
    X[feature_cols] = scaler.fit_transform(X[feature_cols].replace([np.inf, -np.inf], 0).fillna(0))

    if is_train:
        y['SIGN_TARGET'] = np.where(y['RET_TARGET'] > 0, 1, 0).astype(np.int32)
        return X, y, feature_cols, top_cols
    return X, None, feature_cols, top_cols

# Preprocess data
try:
    X_train, y_train, feature_cols, top_cols = preprocess_data(X_train, y_train, supplementary, is_train=True)
    X_test, _, test_feature_cols, _ = preprocess_data(X_test, supplementary=supplementary, is_train=False, top_cols=top_cols)
except ValueError as e:
    print(f"Preprocessing error: {e}")
    raise

# Align columns
common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]
feature_cols = [col for col in common_cols if col not in ['ID_DAY', 'ID_TARGET']]

# Align y_train
y_train = y_train.loc[X_train.index]

# Define models with corrected hyperparameter grids
models = {
    'LightGBM': {
        'model': LGBMClassifier(num_leaves=31, min_child_samples=15, lambda_l1=0.3, lambda_l2=0.3,
                                subsample=0.7, colsample_bytree=0.8, bagging_freq=5, bagging_fraction=0.7,
                                random_state=42, n_jobs=1, verbosity=-1, class_weight='balanced', force_row_wise=True),
        'param_grid': [
            {'learning_rate': [0.02], 'num_leaves': [15], 'min_child_samples': [20], 'bagging_fraction': [0.6]},
            {'learning_rate': [0.05], 'num_leaves': [31], 'min_child_samples': [10], 'bagging_fraction': [0.7]},
            {'learning_rate': [0.1], 'num_leaves': [63], 'min_child_samples': [15], 'bagging_fraction': [0.8]},
            {'learning_rate': [0.07], 'num_leaves': [47], 'min_child_samples': [12], 'bagging_fraction': [0.65]}
        ],
        'callbacks': [lightgbm.early_stopping(stopping_rounds=10, verbose=False)]
    },
    'XGBoost': {
        'model': XGBClassifier(max_depth=4, min_child_weight=2, subsample=0.8, colsample_bytree=0.8,
                               random_state=42, n_jobs=1, eval_metric='logloss', verbosity=0,
                               scale_pos_weight=len(y_train[y_train['SIGN_TARGET'] == 0]) / len(y_train[y_train['SIGN_TARGET'] == 1])),
        'param_grid': [
            {'learning_rate': [0.03], 'max_depth': [3], 'min_child_weight': [1], 'n_estimators': [150]},
            {'learning_rate': [0.05], 'max_depth': [5], 'min_child_weight': [2], 'n_estimators': [200]},
            {'learning_rate': [0.1], 'max_depth': [4], 'min_child_weight': [3], 'n_estimators': [150]},
            {'learning_rate': [0.07], 'max_depth': [6], 'min_child_weight': [2], 'n_estimators': [200]}
        ],
        'callbacks': [xgboost.callback.EarlyStopping(rounds=10, metric_name='logloss', save_best=True)]
    }
}

# Function to train and predict for a single target
def train_target(target, X_train, y_train, X_test, feature_cols, model, model_name, kf, param_grid, callbacks):
    X_target = X_train[X_train['ID_TARGET'] == target][feature_cols]
    y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
    weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
    low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]
    if target in low_performing_targets:
        weights = weights * 1.5
    groups = X_train[X_train['ID_TARGET'] == target]['ID_DAY']
    X_test_target = X_test[X_test['ID_TARGET'] == target][feature_cols]
    test_ids = X_test[X_test['ID_TARGET'] == target].index

    # Check for sufficient data
    if len(X_target) < 100 or len(X_test_target) == 0:
        print(f"Warning: Insufficient data for ID_TARGET {target} (train: {len(X_target)}, test: {len(X_test_target)}). Skipping.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    param_results = []
    best_params = None
    best_score = 0
    best_model = None

    # First-pass grid search with sample weights
    for params in ParameterGrid(param_grid):
        print(f"Testing params for {model_name} ID_TARGET {target}: {params}", flush=True)
        try:
            model.set_params(**params)
        except ValueError as e:
            print(f"Invalid params for {model_name} ID_TARGET {target}: {e}")
            continue
        fold_accuracies = []
        for train_idx, val_idx in kf.split(X_target, y_target, groups):
            X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
            y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
            w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]

            if model_name == 'LightGBM' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=callbacks)
            elif model_name == 'XGBoost' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
            else:
                model.fit(X_tr, y_tr, sample_weight=w_tr)
            y_pred = model.predict(X_val)
            acc = weighted_accuracy(y_val, y_pred, w_val)
            fold_accuracies.append(acc)

        mean_acc = np.mean(fold_accuracies)
        print(f"{model_name} | ID_TARGET: {target} | Params: {params} | Mean Accuracy: {mean_acc:.4f} | Fold Accuracies: {fold_accuracies}", flush=True)
        param_results.append({
            'params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': fold_accuracies
        })
        if mean_acc > best_score:
            best_score = mean_acc
            best_params = params
            best_model = type(model)(**model.get_params())

    # Train with best parameters
    if best_model is None:
        print(f"No valid model for {model_name} ID_TARGET {target}. Using default.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    best_model.set_params(**best_params)
    if model_name == 'LightGBM' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=callbacks)
    elif model_name == 'XGBoost' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
    else:
        best_model.fit(X_target, y_target, sample_weight=weights)

    # Predict
    preds = best_model.predict_proba(X_test_target)[:, 1].astype(np.float32)

    # Feature importance
    importance = best_model.feature_importances_
    feature_importance = dict(zip(feature_cols, importance))
    top_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:30]

    # Clean up memory
    gc.collect()

    return str(target), best_params, best_score, param_results, test_ids, preds, top_features

# Cross-validation and training
kf = GroupKFold(n_splits=5)  # Increased to 5 folds
results = {}
predictions = {}
ensemble_weights = {'LightGBM': 0.7, 'XGBoost': 0.3}
param_log = {}
top_features_all = {}
param_summary = []
low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]

for model_name, model_info in models.items():
    print(f"\nTraining {model_name}...", flush=True)
    param_log[model_name] = {}
    top_features_all[model_name] = {}

    # First-pass training
    results_parallel = Parallel(n_jobs=-1)(
        delayed(train_target)(target, X_train, y_train, X_test, feature_cols, model_info['model'], model_name, kf,
                             model_info['param_grid'], model_info['callbacks'])
        for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets")
    )

    # Collect results
    accuracies = []
    for target, params, mean_acc, param_results, test_ids, preds, top_features in results_parallel:
        param_log[model_name][target] = {
            'best_params': params,
            'mean_accuracy': mean_acc,
            'all_params': param_results
        }
        top_features_all[model_name][target] = top_features
        accuracies.append(mean_acc)
        predictions.setdefault(target, {})[model_name] = dict(zip(test_ids, preds))
        param_summary.append({
            'model': model_name,
            'ID_TARGET': target,
            'best_params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': param_results[-1]['fold_accuracies'] if param_results else []
        })

    results[model_name] = np.mean([acc for acc in accuracies if acc > 0])  # Exclude skipped targets
    print(f"{model_name} Weighted Accuracy: {results[model_name]:.4f}", flush=True)

    # Print parameter summary
    print(f"\n{model_name} Parameter Summary:", flush=True)
    for summary in param_summary:
        if summary['model'] == model_name:
            print(f"ID_TARGET: {summary['ID_TARGET']} | Best Params: {summary['best_params']} | Mean Accuracy: {summary['mean_accuracy']:.4f} | Fold Accuracies: {summary['fold_accuracies']}", flush=True)

# Second-pass training with top 30 features
print("\nPerforming second-pass training with top 30 features...", flush=True)
for model_name, model_info in models.items():
    print(f"\nSecond-pass {model_name}...", flush=True)
    for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets"):
        target_str = str(target)
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
        if target in low_performing_targets:
            weights = weights * 1.5
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index

        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue

        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        best_params['n_estimators'] = 200  # Increase for better convergence
        model.set_params(**best_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Third-pass training for low-performing targets
print("\nPerforming third-pass training for low-performing targets...", flush=True)
for model_name, model_info in models.items():
    print(f"\nThird-pass {model_name}...", flush=True)
    for target in tqdm(low_performing_targets, desc=f"{model_name} Low-Performing Targets"):
        target_str = str(target)
        if target_str not in param_log[model_name] or param_log[model_name][target_str]['mean_accuracy'] >= 0.75:
            continue  # Skip if already performing well
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs() * 2.0  # Increase weight
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index

        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue

        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        # Fine-tune parameters for low-performing targets
        fine_tune_grid = [
            {'learning_rate': [best_params['learning_rate'][0] * 0.8], 'n_estimators': [250]},
            {'learning_rate': [best_params['learning_rate'][0] * 1.2], 'n_estimators': [250]}
        ]
        best_score = param_log[model_name][target_str]['mean_accuracy']
        best_fine_params = best_params

        for params in ParameterGrid(fine_tune_grid):
            print(f"Fine-tuning {model_name} ID_TARGET {target}: {params}", flush=True)
            model.set_params(**params)
            fold_accuracies = []
            for train_idx, val_idx in kf.split(X_target, y_target, groups):
                X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
                y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
                w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
                if model_name == 'LightGBM' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=model_info['callbacks'])
                elif model_name == 'XGBoost' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
                else:
                    model.fit(X_tr, y_tr, sample_weight=w_tr)
                y_pred = model.predict(X_val)
                acc = weighted_accuracy(y_val, y_pred, w_val)
                fold_accuracies.append(acc)
            mean_acc = np.mean(fold_accuracies)
            print(f"{model_name} | ID_TARGET: {target} | Fine-tune Params: {params} | Mean Accuracy: {mean_acc:.4f}", flush=True)
            if mean_acc > best_score:
                best_score = mean_acc
                best_fine_params = params
                param_log[model_name][target_str]['best_params'] = best_fine_params
                param_log[model_name][target_str]['mean_accuracy'] = best_score
                param_summary.append({
                    'model': model_name,
                    'ID_TARGET': target_str,
                    'best_params': best_fine_params,
                    'mean_accuracy': best_score,
                    'fold_accuracies': fold_accuracies
                })

        # Retrain with fine-tuned parameters
        model.set_params(**best_fine_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Save parameter log
param_log_path = os.path.join(DATA_DIR, 'hyperparameters.json')
with open(param_log_path, 'w') as f:
    json.dump(param_log, f, indent=4)
print(f"Saved hyperparameter log to: {param_log_path}", flush=True)
files.download(param_log_path)

# Ensemble predictions
final_predictions = {}
missing_day_targets = []
for target in X_test['ID_TARGET'].unique():
    target_str = str(target)
    test_ids = X_test[X_test['ID_TARGET'] == target].index
    for day in X_test[X_test['ID_TARGET'] == target]['ID_DAY'].unique():
        day_idx = (X_test['ID_TARGET'] == target) & (X_test['ID_DAY'] == day)
        day_test_ids = X_test[day_idx].index
        if len(day_test_ids) == 0:
            print(f"Warning: No test samples for ID_TARGET {target} on ID_DAY {day}. Using default prediction.")
            missing_day_targets.append((target, day))
            for idx in test_ids:
                final_predictions[idx] = -1
            continue
        lgbm_preds = np.array([predictions[target_str].get('LightGBM', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        xgb_preds = np.array([predictions[target_str].get('XGBoost', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        ensemble_preds = ensemble_weights['LightGBM'] * lgbm_preds + ensemble_weights['XGBoost'] * xgb_preds
        # Apply exponential moving average for smoothing
        smoothed_preds = pd.Series(ensemble_preds).ewm(span=3).mean().values[-1] if len(ensemble_preds) > 0 else 0.5
        for idx in day_test_ids:
            final_predictions[idx] = 1 if smoothed_preds >= 0.5 else -1

if missing_day_targets:
    print(f"Missing day-target pairs: {missing_day_targets}")

# Create output CSV
output_df = pd.DataFrame.from_dict(final_predictions, orient='index', columns=['RET_TARGET']).reset_index()
output_df.columns = ['ID', 'RET_TARGET']
output_df = output_df.sort_values('ID')
expected_ids = set(X_test.index)
submission_ids = set(output_df['ID'])
if expected_ids != submission_ids:
    missing_ids = expected_ids - submission_ids
    print(f"Warning: Submission missing {len(missing_ids)} IDs: {missing_ids}")
    missing_df = pd.DataFrame({'ID': list(missing_ids), 'RET_TARGET': -1})
    output_df = pd.concat([output_df, missing_df], ignore_index=True).sort_values('ID')
output_path = os.path.join(DATA_DIR, 'predictions_ensemble.csv')
output_df.to_csv(output_path, index=False)
print(f"Saved predictions to: {output_path}", flush=True)

# Validate output
test_csv = pd.read_csv(output_path)
print(f"Output CSV shape: {test_csv.shape}", flush=True)
print(f"Output CSV ID range: {test_csv['ID'].min()} to {test_csv['ID'].max()}", flush=True)
print(f"Unique RET_TARGET values: {test_csv['RET_TARGET'].unique()}", flush=True)
print(f"Missing values: {test_csv.isna().sum().sum()}", flush=True)

# Download the file
files.download(output_path)

# Print final results
print("\nValidation Results:", flush=True)
for model_name, acc in results.items():
    print(f"{model_name}: {acc:.4f}", flush=True)

XGBoost version: 2.1.4
Selected top return columns: ['RET_262', 'RET_88', 'RET_172', 'RET_259', 'RET_150', 'RET_118', 'RET_216', 'RET_268', 'RET_115', 'RET_261', 'RET_59', 'RET_238', 'RET_121', 'RET_97', 'RET_30']

Training LightGBM...


LightGBM Targets: 100%|██████████| 100/100 [12:35<00:00,  7.55s/it]


LightGBM Weighted Accuracy: 0.7336

LightGBM Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'bagging_fraction': 0.6, 'learning_rate': 0.02, 'min_child_samples': 20, 'num_leaves': 15} | Mean Accuracy: 0.9173 | Fold Accuracies: [np.float64(0.7617021276595745), np.float64(1.0), np.float64(0.8406374501992032), np.float64(0.8604651162790697), np.float64(0.6867469879518072)]
ID_TARGET: 129.0 | Best Params: {'bagging_fraction': 0.6, 'learning_rate': 0.02, 'min_child_samples': 20, 'num_leaves': 15} | Mean Accuracy: 0.6481 | Fold Accuracies: [np.float64(0.636734693877551), np.float64(0.6097560975609756), np.float64(0.6325757575757576), np.float64(0.6371308016877637), np.float64(0.6509090909090909)]
ID_TARGET: 136.0 | Best Params: {'bagging_fraction': 0.65, 'learning_rate': 0.07, 'min_child_samples': 12, 'num_leaves': 47} | Mean Accuracy: 0.6070 | Fold Accuracies: [np.float64(0.6293436293436293), np.float64(0.6136363636363636), np.float64(0.6072727272727273), np.float64(0.58984375), np.floa

XGBoost Targets: 100%|██████████| 100/100 [25:53<00:00, 15.53s/it]


XGBoost Weighted Accuracy: 0.7601

XGBoost Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 3, 'n_estimators': 150} | Mean Accuracy: 0.6171 | Fold Accuracies: [np.float64(0.6425531914893617), np.float64(0.58203125), np.float64(0.6175298804780877), np.float64(0.5697674418604651), np.float64(0.5823293172690763)]
ID_TARGET: 129.0 | Best Params: {'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 3, 'n_estimators': 150} | Mean Accuracy: 0.6229 | Fold Accuracies: [np.float64(0.6081632653061224), np.float64(0.6260162601626016), np.float64(0.5871212121212122), np.float64(0.5780590717299579), np.float64(0.5745454545454546)]
ID_TARGET: 136.0 | Best Params: {'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 3, 'n_estimators': 150} | Mean Accuracy: 1.0000 | Fold Accuracies: [np.float64(0.6293436293436293), np.float64(0.5568181818181818), np.float64(0.5781818181818181), np.float64(0.59765625), np.float64(0.5748987854251012

LightGBM Targets: 100%|██████████| 100/100 [00:55<00:00,  1.79it/s]


Second-pass XGBoost...



XGBoost Targets: 100%|██████████| 100/100 [01:09<00:00,  1.43it/s]


Performing third-pass training for low-performing targets...

Third-pass LightGBM...



LightGBM Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 52488.45it/s]


Third-pass XGBoost...



XGBoost Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 53399.70it/s]

Saved hyperparameter log to: /content/hyperparameters.json





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved predictions to: /content/predictions_ensemble.csv
Output CSV shape: (114468, 2)
Output CSV ID range: 0 to 114467
Unique RET_TARGET values: [ 1 -1]
Missing values: 0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Validation Results:
LightGBM: 0.7336
XGBoost: 0.7601


new method

In [None]:
import numpy as np
import pandas as pd
import os
import gc
from google.colab import files
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import lightgbm
from xgboost import XGBClassifier
import xgboost
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
import json
from sklearn.model_selection import ParameterGrid

# Check xgboost version for compatibility
print(f"XGBoost version: {xgboost.__version__}")
if int(xgboost.__version__.split('.')[0]) < 1:
    print("Warning: XGBoost version < 1.0.0 detected. Ensure compatibility with eval_metric in constructor.")

# Set up data directory
DATA_DIR = '/content'

# Load datasets
try:
    X_train = pd.read_csv(os.path.join(DATA_DIR, 'X_train_itDkypA.csv'), index_col='ID', dtype=np.float32)
    y_train = pd.read_csv(os.path.join(DATA_DIR, 'y_train_3LeeT2g.csv'), index_col='ID', dtype=np.float32)
    X_test = pd.read_csv(os.path.join(DATA_DIR, 'X_test_Beg4ey3.csv'), index_col='ID', dtype=np.float32)
    supplementary = pd.read_csv(os.path.join(DATA_DIR, 'supplementary_data_Vkoyn8z.csv'))
except FileNotFoundError as e:
    raise FileNotFoundError(f"Dataset not found: {e}. Please ensure files are uploaded to {DATA_DIR}.")

# Custom weighted accuracy metric
def weighted_accuracy(y_true, y_pred, weights):
    y_pred_mapped = np.where(y_pred == 1, 1, -1)
    correct = (y_pred_mapped == np.sign(y_true)).astype(float)
    return np.sum(np.abs(y_true) * correct) / np.sum(np.abs(y_true))

# Preprocess data with enhanced feature selection, momentum, and volatility
def preprocess_data(X, y=None, supplementary=None, is_train=True, top_cols=None):
    ret_cols = [col for col in X.columns if col.startswith('RET_')]
    if is_train and top_cols is None:
        if len(ret_cols) > 15:
            variances = X[ret_cols].var()
            corr_sum = X[ret_cols].corr().abs().sum()
            combined_score = variances * corr_sum
            valid_cols = variances[variances > 1e-5].index
            if len(valid_cols) < 15:
                top_cols = valid_cols.tolist()
            else:
                top_cols = combined_score.loc[valid_cols].sort_values(ascending=False).head(15).index.tolist()
        else:
            top_cols = ret_cols
        print(f"Selected top return columns: {top_cols}")
    elif top_cols is not None:
        ret_cols = [col for col in ret_cols if col in top_cols]
        if not ret_cols:
            raise ValueError("No return columns selected. Check top_cols consistency.")

    sector_medians = {}
    if supplementary is not None:
        sector_map = supplementary.set_index('ID_asset')['CLASS_LEVEL_1']
        for col in ret_cols:
            asset_id = int(col.replace('RET_', ''))
            sector = sector_map.get(asset_id, np.nan)
            if pd.notna(sector):
                sector_assets = supplementary[supplementary['CLASS_LEVEL_1'] == sector]['ID_asset']
                sector_cols = [f'RET_{a}' for a in sector_assets if f'RET_{a}' in X.columns]
                if sector_cols:
                    sector_medians[col] = X[sector_cols].median(axis=1)
            if col in sector_medians:
                X[col] = X[col].fillna(sector_medians[col])
            else:
                X[col] = X[col].fillna(X[col].median())

    new_cols = {}
    if supplementary is not None:
        level = 'CLASS_LEVEL_1'
        sector_groups = supplementary.groupby(level)['ID_asset'].apply(list).to_dict()
        for sector, assets in sector_groups.items():
            asset_cols = [f'RET_{asset}' for asset in assets if f'RET_{asset}' in ret_cols]
            if asset_cols:
                new_cols[f'SECTOR_AVG_{level}_{sector}'] = X[asset_cols].mean(axis=1).replace([np.inf, -np.inf], 0).fillna(0)
        X = X.merge(supplementary[['ID_asset', level]].rename(columns={'ID_asset': 'ID_TARGET', level: f'TARGET_{level}'}),
                    on='ID_TARGET', how='left')
        X = pd.concat([X, pd.get_dummies(X[f'TARGET_{level}'], prefix=f'TARGET_{level}', dummy_na=True)], axis=1)
        X = X.drop(f'TARGET_{level}', axis=1)

    new_cols['MEAN_RET'] = X[ret_cols].mean(axis=1)
    new_cols['STD_RET'] = X[ret_cols].std(axis=1)
    new_cols['MOMENTUM_7D'] = X[ret_cols].rolling(window=7, min_periods=1).mean().mean(axis=1).replace([np.inf, -np.inf], 0).fillna(0)
    new_cols['VOLATILITY_3D'] = X[ret_cols].rolling(window=3, min_periods=1).std().mean(axis=1).replace([np.inf, -np.inf], 0).fillna(0)
    new_cols.update({f'CS_RANK_{col}': X.groupby('ID_DAY')[col].rank(pct=True).replace([np.inf, -np.inf], 0).fillna(0) for col in ret_cols})

    if is_train:
        corr_matrix = X[ret_cols].corr()
        corr_pairs = corr_matrix.unstack().sort_values(ascending=False)
        top_pairs = [(i, j) for i, j in corr_pairs.index if i < j][:5]
        for i, j in top_pairs:
            new_cols[f'CORR_{i}_{j}'] = X[i] * X[j]

    if ret_cols:
        pca = PCA(n_components=min(2, len(ret_cols)), random_state=42)
        pca_features = pca.fit_transform(X[ret_cols].fillna(0))
        for i in range(pca_features.shape[1]):
            new_cols[f'PCA_{i}'] = pca_features[:, i]

    new_cols_df = pd.DataFrame(new_cols, index=X.index, dtype=np.float32)
    X = pd.concat([X, new_cols_df], axis=1)

    feature_cols = [col for col in X.columns if col not in ['ID_DAY', 'ID_TARGET']]
    scaler = StandardScaler()
    X[feature_cols] = scaler.fit_transform(X[feature_cols].replace([np.inf, -np.inf], 0).fillna(0))

    if is_train:
        y['SIGN_TARGET'] = np.where(y['RET_TARGET'] > 0, 1, 0).astype(np.int32)
        return X, y, feature_cols, top_cols
    return X, None, feature_cols, top_cols

# Preprocess data
try:
    X_train, y_train, feature_cols, top_cols = preprocess_data(X_train, y_train, supplementary, is_train=True)
    X_test, _, test_feature_cols, _ = preprocess_data(X_test, supplementary=supplementary, is_train=False, top_cols=top_cols)
except ValueError as e:
    print(f"Preprocessing error: {e}")
    raise

# Align columns
common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]
feature_cols = [col for col in common_cols if col not in ['ID_DAY', 'ID_TARGET']]

# Align y_train
y_train = y_train.loc[X_train.index]

# Define models with intermediate hyperparameter grids
models = {
    'LightGBM': {
        'model': LGBMClassifier(num_leaves=31, min_child_samples=15, lambda_l1=0.3, lambda_l2=0.3,
                                subsample=0.7, colsample_bytree=0.8, bagging_freq=5, bagging_fraction=0.7,
                                random_state=42, n_jobs=1, verbosity=-1, class_weight='balanced', force_row_wise=True),
        'param_grid': [
            {'learning_rate': [0.02], 'num_leaves': [15], 'min_child_samples': [20], 'bagging_fraction': [0.6], 'reg_alpha': [0.1], 'reg_lambda': [0.1]},
            {'learning_rate': [0.05], 'num_leaves': [31], 'min_child_samples': [10], 'bagging_fraction': [0.7], 'reg_alpha': [0.2], 'reg_lambda': [0.2]},
            {'learning_rate': [0.07], 'num_leaves': [47], 'min_child_samples': [15], 'bagging_fraction': [0.8], 'reg_alpha': [0.3], 'reg_lambda': [0.3]}
        ],
        'callbacks': [lightgbm.early_stopping(stopping_rounds=10, verbose=False)]
    },
    'XGBoost': {
        'model': XGBClassifier(max_depth=4, min_child_weight=2, subsample=0.8, colsample_bytree=0.8,
                               random_state=42, n_jobs=1, eval_metric='logloss', verbosity=0,
                               scale_pos_weight=len(y_train[y_train['SIGN_TARGET'] == 0]) / len(y_train[y_train['SIGN_TARGET'] == 1])),
        'param_grid': [
            {'learning_rate': [0.03], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [150], 'reg_alpha': [0.1], 'reg_lambda': [1.0]},
            {'learning_rate': [0.05], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [200], 'reg_alpha': [0.2], 'reg_lambda': [1.5]},
            {'learning_rate': [0.07], 'max_depth': [5], 'min_child_weight': [3], 'subsample': [0.8], 'n_estimators': [150], 'reg_alpha': [0.3], 'reg_lambda': [1.0]}
        ],
        'callbacks': [xgboost.callback.EarlyStopping(rounds=10, metric_name='logloss', save_best=True)]
    }
}

# Function to train and predict for a single target
def train_target(target, X_train, y_train, X_test, feature_cols, model, model_name, kf, param_grid, callbacks):
    X_target = X_train[X_train['ID_TARGET'] == target][feature_cols]
    y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
    weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
    low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]
    weight_scale = 2.0 if target in low_performing_targets else 1.0
    weights = weights * weight_scale
    groups = X_train[X_train['ID_TARGET'] == target]['ID_DAY']
    X_test_target = X_test[X_test['ID_TARGET'] == target][feature_cols]
    test_ids = X_test[X_test['ID_TARGET'] == target].index

    if len(X_target) < 100 or len(X_test_target) == 0:
        print(f"Warning: Insufficient data for ID_TARGET {target} (train: {len(X_target)}, test: {len(X_test_target)}). Skipping.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    param_results = []
    best_params = None
    best_score = 0
    best_model = None

    for params in ParameterGrid(param_grid):
        print(f"Testing params for {model_name} ID_TARGET {target}: {params}", flush=True)
        try:
            model.set_params(**params)
        except ValueError as e:
            print(f"Invalid params for {model_name} ID_TARGET {target}: {e}")
            continue
        fold_accuracies = []
        for train_idx, val_idx in kf.split(X_target, y_target, groups):
            X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
            y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
            w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
            if model_name == 'LightGBM' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=callbacks)
            elif model_name == 'XGBoost' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
            else:
                model.fit(X_tr, y_tr, sample_weight=w_tr)
            y_pred = model.predict(X_val)
            acc = weighted_accuracy(y_val, y_pred, w_val)
            fold_accuracies.append(acc)
        mean_acc = np.mean(fold_accuracies)
        print(f"{model_name} | ID_TARGET: {target} | Params: {params} | Mean Accuracy: {mean_acc:.4f} | Fold Accuracies: {fold_accuracies}", flush=True)
        param_results.append({
            'params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': fold_accuracies
        })
        if mean_acc > best_score:
            best_score = mean_acc
            best_params = params
            best_model = type(model)(**model.get_params())

    if best_model is None:
        print(f"No valid model for {model_name} ID_TARGET {target}. Using default.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    best_model.set_params(**best_params)
    if model_name == 'LightGBM' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=callbacks)
    elif model_name == 'XGBoost' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
    else:
        best_model.fit(X_target, y_target, sample_weight=weights)
    preds = best_model.predict_proba(X_test_target)[:, 1].astype(np.float32)
    importance = best_model.feature_importances_
    feature_importance = dict(zip(feature_cols, importance))
    top_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:30]
    gc.collect()
    return str(target), best_params, best_score, param_results, test_ids, preds, top_features

# Cross-validation and training
kf = GroupKFold(n_splits=5)
results = {}
predictions = {}
param_log = {}
top_features_all = {}
param_summary = []
low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]

for model_name, model_info in models.items():
    print(f"\nTraining {model_name}...", flush=True)
    param_log[model_name] = {}
    top_features_all[model_name] = {}
    results_parallel = Parallel(n_jobs=-1)(
        delayed(train_target)(target, X_train, y_train, X_test, feature_cols, model_info['model'], model_name, kf,
                             model_info['param_grid'], model_info['callbacks'])
        for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets")
    )
    accuracies = []
    for target, params, mean_acc, param_results, test_ids, preds, top_features in results_parallel:
        param_log[model_name][target] = {
            'best_params': params,
            'mean_accuracy': mean_acc,
            'all_params': param_results
        }
        top_features_all[model_name][target] = top_features
        accuracies.append(mean_acc)
        predictions.setdefault(target, {})[model_name] = dict(zip(test_ids, preds))
        param_summary.append({
            'model': model_name,
            'ID_TARGET': target,
            'best_params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': param_results[-1]['fold_accuracies'] if param_results else []
        })
    results[model_name] = np.mean([acc for acc in accuracies if acc > 0])
    print(f"{model_name} Weighted Accuracy: {results[model_name]:.4f}", flush=True)
    print(f"\n{model_name} Parameter Summary:", flush=True)
    for summary in param_summary:
        if summary['model'] == model_name:
            print(f"ID_TARGET: {summary['ID_TARGET']} | Best Params: {summary['best_params']} | Mean Accuracy: {summary['mean_accuracy']:.4f} | Fold Accuracies: {summary['fold_accuracies']}", flush=True)

# Second-pass training with top 30 features
print("\nPerforming second-pass training with top 30 features...", flush=True)
for model_name, model_info in models.items():
    print(f"\nSecond-pass {model_name}...", flush=True)
    for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets"):
        target_str = str(target)
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
        weight_scale = 2.0 if target in low_performing_targets else 1.0
        weights = weights * weight_scale
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index
        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue
        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        best_params['n_estimators'] = 200
        model.set_params(**best_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Third-pass training for low-performing targets
print("\nPerforming third-pass training for low-performing targets...", flush=True)
for model_name, model_info in models.items():
    print(f"\nThird-pass {model_name}...", flush=True)
    for target in tqdm(low_performing_targets, desc=f"{model_name} Low-Performing Targets"):
        target_str = str(target)
        if target_str not in param_log[model_name] or param_log[model_name][target_str]['mean_accuracy'] >= 0.75:
            continue
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs() * 2.0
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index
        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue
        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        fine_tune_grid = [
            {'learning_rate': best_params['learning_rate'][0] * 0.8, 'n_estimators': 200, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1},
            {'learning_rate': best_params['learning_rate'][0] * 1.2, 'n_estimators': 250, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1}
        ]
        best_score = param_log[model_name][target_str]['mean_accuracy']
        best_fine_params = best_params
        for params in ParameterGrid(fine_tune_grid):
            print(f"Fine-tuning {model_name} ID_TARGET {target}: {params}", flush=True)
            model.set_params(**params)
            fold_accuracies = []
            for train_idx, val_idx in kf.split(X_target, y_target, groups):
                X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
                y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
                w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
                if model_name == 'LightGBM' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=model_info['callbacks'])
                elif model_name == 'XGBoost' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
                else:
                    model.fit(X_tr, y_tr, sample_weight=w_tr)
                y_pred = model.predict(X_val)
                acc = weighted_accuracy(y_val, y_pred, w_val)
                fold_accuracies.append(acc)
            mean_acc = np.mean(fold_accuracies)
            print(f"{model_name} | ID_TARGET: {target} | Fine-tune Params: {params} | Mean Accuracy: {mean_acc:.4f}", flush=True)
            if mean_acc > best_score:
                best_score = mean_acc
                best_fine_params = params
                param_log[model_name][target_str]['best_params'] = best_fine_params
                param_log[model_name][target_str]['mean_accuracy'] = best_score
                param_summary.append({
                    'model': model_name,
                    'ID_TARGET': target_str,
                    'best_params': best_fine_params,
                    'mean_accuracy': best_score,
                    'fold_accuracies': fold_accuracies
                })
        model.set_params(**best_fine_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Save parameter log
param_log_path = os.path.join(DATA_DIR, 'hyperparameters.json')
with open(param_log_path, 'w') as f:
    json.dump(param_log, f, indent=4)
print(f"Saved hyperparameter log to: {param_log_path}", flush=True)
files.download(param_log_path)

# Ensemble predictions with dynamic weights
final_predictions = {}
missing_day_targets = []
for target in X_test['ID_TARGET'].unique():
    target_str = str(target)
    test_ids = X_test[X_test['ID_TARGET'] == target].index
    lgbm_acc = param_log.get('LightGBM', {}).get(target_str, {}).get('mean_accuracy', 0.5)
    xgb_acc = param_log.get('XGBoost', {}).get(target_str, {}).get('mean_accuracy', 0.5)
    total_acc = lgbm_acc + xgb_acc
    ensemble_weights = {
        'LightGBM': lgbm_acc / total_acc if total_acc > 0 else 0.5,
        'XGBoost': xgb_acc / total_acc if total_acc > 0 else 0.5
    }
    for day in X_test[X_test['ID_TARGET'] == target]['ID_DAY'].unique():
        day_idx = (X_test['ID_TARGET'] == target) & (X_test['ID_DAY'] == day)
        day_test_ids = X_test[day_idx].index
        if len(day_test_ids) == 0:
            print(f"Warning: No test samples for ID_TARGET {target} on ID_DAY {day}. Using default prediction.")
            missing_day_targets.append((target, day))
            for idx in test_ids:
                final_predictions[idx] = -1
            continue
        lgbm_preds = np.array([predictions[target_str].get('LightGBM', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        xgb_preds = np.array([predictions[target_str].get('XGBoost', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        ensemble_preds = ensemble_weights['LightGBM'] * lgbm_preds + ensemble_weights['XGBoost'] * xgb_preds
        smoothed_preds = pd.Series(ensemble_preds).ewm(span=3).mean().values[-1] if len(ensemble_preds) > 0 else 0.5
        for idx in day_test_ids:
            final_predictions[idx] = 1 if smoothed_preds >= 0.5 else -1

if missing_day_targets:
    print(f"Missing day-target pairs: {missing_day_targets}")

# Create output CSV
output_df = pd.DataFrame.from_dict(final_predictions, orient='index', columns=['RET_TARGET']).reset_index()
output_df.columns = ['ID', 'RET_TARGET']
output_df = output_df.sort_values('ID')
expected_ids = set(X_test.index)
submission_ids = set(output_df['ID'])
if expected_ids != submission_ids:
    missing_ids = expected_ids - submission_ids
    print(f"Warning: Submission missing {len(missing_ids)} IDs: {missing_ids}")
    missing_df = pd.DataFrame({'ID': list(missing_ids), 'RET_TARGET': -1})
    output_df = pd.concat([output_df, missing_df], ignore_index=True).sort_values('ID')
output_path = os.path.join(DATA_DIR, 'predictions_ensemble.csv')
output_df.to_csv(output_path, index=False)
print(f"Saved predictions to: {output_path}", flush=True)

# Validate output
test_csv = pd.read_csv(output_path)
print(f"Output CSV shape: {test_csv.shape}", flush=True)
print(f"Output CSV ID range: {test_csv['ID'].min()} to {test_csv['ID'].max()}", flush=True)
print(f"Unique RET_TARGET values: {test_csv['RET_TARGET'].unique()}", flush=True)
print(f"Missing values: {test_csv.isna().sum().sum()}", flush=True)

# Download the file
files.download(output_path)

# Print final results
print("\nValidation Results:", flush=True)
for model_name, acc in results.items():
    print(f"{model_name}: {acc:.4f}", flush=True)

XGBoost version: 2.1.4
Selected top return columns: ['RET_262', 'RET_88', 'RET_172', 'RET_259', 'RET_150', 'RET_118', 'RET_216', 'RET_268', 'RET_115', 'RET_261', 'RET_59', 'RET_238', 'RET_121', 'RET_97', 'RET_30']

Training LightGBM...


LightGBM Targets: 100%|██████████| 100/100 [13:03<00:00,  7.83s/it]


LightGBM Weighted Accuracy: 0.7317

LightGBM Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'bagging_fraction': 0.6, 'learning_rate': 0.02, 'min_child_samples': 20, 'num_leaves': 15, 'reg_alpha': 0.1, 'reg_lambda': 0.1} | Mean Accuracy: 0.8608 | Fold Accuracies: [np.float64(0.7106382978723405), np.float64(0.8125), np.float64(0.7051792828685259), np.float64(0.8178294573643411), np.float64(0.7791164658634538)]
ID_TARGET: 129.0 | Best Params: {'bagging_fraction': 0.6, 'learning_rate': 0.02, 'min_child_samples': 20, 'num_leaves': 15, 'reg_alpha': 0.1, 'reg_lambda': 0.1} | Mean Accuracy: 0.6497 | Fold Accuracies: [np.float64(0.6122448979591837), np.float64(0.6260162601626016), np.float64(0.6628787878787878), np.float64(0.5738396624472574), np.float64(0.7054545454545454)]
ID_TARGET: 136.0 | Best Params: {'bagging_fraction': 0.7, 'learning_rate': 0.05, 'min_child_samples': 10, 'num_leaves': 31, 'reg_alpha': 0.2, 'reg_lambda': 0.2} | Mean Accuracy: 0.6136 | Fold Accuracies: [np.float64(0.

XGBoost Targets: 100%|██████████| 100/100 [20:39<00:00, 12.40s/it]


XGBoost Weighted Accuracy: 0.7598

XGBoost Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'learning_rate': 0.07, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 150, 'reg_alpha': 0.3, 'reg_lambda': 1.0, 'subsample': 0.8} | Mean Accuracy: 0.6562 | Fold Accuracies: [np.float64(0.7361702127659574), np.float64(0.60546875), np.float64(0.6812749003984063), np.float64(0.6317829457364341), np.float64(0.6265060240963856)]
ID_TARGET: 129.0 | Best Params: {'learning_rate': 0.05, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 200, 'reg_alpha': 0.2, 'reg_lambda': 1.5, 'subsample': 0.7} | Mean Accuracy: 0.6040 | Fold Accuracies: [np.float64(0.6122448979591837), np.float64(0.6300813008130082), np.float64(0.5795454545454546), np.float64(0.5822784810126582), np.float64(0.5636363636363636)]
ID_TARGET: 136.0 | Best Params: {'learning_rate': 0.03, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 150, 'reg_alpha': 0.1, 'reg_lambda': 1.0, 'subsample': 0.6} | Mean Accuracy: 0.5962 |

LightGBM Targets: 100%|██████████| 100/100 [01:04<00:00,  1.54it/s]


Second-pass XGBoost...



XGBoost Targets: 100%|██████████| 100/100 [01:16<00:00,  1.32it/s]


Performing third-pass training for low-performing targets...

Third-pass LightGBM...



LightGBM Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 47711.83it/s]


Third-pass XGBoost...



XGBoost Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 115343.36it/s]

Saved hyperparameter log to: /content/hyperparameters.json





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved predictions to: /content/predictions_ensemble.csv
Output CSV shape: (114468, 2)
Output CSV ID range: 0 to 114467
Unique RET_TARGET values: [ 1 -1]
Missing values: 0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Validation Results:
LightGBM: 0.7317
XGBoost: 0.7598


Model between 73.9 % and 74.2 %

In [None]:

import numpy as np
import pandas as pd
import os
import gc
from google.colab import files
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import lightgbm
from xgboost import XGBClassifier
import xgboost
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
import json
from sklearn.model_selection import ParameterGrid

# Check xgboost version for compatibility
print(f"XGBoost version: {xgboost.__version__}")
if int(xgboost.__version__.split('.')[0]) < 1:
    print("Warning: XGBoost version < 1.0.0 detected. Ensure compatibility with eval_metric in constructor.")

# Set up data directory
DATA_DIR = '/content'

# Load datasets
try:
    X_train = pd.read_csv(os.path.join(DATA_DIR, 'X_train_itDkypA.csv'), index_col='ID', dtype=np.float32)
    y_train = pd.read_csv(os.path.join(DATA_DIR, 'y_train_3LeeT2g.csv'), index_col='ID', dtype=np.float32)
    X_test = pd.read_csv(os.path.join(DATA_DIR, 'X_test_Beg4ey3.csv'), index_col='ID', dtype=np.float32)
    supplementary = pd.read_csv(os.path.join(DATA_DIR, 'supplementary_data_Vkoyn8z.csv'))
except FileNotFoundError as e:
    raise FileNotFoundError(f"Dataset not found: {e}. Please ensure files are uploaded to {DATA_DIR}.")

# Custom weighted accuracy metric
def weighted_accuracy(y_true, y_pred, weights):
    y_pred_mapped = np.where(y_pred == 1, 1, -1)
    correct = (y_pred_mapped == np.sign(y_true)).astype(float)
    return np.sum(np.abs(y_true) * correct) / np.sum(np.abs(y_true))

# Preprocess data with enhanced feature selection
def preprocess_data(X, y=None, supplementary=None, is_train=True, top_cols=None):
    ret_cols = [col for col in X.columns if col.startswith('RET_')]
    if is_train and top_cols is None:
        if len(ret_cols) > 15:
            variances = X[ret_cols].var()
            corr_sum = X[ret_cols].corr().abs().sum()
            combined_score = variances * corr_sum
            valid_cols = variances[variances > 1e-5].index
            if len(valid_cols) < 15:
                top_cols = valid_cols.tolist()
            else:
                top_cols = combined_score.loc[valid_cols].sort_values(ascending=False).head(15).index.tolist()
        else:
            top_cols = ret_cols
        print(f"Selected top return columns: {top_cols}")
    elif top_cols is not None:
        ret_cols = [col for col in ret_cols if col in top_cols]
        if not ret_cols:
            raise ValueError("No return columns selected. Check top_cols consistency.")

    sector_medians = {}
    if supplementary is not None:
        sector_map = supplementary.set_index('ID_asset')['CLASS_LEVEL_1']
        for col in ret_cols:
            asset_id = int(col.replace('RET_', ''))
            sector = sector_map.get(asset_id, np.nan)
            if pd.notna(sector):
                sector_assets = supplementary[supplementary['CLASS_LEVEL_1'] == sector]['ID_asset']
                sector_cols = [f'RET_{a}' for a in sector_assets if f'RET_{a}' in X.columns]
                if sector_cols:
                    sector_medians[col] = X[sector_cols].median(axis=1)
            if col in sector_medians:
                X[col] = X[col].fillna(sector_medians[col])
            else:
                X[col] = X[col].fillna(X[col].median())

    new_cols = {}
    if supplementary is not None:
        level = 'CLASS_LEVEL_1'
        sector_groups = supplementary.groupby(level)['ID_asset'].apply(list).to_dict()
        for sector, assets in sector_groups.items():
            asset_cols = [f'RET_{asset}' for asset in assets if f'RET_{asset}' in ret_cols]
            if asset_cols:
                new_cols[f'SECTOR_AVG_{level}_{sector}'] = X[asset_cols].mean(axis=1).replace([np.inf, -np.inf], 0).fillna(0)
        X = X.merge(supplementary[['ID_asset', level]].rename(columns={'ID_asset': 'ID_TARGET', level: f'TARGET_{level}'}),
                    on='ID_TARGET', how='left')
        X = pd.concat([X, pd.get_dummies(X[f'TARGET_{level}'], prefix=f'TARGET_{level}', dummy_na=True)], axis=1)
        X = X.drop(f'TARGET_{level}', axis=1)

    new_cols['MEAN_RET'] = X[ret_cols].mean(axis=1)
    new_cols['STD_RET'] = X[ret_cols].std(axis=1)
    new_cols.update({f'CS_RANK_{col}': X.groupby('ID_DAY')[col].rank(pct=True).replace([np.inf, -np.inf], 0).fillna(0) for col in ret_cols})

    if is_train:
        corr_matrix = X[ret_cols].corr()
        corr_pairs = corr_matrix.unstack().sort_values(ascending=False)
        top_pairs = [(i, j) for i, j in corr_pairs.index if i < j][:5]
        for i, j in top_pairs:
            new_cols[f'CORR_{i}_{j}'] = X[i] * X[j]

    if ret_cols:
        pca = PCA(n_components=min(2, len(ret_cols)), random_state=42)
        pca_features = pca.fit_transform(X[ret_cols].fillna(0))
        for i in range(pca_features.shape[1]):
            new_cols[f'PCA_{i}'] = pca_features[:, i]

    new_cols_df = pd.DataFrame(new_cols, index=X.index, dtype=np.float32)
    X = pd.concat([X, new_cols_df], axis=1)

    feature_cols = [col for col in X.columns if col not in ['ID_DAY', 'ID_TARGET']]
    scaler = StandardScaler()
    X[feature_cols] = scaler.fit_transform(X[feature_cols].replace([np.inf, -np.inf], 0).fillna(0))

    if is_train:
        y['SIGN_TARGET'] = np.where(y['RET_TARGET'] > 0, 1, 0).astype(np.int32)
        return X, y, feature_cols, top_cols
    return X, None, feature_cols, top_cols

# Preprocess data
try:
    X_train, y_train, feature_cols, top_cols = preprocess_data(X_train, y_train, supplementary, is_train=True)
    X_test, _, test_feature_cols, _ = preprocess_data(X_test, supplementary=supplementary, is_train=False, top_cols=top_cols)
except ValueError as e:
    print(f"Preprocessing error: {e}")
    raise

# Align columns
common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]
feature_cols = [col for col in common_cols if col not in ['ID_DAY', 'ID_TARGET']]

# Align y_train
y_train = y_train.loc[X_train.index]

# Define models with intermediate hyperparameter grids
models = {
    'LightGBM': {
        'model': LGBMClassifier(num_leaves=31, min_child_samples=15, lambda_l1=0.3, lambda_l2=0.3,
                                subsample=0.7, colsample_bytree=0.8, bagging_freq=5, bagging_fraction=0.7,
                                random_state=42, n_jobs=1, verbosity=-1, class_weight='balanced', force_row_wise=True),
        'param_grid': [
            {'learning_rate': [0.01], 'num_leaves': [15], 'min_child_samples': [20], 'bagging_fraction': [0.6], 'reg_alpha': [0.0], 'reg_lambda': [0.0]},
            {'learning_rate': [0.03], 'num_leaves': [31], 'min_child_samples': [10], 'bagging_fraction': [0.7], 'reg_alpha': [0.1], 'reg_lambda': [0.1]},
            {'learning_rate': [0.05], 'num_leaves': [47], 'min_child_samples': [15], 'bagging_fraction': [0.8], 'reg_alpha': [0.2], 'reg_lambda': [0.2]},
            {'learning_rate': [0.07], 'num_leaves': [63], 'min_child_samples': [12], 'bagging_fraction': [0.65], 'reg_alpha': [0.3], 'reg_lambda': [0.3]},
            {'learning_rate': [0.02], 'num_leaves': [31], 'min_child_samples': [20], 'bagging_fraction': [0.75], 'reg_alpha': [0.1], 'reg_lambda': [0.1]}
        ],
        'callbacks': [lightgbm.early_stopping(stopping_rounds=12, verbose=False)]
    },
    'XGBoost': {
        'model': XGBClassifier(max_depth=4, min_child_weight=2, subsample=0.8, colsample_bytree=0.8,
                               random_state=42, n_jobs=1, eval_metric='logloss', verbosity=0,
                               scale_pos_weight=len(y_train[y_train['SIGN_TARGET'] == 0]) / len(y_train[y_train['SIGN_TARGET'] == 1])),
        'param_grid': [
            {'learning_rate': [0.01], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [150], 'reg_alpha': [0.0], 'reg_lambda': [1.0]},
            {'learning_rate': [0.03], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [200], 'reg_alpha': [0.1], 'reg_lambda': [1.5]},
            {'learning_rate': [0.05], 'max_depth': [5], 'min_child_weight': [3], 'subsample': [0.8], 'n_estimators': [150], 'reg_alpha': [0.2], 'reg_lambda': [1.0]},
            {'learning_rate': [0.07], 'max_depth': [6], 'min_child_weight': [1], 'subsample': [0.75], 'n_estimators': [200], 'reg_alpha': [0.3], 'reg_lambda': [1.5]},
            {'learning_rate': [0.02], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [200], 'reg_alpha': [0.1], 'reg_lambda': [1.2]}
        ],
        'callbacks': [xgboost.callback.EarlyStopping(rounds=12, metric_name='logloss', save_best=True)]
    }
}

# Function to train and predict for a single target
def train_target(target, X_train, y_train, X_test, feature_cols, model, model_name, kf, param_grid, callbacks):
    X_target = X_train[X_train['ID_TARGET'] == target][feature_cols]
    y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
    weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
    low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]
    weight_scale = 2.5 if target in low_performing_targets else 1.0
    if len(X_target) < 500:
        weight_scale *= 1.5
    weights = weights * weight_scale
    groups = X_train[X_train['ID_TARGET'] == target]['ID_DAY']
    X_test_target = X_test[X_test['ID_TARGET'] == target][feature_cols]
    test_ids = X_test[X_test['ID_TARGET'] == target].index

    if len(X_target) < 100 or len(X_test_target) == 0:
        print(f"Warning: Insufficient data for ID_TARGET {target} (train: {len(X_target)}, test: {len(X_test_target)}). Skipping.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    param_results = []
    best_params = None
    best_score = 0
    best_model = None

    for params in ParameterGrid(param_grid):
        print(f"Testing params for {model_name} ID_TARGET {target}: {params}", flush=True)
        try:
            model.set_params(**params)
        except ValueError as e:
            print(f"Invalid params for {model_name} ID_TARGET {target}: {e}")
            continue
        fold_accuracies = []
        for train_idx, val_idx in kf.split(X_target, y_target, groups):
            X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
            y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
            w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
            if model_name == 'LightGBM' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=callbacks)
            elif model_name == 'XGBoost' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
            else:
                model.fit(X_tr, y_tr, sample_weight=w_tr)
            y_pred = model.predict(X_val)
            acc = weighted_accuracy(y_val, y_pred, w_val)
            fold_accuracies.append(acc)
        mean_acc = np.mean(fold_accuracies)
        print(f"{model_name} | ID_TARGET: {target} | Params: {params} | Mean Accuracy: {mean_acc:.4f} | Fold Accuracies: {fold_accuracies}", flush=True)
        param_results.append({
            'params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': fold_accuracies
        })
        if mean_acc > best_score:
            best_score = mean_acc
            best_params = params
            best_model = type(model)(**model.get_params())

    if best_model is None:
        print(f"No valid model for {model_name} ID_TARGET {target}. Using default.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    best_model.set_params(**best_params)
    if model_name == 'LightGBM' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=callbacks)
    elif model_name == 'XGBoost' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
    else:
        best_model.fit(X_target, y_target, sample_weight=weights)
    preds = best_model.predict_proba(X_test_target)[:, 1].astype(np.float32)
    importance = best_model.feature_importances_
    feature_importance = dict(zip(feature_cols, importance))
    top_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:30]
    gc.collect()
    return str(target), best_params, best_score, param_results, test_ids, preds, top_features

# Cross-validation and training
kf = GroupKFold(n_splits=6)
results = {}
predictions = {}
ensemble_weights = {'LightGBM': 0.7, 'XGBoost': 0.3}
param_log = {}
top_features_all = {}
param_summary = []
low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]

for model_name, model_info in models.items():
    print(f"\nTraining {model_name}...", flush=True)
    param_log[model_name] = {}
    top_features_all[model_name] = {}
    results_parallel = Parallel(n_jobs=-1)(
        delayed(train_target)(target, X_train, y_train, X_test, feature_cols, model_info['model'], model_name, kf,
                             model_info['param_grid'], model_info['callbacks'])
        for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets")
    )
    accuracies = []
    for target, params, mean_acc, param_results, test_ids, preds, top_features in results_parallel:
        param_log[model_name][target] = {
            'best_params': params,
            'mean_accuracy': mean_acc,
            'all_params': param_results
        }
        top_features_all[model_name][target] = top_features
        accuracies.append(mean_acc)
        predictions.setdefault(target, {})[model_name] = dict(zip(test_ids, preds))
        param_summary.append({
            'model': model_name,
            'ID_TARGET': target,
            'best_params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': param_results[-1]['fold_accuracies'] if param_results else []
        })
    results[model_name] = np.mean([acc for acc in accuracies if acc > 0])
    print(f"{model_name} Weighted Accuracy: {results[model_name]:.4f}", flush=True)
    print(f"\n{model_name} Parameter Summary:", flush=True)
    for summary in param_summary:
        if summary['model'] == model_name:
            print(f"ID_TARGET: {summary['ID_TARGET']} | Best Params: {summary['best_params']} | Mean Accuracy: {summary['mean_accuracy']:.4f} | Fold Accuracies: {summary['fold_accuracies']}", flush=True)

# Second-pass training with top 30 features
print("\nPerforming second-pass training with top 30 features...", flush=True)
for model_name, model_info in models.items():
    print(f"\nSecond-pass {model_name}...", flush=True)
    for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets"):
        target_str = str(target)
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
        weight_scale = 2.5 if target in low_performing_targets else 1.0
        if len(X_target) < 500:
            weight_scale *= 1.5
        weights = weights * weight_scale
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index
        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue
        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        best_params['n_estimators'] = 250
        model.set_params(**best_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Third-pass training for low-performing targets
print("\nPerforming third-pass training for low-performing targets...", flush=True)
for model_name, model_info in models.items():
    print(f"\nThird-pass {model_name}...", flush=True)
    for target in tqdm(low_performing_targets, desc=f"{model_name} Low-Performing Targets"):
        target_str = str(target)
        if target_str not in param_log[model_name] or param_log[model_name][target_str]['mean_accuracy'] >= 0.75:
            continue
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs() * 2.5
        if len(X_target) < 500:
            weights = weights * 1.5
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index
        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue
        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        fine_tune_grid = [
            {'learning_rate': best_params['learning_rate'][0] * 0.7, 'n_estimators': 300, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1},
            {'learning_rate': best_params['learning_rate'][0] * 1.0, 'n_estimators': 300, 'reg_alpha': best_params['reg_alpha'][0], 'reg_lambda': best_params['reg_lambda'][0]},
            {'learning_rate': best_params['learning_rate'][0] * 1.3, 'n_estimators': 350, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1}
        ]
        best_score = param_log[model_name][target_str]['mean_accuracy']
        best_fine_params = best_params
        for params in ParameterGrid(fine_tune_grid):
            print(f"Fine-tuning {model_name} ID_TARGET {target}: {params}", flush=True)
            model.set_params(**params)
            fold_accuracies = []
            for train_idx, val_idx in kf.split(X_target, y_target, groups):
                X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
                y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
                w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
                if model_name == 'LightGBM' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=model_info['callbacks'])
                elif model_name == 'XGBoost' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
                else:
                    model.fit(X_tr, y_tr, sample_weight=w_tr)
                y_pred = model.predict(X_val)
                acc = weighted_accuracy(y_val, y_pred, w_val)
                fold_accuracies.append(acc)
            mean_acc = np.mean(fold_accuracies)
            print(f"{model_name} | ID_TARGET: {target} | Fine-tune Params: {params} | Mean Accuracy: {mean_acc:.4f}", flush=True)
            if mean_acc > best_score:
                best_score = mean_acc
                best_fine_params = params
                param_log[model_name][target_str]['best_params'] = best_fine_params
                param_log[model_name][target_str]['mean_accuracy'] = best_score
                param_summary.append({
                    'model': model_name,
                    'ID_TARGET': target_str,
                    'best_params': best_fine_params,
                    'mean_accuracy': best_score,
                    'fold_accuracies': fold_accuracies
                })
        model.set_params(**best_fine_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Save parameter log
param_log_path = os.path.join(DATA_DIR, 'hyperparameters.json')
with open(param_log_path, 'w') as f:
    json.dump(param_log, f, indent=4)
print(f"Saved hyperparameter log to: {param_log_path}", flush=True)
files.download(param_log_path)

# Ensemble predictions
final_predictions = {}
missing_day_targets = []
for target in X_test['ID_TARGET'].unique():
    target_str = str(target)
    test_ids = X_test[X_test['ID_TARGET'] == target].index
    for day in X_test[X_test['ID_TARGET'] == target]['ID_DAY'].unique():
        day_idx = (X_test['ID_TARGET'] == target) & (X_test['ID_DAY'] == day)
        day_test_ids = X_test[day_idx].index
        if len(day_test_ids) == 0:
            print(f"Warning: No test samples for ID_TARGET {target} on ID_DAY {day}. Using default prediction.")
            missing_day_targets.append((target, day))
            for idx in test_ids:
                final_predictions[idx] = -1
            continue
        lgbm_preds = np.array([predictions[target_str].get('LightGBM', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        xgb_preds = np.array([predictions[target_str].get('XGBoost', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        ensemble_preds = ensemble_weights['LightGBM'] * lgbm_preds + ensemble_weights['XGBoost'] * xgb_preds
        smoothed_preds = pd.Series(ensemble_preds).ewm(span=3).mean().values[-1] if len(ensemble_preds) > 0 else 0.5
        for idx in day_test_ids:
            final_predictions[idx] = 1 if smoothed_preds >= 0.5 else -1

if missing_day_targets:
    print(f"Missing day-target pairs: {missing_day_targets}")

# Create output CSV
output_df = pd.DataFrame.from_dict(final_predictions, orient='index', columns=['RET_TARGET']).reset_index()
output_df.columns = ['ID', 'RET_TARGET']
output_df = output_df.sort_values('ID')
expected_ids = set(X_test.index)
submission_ids = set(output_df['ID'])
if expected_ids != submission_ids:
    missing_ids = expected_ids - submission_ids
    print(f"Warning: Submission missing {len(missing_ids)} IDs: {missing_ids}")
    missing_df = pd.DataFrame({'ID': list(missing_ids), 'RET_TARGET': -1})
    output_df = pd.concat([output_df, missing_df], ignore_index=True).sort_values('ID')
output_path = os.path.join(DATA_DIR, 'predictions_ensemble.csv')
output_df.to_csv(output_path, index=False)
print(f"Saved predictions to: {output_path}", flush=True)

# Validate output
test_csv = pd.read_csv(output_path)
print(f"Output CSV shape: {test_csv.shape}", flush=True)
print(f"Output CSV ID range: {test_csv['ID'].min()} to {test_csv['ID'].max()}", flush=True)
print(f"Unique RET_TARGET values: {test_csv['RET_TARGET'].unique()}", flush=True)
print(f"Missing values: {test_csv.isna().sum().sum()}", flush=True)

# Download the file
files.download(output_path)

# Print final results
print("\nValidation Results:", flush=True)
for model_name, acc in results.items():
    print(f"{model_name}: {acc:.4f}", flush=True)


XGBoost version: 2.1.4
Selected top return columns: ['RET_262', 'RET_88', 'RET_172', 'RET_259', 'RET_150', 'RET_118', 'RET_216', 'RET_268', 'RET_115', 'RET_261', 'RET_59', 'RET_238', 'RET_121', 'RET_97', 'RET_30']

Training LightGBM...


LightGBM Targets: 100%|██████████| 100/100 [25:47<00:00, 15.48s/it]


LightGBM Weighted Accuracy: 0.7518

LightGBM Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'bagging_fraction': 0.6, 'learning_rate': 0.01, 'min_child_samples': 20, 'num_leaves': 15, 'reg_alpha': 0.0, 'reg_lambda': 0.0} | Mean Accuracy: 0.9016 | Fold Accuracies: [np.float64(0.9952830188679245), np.float64(0.8054298642533937), np.float64(0.827906976744186), np.float64(0.7210526315789474), np.float64(0.9901477832512315), np.float64(0.8221153846153846)]
ID_TARGET: 129.0 | Best Params: {'bagging_fraction': 0.6, 'learning_rate': 0.01, 'min_child_samples': 20, 'num_leaves': 15, 'reg_alpha': 0.0, 'reg_lambda': 0.0} | Mean Accuracy: 0.6700 | Fold Accuracies: [np.float64(0.6470588235294118), np.float64(0.645), np.float64(0.6570048309178744), np.float64(0.6747572815533981), np.float64(0.6132075471698113), np.float64(0.6108597285067874)]
ID_TARGET: 136.0 | Best Params: {'bagging_fraction': 0.6, 'learning_rate': 0.01, 'min_child_samples': 20, 'num_leaves': 15, 'reg_alpha': 0.0, 'reg_lambda': 

XGBoost Targets: 100%|██████████| 100/100 [41:29<00:00, 24.89s/it]


XGBoost Weighted Accuracy: 0.7638

XGBoost Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 150, 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'subsample': 0.6} | Mean Accuracy: 0.7448 | Fold Accuracies: [np.float64(0.660377358490566), np.float64(0.6470588235294118), np.float64(0.641860465116279), np.float64(0.7315789473684211), np.float64(0.6157635467980296), np.float64(0.6346153846153846)]
ID_TARGET: 129.0 | Best Params: {'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 150, 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'subsample': 0.6} | Mean Accuracy: 0.6432 | Fold Accuracies: [np.float64(0.5882352941176471), np.float64(0.67), np.float64(0.6666666666666666), np.float64(0.6310679611650486), np.float64(0.5849056603773585), np.float64(0.5656108597285068)]
ID_TARGET: 136.0 | Best Params: {'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 150, 'reg_alpha': 0.0, 'reg_l

LightGBM Targets: 100%|██████████| 100/100 [01:05<00:00,  1.52it/s]


Second-pass XGBoost...



XGBoost Targets: 100%|██████████| 100/100 [01:25<00:00,  1.17it/s]


Performing third-pass training for low-performing targets...

Third-pass LightGBM...



LightGBM Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 53648.07it/s]


Third-pass XGBoost...



XGBoost Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 36792.14it/s]

Saved hyperparameter log to: /content/hyperparameters.json





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved predictions to: /content/predictions_ensemble.csv
Output CSV shape: (114468, 2)
Output CSV ID range: 0 to 114467
Unique RET_TARGET values: [ 1 -1]
Missing values: 0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Validation Results:
LightGBM: 0.7518
XGBoost: 0.7638


try 2 -- 73.9%

In [None]:
import numpy as np
import pandas as pd
import os
import gc
from google.colab import files
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import lightgbm
from xgboost import XGBClassifier
import xgboost
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
import json
from sklearn.model_selection import ParameterGrid

# Check xgboost version for compatibility
print(f"XGBoost version: {xgboost.__version__}")
if int(xgboost.__version__.split('.')[0]) < 1:
    print("Warning: XGBoost version < 1.0.0 detected. Ensure compatibility with eval_metric in constructor.")

# Set up data directory
DATA_DIR = '/content'

# Load datasets
try:
    X_train = pd.read_csv(os.path.join(DATA_DIR, 'X_train_itDkypA.csv'), index_col='ID', dtype=np.float32)
    y_train = pd.read_csv(os.path.join(DATA_DIR, 'y_train_3LeeT2g.csv'), index_col='ID', dtype=np.float32)
    X_test = pd.read_csv(os.path.join(DATA_DIR, 'X_test_Beg4ey3.csv'), index_col='ID', dtype=np.float32)
    supplementary = pd.read_csv(os.path.join(DATA_DIR, 'supplementary_data_Vkoyn8z.csv'))
except FileNotFoundError as e:
    raise FileNotFoundError(f"Dataset not found: {e}. Please ensure files are uploaded to {DATA_DIR}.")

# Custom weighted accuracy metric
def weighted_accuracy(y_true, y_pred, weights):
    y_pred_mapped = np.where(y_pred == 1, 1, -1)
    correct = (y_pred_mapped == np.sign(y_true)).astype(float)
    return np.sum(np.abs(y_true) * correct) / np.sum(np.abs(y_true))

# Preprocess data with enhanced feature selection
def preprocess_data(X, y=None, supplementary=None, is_train=True, top_cols=None):
    ret_cols = [col for col in X.columns if col.startswith('RET_')]

    # Dynamic feature selection with minimum variance threshold
    if is_train and top_cols is None:
        if len(ret_cols) > 15:
            variances = X[ret_cols].var()
            corr_sum = X[ret_cols].corr().abs().sum()
            combined_score = variances * corr_sum
            # Filter out low-variance features (threshold = 1e-5)
            valid_cols = variances[variances > 1e-5].index
            if len(valid_cols) < 15:
                top_cols = valid_cols.tolist()
            else:
                top_cols = combined_score.loc[valid_cols].sort_values(ascending=False).head(15).index.tolist()
        else:
            top_cols = ret_cols
        print(f"Selected top return columns: {top_cols}")
    elif top_cols is not None:
        ret_cols = [col for col in ret_cols if col in top_cols]
        if not ret_cols:
            raise ValueError("No return columns selected. Check top_cols consistency.")

    # Cache sector-based medians
    sector_medians = {}
    if supplementary is not None:
        sector_map = supplementary.set_index('ID_asset')['CLASS_LEVEL_1']
        for col in ret_cols:
            asset_id = int(col.replace('RET_', ''))
            sector = sector_map.get(asset_id, np.nan)
            if pd.notna(sector):
                sector_assets = supplementary[supplementary['CLASS_LEVEL_1'] == sector]['ID_asset']
                sector_cols = [f'RET_{a}' for a in sector_assets if f'RET_{a}' in X.columns]
                if sector_cols:
                    sector_medians[col] = X[sector_cols].median(axis=1)
            if col in sector_medians:
                X[col] = X[col].fillna(sector_medians[col])
            else:
                X[col] = X[col].fillna(X[col].median())

    # Sector-based features
    new_cols = {}
    if supplementary is not None:
        level = 'CLASS_LEVEL_1'
        sector_groups = supplementary.groupby(level)['ID_asset'].apply(list).to_dict()
        for sector, assets in sector_groups.items():
            asset_cols = [f'RET_{asset}' for asset in assets if f'RET_{asset}' in ret_cols]
            if asset_cols:
                new_cols[f'SECTOR_AVG_{level}_{sector}'] = X[asset_cols].mean(axis=1).replace([np.inf, -np.inf], 0).fillna(0)
        X = X.merge(supplementary[['ID_asset', level]].rename(columns={'ID_asset': 'ID_TARGET', level: f'TARGET_{level}'}),
                    on='ID_TARGET', how='left')
        X = pd.concat([X, pd.get_dummies(X[f'TARGET_{level}'], prefix=f'TARGET_{level}', dummy_na=True)], axis=1)
        X = X.drop(f'TARGET_{level}', axis=1)

    # Cross-sectional features
    new_cols['MEAN_RET'] = X[ret_cols].mean(axis=1)
    new_cols['STD_RET'] = X[ret_cols].std(axis=1)
    new_cols.update({f'CS_RANK_{col}': X.groupby('ID_DAY')[col].rank(pct=True).replace([np.inf, -np.inf], 0).fillna(0) for col in ret_cols})

    # Correlation-based features (top 5 pairs)
    if is_train:
        corr_matrix = X[ret_cols].corr()
        corr_pairs = corr_matrix.unstack().sort_values(ascending=False)
        top_pairs = [(i, j) for i, j in corr_pairs.index if i < j][:5]
        for i, j in top_pairs:
            new_cols[f'CORR_{i}_{j}'] = X[i] * X[j]

    # PCA features (2 components)
    if ret_cols:
        pca = PCA(n_components=min(2, len(ret_cols)), random_state=42)
        pca_features = pca.fit_transform(X[ret_cols].fillna(0))
        for i in range(pca_features.shape[1]):
            new_cols[f'PCA_{i}'] = pca_features[:, i]

    # Add new features to DataFrame
    new_cols_df = pd.DataFrame(new_cols, index=X.index, dtype=np.float32)
    X = pd.concat([X, new_cols_df], axis=1)

    # Standardize numerical features
    feature_cols = [col for col in X.columns if col not in ['ID_DAY', 'ID_TARGET']]
    scaler = StandardScaler()
    X[feature_cols] = scaler.fit_transform(X[feature_cols].replace([np.inf, -np.inf], 0).fillna(0))

    if is_train:
        y['SIGN_TARGET'] = np.where(y['RET_TARGET'] > 0, 1, 0).astype(np.int32)
        return X, y, feature_cols, top_cols
    return X, None, feature_cols, top_cols

# Preprocess data
try:
    X_train, y_train, feature_cols, top_cols = preprocess_data(X_train, y_train, supplementary, is_train=True)
    X_test, _, test_feature_cols, _ = preprocess_data(X_test, supplementary=supplementary, is_train=False, top_cols=top_cols)
except ValueError as e:
    print(f"Preprocessing error: {e}")
    raise

# Align columns
common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]
feature_cols = [col for col in common_cols if col not in ['ID_DAY', 'ID_TARGET']]

# Align y_train
y_train = y_train.loc[X_train.index]

# Define models with expanded hyperparameter grids
models = {
    'LightGBM': {
        'model': LGBMClassifier(num_leaves=31, min_child_samples=15, lambda_l1=0.3, lambda_l2=0.3,
                                subsample=0.7, colsample_bytree=0.8, bagging_freq=5, bagging_fraction=0.7,
                                random_state=42, n_jobs=1, verbosity=-1, class_weight='balanced', force_row_wise=True),
        'param_grid': [
            {'learning_rate': [0.01], 'num_leaves': [15], 'min_child_samples': [20], 'bagging_fraction': [0.6], 'reg_alpha': [0.0], 'reg_lambda': [0.0]},
            {'learning_rate': [0.03], 'num_leaves': [31], 'min_child_samples': [10], 'bagging_fraction': [0.7], 'reg_alpha': [0.1], 'reg_lambda': [0.1]},
            {'learning_rate': [0.05], 'num_leaves': [47], 'min_child_samples': [15], 'bagging_fraction': [0.8], 'reg_alpha': [0.2], 'reg_lambda': [0.2]},
            {'learning_rate': [0.07], 'num_leaves': [63], 'min_child_samples': [12], 'bagging_fraction': [0.65], 'reg_alpha': [0.3], 'reg_lambda': [0.3]},
            {'learning_rate': [0.1], 'num_leaves': [31], 'min_child_samples': [20], 'bagging_fraction': [0.75], 'reg_alpha': [0.0], 'reg_lambda': [0.1]},
            {'learning_rate': [0.02], 'num_leaves': [47], 'min_child_samples': [10], 'bagging_fraction': [0.7], 'reg_alpha': [0.2], 'reg_lambda': [0.0]}
        ],
        'callbacks': [lightgbm.early_stopping(stopping_rounds=15, verbose=False)]
    },
    'XGBoost': {
        'model': XGBClassifier(max_depth=4, min_child_weight=2, subsample=0.8, colsample_bytree=0.8,
                               random_state=42, n_jobs=1, eval_metric='logloss', verbosity=0,
                               scale_pos_weight=len(y_train[y_train['SIGN_TARGET'] == 0]) / len(y_train[y_train['SIGN_TARGET'] == 1])),
        'param_grid': [
            {'learning_rate': [0.01], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [150], 'reg_alpha': [0.0], 'reg_lambda': [1.0]},
            {'learning_rate': [0.03], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [200], 'reg_alpha': [0.1], 'reg_lambda': [1.5]},
            {'learning_rate': [0.05], 'max_depth': [5], 'min_child_weight': [3], 'subsample': [0.8], 'n_estimators': [150], 'reg_alpha': [0.2], 'reg_lambda': [1.0]},
            {'learning_rate': [0.07], 'max_depth': [6], 'min_child_weight': [1], 'subsample': [0.75], 'n_estimators': [200], 'reg_alpha': [0.3], 'reg_lambda': [1.5]},
            {'learning_rate': [0.1], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [250], 'reg_alpha': [0.0], 'reg_lambda': [1.0]},
            {'learning_rate': [0.02], 'max_depth': [5], 'min_child_weight': [3], 'subsample': [0.65], 'n_estimators': [200], 'reg_alpha': [0.1], 'reg_lambda': [1.2]}
        ],
        'callbacks': [xgboost.callback.EarlyStopping(rounds=15, metric_name='logloss', save_best=True)]
    }
}

# Function to train and predict for a single target
def train_target(target, X_train, y_train, X_test, feature_cols, model, model_name, kf, param_grid, callbacks):
    X_target = X_train[X_train['ID_TARGET'] == target][feature_cols]
    y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
    weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
    low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]
    # Dynamic weight scaling based on sample size
    weight_scale = 2.0 if target in low_performing_targets else 1.0
    if len(X_target) < 500:
        weight_scale *= 1.5  # Boost weights for small targets
    weights = weights * weight_scale
    groups = X_train[X_train['ID_TARGET'] == target]['ID_DAY']
    X_test_target = X_test[X_test['ID_TARGET'] == target][feature_cols]
    test_ids = X_test[X_test['ID_TARGET'] == target].index

    # Check for sufficient data
    if len(X_target) < 100 or len(X_test_target) == 0:
        print(f"Warning: Insufficient data for ID_TARGET {target} (train: {len(X_target)}, test: {len(X_test_target)}). Skipping.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    param_results = []
    best_params = None
    best_score = 0
    best_model = None

    # First-pass grid search with sample weights
    for params in ParameterGrid(param_grid):
        print(f"Testing params for {model_name} ID_TARGET {target}: {params}", flush=True)
        try:
            model.set_params(**params)
        except ValueError as e:
            print(f"Invalid params for {model_name} ID_TARGET {target}: {e}")
            continue
        fold_accuracies = []
        for train_idx, val_idx in kf.split(X_target, y_target, groups):
            X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
            y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
            w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]

            if model_name == 'LightGBM' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=callbacks)
            elif model_name == 'XGBoost' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
            else:
                model.fit(X_tr, y_tr, sample_weight=w_tr)
            y_pred = model.predict(X_val)
            acc = weighted_accuracy(y_val, y_pred, w_val)
            fold_accuracies.append(acc)

        mean_acc = np.mean(fold_accuracies)
        print(f"{model_name} | ID_TARGET: {target} | Params: {params} | Mean Accuracy: {mean_acc:.4f} | Fold Accuracies: {fold_accuracies}", flush=True)
        param_results.append({
            'params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': fold_accuracies
        })
        if mean_acc > best_score:
            best_score = mean_acc
            best_params = params
            best_model = type(model)(**model.get_params())

    # Train with best parameters
    if best_model is None:
        print(f"No valid model for {model_name} ID_TARGET {target}. Using default.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    best_model.set_params(**best_params)
    if model_name == 'LightGBM' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=callbacks)
    elif model_name == 'XGBoost' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
    else:
        best_model.fit(X_target, y_target, sample_weight=weights)

    # Predict
    preds = best_model.predict_proba(X_test_target)[:, 1].astype(np.float32)

    # Feature importance
    importance = best_model.feature_importances_
    feature_importance = dict(zip(feature_cols, importance))
    top_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:30]

    # Clean up memory
    gc.collect()

    return str(target), best_params, best_score, param_results, test_ids, preds, top_features

# Cross-validation and training
kf = GroupKFold(n_splits=7)  # Increased to 7 folds
results = {}
predictions = {}
ensemble_weights = {'LightGBM': 0.7, 'XGBoost': 0.3}
param_log = {}
top_features_all = {}
param_summary = []
low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]

for model_name, model_info in models.items():
    print(f"\nTraining {model_name}...", flush=True)
    param_log[model_name] = {}
    top_features_all[model_name] = {}

    # First-pass training
    results_parallel = Parallel(n_jobs=-1)(
        delayed(train_target)(target, X_train, y_train, X_test, feature_cols, model_info['model'], model_name, kf,
                             model_info['param_grid'], model_info['callbacks'])
        for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets")
    )

    # Collect results
    accuracies = []
    for target, params, mean_acc, param_results, test_ids, preds, top_features in results_parallel:
        param_log[model_name][target] = {
            'best_params': params,
            'mean_accuracy': mean_acc,
            'all_params': param_results
        }
        top_features_all[model_name][target] = top_features
        accuracies.append(mean_acc)
        predictions.setdefault(target, {})[model_name] = dict(zip(test_ids, preds))
        param_summary.append({
            'model': model_name,
            'ID_TARGET': target,
            'best_params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': param_results[-1]['fold_accuracies'] if param_results else []
        })

    results[model_name] = np.mean([acc for acc in accuracies if acc > 0])  # Exclude skipped targets
    print(f"{model_name} Weighted Accuracy: {results[model_name]:.4f}", flush=True)

    # Print parameter summary
    print(f"\n{model_name} Parameter Summary:", flush=True)
    for summary in param_summary:
        if summary['model'] == model_name:
            print(f"ID_TARGET: {summary['ID_TARGET']} | Best Params: {summary['best_params']} | Mean Accuracy: {summary['mean_accuracy']:.4f} | Fold Accuracies: {summary['fold_accuracies']}", flush=True)

# Second-pass training with top 30 features
print("\nPerforming second-pass training with top 30 features...", flush=True)
for model_name, model_info in models.items():
    print(f"\nSecond-pass {model_name}...", flush=True)
    for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets"):
        target_str = str(target)
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
        weight_scale = 2.0 if target in low_performing_targets else 1.0
        if len(X_target) < 500:
            weight_scale *= 1.5
        weights = weights * weight_scale
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index

        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue

        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        best_params['n_estimators'] = 300  # Use integer instead of list
        model.set_params(**best_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Third-pass training for low-performing targets
print("\nPerforming third-pass training for low-performing targets...", flush=True)
for model_name, model_info in models.items():
    print(f"\nThird-pass {model_name}...", flush=True)
    for target in tqdm(low_performing_targets, desc=f"{model_name} Low-Performing Targets"):
        target_str = str(target)
        if target_str not in param_log[model_name] or param_log[model_name][target_str]['mean_accuracy'] >= 0.75:
            continue  # Skip if already performing well
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs() * 3.0  # Increase weight
        if len(X_target) < 500:
            weights = weights * 1.5
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index

        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue

        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        # Fine-tune parameters for low-performing targets with scalar values
        fine_tune_grid = [
            {'learning_rate': best_params['learning_rate'][0] * 0.7, 'n_estimators': 400, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1},
            {'learning_rate': best_params['learning_rate'][0] * 1.3, 'n_estimators': 400, 'reg_alpha': best_params['reg_alpha'][0], 'reg_lambda': best_params['reg_lambda'][0]}
        ]
        best_score = param_log[model_name][target_str]['mean_accuracy']
        best_fine_params = best_params

        for params in ParameterGrid(fine_tune_grid):
            print(f"Fine-tuning {model_name} ID_TARGET {target}: {params}", flush=True)
            model.set_params(**params)
            fold_accuracies = []
            for train_idx, val_idx in kf.split(X_target, y_target, groups):
                X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
                y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
                w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
                if model_name == 'LightGBM' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=model_info['callbacks'])
                elif model_name == 'XGBoost' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
                else:
                    model.fit(X_tr, y_tr, sample_weight=w_tr)
                y_pred = model.predict(X_val)
                acc = weighted_accuracy(y_val, y_pred, w_val)
                fold_accuracies.append(acc)
            mean_acc = np.mean(fold_accuracies)
            print(f"{model_name} | ID_TARGET: {target} | Fine-tune Params: {params} | Mean Accuracy: {mean_acc:.4f}", flush=True)
            if mean_acc > best_score:
                best_score = mean_acc
                best_fine_params = params
                param_log[model_name][target_str]['best_params'] = best_fine_params
                param_log[model_name][target_str]['mean_accuracy'] = best_score
                param_summary.append({
                    'model': model_name,
                    'ID_TARGET': target_str,
                    'best_params': best_fine_params,
                    'mean_accuracy': best_score,
                    'fold_accuracies': fold_accuracies
                })

        # Retrain with fine-tuned parameters
        model.set_params(**best_fine_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Fourth-pass training for very low-performing targets
print("\nPerforming fourth-pass training for very low-performing targets...", flush=True)
for model_name, model_info in models.items():
    print(f"\nFourth-pass {model_name}...", flush=True)
    for target in tqdm(low_performing_targets, desc=f"{model_name} Very Low-Performing Targets"):
        target_str = str(target)
        if target_str not in param_log[model_name] or param_log[model_name][target_str]['mean_accuracy'] >= 0.70:
            continue  # Skip if accuracy >= 0.70
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs() * 3.0  # High weight
        if len(X_target) < 500:
            weights = weights * 1.5
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index

        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue

        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        # Expanded fine-tuning grid for very low-performing targets with scalar values
        fine_tune_grid = [
            {'learning_rate': best_params['learning_rate'][0] * 0.5, 'n_estimators': 400, 'reg_alpha': best_params['reg_alpha'][0] + 0.2, 'reg_lambda': best_params['reg_lambda'][0] + 0.2},
            {'learning_rate': best_params['learning_rate'][0] * 1.5, 'n_estimators': 400, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1},
            {'learning_rate': best_params['learning_rate'][0], 'n_estimators': 500, 'reg_alpha': best_params['reg_alpha'][0], 'reg_lambda': best_params['reg_lambda'][0]}
        ]
        best_score = param_log[model_name][target_str]['mean_accuracy']
        best_fine_params = best_params

        for params in ParameterGrid(fine_tune_grid):
            print(f"Fine-tuning {model_name} ID_TARGET {target}: {params}", flush=True)
            model.set_params(**params)
            fold_accuracies = []
            for train_idx, val_idx in kf.split(X_target, y_target, groups):
                X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
                y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
                w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
                if model_name == 'LightGBM' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=model_info['callbacks'])
                elif model_name == 'XGBoost' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
                else:
                    model.fit(X_tr, y_tr, sample_weight=w_tr)
                y_pred = model.predict(X_val)
                acc = weighted_accuracy(y_val, y_pred, w_val)
                fold_accuracies.append(acc)
            mean_acc = np.mean(fold_accuracies)
            print(f"{model_name} | ID_TARGET: {target} | Fine-tune Params: {params} | Mean Accuracy: {mean_acc:.4f}", flush=True)
            if mean_acc > best_score:
                best_score = mean_acc
                best_fine_params = params
                param_log[model_name][target_str]['best_params'] = best_fine_params
                param_log[model_name][target_str]['mean_accuracy'] = best_score
                param_summary.append({
                    'model': model_name,
                    'ID_TARGET': target_str,
                    'best_params': best_fine_params,
                    'mean_accuracy': best_score,
                    'fold_accuracies': fold_accuracies
                })

        # Retrain with fine-tuned parameters
        model.set_params(**best_fine_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Save parameter log
param_log_path = os.path.join(DATA_DIR, 'hyperparameters.json')
with open(param_log_path, 'w') as f:
    json.dump(param_log, f, indent=4)
print(f"Saved hyperparameter log to: {param_log_path}", flush=True)
files.download(param_log_path)

# Ensemble predictions
final_predictions = {}
missing_day_targets = []
for target in X_test['ID_TARGET'].unique():
    target_str = str(target)
    test_ids = X_test[X_test['ID_TARGET'] == target].index
    for day in X_test[X_test['ID_TARGET'] == target]['ID_DAY'].unique():
        day_idx = (X_test['ID_TARGET'] == target) & (X_test['ID_DAY'] == day)
        day_test_ids = X_test[day_idx].index
        if len(day_test_ids) == 0:
            print(f"Warning: No test samples for ID_TARGET {target} on ID_DAY {day}. Using default prediction.")
            missing_day_targets.append((target, day))
            for idx in test_ids:
                final_predictions[idx] = -1
            continue
        lgbm_preds = np.array([predictions[target_str].get('LightGBM', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        xgb_preds = np.array([predictions[target_str].get('XGBoost', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        ensemble_preds = ensemble_weights['LightGBM'] * lgbm_preds + ensemble_weights['XGBoost'] * xgb_preds
        # Apply exponential moving average for smoothing
        smoothed_preds = pd.Series(ensemble_preds).ewm(span=3).mean().values[-1] if len(ensemble_preds) > 0 else 0.5
        for idx in day_test_ids:
            final_predictions[idx] = 1 if smoothed_preds >= 0.5 else -1

if missing_day_targets:
    print(f"Missing day-target pairs: {missing_day_targets}")

# Create output CSV
output_df = pd.DataFrame.from_dict(final_predictions, orient='index', columns=['RET_TARGET']).reset_index()
output_df.columns = ['ID', 'RET_TARGET']
output_df = output_df.sort_values('ID')
expected_ids = set(X_test.index)
submission_ids = set(output_df['ID'])
if expected_ids != submission_ids:
    missing_ids = expected_ids - submission_ids
    print(f"Warning: Submission missing {len(missing_ids)} IDs: {missing_ids}")
    missing_df = pd.DataFrame({'ID': list(missing_ids), 'RET_TARGET': -1})
    output_df = pd.concat([output_df, missing_df], ignore_index=True).sort_values('ID')
output_path = os.path.join(DATA_DIR, 'predictions_ensemble.csv')
output_df.to_csv(output_path, index=False)
print(f"Saved predictions to: {output_path}", flush=True)

# Validate output
test_csv = pd.read_csv(output_path)
print(f"Output CSV shape: {test_csv.shape}", flush=True)
print(f"Output CSV ID range: {test_csv['ID'].min()} to {test_csv['ID'].max()}", flush=True)
print(f"Unique RET_TARGET values: {test_csv['RET_TARGET'].unique()}", flush=True)
print(f"Missing values: {test_csv.isna().sum().sum()}", flush=True)

# Download the file
files.download(output_path)

# Print final results
print("\nValidation Results:", flush=True)
for model_name, acc in results.items():
    print(f"{model_name}: {acc:.4f}", flush=True)

XGBoost version: 2.1.4
Selected top return columns: ['RET_262', 'RET_88', 'RET_172', 'RET_259', 'RET_150', 'RET_118', 'RET_216', 'RET_268', 'RET_115', 'RET_261', 'RET_59', 'RET_238', 'RET_121', 'RET_97', 'RET_30']

Training LightGBM...


LightGBM Targets: 100%|██████████| 100/100 [33:29<00:00, 20.10s/it]


LightGBM Weighted Accuracy: 0.7546

LightGBM Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'bagging_fraction': 0.6, 'learning_rate': 0.01, 'min_child_samples': 20, 'num_leaves': 15, 'reg_alpha': 0.0, 'reg_lambda': 0.0} | Mean Accuracy: 0.9423 | Fold Accuracies: [np.float64(0.7219251336898396), np.float64(0.7456647398843931), np.float64(0.8770949720670391), np.float64(0.7614213197969543), np.float64(0.8132530120481928), np.float64(0.7666666666666667), np.float64(0.8143712574850299)]
ID_TARGET: 129.0 | Best Params: {'bagging_fraction': 0.6, 'learning_rate': 0.01, 'min_child_samples': 20, 'num_leaves': 15, 'reg_alpha': 0.0, 'reg_lambda': 0.0} | Mean Accuracy: 0.6764 | Fold Accuracies: [np.float64(0.550561797752809), np.float64(0.6628571428571428), np.float64(0.6793478260869565), np.float64(0.6091954022988506), np.float64(0.6892655367231638), np.float64(0.6703910614525139), np.float64(0.605)]
ID_TARGET: 136.0 | Best Params: {'bagging_fraction': 0.6, 'learning_rate': 0.01, 'min_child_

XGBoost Targets: 100%|██████████| 100/100 [52:22<00:00, 31.43s/it]


XGBoost Weighted Accuracy: 0.8488

XGBoost Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 150, 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'subsample': 0.6} | Mean Accuracy: 0.7675 | Fold Accuracies: [np.float64(0.732620320855615), np.float64(0.6936416184971098), np.float64(0.7932960893854749), np.float64(0.6345177664974619), np.float64(0.8313253012048193), np.float64(0.7833333333333333), np.float64(0.7844311377245509)]
ID_TARGET: 129.0 | Best Params: {'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 150, 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'subsample': 0.6} | Mean Accuracy: 0.6427 | Fold Accuracies: [np.float64(0.5224719101123596), np.float64(0.6742857142857143), np.float64(0.6195652173913043), np.float64(0.5977011494252874), np.float64(0.6440677966101694), np.float64(0.6815642458100558), np.float64(0.57)]
ID_TARGET: 136.0 | Best Params: {'learning_rate': 0.01, 'max_depth': 3, 'mi

LightGBM Targets: 100%|██████████| 100/100 [01:13<00:00,  1.36it/s]


Second-pass XGBoost...



XGBoost Targets: 100%|██████████| 100/100 [01:31<00:00,  1.10it/s]


Performing third-pass training for low-performing targets...

Third-pass LightGBM...



LightGBM Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 88555.36it/s]


Third-pass XGBoost...



XGBoost Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 54024.99it/s]


Performing fourth-pass training for very low-performing targets...

Fourth-pass LightGBM...



LightGBM Very Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 102985.14it/s]


Fourth-pass XGBoost...



XGBoost Very Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 89240.51it/s]

Saved hyperparameter log to: /content/hyperparameters.json





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved predictions to: /content/predictions_ensemble.csv
Output CSV shape: (114468, 2)
Output CSV ID range: 0 to 114467
Unique RET_TARGET values: [ 1 -1]
Missing values: 0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Validation Results:
LightGBM: 0.7546
XGBoost: 0.8488


try try - 73%

In [None]:
import numpy as np
import pandas as pd
import os
import gc
from google.colab import files
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import lightgbm
from xgboost import XGBClassifier
import xgboost
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
import json
from sklearn.model_selection import ParameterGrid

# Check xgboost version for compatibility
print(f"XGBoost version: {xgboost.__version__}")
if int(xgboost.__version__.split('.')[0]) < 1:
    print("Warning: XGBoost version < 1.0.0 detected. Ensure compatibility with eval_metric in constructor.")

# Set up data directory
DATA_DIR = '/content'

# Load datasets
try:
    X_train = pd.read_csv(os.path.join(DATA_DIR, 'X_train_itDkypA.csv'), index_col='ID', dtype=np.float32)
    y_train = pd.read_csv(os.path.join(DATA_DIR, 'y_train_3LeeT2g.csv'), index_col='ID', dtype=np.float32)
    X_test = pd.read_csv(os.path.join(DATA_DIR, 'X_test_Beg4ey3.csv'), index_col='ID', dtype=np.float32)
    supplementary = pd.read_csv(os.path.join(DATA_DIR, 'supplementary_data_Vkoyn8z.csv'))
except FileNotFoundError as e:
    raise FileNotFoundError(f"Dataset not found: {e}. Please ensure files are uploaded to {DATA_DIR}.")

# Custom weighted accuracy metric
def weighted_accuracy(y_true, y_pred, weights):
    y_pred_mapped = np.where(y_pred == 1, 1, -1)
    correct = (y_pred_mapped == np.sign(y_true)).astype(float)
    return np.sum(np.abs(y_true) * correct) / np.sum(np.abs(y_true))

# Preprocess data with enhanced feature selection
def preprocess_data(X, y=None, supplementary=None, is_train=True, top_cols=None):
    ret_cols = [col for col in X.columns if col.startswith('RET_')]

    # Dynamic feature selection with minimum variance threshold
    if is_train and top_cols is None:
        if len(ret_cols) > 15:
            variances = X[ret_cols].var()
            corr_sum = X[ret_cols].corr().abs().sum()
            combined_score = variances * corr_sum
            # Filter out low-variance features (threshold = 1e-5)
            valid_cols = variances[variances > 1e-5].index
            if len(valid_cols) < 15:
                top_cols = valid_cols.tolist()
            else:
                top_cols = combined_score.loc[valid_cols].sort_values(ascending=False).head(15).index.tolist()
        else:
            top_cols = ret_cols
        print(f"Selected top return columns: {top_cols}")
    elif top_cols is not None:
        ret_cols = [col for col in ret_cols if col in top_cols]
        if not ret_cols:
            raise ValueError("No return columns selected. Check top_cols consistency.")

    # Cache sector-based medians
    sector_medians = {}
    if supplementary is not None:
        sector_map = supplementary.set_index('ID_asset')['CLASS_LEVEL_1']
        for col in ret_cols:
            asset_id = int(col.replace('RET_', ''))
            sector = sector_map.get(asset_id, np.nan)
            if pd.notna(sector):
                sector_assets = supplementary[supplementary['CLASS_LEVEL_1'] == sector]['ID_asset']
                sector_cols = [f'RET_{a}' for a in sector_assets if f'RET_{a}' in X.columns]
                if sector_cols:
                    sector_medians[col] = X[sector_cols].median(axis=1)
            if col in sector_medians:
                X[col] = X[col].fillna(sector_medians[col])
            else:
                X[col] = X[col].fillna(X[col].median())

    # Sector-based features
    new_cols = {}
    if supplementary is not None:
        level = 'CLASS_LEVEL_1'
        sector_groups = supplementary.groupby(level)['ID_asset'].apply(list).to_dict()
        for sector, assets in sector_groups.items():
            asset_cols = [f'RET_{asset}' for asset in assets if f'RET_{asset}' in ret_cols]
            if asset_cols:
                new_cols[f'SECTOR_AVG_{level}_{sector}'] = X[asset_cols].mean(axis=1).replace([np.inf, -np.inf], 0).fillna(0)
        X = X.merge(supplementary[['ID_asset', level]].rename(columns={'ID_asset': 'ID_TARGET', level: f'TARGET_{level}'}),
                    on='ID_TARGET', how='left')
        X = pd.concat([X, pd.get_dummies(X[f'TARGET_{level}'], prefix=f'TARGET_{level}', dummy_na=True)], axis=1)
        X = X.drop(f'TARGET_{level}', axis=1)

    # Cross-sectional features
    new_cols['MEAN_RET'] = X[ret_cols].mean(axis=1)
    new_cols['STD_RET'] = X[ret_cols].std(axis=1)
    new_cols.update({f'CS_RANK_{col}': X.groupby('ID_DAY')[col].rank(pct=True).replace([np.inf, -np.inf], 0).fillna(0) for col in ret_cols})

    # Correlation-based features (top 5 pairs)
    if is_train:
        corr_matrix = X[ret_cols].corr()
        corr_pairs = corr_matrix.unstack().sort_values(ascending=False)
        top_pairs = [(i, j) for i, j in corr_pairs.index if i < j][:5]
        for i, j in top_pairs:
            new_cols[f'CORR_{i}_{j}'] = X[i] * X[j]

    # PCA features (2 components)
    if ret_cols:
        pca = PCA(n_components=min(2, len(ret_cols)), random_state=42)
        pca_features = pca.fit_transform(X[ret_cols].fillna(0))
        for i in range(pca_features.shape[1]):
            new_cols[f'PCA_{i}'] = pca_features[:, i]

    # Add new features to DataFrame
    new_cols_df = pd.DataFrame(new_cols, index=X.index, dtype=np.float32)
    X = pd.concat([X, new_cols_df], axis=1)

    # Standardize numerical features
    feature_cols = [col for col in X.columns if col not in ['ID_DAY', 'ID_TARGET']]
    scaler = StandardScaler()
    X[feature_cols] = scaler.fit_transform(X[feature_cols].replace([np.inf, -np.inf], 0).fillna(0))

    if is_train:
        y['SIGN_TARGET'] = np.where(y['RET_TARGET'] > 0, 1, 0).astype(np.int32)
        return X, y, feature_cols, top_cols
    return X, None, feature_cols, top_cols

# Preprocess data
try:
    X_train, y_train, feature_cols, top_cols = preprocess_data(X_train, y_train, supplementary, is_train=True)
    X_test, _, test_feature_cols, _ = preprocess_data(X_test, supplementary=supplementary, is_train=False, top_cols=top_cols)
except ValueError as e:
    print(f"Preprocessing error: {e}")
    raise

# Align columns
common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]
feature_cols = [col for col in common_cols if col not in ['ID_DAY', 'ID_TARGET']]

# Align y_train
y_train = y_train.loc[X_train.index]

# Define models with expanded hyperparameter grids
models = {
    'LightGBM': {
        'model': LGBMClassifier(num_leaves=31, min_child_samples=15, lambda_l1=0.3, lambda_l2=0.3,
                                subsample=0.7, colsample_bytree=0.8, bagging_freq=5, bagging_fraction=0.7,
                                random_state=42, n_jobs=1, verbosity=-1, class_weight='balanced', force_row_wise=True),
        'param_grid': [
            {'learning_rate': [0.005], 'num_leaves': [15], 'min_child_samples': [10], 'bagging_fraction': [0.6], 'reg_alpha': [0.0], 'reg_lambda': [0.0]},
            {'learning_rate': [0.01], 'num_leaves': [31], 'min_child_samples': [15], 'bagging_fraction': [0.7], 'reg_alpha': [0.1], 'reg_lambda': [0.1]},
            {'learning_rate': [0.03], 'num_leaves': [47], 'min_child_samples': [20], 'bagging_fraction': [0.8], 'reg_alpha': [0.2], 'reg_lambda': [0.2]},
            {'learning_rate': [0.05], 'num_leaves': [63], 'min_child_samples': [12], 'bagging_fraction': [0.65], 'reg_alpha': [0.3], 'reg_lambda': [0.3]},
            {'learning_rate': [0.07], 'num_leaves': [79], 'min_child_samples': [30], 'bagging_fraction': [0.75], 'reg_alpha': [0.4], 'reg_lambda': [0.4]},
            {'learning_rate': [0.1], 'num_leaves': [15], 'min_child_samples': [10], 'bagging_fraction': [0.6], 'reg_alpha': [0.0], 'reg_lambda': [0.1]},
            {'learning_rate': [0.15], 'num_leaves': [31], 'min_child_samples': [15], 'bagging_fraction': [0.7], 'reg_alpha': [0.2], 'reg_lambda': [0.0]},
            {'learning_rate': [0.005], 'num_leaves': [47], 'min_child_samples': [20], 'bagging_fraction': [0.8], 'reg_alpha': [0.1], 'reg_lambda': [0.3]},
            {'learning_rate': [0.01], 'num_leaves': [63], 'min_child_samples': [12], 'bagging_fraction': [0.65], 'reg_alpha': [0.3], 'reg_lambda': [0.2]},
            {'learning_rate': [0.03], 'num_leaves': [79], 'min_child_samples': [30], 'bagging_fraction': [0.75], 'reg_alpha': [0.4], 'reg_lambda': [0.1]},
            {'learning_rate': [0.05], 'num_leaves': [15], 'min_child_samples': [10], 'bagging_fraction': [0.6], 'reg_alpha': [0.0], 'reg_lambda': [0.4]},
            {'learning_rate': [0.07], 'num_leaves': [31], 'min_child_samples': [15], 'bagging_fraction': [0.7], 'reg_alpha': [0.2], 'reg_lambda': [0.0]}
        ],
        'callbacks': [lightgbm.early_stopping(stopping_rounds=15, verbose=False)]
    },
    'XGBoost': {
        'model': XGBClassifier(max_depth=4, min_child_weight=2, subsample=0.8, colsample_bytree=0.8,
                               random_state=42, n_jobs=1, eval_metric='logloss', verbosity=0,
                               scale_pos_weight=len(y_train[y_train['SIGN_TARGET'] == 0]) / len(y_train[y_train['SIGN_TARGET'] == 1])),
        'param_grid': [
            {'learning_rate': [0.005], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [100], 'reg_alpha': [0.0], 'reg_lambda': [1.0]},
            {'learning_rate': [0.01], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [150], 'reg_alpha': [0.5], 'reg_lambda': [1.5]},
            {'learning_rate': [0.03], 'max_depth': [5], 'min_child_weight': [3], 'subsample': [0.8], 'n_estimators': [200], 'reg_alpha': [1.0], 'reg_lambda': [2.0]},
            {'learning_rate': [0.05], 'max_depth': [6], 'min_child_weight': [4], 'subsample': [0.65], 'n_estimators': [250], 'reg_alpha': [0.0], 'reg_lambda': [1.0]},
            {'learning_rate': [0.07], 'max_depth': [7], 'min_child_weight': [5], 'subsample': [0.75], 'n_estimators': [300], 'reg_alpha': [0.5], 'reg_lambda': [1.5]},
            {'learning_rate': [0.1], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [100], 'reg_alpha': [1.0], 'reg_lambda': [2.0]},
            {'learning_rate': [0.15], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [150], 'reg_alpha': [0.0], 'reg_lambda': [1.0]},
            {'learning_rate': [0.005], 'max_depth': [5], 'min_child_weight': [3], 'subsample': [0.8], 'n_estimators': [200], 'reg_alpha': [0.5], 'reg_lambda': [1.5]},
            {'learning_rate': [0.01], 'max_depth': [6], 'min_child_weight': [4], 'subsample': [0.65], 'n_estimators': [250], 'reg_alpha': [1.0], 'reg_lambda': [2.0]},
            {'learning_rate': [0.03], 'max_depth': [7], 'min_child_weight': [5], 'subsample': [0.75], 'n_estimators': [300], 'reg_alpha': [0.0], 'reg_lambda': [1.0]},
            {'learning_rate': [0.05], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [100], 'reg_alpha': [0.5], 'reg_lambda': [1.5]},
            {'learning_rate': [0.07], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [150], 'reg_alpha': [1.0], 'reg_lambda': [2.0]}
        ],
        'callbacks': [xgboost.callback.EarlyStopping(rounds=15, metric_name='logloss', save_best=True)]
    }
}

# Function to train and predict for a single target
def train_target(target, X_train, y_train, X_test, feature_cols, model, model_name, kf, param_grid, callbacks):
    X_target = X_train[X_train['ID_TARGET'] == target][feature_cols]
    y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
    weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
    low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]
    # Dynamic weight scaling based on sample size
    weight_scale = 2.0 if target in low_performing_targets else 1.0
    if len(X_target) < 500:
        weight_scale *= 1.5  # Boost weights for small targets
    weights = weights * weight_scale
    groups = X_train[X_train['ID_TARGET'] == target]['ID_DAY']
    X_test_target = X_test[X_test['ID_TARGET'] == target][feature_cols]
    test_ids = X_test[X_test['ID_TARGET'] == target].index

    # Check for sufficient data
    if len(X_target) < 100 or len(X_test_target) == 0:
        print(f"Warning: Insufficient data for ID_TARGET {target} (train: {len(X_target)}, test: {len(X_test_target)}). Skipping.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    param_results = []
    best_params = None
    best_score = 0
    best_model = None

    # First-pass grid search with sample weights
    for params in ParameterGrid(param_grid):
        print(f"Testing params for {model_name} ID_TARGET {target}: {params}", flush=True)
        try:
            model.set_params(**params)
        except ValueError as e:
            print(f"Invalid params for {model_name} ID_TARGET {target}: {e}")
            continue
        fold_accuracies = []
        for train_idx, val_idx in kf.split(X_target, y_target, groups):
            X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
            y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
            w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]

            if model_name == 'LightGBM' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=callbacks)
            elif model_name == 'XGBoost' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
            else:
                model.fit(X_tr, y_tr, sample_weight=w_tr)
            y_pred = model.predict(X_val)
            acc = weighted_accuracy(y_val, y_pred, w_val)
            fold_accuracies.append(acc)

        mean_acc = np.mean(fold_accuracies)
        print(f"{model_name} | ID_TARGET: {target} | Params: {params} | Mean Accuracy: {mean_acc:.4f} | Fold Accuracies: {fold_accuracies}", flush=True)
        param_results.append({
            'params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': fold_accuracies
        })
        if mean_acc > best_score:
            best_score = mean_acc
            best_params = params
            best_model = type(model)(**model.get_params())

    # Train with best parameters
    if best_model is None:
        print(f"No valid model for {model_name} ID_TARGET {target}. Using default.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    best_model.set_params(**best_params)
    if model_name == 'LightGBM' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=callbacks)
    elif model_name == 'XGBoost' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
    else:
        best_model.fit(X_target, y_target, sample_weight=weights)

    # Predict
    preds = best_model.predict_proba(X_test_target)[:, 1].astype(np.float32)

    # Feature importance
    importance = best_model.feature_importances_
    feature_importance = dict(zip(feature_cols, importance))
    top_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:30]

    # Clean up memory
    gc.collect()

    return str(target), best_params, best_score, param_results, test_ids, preds, top_features

# Cross-validation and training
kf = GroupKFold(n_splits=7)  # Increased to 7 folds
results = {}
predictions = {}
ensemble_weights = {'LightGBM': 0.7, 'XGBoost': 0.3}
param_log = {}
top_features_all = {}
param_summary = []
low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]

for model_name, model_info in models.items():
    print(f"\nTraining {model_name}...", flush=True)
    param_log[model_name] = {}
    top_features_all[model_name] = {}

    # First-pass training
    results_parallel = Parallel(n_jobs=-1)(
        delayed(train_target)(target, X_train, y_train, X_test, feature_cols, model_info['model'], model_name, kf,
                             model_info['param_grid'], model_info['callbacks'])
        for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets")
    )

    # Collect results
    accuracies = []
    for target, params, mean_acc, param_results, test_ids, preds, top_features in results_parallel:
        param_log[model_name][target] = {
            'best_params': params,
            'mean_accuracy': mean_acc,
            'all_params': param_results
        }
        top_features_all[model_name][target] = top_features
        accuracies.append(mean_acc)
        predictions.setdefault(target, {})[model_name] = dict(zip(test_ids, preds))
        param_summary.append({
            'model': model_name,
            'ID_TARGET': target,
            'best_params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': param_results[-1]['fold_accuracies'] if param_results else []
        })

    results[model_name] = np.mean([acc for acc in accuracies if acc > 0])  # Exclude skipped targets
    print(f"{model_name} Weighted Accuracy: {results[model_name]:.4f}", flush=True)

    # Print parameter summary
    print(f"\n{model_name} Parameter Summary:", flush=True)
    for summary in param_summary:
        if summary['model'] == model_name:
            print(f"ID_TARGET: {summary['ID_TARGET']} | Best Params: {summary['best_params']} | Mean Accuracy: {summary['mean_accuracy']:.4f} | Fold Accuracies: {summary['fold_accuracies']}", flush=True)

# Second-pass training with top 30 features
print("\nPerforming second-pass training with top 30 features...", flush=True)
for model_name, model_info in models.items():
    print(f"\nSecond-pass {model_name}...", flush=True)
    for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets"):
        target_str = str(target)
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
        weight_scale = 2.0 if target in low_performing_targets else 1.0
        if len(X_target) < 500:
            weight_scale *= 1.5
        weights = weights * weight_scale
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index

        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue

        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        best_params['n_estimators'] = 300  # Use integer instead of list
        model.set_params(**best_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Third-pass training for low-performing targets
print("\nPerforming third-pass training for low-performing targets...", flush=True)
for model_name, model_info in models.items():
    print(f"\nThird-pass {model_name}...", flush=True)
    for target in tqdm(low_performing_targets, desc=f"{model_name} Low-Performing Targets"):
        target_str = str(target)
        if target_str not in param_log[model_name] or param_log[model_name][target_str]['mean_accuracy'] >= 0.75:
            continue  # Skip if already performing well
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs() * 3.0  # Increase weight
        if len(X_target) < 500:
            weights = weights * 1.5
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index

        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue

        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        # Expanded fine-tuning grid for low-performing targets
        fine_tune_grid = [
            {'learning_rate': best_params['learning_rate'][0] * 0.5, 'n_estimators': 400, 'reg_alpha': best_params['reg_alpha'][0] + 0.2, 'reg_lambda': best_params['reg_lambda'][0] + 0.2},
            {'learning_rate': best_params['learning_rate'][0] * 0.7, 'n_estimators': 450, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1},
            {'learning_rate': best_params['learning_rate'][0] * 1.3, 'n_estimators': 400, 'reg_alpha': best_params['reg_alpha'][0], 'reg_lambda': best_params['reg_lambda'][0]},
            {'learning_rate': best_params['learning_rate'][0] * 1.5, 'n_estimators': 450, 'reg_alpha': best_params['reg_alpha'][0] + 0.3, 'reg_lambda': best_params['reg_lambda'][0] + 0.3},
            {'learning_rate': best_params['learning_rate'][0] * 0.6, 'n_estimators': 500, 'reg_alpha': best_params['reg_alpha'][0] + 0.15, 'reg_lambda': best_params['reg_lambda'][0] + 0.15},
            {'learning_rate': best_params['learning_rate'][0], 'n_estimators': 550, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.2}
        ]
        best_score = param_log[model_name][target_str]['mean_accuracy']
        best_fine_params = best_params

        for params in ParameterGrid(fine_tune_grid):
            print(f"Fine-tuning {model_name} ID_TARGET {target}: {params}", flush=True)
            model.set_params(**params)
            fold_accuracies = []
            for train_idx, val_idx in kf.split(X_target, y_target, groups):
                X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
                y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
                w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
                if model_name == 'LightGBM' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=model_info['callbacks'])
                elif model_name == 'XGBoost' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
                else:
                    model.fit(X_tr, y_tr, sample_weight=w_tr)
                y_pred = model.predict(X_val)
                acc = weighted_accuracy(y_val, y_pred, w_val)
                fold_accuracies.append(acc)
            mean_acc = np.mean(fold_accuracies)
            print(f"{model_name} | ID_TARGET: {target} | Fine-tune Params: {params} | Mean Accuracy: {mean_acc:.4f}", flush=True)
            if mean_acc > best_score:
                best_score = mean_acc
                best_fine_params = params
                param_log[model_name][target_str]['best_params'] = best_fine_params
                param_log[model_name][target_str]['mean_accuracy'] = best_score
                param_summary.append({
                    'model': model_name,
                    'ID_TARGET': target_str,
                    'best_params': best_fine_params,
                    'mean_accuracy': best_score,
                    'fold_accuracies': fold_accuracies
                })

        # Retrain with fine-tuned parameters
        model.set_params(**best_fine_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Fourth-pass training for very low-performing targets
print("\nPerforming fourth-pass training for very low-performing targets...", flush=True)
for model_name, model_info in models.items():
    print(f"\nFourth-pass {model_name}...", flush=True)
    for target in tqdm(low_performing_targets, desc=f"{model_name} Very Low-Performing Targets"):
        target_str = str(target)
        if target_str not in param_log[model_name] or param_log[model_name][target_str]['mean_accuracy'] >= 0.70:
            continue  # Skip if accuracy >= 0.70
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs() * 3.0  # High weight
        if len(X_target) < 500:
            weights = weights * 1.5
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index

        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue

        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        # Expanded fine-tuning grid for very low-performing targets
        fine_tune_grid = [
            {'learning_rate': best_params['learning_rate'][0] * 0.4, 'n_estimators': 400, 'reg_alpha': best_params['reg_alpha'][0] + 0.3, 'reg_lambda': best_params['reg_lambda'][0] + 0.3},
            {'learning_rate': best_params['learning_rate'][0] * 0.6, 'n_estimators': 450, 'reg_alpha': best_params['reg_alpha'][0] + 0.2, 'reg_lambda': best_params['reg_lambda'][0] + 0.2},
            {'learning_rate': best_params['learning_rate'][0] * 0.8, 'n_estimators': 500, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1},
            {'learning_rate': best_params['learning_rate'][0] * 1.2, 'n_estimators': 400, 'reg_alpha': best_params['reg_alpha'][0], 'reg_lambda': best_params['reg_lambda'][0]},
            {'learning_rate': best_params['learning_rate'][0] * 1.4, 'n_estimators': 450, 'reg_alpha': best_params['reg_alpha'][0] + 0.15, 'reg_lambda': best_params['reg_lambda'][0] + 0.15},
            {'learning_rate': best_params['learning_rate'][0] * 1.6, 'n_estimators': 500, 'reg_alpha': best_params['reg_alpha'][0] + 0.25, 'reg_lambda': best_params['reg_lambda'][0] + 0.25}
        ]
        best_score = param_log[model_name][target_str]['mean_accuracy']
        best_fine_params = best_params

        for params in ParameterGrid(fine_tune_grid):
            print(f"Fine-tuning {model_name} ID_TARGET {target}: {params}", flush=True)
            model.set_params(**params)
            fold_accuracies = []
            for train_idx, val_idx in kf.split(X_target, y_target, groups):
                X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
                y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
                w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
                if model_name == 'LightGBM' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=model_info['callbacks'])
                elif model_name == 'XGBoost' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
                else:
                    model.fit(X_tr, y_tr, sample_weight=w_tr)
                y_pred = model.predict(X_val)
                acc = weighted_accuracy(y_val, y_pred, w_val)
                fold_accuracies.append(acc)
            mean_acc = np.mean(fold_accuracies)
            print(f"{model_name} | ID_TARGET: {target} | Fine-tune Params: {params} | Mean Accuracy: {mean_acc:.4f}", flush=True)
            if mean_acc > best_score:
                best_score = mean_acc
                best_fine_params = params
                param_log[model_name][target_str]['best_params'] = best_fine_params
                param_log[model_name][target_str]['mean_accuracy'] = best_score
                param_summary.append({
                    'model': model_name,
                    'ID_TARGET': target_str,
                    'best_params': best_fine_params,
                    'mean_accuracy': best_score,
                    'fold_accuracies': fold_accuracies
                })

        # Retrain with fine-tuned parameters
        model.set_params(**best_fine_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Save parameter log
param_log_path = os.path.join(DATA_DIR, 'hyperparameters.json')
with open(param_log_path, 'w') as f:
    json.dump(param_log, f, indent=4)
print(f"Saved hyperparameter log to: {param_log_path}", flush=True)
files.download(param_log_path)

# Ensemble predictions
final_predictions = {}
missing_day_targets = []
for target in X_test['ID_TARGET'].unique():
    target_str = str(target)
    test_ids = X_test[X_test['ID_TARGET'] == target].index
    for day in X_test[X_test['ID_TARGET'] == target]['ID_DAY'].unique():
        day_idx = (X_test['ID_TARGET'] == target) & (X_test['ID_DAY'] == day)
        day_test_ids = X_test[day_idx].index
        if len(day_test_ids) == 0:
            print(f"Warning: No test samples for ID_TARGET {target} on ID_DAY {day}. Using default prediction.")
            missing_day_targets.append((target, day))
            for idx in test_ids:
                final_predictions[idx] = -1
            continue
        lgbm_preds = np.array([predictions[target_str].get('LightGBM', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        xgb_preds = np.array([predictions[target_str].get('XGBoost', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        ensemble_preds = ensemble_weights['LightGBM'] * lgbm_preds + ensemble_weights['XGBoost'] * xgb_preds
        # Apply exponential moving average for smoothing
        smoothed_preds = pd.Series(ensemble_preds).ewm(span=3).mean().values[-1] if len(ensemble_preds) > 0 else 0.5
        for idx in day_test_ids:
            final_predictions[idx] = 1 if smoothed_preds >= 0.5 else -1

if missing_day_targets:
    print(f"Missing day-target pairs: {missing_day_targets}")

# Create output CSV
output_df = pd.DataFrame.from_dict(final_predictions, orient='index', columns=['RET_TARGET']).reset_index()
output_df.columns = ['ID', 'RET_TARGET']
output_df = output_df.sort_values('ID')
expected_ids = set(X_test.index)
submission_ids = set(output_df['ID'])
if expected_ids != submission_ids:
    missing_ids = expected_ids - submission_ids
    print(f"Warning: Submission missing {len(missing_ids)} IDs: {missing_ids}")
    missing_df = pd.DataFrame({'ID': list(missing_ids), 'RET_TARGET': -1})
    output_df = pd.concat([output_df, missing_df], ignore_index=True).sort_values('ID')
output_path = os.path.join(DATA_DIR, 'predictions_ensemble.csv')
output_df.to_csv(output_path, index=False)
print(f"Saved predictions to: {output_path}", flush=True)

# Validate output
test_csv = pd.read_csv(output_path)
print(f"Output CSV shape: {test_csv.shape}", flush=True)
print(f"Output CSV ID range: {test_csv['ID'].min()} to {test_csv['ID'].max()}", flush=True)
print(f"Unique RET_TARGET values: {test_csv['RET_TARGET'].unique()}", flush=True)
print(f"Missing values: {test_csv.isna().sum().sum()}", flush=True)

# Download the file
files.download(output_path)

# Print final results
print("\nValidation Results:", flush=True)
for model_name, acc in results.items():
    print(f"{model_name}: {acc:.4f}", flush=True)

XGBoost version: 2.1.4
Selected top return columns: ['RET_262', 'RET_88', 'RET_172', 'RET_259', 'RET_118', 'RET_150', 'RET_261', 'RET_268', 'RET_115', 'RET_216', 'RET_238', 'RET_59', 'RET_30', 'RET_97', 'RET_122']

Training LightGBM...


LightGBM Targets: 100%|██████████| 101/101 [58:45<00:00, 34.91s/it]


LightGBM Weighted Accuracy: 0.7985

LightGBM Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'bagging_fraction': 0.6, 'learning_rate': 0.005, 'min_child_samples': 10, 'num_leaves': 15, 'reg_alpha': 0.0, 'reg_lambda': 0.0} | Mean Accuracy: 1.0000 | Fold Accuracies: [np.float64(0.6257668711656442), np.float64(0.7515527950310559), np.float64(0.7692307692307693), np.float64(0.6606060606060606), np.float64(0.6408839779005525), np.float64(0.8169934640522876), np.float64(0.7232704402515723)]
ID_TARGET: 129.0 | Best Params: {'bagging_fraction': 0.6, 'learning_rate': 0.005, 'min_child_samples': 10, 'num_leaves': 15, 'reg_alpha': 0.0, 'reg_lambda': 0.0} | Mean Accuracy: 0.7565 | Fold Accuracies: [np.float64(0.61875), np.float64(0.6296296296296297), np.float64(0.610062893081761), np.float64(0.6802325581395349), np.float64(0.6172839506172839), np.float64(0.5974842767295597), np.float64(0.625)]
ID_TARGET: 136.0 | Best Params: {'bagging_fraction': 0.8, 'learning_rate': 0.005, 'min_child_samples'

XGBoost Targets: 100%|██████████| 101/101 [1:23:05<00:00, 49.36s/it]


XGBoost Weighted Accuracy: 0.9894

XGBoost Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 200, 'reg_alpha': 1.0, 'reg_lambda': 2.0, 'subsample': 0.8} | Mean Accuracy: 1.0000 | Fold Accuracies: [np.float64(0.950920245398773), np.float64(0.9627329192546584), np.float64(0.9935897435897436), np.float64(0.9515151515151515), np.float64(0.9502762430939227), np.float64(0.934640522875817), np.float64(1.0)]
ID_TARGET: 129.0 | Best Params: {'learning_rate': 0.05, 'max_depth': 6, 'min_child_weight': 4, 'n_estimators': 250, 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'subsample': 0.65} | Mean Accuracy: 1.0000 | Fold Accuracies: [np.float64(0.5875), np.float64(0.6234567901234568), np.float64(0.6037735849056604), np.float64(0.6744186046511628), np.float64(0.6296296296296297), np.float64(0.6477987421383647), np.float64(0.6306818181818182)]
ID_TARGET: 136.0 | Best Params: {'learning_rate': 0.05, 'max_depth': 6, 'min_child_weigh

LightGBM Targets: 100%|██████████| 101/101 [01:19<00:00,  1.27it/s]


Second-pass XGBoost...



XGBoost Targets: 100%|██████████| 101/101 [01:16<00:00,  1.31it/s]


Performing third-pass training for low-performing targets...

Third-pass LightGBM...



LightGBM Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 59150.44it/s]


Third-pass XGBoost...



XGBoost Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 60947.61it/s]


Performing fourth-pass training for very low-performing targets...

Fourth-pass LightGBM...



LightGBM Very Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 52015.04it/s]


Fourth-pass XGBoost...



XGBoost Very Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 117998.32it/s]

Saved hyperparameter log to: /content/hyperparameters.json





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved predictions to: /content/predictions_ensemble.csv
Output CSV shape: (114468, 2)
Output CSV ID range: 0 to 114467
Unique RET_TARGET values: [ 1 -1]
Missing values: 0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Validation Results:
LightGBM: 0.7985
XGBoost: 0.9894


try new -- not yet submit

In [None]:
import numpy as np
import pandas as pd
import os
import gc
from google.colab import files
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import lightgbm
from xgboost import XGBClassifier
import xgboost
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
import json
from sklearn.model_selection import ParameterGrid

# Check xgboost version for compatibility
print(f"XGBoost version: {xgboost.__version__}")
if int(xgboost.__version__.split('.')[0]) < 1:
    print("Warning: XGBoost version < 1.0.0 detected. Ensure compatibility with eval_metric in constructor.")

# Set up data directory
DATA_DIR = '/content'

# Load datasets
try:
    X_train = pd.read_csv(os.path.join(DATA_DIR, 'X_train_itDkypA.csv'), index_col='ID', dtype=np.float32)
    y_train = pd.read_csv(os.path.join(DATA_DIR, 'y_train_3LeeT2g.csv'), index_col='ID', dtype=np.float32)
    X_test = pd.read_csv(os.path.join(DATA_DIR, 'X_test_Beg4ey3.csv'), index_col='ID', dtype=np.float32)
    supplementary = pd.read_csv(os.path.join(DATA_DIR, 'supplementary_data_Vkoyn8z.csv'))
except FileNotFoundError as e:
    raise FileNotFoundError(f"Dataset not found: {e}. Please ensure files are uploaded to {DATA_DIR}.")

# Custom weighted accuracy metric
def weighted_accuracy(y_true, y_pred, weights):
    y_pred_mapped = np.where(y_pred == 1, 1, -1)
    correct = (y_pred_mapped == np.sign(y_true)).astype(float)
    return np.sum(np.abs(y_true) * correct) / np.sum(np.abs(y_true))

# Preprocess data with enhanced feature selection
def preprocess_data(X, y=None, supplementary=None, is_train=True, top_cols=None):
    ret_cols = [col for col in X.columns if col.startswith('RET_')]

    # Dynamic feature selection with minimum variance threshold
    if is_train and top_cols is None:
        if len(ret_cols) > 15:
            variances = X[ret_cols].var()
            corr_sum = X[ret_cols].corr().abs().sum()
            combined_score = variances * corr_sum
            valid_cols = variances[variances > 1e-5].index
            if len(valid_cols) < 15:
                top_cols = valid_cols.tolist()
            else:
                top_cols = combined_score.loc[valid_cols].sort_values(ascending=False).head(15).index.tolist()
        else:
            top_cols = ret_cols
        print(f"Selected top return columns: {top_cols}")
    elif top_cols is not None:
        ret_cols = [col for col in ret_cols if col in top_cols]
        if not ret_cols:
            raise ValueError("No return columns selected. Check top_cols consistency.")

    # Cache sector-based medians
    sector_medians = {}
    if supplementary is not None:
        sector_map = supplementary.set_index('ID_asset')['CLASS_LEVEL_1']
        for col in ret_cols:
            asset_id = int(col.replace('RET_', ''))
            sector = sector_map.get(asset_id, np.nan)
            if pd.notna(sector):
                sector_assets = supplementary[supplementary['CLASS_LEVEL_1'] == sector]['ID_asset']
                sector_cols = [f'RET_{a}' for a in sector_assets if f'RET_{a}' in X.columns]
                if sector_cols:
                    sector_medians[col] = X[sector_cols].median(axis=1)
            if col in sector_medians:
                X[col] = X[col].fillna(sector_medians[col])
            else:
                X[col] = X[col].fillna(X[col].median())

    # Sector-based features
    new_cols = {}
    if supplementary is not None:
        level = 'CLASS_LEVEL_1'
        sector_groups = supplementary.groupby(level)['ID_asset'].apply(list).to_dict()
        for sector, assets in sector_groups.items():
            asset_cols = [f'RET_{asset}' for asset in assets if f'RET_{asset}' in ret_cols]
            if asset_cols:
                new_cols[f'SECTOR_AVG_{level}_{sector}'] = X[asset_cols].mean(axis=1).replace([np.inf, -np.inf], 0).fillna(0)
        X = X.merge(supplementary[['ID_asset', level]].rename(columns={'ID_asset': 'ID_TARGET', level: f'TARGET_{level}'}),
                    on='ID_TARGET', how='left')
        X = pd.concat([X, pd.get_dummies(X[f'TARGET_{level}'], prefix=f'TARGET_{level}', dummy_na=True)], axis=1)
        X = X.drop(f'TARGET_{level}', axis=1)

    # Cross-sectional features
    new_cols['MEAN_RET'] = X[ret_cols].mean(axis=1)
    new_cols['STD_RET'] = X[ret_cols].std(axis=1)
    new_cols.update({f'CS_RANK_{col}': X.groupby('ID_DAY')[col].rank(pct=True).replace([np.inf, -np.inf], 0).fillna(0) for col in ret_cols})

    # Correlation-based features (top 5 pairs)
    if is_train:
        corr_matrix = X[ret_cols].corr()
        corr_pairs = corr_matrix.unstack().sort_values(ascending=False)
        top_pairs = [(i, j) for i, j in corr_pairs.index if i < j][:5]
        for i, j in top_pairs:
            new_cols[f'CORR_{i}_{j}'] = X[i] * X[j]

    # PCA features (2 components)
    if ret_cols:
        pca = PCA(n_components=min(2, len(ret_cols)), random_state=42)
        pca_features = pca.fit_transform(X[ret_cols].fillna(0))
        for i in range(pca_features.shape[1]):
            new_cols[f'PCA_{i}'] = pca_features[:, i]

    # Add new features to DataFrame
    new_cols_df = pd.DataFrame(new_cols, index=X.index, dtype=np.float32)
    X = pd.concat([X, new_cols_df], axis=1)

    # Standardize numerical features
    feature_cols = [col for col in X.columns if col not in ['ID_DAY', 'ID_TARGET']]
    scaler = StandardScaler()
    X[feature_cols] = scaler.fit_transform(X[feature_cols].replace([np.inf, -np.inf], 0).fillna(0))

    if is_train:
        y['SIGN_TARGET'] = np.where(y['RET_TARGET'] > 0, 1, 0).astype(np.int32)
        return X, y, feature_cols, top_cols
    return X, None, feature_cols, top_cols

# Preprocess data
try:
    X_train, y_train, feature_cols, top_cols = preprocess_data(X_train, y_train, supplementary, is_train=True)
    X_test, _, test_feature_cols, _ = preprocess_data(X_test, supplementary=supplementary, is_train=False, top_cols=top_cols)
except ValueError as e:
    print(f"Preprocessing error: {e}")
    raise

# Align columns
common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]
feature_cols = [col for col in common_cols if col not in ['ID_DAY', 'ID_TARGET']]

# Align y_train
y_train = y_train.loc[X_train.index]

# Define models with further expanded hyperparameter grids
models = {
    'LightGBM': {
        'model': LGBMClassifier(num_leaves=31, min_child_samples=15, lambda_l1=0.3, lambda_l2=0.3,
                                subsample=0.7, colsample_bytree=0.8, bagging_freq=5, bagging_fraction=0.7,
                                random_state=42, n_jobs=1, verbosity=-1, class_weight='balanced', force_row_wise=True),
        'param_grid': [
            {'learning_rate': [0.003], 'num_leaves': [15], 'min_child_samples': [10], 'bagging_fraction': [0.6], 'reg_alpha': [0.0], 'reg_lambda': [0.0], 'colsample_bytree': [0.6]},
            {'learning_rate': [0.005], 'num_leaves': [31], 'min_child_samples': [15], 'bagging_fraction': [0.7], 'reg_alpha': [0.1], 'reg_lambda': [0.1], 'colsample_bytree': [0.7]},
            {'learning_rate': [0.01], 'num_leaves': [47], 'min_child_samples': [20], 'bagging_fraction': [0.8], 'reg_alpha': [0.2], 'reg_lambda': [0.2], 'colsample_bytree': [0.8]},
            {'learning_rate': [0.03], 'num_leaves': [63], 'min_child_samples': [12], 'bagging_fraction': [0.65], 'reg_alpha': [0.3], 'reg_lambda': [0.3], 'colsample_bytree': [0.9]},
            {'learning_rate': [0.05], 'num_leaves': [79], 'min_child_samples': [30], 'bagging_fraction': [0.75], 'reg_alpha': [0.4], 'reg_lambda': [0.4], 'colsample_bytree': [0.6]},
            {'learning_rate': [0.07], 'num_leaves': [15], 'min_child_samples': [10], 'bagging_fraction': [0.6], 'reg_alpha': [0.0], 'reg_lambda': [0.1], 'colsample_bytree': [0.7]},
            {'learning_rate': [0.1], 'num_leaves': [31], 'min_child_samples': [15], 'bagging_fraction': [0.7], 'reg_alpha': [0.2], 'reg_lambda': [0.0], 'colsample_bytree': [0.8]},
            {'learning_rate': [0.15], 'num_leaves': [47], 'min_child_samples': [20], 'bagging_fraction': [0.8], 'reg_alpha': [0.1], 'reg_lambda': [0.3], 'colsample_bytree': [0.9]},
            {'learning_rate': [0.003], 'num_leaves': [63], 'min_child_samples': [12], 'bagging_fraction': [0.65], 'reg_alpha': [0.3], 'reg_lambda': [0.2], 'colsample_bytree': [0.6]},
            {'learning_rate': [0.005], 'num_leaves': [79], 'min_child_samples': [30], 'bagging_fraction': [0.75], 'reg_alpha': [0.4], 'reg_lambda': [0.1], 'colsample_bytree': [0.7]},
            {'learning_rate': [0.01], 'num_leaves': [15], 'min_child_samples': [10], 'bagging_fraction': [0.6], 'reg_alpha': [0.0], 'reg_lambda': [0.4], 'colsample_bytree': [0.8]},
            {'learning_rate': [0.03], 'num_leaves': [31], 'min_child_samples': [15], 'bagging_fraction': [0.7], 'reg_alpha': [0.2], 'reg_lambda': [0.0], 'colsample_bytree': [0.9]},
            {'learning_rate': [0.05], 'num_leaves': [47], 'min_child_samples': [20], 'bagging_fraction': [0.8], 'reg_alpha': [0.5], 'reg_lambda': [0.5], 'colsample_bytree': [0.6]},
            {'learning_rate': [0.07], 'num_leaves': [63], 'min_child_samples': [12], 'bagging_fraction': [0.65], 'reg_alpha': [0.1], 'reg_lambda': [0.2], 'colsample_bytree': [0.7]},
            {'learning_rate': [0.1], 'num_leaves': [79], 'min_child_samples': [30], 'bagging_fraction': [0.75], 'reg_alpha': [0.3], 'reg_lambda': [0.3], 'colsample_bytree': [0.8]},
            {'learning_rate': [0.15], 'num_leaves': [15], 'min_child_samples': [10], 'bagging_fraction': [0.6], 'reg_alpha': [0.4], 'reg_lambda': [0.1], 'colsample_bytree': [0.9]},
            {'learning_rate': [0.003], 'num_leaves': [31], 'min_child_samples': [15], 'bagging_fraction': [0.7], 'reg_alpha': [0.0], 'reg_lambda': [0.0], 'colsample_bytree': [0.6]},
            {'learning_rate': [0.005], 'num_leaves': [47], 'min_child_samples': [20], 'bagging_fraction': [0.8], 'reg_alpha': [0.2], 'reg_lambda': [0.2], 'colsample_bytree': [0.7]}
        ],
        'callbacks': [lightgbm.early_stopping(stopping_rounds=15, verbose=False)]
    },
    'XGBoost': {
        'model': XGBClassifier(max_depth=4, min_child_weight=2, subsample=0.8, colsample_bytree=0.8,
                               random_state=42, n_jobs=1, eval_metric='logloss', verbosity=0,
                               scale_pos_weight=len(y_train[y_train['SIGN_TARGET'] == 0]) / len(y_train[y_train['SIGN_TARGET'] == 1])),
        'param_grid': [
            {'learning_rate': [0.003], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [100], 'reg_alpha': [0.0], 'reg_lambda': [1.0], 'gamma': [0.0]},
            {'learning_rate': [0.005], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [150], 'reg_alpha': [0.5], 'reg_lambda': [1.5], 'gamma': [0.1]},
            {'learning_rate': [0.01], 'max_depth': [5], 'min_child_weight': [3], 'subsample': [0.8], 'n_estimators': [200], 'reg_alpha': [1.0], 'reg_lambda': [2.0], 'gamma': [0.2]},
            {'learning_rate': [0.03], 'max_depth': [6], 'min_child_weight': [4], 'subsample': [0.65], 'n_estimators': [250], 'reg_alpha': [0.0], 'reg_lambda': [1.0], 'gamma': [0.0]},
            {'learning_rate': [0.05], 'max_depth': [7], 'min_child_weight': [5], 'subsample': [0.75], 'n_estimators': [300], 'reg_alpha': [0.5], 'reg_lambda': [1.5], 'gamma': [0.1]},
            {'learning_rate': [0.07], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [100], 'reg_alpha': [1.0], 'reg_lambda': [2.0], 'gamma': [0.2]},
            {'learning_rate': [0.1], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [150], 'reg_alpha': [0.0], 'reg_lambda': [1.0], 'gamma': [0.0]},
            {'learning_rate': [0.15], 'max_depth': [5], 'min_child_weight': [3], 'subsample': [0.8], 'n_estimators': [200], 'reg_alpha': [0.5], 'reg_lambda': [1.5], 'gamma': [0.1]},
            {'learning_rate': [0.003], 'max_depth': [6], 'min_child_weight': [4], 'subsample': [0.65], 'n_estimators': [250], 'reg_alpha': [1.0], 'reg_lambda': [2.0], 'gamma': [0.2]},
            {'learning_rate': [0.005], 'max_depth': [7], 'min_child_weight': [5], 'subsample': [0.75], 'n_estimators': [300], 'reg_alpha': [0.0], 'reg_lambda': [1.0], 'gamma': [0.0]},
            {'learning_rate': [0.01], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [100], 'reg_alpha': [0.5], 'reg_lambda': [1.5], 'gamma': [0.1]},
            {'learning_rate': [0.03], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [150], 'reg_alpha': [1.0], 'reg_lambda': [2.0], 'gamma': [0.2]},
            {'learning_rate': [0.05], 'max_depth': [5], 'min_child_weight': [3], 'subsample': [0.8], 'n_estimators': [200], 'reg_alpha': [0.0], 'reg_lambda': [1.0], 'gamma': [0.0]},
            {'learning_rate': [0.07], 'max_depth': [6], 'min_child_weight': [4], 'subsample': [0.65], 'n_estimators': [250], 'reg_alpha': [0.5], 'reg_lambda': [1.5], 'gamma': [0.1]},
            {'learning_rate': [0.1], 'max_depth': [7], 'min_child_weight': [5], 'subsample': [0.75], 'n_estimators': [300], 'reg_alpha': [1.0], 'reg_lambda': [2.0], 'gamma': [0.2]},
            {'learning_rate': [0.15], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [100], 'reg_alpha': [0.0], 'reg_lambda': [1.0], 'gamma': [0.0]},
            {'learning_rate': [0.003], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [150], 'reg_alpha': [0.5], 'reg_lambda': [1.5], 'gamma': [0.1]},
            {'learning_rate': [0.005], 'max_depth': [5], 'min_child_weight': [3], 'subsample': [0.8], 'n_estimators': [200], 'reg_alpha': [1.0], 'reg_lambda': [2.0], 'gamma': [0.2]}
        ],
        'callbacks': [xgboost.callback.EarlyStopping(rounds=15, metric_name='logloss', save_best=True)]
    }
}

# Function to train and predict for a single target
def train_target(target, X_train, y_train, X_test, feature_cols, model, model_name, kf, param_grid, callbacks):
    X_target = X_train[X_train['ID_TARGET'] == target][feature_cols]
    y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
    weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
    low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]
    # Dynamic weight scaling based on sample size
    weight_scale = 2.0 if target in low_performing_targets else 1.0
    if len(X_target) < 500:
        weight_scale *= 1.5  # Boost weights for small targets
    weights = weights * weight_scale
    groups = X_train[X_train['ID_TARGET'] == target]['ID_DAY']
    X_test_target = X_test[X_test['ID_TARGET'] == target][feature_cols]
    test_ids = X_test[X_test['ID_TARGET'] == target].index

    # Check for sufficient data
    if len(X_target) < 100 or len(X_test_target) == 0:
        print(f"Warning: Insufficient data for ID_TARGET {target} (train: {len(X_target)}, test: {len(X_test_target)}). Skipping.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    param_results = []
    best_params = None
    best_score = 0
    best_model = None

    # First-pass grid search with sample weights
    for params in ParameterGrid(param_grid):
        print(f"Testing params for {model_name} ID_TARGET {target}: {params}", flush=True)
        try:
            model.set_params(**params)
        except ValueError as e:
            print(f"Invalid params for {model_name} ID_TARGET {target}: {e}")
            continue
        fold_accuracies = []
        for train_idx, val_idx in kf.split(X_target, y_target, groups):
            X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
            y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
            w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]

            if model_name == 'LightGBM' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=callbacks)
            elif model_name == 'XGBoost' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
            else:
                model.fit(X_tr, y_tr, sample_weight=w_tr)
            y_pred = model.predict(X_val)
            acc = weighted_accuracy(y_val, y_pred, w_val)
            fold_accuracies.append(acc)

        mean_acc = np.mean(fold_accuracies)
        print(f"{model_name} | ID_TARGET: {target} | Params: {params} | Mean Accuracy: {mean_acc:.4f} | Fold Accuracies: {fold_accuracies}", flush=True)
        param_results.append({
            'params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': fold_accuracies
        })
        if mean_acc > best_score:
            best_score = mean_acc
            best_params = params
            best_model = type(model)(**model.get_params())

    # Train with best parameters
    if best_model is None:
        print(f"No valid model for {model_name} ID_TARGET {target}. Using default.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    best_model.set_params(**best_params)
    if model_name == 'LightGBM' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=callbacks)
    elif model_name == 'XGBoost' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
    else:
        best_model.fit(X_target, y_target, sample_weight=weights)

    # Predict
    preds = best_model.predict_proba(X_test_target)[:, 1].astype(np.float32)

    # Feature importance
    importance = best_model.feature_importances_
    feature_importance = dict(zip(feature_cols, importance))
    top_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:30]

    # Clean up memory
    gc.collect()

    return str(target), best_params, best_score, param_results, test_ids, preds, top_features

# Cross-validation and training
kf = GroupKFold(n_splits=7)  # Increased to 7 folds
results = {}
predictions = {}
ensemble_weights = {'LightGBM': 0.7, 'XGBoost': 0.3}
param_log = {}
top_features_all = {}
param_summary = []
low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]

for model_name, model_info in models.items():
    print(f"\nTraining {model_name}...", flush=True)
    param_log[model_name] = {}
    top_features_all[model_name] = {}

    # First-pass training
    results_parallel = Parallel(n_jobs=-1)(
        delayed(train_target)(target, X_train, y_train, X_test, feature_cols, model_info['model'], model_name, kf,
                             model_info['param_grid'], model_info['callbacks'])
        for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets")
    )

    # Collect results
    accuracies = []
    for target, params, mean_acc, param_results, test_ids, preds, top_features in results_parallel:
        param_log[model_name][target] = {
            'best_params': params,
            'mean_accuracy': mean_acc,
            'all_params': param_results
        }
        top_features_all[model_name][target] = top_features
        accuracies.append(mean_acc)
        predictions.setdefault(target, {})[model_name] = dict(zip(test_ids, preds))
        param_summary.append({
            'model': model_name,
            'ID_TARGET': target,
            'best_params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': param_results[-1]['fold_accuracies'] if param_results else []
        })

    results[model_name] = np.mean([acc for acc in accuracies if acc > 0])  # Exclude skipped targets
    print(f"{model_name} Weighted Accuracy: {results[model_name]:.4f}", flush=True)

    # Print parameter summary
    print(f"\n{model_name} Parameter Summary:", flush=True)
    for summary in param_summary:
        if summary['model'] == model_name:
            print(f"ID_TARGET: {summary['ID_TARGET']} | Best Params: {summary['best_params']} | Mean Accuracy: {summary['mean_accuracy']:.4f} | Fold Accuracies: {summary['fold_accuracies']}", flush=True)

# Second-pass training with top 30 features
print("\nPerforming second-pass training with top 30 features...", flush=True)
for model_name, model_info in models.items():
    print(f"\nSecond-pass {model_name}...", flush=True)
    for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets"):
        target_str = str(target)
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
        weight_scale = 2.0 if target in low_performing_targets else 1.0
        if len(X_target) < 500:
            weight_scale *= 1.5
        weights = weights * weight_scale
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index

        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue

        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        best_params['n_estimators'] = 300  # Use integer instead of list
        model.set_params(**best_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Third-pass training for low-performing targets
print("\nPerforming third-pass training for low-performing targets...", flush=True)
for model_name, model_info in models.items():
    print(f"\nThird-pass {model_name}...", flush=True)
    for target in tqdm(low_performing_targets, desc=f"{model_name} Low-Performing Targets"):
        target_str = str(target)
        if target_str not in param_log[model_name] or param_log[model_name][target_str]['mean_accuracy'] >= 0.75:
            continue  # Skip if already performing well
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs() * 3.0  # Increase weight
        if len(X_target) < 500:
            weights = weights * 1.5
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index

        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue

        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        # Further expanded fine-tuning grid for low-performing targets
        fine_tune_grid = [
            {'learning_rate': best_params['learning_rate'][0] * 0.3, 'n_estimators': 400, 'reg_alpha': best_params['reg_alpha'][0] + 0.3, 'reg_lambda': best_params['reg_lambda'][0] + 0.3},
            {'learning_rate': best_params['learning_rate'][0] * 0.5, 'n_estimators': 450, 'reg_alpha': best_params['reg_alpha'][0] + 0.2, 'reg_lambda': best_params['reg_lambda'][0] + 0.2},
            {'learning_rate': best_params['learning_rate'][0] * 0.7, 'n_estimators': 500, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1},
            {'learning_rate': best_params['learning_rate'][0] * 1.0, 'n_estimators': 400, 'reg_alpha': best_params['reg_alpha'][0], 'reg_lambda': best_params['reg_lambda'][0]},
            {'learning_rate': best_params['learning_rate'][0] * 1.3, 'n_estimators': 450, 'reg_alpha': best_params['reg_alpha'][0] + 0.15, 'reg_lambda': best_params['reg_lambda'][0] + 0.15},
            {'learning_rate': best_params['learning_rate'][0] * 1.5, 'n_estimators': 500, 'reg_alpha': best_params['reg_alpha'][0] + 0.25, 'reg_lambda': best_params['reg_lambda'][0] + 0.25},
            {'learning_rate': best_params['learning_rate'][0] * 0.4, 'n_estimators': 550, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1},
            {'learning_rate': best_params['learning_rate'][0] * 0.8, 'n_estimators': 600, 'reg_alpha': best_params['reg_alpha'][0] + 0.2, 'reg_lambda': best_params['reg_lambda'][0] + 0.2},
            {'learning_rate': best_params['learning_rate'][0] * 1.2, 'n_estimators': 450, 'reg_alpha': best_params['reg_alpha'][0] + 0.3, 'reg_lambda': best_params['reg_lambda'][0] + 0.3}
        ]
        best_score = param_log[model_name][target_str]['mean_accuracy']
        best_fine_params = best_params

        for params in ParameterGrid(fine_tune_grid):
            print(f"Fine-tuning {model_name} ID_TARGET {target}: {params}", flush=True)
            model.set_params(**params)
            fold_accuracies = []
            for train_idx, val_idx in kf.split(X_target, y_target, groups):
                X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
                y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
                w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
                if model_name == 'LightGBM' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=model_info['callbacks'])
                elif model_name == 'XGBoost' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
                else:
                    model.fit(X_tr, y_tr, sample_weight=w_tr)
                y_pred = model.predict(X_val)
                acc = weighted_accuracy(y_val, y_pred, w_val)
                fold_accuracies.append(acc)
            mean_acc = np.mean(fold_accuracies)
            print(f"{model_name} | ID_TARGET: {target} | Fine-tune Params: {params} | Mean Accuracy: {mean_acc:.4f}", flush=True)
            if mean_acc > best_score:
                best_score = mean_acc
                best_fine_params = params
                param_log[model_name][target_str]['best_params'] = best_fine_params
                param_log[model_name][target_str]['mean_accuracy'] = best_score
                param_summary.append({
                    'model': model_name,
                    'ID_TARGET': target_str,
                    'best_params': best_fine_params,
                    'mean_accuracy': best_score,
                    'fold_accuracies': fold_accuracies
                })

        # Retrain with fine-tuned parameters
        model.set_params(**best_fine_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Fourth-pass training for very low-performing targets
print("\nPerforming fourth-pass training for very low-performing targets...", flush=True)
for model_name, model_info in models.items():
    print(f"\nFourth-pass {model_name}...", flush=True)
    for target in tqdm(low_performing_targets, desc=f"{model_name} Very Low-Performing Targets"):
        target_str = str(target)
        if target_str not in param_log[model_name] or param_log[model_name][target_str]['mean_accuracy'] >= 0.70:
            continue  # Skip if accuracy >= 0.70
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs() * 3.0  # High weight
        if len(X_target) < 500:
            weights = weights * 1.5
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index

        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue

        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        # Further expanded fine-tuning grid for very low-performing targets
        fine_tune_grid = [
            {'learning_rate': best_params['learning_rate'][0] * 0.2, 'n_estimators': 400, 'reg_alpha': best_params['reg_alpha'][0] + 0.4, 'reg_lambda': best_params['reg_lambda'][0] + 0.4},
            {'learning_rate': best_params['learning_rate'][0] * 0.4, 'n_estimators': 450, 'reg_alpha': best_params['reg_alpha'][0] + 0.3, 'reg_lambda': best_params['reg_lambda'][0] + 0.3},
            {'learning_rate': best_params['learning_rate'][0] * 0.6, 'n_estimators': 500, 'reg_alpha': best_params['reg_alpha'][0] + 0.2, 'reg_lambda': best_params['reg_lambda'][0] + 0.2},
            {'learning_rate': best_params['learning_rate'][0] * 0.8, 'n_estimators': 550, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1},
            {'learning_rate': best_params['learning_rate'][0] * 1.0, 'n_estimators': 400, 'reg_alpha': best_params['reg_alpha'][0], 'reg_lambda': best_params['reg_lambda'][0]},
            {'learning_rate': best_params['learning_rate'][0] * 1.2, 'n_estimators': 450, 'reg_alpha': best_params['reg_alpha'][0] + 0.15, 'reg_lambda': best_params['reg_lambda'][0] + 0.15},
            {'learning_rate': best_params['learning_rate'][0] * 1.4, 'n_estimators': 500, 'reg_alpha': best_params['reg_alpha'][0] + 0.25, 'reg_lambda': best_params['reg_lambda'][0] + 0.25},
            {'learning_rate': best_params['learning_rate'][0] * 1.6, 'n_estimators': 550, 'reg_alpha': best_params['reg_alpha'][0] + 0.3, 'reg_lambda': best_params['reg_lambda'][0] + 0.3},
            {'learning_rate': best_params['learning_rate'][0] * 0.5, 'n_estimators': 600, 'reg_alpha': best_params['reg_alpha'][0] + 0.2, 'reg_lambda': best_params['reg_lambda'][0] + 0.2}
        ]
        best_score = param_log[model_name][target_str]['mean_accuracy']
        best_fine_params = best_params

        for params in ParameterGrid(fine_tune_grid):
            print(f"Fine-tuning {model_name} ID_TARGET {target}: {params}", flush=True)
            model.set_params(**params)
            fold_accuracies = []
            for train_idx, val_idx in kf.split(X_target, y_target, groups):
                X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
                y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
                w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
                if model_name == 'LightGBM' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=model_info['callbacks'])
                elif model_name == 'XGBoost' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
                else:
                    model.fit(X_tr, y_tr, sample_weight=w_tr)
                y_pred = model.predict(X_val)
                acc = weighted_accuracy(y_val, y_pred, w_val)
                fold_accuracies.append(acc)
            mean_acc = np.mean(fold_accuracies)
            print(f"{model_name} | ID_TARGET: {target} | Fine-tune Params: {params} | Mean Accuracy: {mean_acc:.4f}", flush=True)
            if mean_acc > best_score:
                best_score = mean_acc
                best_fine_params = params
                param_log[model_name][target_str]['best_params'] = best_fine_params
                param_log[model_name][target_str]['mean_accuracy'] = best_score
                param_summary.append({
                    'model': model_name,
                    'ID_TARGET': target_str,
                    'best_params': best_fine_params,
                    'mean_accuracy': best_score,
                    'fold_accuracies': fold_accuracies
                })

        # Retrain with fine-tuned parameters
        model.set_params(**best_fine_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Save parameter log
param_log_path = os.path.join(DATA_DIR, 'hyperparameters.json')
with open(param_log_path, 'w') as f:
    json.dump(param_log, f, indent=4)
print(f"Saved hyperparameter log to: {param_log_path}", flush=True)
files.download(param_log_path)

# Ensemble predictions
final_predictions = {}
missing_day_targets = []
for target in X_test['ID_TARGET'].unique():
    target_str = str(target)
    test_ids = X_test[X_test['ID_TARGET'] == target].index
    for day in X_test[X_test['ID_TARGET'] == target]['ID_DAY'].unique():
        day_idx = (X_test['ID_TARGET'] == target) & (X_test['ID_DAY'] == day)
        day_test_ids = X_test[day_idx].index
        if len(day_test_ids) == 0:
            print(f"Warning: No test samples for ID_TARGET {target} on ID_DAY {day}. Using default prediction.")
            missing_day_targets.append((target, day))
            for idx in test_ids:
                final_predictions[idx] = -1
            continue
        lgbm_preds = np.array([predictions[target_str].get('LightGBM', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        xgb_preds = np.array([predictions[target_str].get('XGBoost', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        ensemble_preds = ensemble_weights['LightGBM'] * lgbm_preds + ensemble_weights['XGBoost'] * xgb_preds
        # Apply exponential moving average for smoothing
        smoothed_preds = pd.Series(ensemble_preds).ewm(span=3).mean().values[-1] if len(ensemble_preds) > 0 else 0.5
        for idx in day_test_ids:
            final_predictions[idx] = 1 if smoothed_preds >= 0.5 else -1

if missing_day_targets:
    print(f"Missing day-target pairs: {missing_day_targets}")

# Create output CSV
output_df = pd.DataFrame.from_dict(final_predictions, orient='index', columns=['RET_TARGET']).reset_index()
output_df.columns = ['ID', 'RET_TARGET']
output_df = output_df.sort_values('ID')
expected_ids = set(X_test.index)
submission_ids = set(output_df['ID'])
if expected_ids != submission_ids:
    missing_ids = expected_ids - submission_ids
    print(f"Warning: Submission missing {len(missing_ids)} IDs: {missing_ids}")
    missing_df = pd.DataFrame({'ID': list(missing_ids), 'RET_TARGET': -1})
    output_df = pd.concat([output_df, missing_df], ignore_index=True).sort_values('ID')
output_path = os.path.join(DATA_DIR, 'predictions_ensemble.csv')
output_df.to_csv(output_path, index=False)
print(f"Saved predictions to: {output_path}", flush=True)

# Validate output
test_csv = pd.read_csv(output_path)
print(f"Output CSV shape: {test_csv.shape}", flush=True)
print(f"Output CSV ID range: {test_csv['ID'].min()} to {test_csv['ID'].max()}", flush=True)
print(f"Unique RET_TARGET values: {test_csv['RET_TARGET'].unique()}", flush=True)
print(f"Missing values: {test_csv.isna().sum().sum()}", flush=True)

# Download the file
files.download(output_path)

# Print final results
print("\nValidation Results:", flush=True)
for model_name, acc in results.items():
    print(f"{model_name}: {acc:.4f}", flush=True)

XGBoost version: 2.1.4
Selected top return columns: ['RET_262', 'RET_88', 'RET_172', 'RET_259', 'RET_118', 'RET_150', 'RET_261', 'RET_268', 'RET_115', 'RET_216', 'RET_238', 'RET_59', 'RET_30', 'RET_97', 'RET_122']

Training LightGBM...


LightGBM Targets: 100%|██████████| 101/101 [1:24:34<00:00, 50.25s/it]


LightGBM Weighted Accuracy: 0.8597

LightGBM Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'bagging_fraction': 0.6, 'colsample_bytree': 0.6, 'learning_rate': 0.003, 'min_child_samples': 10, 'num_leaves': 15, 'reg_alpha': 0.0, 'reg_lambda': 0.0} | Mean Accuracy: 1.0000 | Fold Accuracies: [np.float64(0.9877300613496932), np.float64(1.0), np.float64(1.0), np.float64(0.9939393939393939), np.float64(0.988950276243094), np.float64(1.0), np.float64(1.0)]
ID_TARGET: 129.0 | Best Params: {'bagging_fraction': 0.6, 'colsample_bytree': 0.6, 'learning_rate': 0.003, 'min_child_samples': 10, 'num_leaves': 15, 'reg_alpha': 0.0, 'reg_lambda': 0.0} | Mean Accuracy: 0.8942 | Fold Accuracies: [np.float64(0.73125), np.float64(0.7345679012345679), np.float64(0.7169811320754716), np.float64(0.8430232558139535), np.float64(0.7407407407407407), np.float64(0.7672955974842768), np.float64(0.7670454545454546)]
ID_TARGET: 136.0 | Best Params: {'bagging_fraction': 0.6, 'colsample_bytree': 0.6, 'learning_rate'

XGBoost Targets: 100%|██████████| 101/101 [2:02:18<00:00, 72.66s/it]


XGBoost Weighted Accuracy: 0.9945

XGBoost Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'gamma': 0.0, 'learning_rate': 0.003, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'subsample': 0.6} | Mean Accuracy: 1.0000 | Fold Accuracies: [np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0)]
ID_TARGET: 129.0 | Best Params: {'gamma': 0.0, 'learning_rate': 0.03, 'max_depth': 6, 'min_child_weight': 4, 'n_estimators': 250, 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'subsample': 0.65} | Mean Accuracy: 1.0000 | Fold Accuracies: [np.float64(0.8125), np.float64(0.7407407407407407), np.float64(0.6792452830188679), np.float64(1.0), np.float64(0.8333333333333334), np.float64(1.0), np.float64(0.7329545454545454)]
ID_TARGET: 136.0 | Best Params: {'gamma': 0.0, 'learning_rate': 0.03, 'max_depth': 6, 'min_child_weight': 4, 'n_estimators': 250, 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'subsample

LightGBM Targets: 100%|██████████| 101/101 [01:14<00:00,  1.36it/s]


Second-pass XGBoost...



XGBoost Targets: 100%|██████████| 101/101 [01:19<00:00,  1.27it/s]


Performing third-pass training for low-performing targets...

Third-pass LightGBM...



LightGBM Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 58848.65it/s]


Third-pass XGBoost...



XGBoost Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 59532.06it/s]


Performing fourth-pass training for very low-performing targets...

Fourth-pass LightGBM...



LightGBM Very Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 54990.87it/s]


Fourth-pass XGBoost...



XGBoost Very Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 123032.92it/s]


Saved hyperparameter log to: /content/hyperparameters.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved predictions to: /content/predictions_ensemble.csv
Output CSV shape: (114468, 2)
Output CSV ID range: 0 to 114467
Unique RET_TARGET values: [ 1 -1]
Missing values: 0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Validation Results:
LightGBM: 0.8597
XGBoost: 0.9945


try new -65%, overfit

In [None]:
import numpy as np
import pandas as pd
import os
import gc
from google.colab import files
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import lightgbm
from xgboost import XGBClassifier
import xgboost
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
import json
from sklearn.model_selection import ParameterGrid

# Check xgboost version for compatibility
print(f"XGBoost version: {xgboost.__version__}")
if int(xgboost.__version__.split('.')[0]) < 1:
    print("Warning: XGBoost version < 1.0.0 detected. Ensure compatibility with eval_metric in constructor.")

# Set up data directory
DATA_DIR = '/content'

# Load datasets
try:
    X_train = pd.read_csv(os.path.join(DATA_DIR, 'X_train_itDkypA.csv'), index_col='ID', dtype=np.float32)
    y_train = pd.read_csv(os.path.join(DATA_DIR, 'y_train_3LeeT2g.csv'), index_col='ID', dtype=np.float32)
    X_test = pd.read_csv(os.path.join(DATA_DIR, 'X_test_Beg4ey3.csv'), index_col='ID', dtype=np.float32)
    supplementary = pd.read_csv(os.path.join(DATA_DIR, 'supplementary_data_Vkoyn8z.csv'))
except FileNotFoundError as e:
    raise FileNotFoundError(f"Dataset not found: {e}. Please ensure files are uploaded to {DATA_DIR}.")

# Custom weighted accuracy metric
def weighted_accuracy(y_true, y_pred, weights):
    y_pred_mapped = np.where(y_pred == 1, 1, -1)
    correct = (y_pred_mapped == np.sign(y_true)).astype(float)
    return np.sum(np.abs(y_true) * correct) / np.sum(np.abs(y_true))

# Preprocess data with enhanced feature selection
def preprocess_data(X, y=None, supplementary=None, is_train=True, top_cols=None):
    ret_cols = [col for col in X.columns if col.startswith('RET_')]

    # Dynamic feature selection with minimum variance threshold
    if is_train and top_cols is None:
        if len(ret_cols) > 15:
            variances = X[ret_cols].var()
            corr_sum = X[ret_cols].corr().abs().sum()
            combined_score = variances * corr_sum
            valid_cols = variances[variances > 1e-5].index
            if len(valid_cols) < 15:
                top_cols = valid_cols.tolist()
            else:
                top_cols = combined_score.loc[valid_cols].sort_values(ascending=False).head(15).index.tolist()
        else:
            top_cols = ret_cols
        print(f"Selected top return columns: {top_cols}")
    elif top_cols is not None:
        ret_cols = [col for col in ret_cols if col in top_cols]
        if not ret_cols:
            raise ValueError("No return columns selected. Check top_cols consistency.")

    # Cache sector-based medians
    sector_medians = {}
    if supplementary is not None:
        sector_map = supplementary.set_index('ID_asset')['CLASS_LEVEL_1']
        for col in ret_cols:
            asset_id = int(col.replace('RET_', ''))
            sector = sector_map.get(asset_id, np.nan)
            if pd.notna(sector):
                sector_assets = supplementary[supplementary['CLASS_LEVEL_1'] == sector]['ID_asset']
                sector_cols = [f'RET_{a}' for a in sector_assets if f'RET_{a}' in X.columns]
                if sector_cols:
                    sector_medians[col] = X[sector_cols].median(axis=1)
            if col in sector_medians:
                X[col] = X[col].fillna(sector_medians[col])
            else:
                X[col] = X[col].fillna(X[col].median())

    # Sector-based features
    new_cols = {}
    if supplementary is not None:
        level = 'CLASS_LEVEL_1'
        sector_groups = supplementary.groupby(level)['ID_asset'].apply(list).to_dict()
        for sector, assets in sector_groups.items():
            asset_cols = [f'RET_{asset}' for asset in assets if f'RET_{asset}' in ret_cols]
            if asset_cols:
                new_cols[f'SECTOR_AVG_{level}_{sector}'] = X[asset_cols].mean(axis=1).replace([np.inf, -np.inf], 0).fillna(0)
        X = X.merge(supplementary[['ID_asset', level]].rename(columns={'ID_asset': 'ID_TARGET', level: f'TARGET_{level}'}),
                    on='ID_TARGET', how='left')
        X = pd.concat([X, pd.get_dummies(X[f'TARGET_{level}'], prefix=f'TARGET_{level}', dummy_na=True)], axis=1)
        X = X.drop(f'TARGET_{level}', axis=1)

    # Cross-sectional features
    new_cols['MEAN_RET'] = X[ret_cols].mean(axis=1)
    new_cols['STD_RET'] = X[ret_cols].std(axis=1)
    new_cols.update({f'CS_RANK_{col}': X.groupby('ID_DAY')[col].rank(pct=True).replace([np.inf, -np.inf], 0).fillna(0) for col in ret_cols})

    # Correlation-based features (top 5 pairs)
    if is_train:
        corr_matrix = X[ret_cols].corr()
        corr_pairs = corr_matrix.unstack().sort_values(ascending=False)
        top_pairs = [(i, j) for i, j in corr_pairs.index if i < j][:5]
        for i, j in top_pairs:
            new_cols[f'CORR_{i}_{j}'] = X[i] * X[j]

    # PCA features (2 components)
    if ret_cols:
        pca = PCA(n_components=min(2, len(ret_cols)), random_state=42)
        pca_features = pca.fit_transform(X[ret_cols].fillna(0))
        for i in range(pca_features.shape[1]):
            new_cols[f'PCA_{i}'] = pca_features[:, i]

    # Add new features to DataFrame
    new_cols_df = pd.DataFrame(new_cols, index=X.index, dtype=np.float32)
    X = pd.concat([X, new_cols_df], axis=1)

    # Standardize numerical features
    feature_cols = [col for col in X.columns if col not in ['ID_DAY', 'ID_TARGET']]
    scaler = StandardScaler()
    X[feature_cols] = scaler.fit_transform(X[feature_cols].replace([np.inf, -np.inf], 0).fillna(0))

    if is_train:
        y['SIGN_TARGET'] = np.where(y['RET_TARGET'] > 0, 1, 0).astype(np.int32)
        return X, y, feature_cols, top_cols
    return X, None, feature_cols, top_cols

# Preprocess data
try:
    X_train, y_train, feature_cols, top_cols = preprocess_data(X_train, y_train, supplementary, is_train=True)
    X_test, _, test_feature_cols, _ = preprocess_data(X_test, supplementary=supplementary, is_train=False, top_cols=top_cols)
except ValueError as e:
    print(f"Preprocessing error: {e}")
    raise

# Align columns
common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]
feature_cols = [col for col in common_cols if col not in ['ID_DAY', 'ID_TARGET']]

# Align y_train
y_train = y_train.loc[X_train.index]

# Define models with further expanded hyperparameter grids
models = {
    'LightGBM': {
        'model': LGBMClassifier(num_leaves=31, min_child_samples=15, lambda_l1=0.3, lambda_l2=0.3,
                                subsample=0.7, colsample_bytree=0.8, bagging_freq=5, bagging_fraction=0.7,
                                random_state=42, n_jobs=1, verbosity=-1, class_weight='balanced', force_row_wise=True),
        'param_grid': [
            {'learning_rate': [0.001], 'num_leaves': [15], 'min_child_samples': [10], 'bagging_fraction': [0.6], 'reg_alpha': [0.0], 'reg_lambda': [0.0], 'colsample_bytree': [0.6], 'min_gain_to_split': [0.0]},
            {'learning_rate': [0.003], 'num_leaves': [31], 'min_child_samples': [15], 'bagging_fraction': [0.7], 'reg_alpha': [0.1], 'reg_lambda': [0.1], 'colsample_bytree': [0.7], 'min_gain_to_split': [0.1]},
            {'learning_rate': [0.005], 'num_leaves': [47], 'min_child_samples': [20], 'bagging_fraction': [0.8], 'reg_alpha': [0.2], 'reg_lambda': [0.2], 'colsample_bytree': [0.8], 'min_gain_to_split': [0.0]},
            {'learning_rate': [0.01], 'num_leaves': [63], 'min_child_samples': [12], 'bagging_fraction': [0.65], 'reg_alpha': [0.3], 'reg_lambda': [0.3], 'colsample_bytree': [0.9], 'min_gain_to_split': [0.1]},
            {'learning_rate': [0.03], 'num_leaves': [79], 'min_child_samples': [30], 'bagging_fraction': [0.75], 'reg_alpha': [0.4], 'reg_lambda': [0.4], 'colsample_bytree': [0.6], 'min_gain_to_split': [0.2]},
            {'learning_rate': [0.05], 'num_leaves': [15], 'min_child_samples': [10], 'bagging_fraction': [0.6], 'reg_alpha': [0.0], 'reg_lambda': [0.1], 'colsample_bytree': [0.7], 'min_gain_to_split': [0.0]},
            {'learning_rate': [0.07], 'num_leaves': [31], 'min_child_samples': [15], 'bagging_fraction': [0.7], 'reg_alpha': [0.2], 'reg_lambda': [0.0], 'colsample_bytree': [0.8], 'min_gain_to_split': [0.1]},
            {'learning_rate': [0.1], 'num_leaves': [47], 'min_child_samples': [20], 'bagging_fraction': [0.8], 'reg_alpha': [0.1], 'reg_lambda': [0.3], 'colsample_bytree': [0.9], 'min_gain_to_split': [0.2]},
            {'learning_rate': [0.15], 'num_leaves': [63], 'min_child_samples': [12], 'bagging_fraction': [0.65], 'reg_alpha': [0.3], 'reg_lambda': [0.2], 'colsample_bytree': [0.6], 'min_gain_to_split': [0.0]},
            {'learning_rate': [0.2], 'num_leaves': [79], 'min_child_samples': [30], 'bagging_fraction': [0.75], 'reg_alpha': [0.4], 'reg_lambda': [0.1], 'colsample_bytree': [0.7], 'min_gain_to_split': [0.1]},
            {'learning_rate': [0.001], 'num_leaves': [15], 'min_child_samples': [10], 'bagging_fraction': [0.6], 'reg_alpha': [0.5], 'reg_lambda': [0.5], 'colsample_bytree': [0.8], 'min_gain_to_split': [0.2]},
            {'learning_rate': [0.003], 'num_leaves': [31], 'min_child_samples': [15], 'bagging_fraction': [0.7], 'reg_alpha': [0.0], 'reg_lambda': [0.0], 'colsample_bytree': [0.9], 'min_gain_to_split': [0.0]},
            {'learning_rate': [0.005], 'num_leaves': [47], 'min_child_samples': [20], 'bagging_fraction': [0.8], 'reg_alpha': [0.2], 'reg_lambda': [0.2], 'colsample_bytree': [0.6], 'min_gain_to_split': [0.1]},
            {'learning_rate': [0.01], 'num_leaves': [63], 'min_child_samples': [12], 'bagging_fraction': [0.65], 'reg_alpha': [0.1], 'reg_lambda': [0.3], 'colsample_bytree': [0.7], 'min_gain_to_split': [0.2]},
            {'learning_rate': [0.03], 'num_leaves': [79], 'min_child_samples': [30], 'bagging_fraction': [0.75], 'reg_alpha': [0.3], 'reg_lambda': [0.4], 'colsample_bytree': [0.8], 'min_gain_to_split': [0.0]},
            {'learning_rate': [0.05], 'num_leaves': [15], 'min_child_samples': [10], 'bagging_fraction': [0.6], 'reg_alpha': [0.4], 'reg_lambda': [0.1], 'colsample_bytree': [0.9], 'min_gain_to_split': [0.1]},
            {'learning_rate': [0.07], 'num_leaves': [31], 'min_child_samples': [15], 'bagging_fraction': [0.7], 'reg_alpha': [0.5], 'reg_lambda': [0.5], 'colsample_bytree': [0.6], 'min_gain_to_split': [0.2]},
            {'learning_rate': [0.1], 'num_leaves': [47], 'min_child_samples': [20], 'bagging_fraction': [0.8], 'reg_alpha': [0.0], 'reg_lambda': [0.0], 'colsample_bytree': [0.7], 'min_gain_to_split': [0.0]},
            {'learning_rate': [0.15], 'num_leaves': [63], 'min_child_samples': [12], 'bagging_fraction': [0.65], 'reg_alpha': [0.2], 'reg_lambda': [0.2], 'colsample_bytree': [0.8], 'min_gain_to_split': [0.1]},
            {'learning_rate': [0.2], 'num_leaves': [79], 'min_child_samples': [30], 'bagging_fraction': [0.75], 'reg_alpha': [0.1], 'reg_lambda': [0.3], 'colsample_bytree': [0.9], 'min_gain_to_split': [0.2]},
            {'learning_rate': [0.001], 'num_leaves': [15], 'min_child_samples': [10], 'bagging_fraction': [0.6], 'reg_alpha': [0.3], 'reg_lambda': [0.4], 'colsample_bytree': [0.6], 'min_gain_to_split': [0.0]},
            {'learning_rate': [0.003], 'num_leaves': [31], 'min_child_samples': [15], 'bagging_fraction': [0.7], 'reg_alpha': [0.4], 'reg_lambda': [0.1], 'colsample_bytree': [0.7], 'min_gain_to_split': [0.1]},
            {'learning_rate': [0.005], 'num_leaves': [47], 'min_child_samples': [20], 'bagging_fraction': [0.8], 'reg_alpha': [0.5], 'reg_lambda': [0.5], 'colsample_bytree': [0.8], 'min_gain_to_split': [0.2]},
            {'learning_rate': [0.01], 'num_leaves': [63], 'min_child_samples': [12], 'bagging_fraction': [0.65], 'reg_alpha': [0.0], 'reg_lambda': [0.0], 'colsample_bytree': [0.9], 'min_gain_to_split': [0.0]}
        ],
        'callbacks': [lightgbm.early_stopping(stopping_rounds=15, verbose=False)]
    },
    'XGBoost': {
        'model': XGBClassifier(max_depth=4, min_child_weight=2, subsample=0.8, colsample_bytree=0.8,
                               random_state=42, n_jobs=1, eval_metric='logloss', verbosity=0,
                               scale_pos_weight=len(y_train[y_train['SIGN_TARGET'] == 0]) / len(y_train[y_train['SIGN_TARGET'] == 1])),
        'param_grid': [
            {'learning_rate': [0.001], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [100], 'reg_alpha': [0.0], 'reg_lambda': [1.0], 'gamma': [0.0], 'colsample_bylevel': [0.6]},
            {'learning_rate': [0.003], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [150], 'reg_alpha': [0.5], 'reg_lambda': [1.5], 'gamma': [0.1], 'colsample_bylevel': [0.7]},
            {'learning_rate': [0.005], 'max_depth': [5], 'min_child_weight': [3], 'subsample': [0.8], 'n_estimators': [200], 'reg_alpha': [1.0], 'reg_lambda': [2.0], 'gamma': [0.2], 'colsample_bylevel': [0.8]},
            {'learning_rate': [0.01], 'max_depth': [6], 'min_child_weight': [4], 'subsample': [0.65], 'n_estimators': [250], 'reg_alpha': [0.0], 'reg_lambda': [1.0], 'gamma': [0.0], 'colsample_bylevel': [0.9]},
            {'learning_rate': [0.03], 'max_depth': [7], 'min_child_weight': [5], 'subsample': [0.75], 'n_estimators': [300], 'reg_alpha': [0.5], 'reg_lambda': [1.5], 'gamma': [0.1], 'colsample_bylevel': [0.6]},
            {'learning_rate': [0.05], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [100], 'reg_alpha': [1.0], 'reg_lambda': [2.0], 'gamma': [0.2], 'colsample_bylevel': [0.7]},
            {'learning_rate': [0.07], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [150], 'reg_alpha': [0.0], 'reg_lambda': [1.0], 'gamma': [0.0], 'colsample_bylevel': [0.8]},
            {'learning_rate': [0.1], 'max_depth': [5], 'min_child_weight': [3], 'subsample': [0.8], 'n_estimators': [200], 'reg_alpha': [0.5], 'reg_lambda': [1.5], 'gamma': [0.1], 'colsample_bylevel': [0.9]},
            {'learning_rate': [0.15], 'max_depth': [6], 'min_child_weight': [4], 'subsample': [0.65], 'n_estimators': [250], 'reg_alpha': [1.0], 'reg_lambda': [2.0], 'gamma': [0.2], 'colsample_bylevel': [0.6]},
            {'learning_rate': [0.2], 'max_depth': [7], 'min_child_weight': [5], 'subsample': [0.75], 'n_estimators': [300], 'reg_alpha': [0.0], 'reg_lambda': [1.0], 'gamma': [0.0], 'colsample_bylevel': [0.7]},
            {'learning_rate': [0.001], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [100], 'reg_alpha': [0.5], 'reg_lambda': [1.5], 'gamma': [0.1], 'colsample_bylevel': [0.8]},
            {'learning_rate': [0.003], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [150], 'reg_alpha': [1.0], 'reg_lambda': [2.0], 'gamma': [0.2], 'colsample_bylevel': [0.9]},
            {'learning_rate': [0.005], 'max_depth': [5], 'min_child_weight': [3], 'subsample': [0.8], 'n_estimators': [200], 'reg_alpha': [0.0], 'reg_lambda': [1.0], 'gamma': [0.0], 'colsample_bylevel': [0.6]},
            {'learning_rate': [0.01], 'max_depth': [6], 'min_child_weight': [4], 'subsample': [0.65], 'n_estimators': [250], 'reg_alpha': [0.5], 'reg_lambda': [1.5], 'gamma': [0.1], 'colsample_bylevel': [0.7]},
            {'learning_rate': [0.03], 'max_depth': [7], 'min_child_weight': [5], 'subsample': [0.75], 'n_estimators': [300], 'reg_alpha': [1.0], 'reg_lambda': [2.0], 'gamma': [0.2], 'colsample_bylevel': [0.8]},
            {'learning_rate': [0.05], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [100], 'reg_alpha': [0.0], 'reg_lambda': [1.0], 'gamma': [0.0], 'colsample_bylevel': [0.9]},
            {'learning_rate': [0.07], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [150], 'reg_alpha': [0.5], 'reg_lambda': [1.5], 'gamma': [0.1], 'colsample_bylevel': [0.6]},
            {'learning_rate': [0.1], 'max_depth': [5], 'min_child_weight': [3], 'subsample': [0.8], 'n_estimators': [200], 'reg_alpha': [1.0], 'reg_lambda': [2.0], 'gamma': [0.2], 'colsample_bylevel': [0.7]},
            {'learning_rate': [0.15], 'max_depth': [6], 'min_child_weight': [4], 'subsample': [0.65], 'n_estimators': [250], 'reg_alpha': [0.0], 'reg_lambda': [1.0], 'gamma': [0.0], 'colsample_bylevel': [0.8]},
            {'learning_rate': [0.2], 'max_depth': [7], 'min_child_weight': [5], 'subsample': [0.75], 'n_estimators': [300], 'reg_alpha': [0.5], 'reg_lambda': [1.5], 'gamma': [0.1], 'colsample_bylevel': [0.9]},
            {'learning_rate': [0.001], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [100], 'reg_alpha': [1.0], 'reg_lambda': [2.0], 'gamma': [0.2], 'colsample_bylevel': [0.6]},
            {'learning_rate': [0.003], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [150], 'reg_alpha': [0.0], 'reg_lambda': [1.0], 'gamma': [0.0], 'colsample_bylevel': [0.7]},
            {'learning_rate': [0.005], 'max_depth': [5], 'min_child_weight': [3], 'subsample': [0.8], 'n_estimators': [200], 'reg_alpha': [0.5], 'reg_lambda': [1.5], 'gamma': [0.1], 'colsample_bylevel': [0.8]},
            {'learning_rate': [0.01], 'max_depth': [6], 'min_child_weight': [4], 'subsample': [0.65], 'n_estimators': [250], 'reg_alpha': [1.0], 'reg_lambda': [2.0], 'gamma': [0.2], 'colsample_bylevel': [0.9]}
        ],
        'callbacks': [xgboost.callback.EarlyStopping(rounds=15, metric_name='logloss', save_best=True)]
    }
}

# Function to train and predict for a single target
def train_target(target, X_train, y_train, X_test, feature_cols, model, model_name, kf, param_grid, callbacks):
    X_target = X_train[X_train['ID_TARGET'] == target][feature_cols]
    y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
    weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
    low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]
    # Dynamic weight scaling based on sample size
    weight_scale = 2.0 if target in low_performing_targets else 1.0
    if len(X_target) < 500:
        weight_scale *= 1.5  # Boost weights for small targets
    weights = weights * weight_scale
    groups = X_train[X_train['ID_TARGET'] == target]['ID_DAY']
    X_test_target = X_test[X_test['ID_TARGET'] == target][feature_cols]
    test_ids = X_test[X_test['ID_TARGET'] == target].index

    # Check for sufficient data
    if len(X_target) < 100 or len(X_test_target) == 0:
        print(f"Warning: Insufficient data for ID_TARGET {target} (train: {len(X_target)}, test: {len(X_test_target)}). Skipping.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    param_results = []
    best_params = None
    best_score = 0
    best_model = None

    # First-pass grid search with sample weights
    for params in ParameterGrid(param_grid):
        print(f"Testing params for {model_name} ID_TARGET {target}: {params}", flush=True)
        try:
            model.set_params(**params)
        except ValueError as e:
            print(f"Invalid params for {model_name} ID_TARGET {target}: {e}")
            continue
        fold_accuracies = []
        for train_idx, val_idx in kf.split(X_target, y_target, groups):
            X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
            y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
            w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]

            if model_name == 'LightGBM' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=callbacks)
            elif model_name == 'XGBoost' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
            else:
                model.fit(X_tr, y_tr, sample_weight=w_tr)
            y_pred = model.predict(X_val)
            acc = weighted_accuracy(y_val, y_pred, w_val)
            fold_accuracies.append(acc)

        mean_acc = np.mean(fold_accuracies)
        print(f"{model_name} | ID_TARGET: {target} | Params: {params} | Mean Accuracy: {mean_acc:.4f} | Fold Accuracies: {fold_accuracies}", flush=True)
        param_results.append({
            'params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': fold_accuracies
        })
        if mean_acc > best_score:
            best_score = mean_acc
            best_params = params
            best_model = type(model)(**model.get_params())

    # Train with best parameters
    if best_model is None:
        print(f"No valid model for {model_name} ID_TARGET {target}. Using default.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    best_model.set_params(**best_params)
    if model_name == 'LightGBM' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=callbacks)
    elif model_name == 'XGBoost' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
    else:
        best_model.fit(X_target, y_target, sample_weight=weights)

    # Predict
    preds = best_model.predict_proba(X_test_target)[:, 1].astype(np.float32)

    # Feature importance
    importance = best_model.feature_importances_
    feature_importance = dict(zip(feature_cols, importance))
    top_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:30]

    # Clean up memory
    gc.collect()

    return str(target), best_params, best_score, param_results, test_ids, preds, top_features

# Cross-validation and training
kf = GroupKFold(n_splits=7)  # Increased to 7 folds
results = {}
predictions = {}
ensemble_weights = {'LightGBM': 0.7, 'XGBoost': 0.3}
param_log = {}
top_features_all = {}
param_summary = []
low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]

for model_name, model_info in models.items():
    print(f"\nTraining {model_name}...", flush=True)
    param_log[model_name] = {}
    top_features_all[model_name] = {}

    # First-pass training
    results_parallel = Parallel(n_jobs=-1)(
        delayed(train_target)(target, X_train, y_train, X_test, feature_cols, model_info['model'], model_name, kf,
                             model_info['param_grid'], model_info['callbacks'])
        for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets")
    )

    # Collect results
    accuracies = []
    for target, params, mean_acc, param_results, test_ids, preds, top_features in results_parallel:
        param_log[model_name][target] = {
            'best_params': params,
            'mean_accuracy': mean_acc,
            'all_params': param_results
        }
        top_features_all[model_name][target] = top_features
        accuracies.append(mean_acc)
        predictions.setdefault(target, {})[model_name] = dict(zip(test_ids, preds))
        param_summary.append({
            'model': model_name,
            'ID_TARGET': target,
            'best_params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': param_results[-1]['fold_accuracies'] if param_results else []
        })

    results[model_name] = np.mean([acc for acc in accuracies if acc > 0])  # Exclude skipped targets
    print(f"{model_name} Weighted Accuracy: {results[model_name]:.4f}", flush=True)

    # Print parameter summary
    print(f"\n{model_name} Parameter Summary:", flush=True)
    for summary in param_summary:
        if summary['model'] == model_name:
            print(f"ID_TARGET: {summary['ID_TARGET']} | Best Params: {summary['best_params']} | Mean Accuracy: {summary['mean_accuracy']:.4f} | Fold Accuracies: {summary['fold_accuracies']}", flush=True)

# Second-pass training with top 30 features
print("\nPerforming second-pass training with top 30 features...", flush=True)
for model_name, model_info in models.items():
    print(f"\nSecond-pass {model_name}...", flush=True)
    for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets"):
        target_str = str(target)
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
        weight_scale = 2.0 if target in low_performing_targets else 1.0
        if len(X_target) < 500:
            weight_scale *= 1.5
        weights = weights * weight_scale
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index

        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue

        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        best_params['n_estimators'] = 300  # Use integer instead of list
        model.set_params(**best_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Third-pass training for low-performing targets
print("\nPerforming third-pass training for low-performing targets...", flush=True)
for model_name, model_info in models.items():
    print(f"\nThird-pass {model_name}...", flush=True)
    for target in tqdm(low_performing_targets, desc=f"{model_name} Low-Performing Targets"):
        target_str = str(target)
        if target_str not in param_log[model_name] or param_log[model_name][target_str]['mean_accuracy'] >= 0.75:
            continue  # Skip if already performing well
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs() * 3.0  # Increase weight
        if len(X_target) < 500:
            weights = weights * 1.5
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index

        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue

        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        # Further expanded fine-tuning grid for low-performing targets
        fine_tune_grid = [
            {'learning_rate': best_params['learning_rate'][0] * 0.2, 'n_estimators': 400, 'reg_alpha': best_params['reg_alpha'][0] + 0.4, 'reg_lambda': best_params['reg_lambda'][0] + 0.4},
            {'learning_rate': best_params['learning_rate'][0] * 0.3, 'n_estimators': 450, 'reg_alpha': best_params['reg_alpha'][0] + 0.3, 'reg_lambda': best_params['reg_lambda'][0] + 0.3},
            {'learning_rate': best_params['learning_rate'][0] * 0.5, 'n_estimators': 500, 'reg_alpha': best_params['reg_alpha'][0] + 0.2, 'reg_lambda': best_params['reg_lambda'][0] + 0.2},
            {'learning_rate': best_params['learning_rate'][0] * 0.7, 'n_estimators': 550, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1},
            {'learning_rate': best_params['learning_rate'][0] * 1.0, 'n_estimators': 400, 'reg_alpha': best_params['reg_alpha'][0], 'reg_lambda': best_params['reg_lambda'][0]},
            {'learning_rate': best_params['learning_rate'][0] * 1.2, 'n_estimators': 450, 'reg_alpha': best_params['reg_alpha'][0] + 0.15, 'reg_lambda': best_params['reg_lambda'][0] + 0.15},
            {'learning_rate': best_params['learning_rate'][0] * 1.5, 'n_estimators': 500, 'reg_alpha': best_params['reg_alpha'][0] + 0.25, 'reg_lambda': best_params['reg_lambda'][0] + 0.25},
            {'learning_rate': best_params['learning_rate'][0] * 1.7, 'n_estimators': 550, 'reg_alpha': best_params['reg_alpha'][0] + 0.3, 'reg_lambda': best_params['reg_lambda'][0] + 0.3},
            {'learning_rate': best_params['learning_rate'][0] * 2.0, 'n_estimators': 600, 'reg_alpha': best_params['reg_alpha'][0] + 0.2, 'reg_lambda': best_params['reg_lambda'][0] + 0.2},
            {'learning_rate': best_params['learning_rate'][0] * 0.4, 'n_estimators': 650, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1},
            {'learning_rate': best_params['learning_rate'][0] * 0.8, 'n_estimators': 700, 'reg_alpha': best_params['reg_alpha'][0] + 0.15, 'reg_lambda': best_params['reg_lambda'][0] + 0.15},
            {'learning_rate': best_params['learning_rate'][0] * 1.3, 'n_estimators': 450, 'reg_alpha': best_params['reg_alpha'][0] + 0.3, 'reg_lambda': best_params['reg_lambda'][0] + 0.3}
        ]
        best_score = param_log[model_name][target_str]['mean_accuracy']
        best_fine_params = best_params

        for params in ParameterGrid(fine_tune_grid):
            print(f"Fine-tuning {model_name} ID_TARGET {target}: {params}", flush=True)
            model.set_params(**params)
            fold_accuracies = []
            for train_idx, val_idx in kf.split(X_target, y_target, groups):
                X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
                y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
                w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
                if model_name == 'LightGBM' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=model_info['callbacks'])
                elif model_name == 'XGBoost' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
                else:
                    model.fit(X_tr, y_tr, sample_weight=w_tr)
                y_pred = model.predict(X_val)
                acc = weighted_accuracy(y_val, y_pred, w_val)
                fold_accuracies.append(acc)
            mean_acc = np.mean(fold_accuracies)
            print(f"{model_name} | ID_TARGET: {target} | Fine-tune Params: {params} | Mean Accuracy: {mean_acc:.4f}", flush=True)
            if mean_acc > best_score:
                best_score = mean_acc
                best_fine_params = params
                param_log[model_name][target_str]['best_params'] = best_fine_params
                param_log[model_name][target_str]['mean_accuracy'] = best_score
                param_summary.append({
                    'model': model_name,
                    'ID_TARGET': target_str,
                    'best_params': best_fine_params,
                    'mean_accuracy': best_score,
                    'fold_accuracies': fold_accuracies
                })

        # Retrain with fine-tuned parameters
        model.set_params(**best_fine_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Fourth-pass training for very low-performing targets
print("\nPerforming fourth-pass training for very low-performing targets...", flush=True)
for model_name, model_info in models.items():
    print(f"\nFourth-pass {model_name}...", flush=True)
    for target in tqdm(low_performing_targets, desc=f"{model_name} Very Low-Performing Targets"):
        target_str = str(target)
        if target_str not in param_log[model_name] or param_log[model_name][target_str]['mean_accuracy'] >= 0.70:
            continue  # Skip if accuracy >= 0.70
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs() * 3.0  # High weight
        if len(X_target) < 500:
            weights = weights * 1.5
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index

        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue

        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        # Further expanded fine-tuning grid for very low-performing targets
        fine_tune_grid = [
            {'learning_rate': best_params['learning_rate'][0] * 0.1, 'n_estimators': 400, 'reg_alpha': best_params['reg_alpha'][0] + 0.5, 'reg_lambda': best_params['reg_lambda'][0] + 0.5},
            {'learning_rate': best_params['learning_rate'][0] * 0.2, 'n_estimators': 450, 'reg_alpha': best_params['reg_alpha'][0] + 0.4, 'reg_lambda': best_params['reg_lambda'][0] + 0.4},
            {'learning_rate': best_params['learning_rate'][0] * 0.4, 'n_estimators': 500, 'reg_alpha': best_params['reg_alpha'][0] + 0.3, 'reg_lambda': best_params['reg_lambda'][0] + 0.3},
            {'learning_rate': best_params['learning_rate'][0] * 0.6, 'n_estimators': 550, 'reg_alpha': best_params['reg_alpha'][0] + 0.2, 'reg_lambda': best_params['reg_lambda'][0] + 0.2},
            {'learning_rate': best_params['learning_rate'][0] * 0.8, 'n_estimators': 600, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1},
            {'learning_rate': best_params['learning_rate'][0] * 1.0, 'n_estimators': 400, 'reg_alpha': best_params['reg_alpha'][0], 'reg_lambda': best_params['reg_lambda'][0]},
            {'learning_rate': best_params['learning_rate'][0] * 1.2, 'n_estimators': 450, 'reg_alpha': best_params['reg_alpha'][0] + 0.15, 'reg_lambda': best_params['reg_lambda'][0] + 0.15},
            {'learning_rate': best_params['learning_rate'][0] * 1.5, 'n_estimators': 500, 'reg_alpha': best_params['reg_alpha'][0] + 0.25, 'reg_lambda': best_params['reg_lambda'][0] + 0.25},
            {'learning_rate': best_params['learning_rate'][0] * 1.7, 'n_estimators': 550, 'reg_alpha': best_params['reg_alpha'][0] + 0.3, 'reg_lambda': best_params['reg_lambda'][0] + 0.3},
            {'learning_rate': best_params['learning_rate'][0] * 2.0, 'n_estimators': 600, 'reg_alpha': best_params['reg_alpha'][0] + 0.2, 'reg_lambda': best_params['reg_lambda'][0] + 0.2},
            {'learning_rate': best_params['learning_rate'][0] * 0.3, 'n_estimators': 650, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1},
            {'learning_rate': best_params['learning_rate'][0] * 0.5, 'n_estimators': 700, 'reg_alpha': best_params['reg_alpha'][0] + 0.15, 'reg_lambda': best_params['reg_lambda'][0] + 0.15}
        ]
        best_score = param_log[model_name][target_str]['mean_accuracy']
        best_fine_params = best_params

        for params in ParameterGrid(fine_tune_grid):
            print(f"Fine-tuning {model_name} ID_TARGET {target}: {params}", flush=True)
            model.set_params(**params)
            fold_accuracies = []
            for train_idx, val_idx in kf.split(X_target, y_target, groups):
                X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
                y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
                w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
                if model_name == 'LightGBM' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=model_info['callbacks'])
                elif model_name == 'XGBoost' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
                else:
                    model.fit(X_tr, y_tr, sample_weight=w_tr)
                y_pred = model.predict(X_val)
                acc = weighted_accuracy(y_val, y_pred, w_val)
                fold_accuracies.append(acc)
            mean_acc = np.mean(fold_accuracies)
            print(f"{model_name} | ID_TARGET: {target} | Fine-tune Params: {params} | Mean Accuracy: {mean_acc:.4f}", flush=True)
            if mean_acc > best_score:
                best_score = mean_acc
                best_fine_params = params
                param_log[model_name][target_str]['best_params'] = best_fine_params
                param_log[model_name][target_str]['mean_accuracy'] = best_score
                param_summary.append({
                    'model': model_name,
                    'ID_TARGET': target_str,
                    'best_params': best_fine_params,
                    'mean_accuracy': best_score,
                    'fold_accuracies': fold_accuracies
                })

        # Retrain with fine-tuned parameters
        model.set_params(**best_fine_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Save parameter log
param_log_path = os.path.join(DATA_DIR, 'hyperparameters.json')
with open(param_log_path, 'w') as f:
    json.dump(param_log, f, indent=4)
print(f"Saved hyperparameter log to: {param_log_path}", flush=True)
files.download(param_log_path)

# Ensemble predictions
final_predictions = {}
missing_day_targets = []
for target in X_test['ID_TARGET'].unique():
    target_str = str(target)
    test_ids = X_test[X_test['ID_TARGET'] == target].index
    for day in X_test[X_test['ID_TARGET'] == target]['ID_DAY'].unique():
        day_idx = (X_test['ID_TARGET'] == target) & (X_test['ID_DAY'] == day)
        day_test_ids = X_test[day_idx].index
        if len(day_test_ids) == 0:
            print(f"Warning: No test samples for ID_TARGET {target} on ID_DAY {day}. Using default prediction.")
            missing_day_targets.append((target, day))
            for idx in test_ids:
                final_predictions[idx] = -1
            continue
        lgbm_preds = np.array([predictions[target_str].get('LightGBM', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        xgb_preds = np.array([predictions[target_str].get('XGBoost', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        ensemble_preds = ensemble_weights['LightGBM'] * lgbm_preds + ensemble_weights['XGBoost'] * xgb_preds
        # Apply exponential moving average for smoothing
        smoothed_preds = pd.Series(ensemble_preds).ewm(span=3).mean().values[-1] if len(ensemble_preds) > 0 else 0.5
        for idx in day_test_ids:
            final_predictions[idx] = 1 if smoothed_preds >= 0.5 else -1

if missing_day_targets:
    print(f"Missing day-target pairs: {missing_day_targets}")

# Create output CSV
output_df = pd.DataFrame.from_dict(final_predictions, orient='index', columns=['RET_TARGET']).reset_index()
output_df.columns = ['ID', 'RET_TARGET']
output_df = output_df.sort_values('ID')
expected_ids = set(X_test.index)
submission_ids = set(output_df['ID'])
if expected_ids != submission_ids:
    missing_ids = expected_ids - submission_ids
    print(f"Warning: Submission missing {len(missing_ids)} IDs: {missing_ids}")
    missing_df = pd.DataFrame({'ID': list(missing_ids), 'RET_TARGET': -1})
    output_df = pd.concat([output_df, missing_df], ignore_index=True).sort_values('ID')
output_path = os.path.join(DATA_DIR, 'predictions_ensemble.csv')
output_df.to_csv(output_path, index=False)
print(f"Saved predictions to: {output_path}", flush=True)

# Validate output
test_csv = pd.read_csv(output_path)
print(f"Output CSV shape: {test_csv.shape}", flush=True)
print(f"Output CSV ID range: {test_csv['ID'].min()} to {test_csv['ID'].max()}", flush=True)
print(f"Unique RET_TARGET values: {test_csv['RET_TARGET'].unique()}", flush=True)
print(f"Missing values: {test_csv.isna().sum().sum()}", flush=True)

# Download the file
files.download(output_path)

# Print final results
print("\nValidation Results:", flush=True)
for model_name, acc in results.items():
    print(f"{model_name}: {acc:.4f}", flush=True)

XGBoost version: 2.1.4
Selected top return columns: ['RET_262', 'RET_88', 'RET_172', 'RET_259', 'RET_118', 'RET_150', 'RET_261', 'RET_268', 'RET_115', 'RET_216', 'RET_238', 'RET_59', 'RET_30', 'RET_97', 'RET_122']

Training LightGBM...


LightGBM Targets: 100%|██████████| 101/101 [1:23:26<00:00, 49.57s/it]


LightGBM Weighted Accuracy: 0.9858

LightGBM Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'bagging_fraction': 0.6, 'colsample_bytree': 0.6, 'learning_rate': 0.001, 'min_child_samples': 10, 'min_gain_to_split': 0.0, 'num_leaves': 15, 'reg_alpha': 0.0, 'reg_lambda': 0.0} | Mean Accuracy: 1.0000 | Fold Accuracies: [np.float64(0.8282208588957055), np.float64(1.0), np.float64(1.0), np.float64(0.9030303030303031), np.float64(1.0), np.float64(1.0), np.float64(0.9559748427672956)]
ID_TARGET: 129.0 | Best Params: {'bagging_fraction': 0.6, 'colsample_bytree': 0.6, 'learning_rate': 0.001, 'min_child_samples': 10, 'min_gain_to_split': 0.0, 'num_leaves': 15, 'reg_alpha': 0.0, 'reg_lambda': 0.0} | Mean Accuracy: 1.0000 | Fold Accuracies: [np.float64(0.65), np.float64(0.6728395061728395), np.float64(0.5786163522012578), np.float64(0.7325581395348837), np.float64(0.6481481481481481), np.float64(0.6477987421383647), np.float64(0.6931818181818182)]
ID_TARGET: 136.0 | Best Params: {'bagging_fracti

XGBoost Targets: 100%|██████████| 101/101 [2:42:45<00:00, 96.69s/it]


XGBoost Weighted Accuracy: 1.0000

XGBoost Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'colsample_bylevel': 0.6, 'gamma': 0.0, 'learning_rate': 0.001, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'subsample': 0.6} | Mean Accuracy: 1.0000 | Fold Accuracies: [np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0)]
ID_TARGET: 129.0 | Best Params: {'colsample_bylevel': 0.6, 'gamma': 0.0, 'learning_rate': 0.001, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'subsample': 0.6} | Mean Accuracy: 1.0000 | Fold Accuracies: [np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0), np.float64(1.0)]
ID_TARGET: 136.0 | Best Params: {'colsample_bylevel': 0.6, 'gamma': 0.0, 'learning_rate': 0.001, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0.0, 'reg_lambda':

LightGBM Targets: 100%|██████████| 101/101 [01:08<00:00,  1.48it/s]


Second-pass XGBoost...



XGBoost Targets: 100%|██████████| 101/101 [01:27<00:00,  1.16it/s]


Performing third-pass training for low-performing targets...

Third-pass LightGBM...



LightGBM Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 54215.45it/s]


Third-pass XGBoost...



XGBoost Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 54024.99it/s]


Performing fourth-pass training for very low-performing targets...

Fourth-pass LightGBM...



LightGBM Very Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 52488.45it/s]


Fourth-pass XGBoost...



XGBoost Very Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 125033.45it/s]


Saved hyperparameter log to: /content/hyperparameters.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved predictions to: /content/predictions_ensemble.csv
Output CSV shape: (114468, 2)
Output CSV ID range: 0 to 114467
Unique RET_TARGET values: [ 1 -1]
Missing values: 0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Validation Results:
LightGBM: 0.9858
XGBoost: 1.0000


New method

In [None]:
!pip install arch

Collecting arch
  Downloading arch-7.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading arch-7.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (985 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m985.3/985.3 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: arch
Successfully installed arch-7.2.0


In [None]:
import numpy as np
import pandas as pd
import os
import gc
from google.colab import files
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import lightgbm
from xgboost import XGBClassifier
import xgboost
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
import json
from sklearn.model_selection import ParameterGrid
from sklearn.feature_selection import RFE

# Check xgboost version for compatibility
print(f"XGBoost version: {xgboost.__version__}")
if int(xgboost.__version__.split('.')[0]) < 1:
    print("Warning: XGBoost version < 1.0.0 detected. Ensure compatibility with eval_metric in constructor.")

# Set up data directory
DATA_DIR = '/content'

# Load datasets with minimal memory
try:
    X_train = pd.read_csv(os.path.join(DATA_DIR, 'X_train_itDkypA.csv'), index_col='ID', dtype=np.float32)
    y_train = pd.read_csv(os.path.join(DATA_DIR, 'y_train_3LeeT2g.csv'), index_col='ID', dtype=np.float32)
    X_test = pd.read_csv(os.path.join(DATA_DIR, 'X_test_Beg4ey3.csv'), index_col='ID', dtype=np.float32)
    supplementary = pd.read_csv(os.path.join(DATA_DIR, 'supplementary_data_Vkoyn8z.csv'), dtype={'ID_asset': np.int32, 'CLASS_LEVEL_1': 'category'})
except FileNotFoundError as e:
    raise FileNotFoundError(f"Dataset not found: {e}. Please ensure files are uploaded to {DATA_DIR}.")

# Custom weighted accuracy metric
def weighted_accuracy(y_true, y_pred, weights):
    y_pred_mapped = np.where(y_pred == 1, 1, -1)
    correct = (y_pred_mapped == np.sign(y_true)).astype(np.float32)
    return np.sum(np.abs(y_true) * correct) / np.sum(np.abs(y_true))

# Preprocess data with reduced feature set
def preprocess_data(X, y=None, supplementary=None, is_train=True, top_cols=None):
    ret_cols = [col for col in X.columns if col.startswith('RET_')]

    # Dynamic feature selection (top 8)
    if is_train and top_cols is None:
        if len(ret_cols) > 8:
            variances = X[ret_cols].var()
            corr_sum = X[ret_cols].corr().abs().sum()
            combined_score = variances * corr_sum
            valid_cols = variances[variances > 1e-5].index
            top_cols = combined_score.loc[valid_cols].sort_values(ascending=False).head(8).index.tolist()
        else:
            top_cols = ret_cols
        print(f"Selected top return columns: {top_cols}")
    elif top_cols is not None:
        ret_cols = [col for col in ret_cols if col in top_cols]
        if not ret_cols:
            raise ValueError("No return columns selected. Check top_cols consistency.")

    # Cache sector-based medians
    sector_medians = {}
    if supplementary is not None:
        sector_map = supplementary.set_index('ID_asset')['CLASS_LEVEL_1']
        for col in ret_cols:
            asset_id = int(col.replace('RET_', ''))
            sector = sector_map.get(asset_id, np.nan)
            if pd.notna(sector):
                sector_assets = supplementary[supplementary['CLASS_LEVEL_1'] == sector]['ID_asset']
                sector_cols = [f'RET_{a}' for a in sector_assets if f'RET_{a}' in X.columns]
                if sector_cols:
                    sector_medians[col] = X[sector_cols].median(axis=1)
            if col in sector_medians:
                X[col] = X[col].fillna(sector_medians[col])
            else:
                X[col] = X[col].fillna(X[col].median())
        gc.collect()

    # New columns dictionary
    new_cols = {}

    # Sector-based features (top 5 categories)
    if supplementary is not None:
        level = 'CLASS_LEVEL_1'
        top_sectors = supplementary[level].value_counts().head(5).index
        sector_groups = supplementary[supplementary[level].isin(top_sectors)].groupby(level, observed=True)['ID_asset'].apply(list).to_dict()
        for sector, assets in sector_groups.items():
            asset_cols = [f'RET_{asset}' for asset in assets if f'RET_{asset}' in ret_cols]
            if asset_cols:
                new_cols[f'SECTOR_AVG_{level}_{sector}'] = X[asset_cols].mean(axis=1).replace([np.inf, -np.inf], 0).fillna(0)
        X = X.merge(supplementary[['ID_asset', level]].rename(columns={'ID_asset': 'ID_TARGET', level: f'TARGET_{level}'}),
                    on='ID_TARGET', how='left')
        X = pd.concat([X, pd.get_dummies(X[f'TARGET_{level}'], prefix=f'TARGET_{level}', dummy_na=True)], axis=1)
        X = X.drop(f'TARGET_{level}', axis=1)
        gc.collect()

    # Cross-sectional features
    new_cols['MEAN_RET'] = X[ret_cols].mean(axis=1)
    new_cols['STD_RET'] = X[ret_cols].std(axis=1)
    new_cols.update({f'CS_RANK_{col}': X.groupby('ID_DAY')[col].rank(pct=True).replace([np.inf, -np.inf], 0).fillna(0) for col in ret_cols})

    # Correlation-based features (top 3 pairs)
    if is_train:
        corr_matrix = X[ret_cols].corr()
        corr_pairs = corr_matrix.unstack().sort_values(ascending=False)
        top_pairs = [(i, j) for i, j in corr_pairs.index if i < j][:3]
        for i, j in top_pairs:
            new_cols[f'CORR_{i}_{j}'] = X[i] * X[j]

    # PCA features (1 component)
    if ret_cols:
        pca = PCA(n_components=1, random_state=42)
        pca_features = pca.fit_transform(X[ret_cols].fillna(0))
        new_cols['PCA_0'] = pca_features[:, 0]
        gc.collect()

    # Temporal features: Momentum and Volatility
    for col in ret_cols:
        new_cols[f'MOMENTUM_7D_{col}'] = X[col].groupby(X['ID_TARGET']).rolling(7, min_periods=1).mean().reset_index(level=0, drop=True)
        new_cols[f'VOLATILITY_3D_{col}'] = X[col].groupby(X['ID_TARGET']).rolling(3, min_periods=1).std().reset_index(level=0, drop=True)
        gc.collect()

    # Add new features to DataFrame
    new_cols_df = pd.DataFrame(new_cols, index=X.index, dtype=np.float32)
    X = pd.concat([X, new_cols_df], axis=1)
    del new_cols_df
    gc.collect()

    # Standardize numerical features
    feature_cols = [col for col in X.columns if col not in ['ID_DAY', 'ID_TARGET']]
    scaler = StandardScaler()
    X[feature_cols] = scaler.fit_transform(X[feature_cols].replace([np.inf, -np.inf], 0).fillna(0))
    gc.collect()

    if is_train:
        y['SIGN_TARGET'] = np.where(y['RET_TARGET'] > 0, 1, 0).astype(np.int32)
        return X, y, feature_cols, top_cols
    return X, None, feature_cols, top_cols

# Preprocess data
try:
    X_train, y_train, feature_cols, top_cols = preprocess_data(X_train, y_train, supplementary, is_train=True)
    X_test, _, test_feature_cols, _ = preprocess_data(X_test, supplementary=supplementary, is_train=False, top_cols=top_cols)
except ValueError as e:
    print(f"Preprocessing error: {e}")
    raise

# Align columns
common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]
feature_cols = [col for col in common_cols if col not in ['ID_DAY', 'ID_TARGET']]
y_train = y_train.loc[X_train.index]
gc.collect()

# Create holdout set (20% of training data)
outer_kf = GroupKFold(n_splits=4)
train_idx, holdout_idx = next(outer_kf.split(X_train, y_train, groups=X_train['ID_DAY']))
X_train_inner, X_holdout = X_train.iloc[train_idx], X_train.iloc[holdout_idx]
y_train_inner, y_holdout = y_train.iloc[train_idx], y_train.iloc[holdout_idx]
del X_train, y_train
gc.collect()

# Define models with fixed hyperparameter grids
models = {
    'LightGBM': {
        'model': LGBMClassifier(num_leaves=31, min_child_samples=15, lambda_l1=0.3, lambda_l2=0.3,
                                subsample=0.7, colsample_bytree=0.8, bagging_freq=5, bagging_fraction=0.7,
                                random_state=42, n_jobs=1, verbosity=-1, class_weight='balanced', force_row_wise=True),
        'param_grid': [
            {'learning_rate': [0.01], 'num_leaves': [15], 'min_child_samples': [10], 'bagging_fraction': [0.6], 'reg_alpha': [0.0], 'reg_lambda': [0.0]},
            {'learning_rate': [0.03], 'num_leaves': [31], 'min_child_samples': [15], 'bagging_fraction': [0.7], 'reg_alpha': [0.1], 'reg_lambda': [0.1]},
            {'learning_rate': [0.05], 'num_leaves': [47], 'min_child_samples': [20], 'bagging_fraction': [0.8], 'reg_alpha': [0.2], 'reg_lambda': [0.2]},
            {'learning_rate': [0.1], 'num_leaves': [31], 'min_child_samples': [15], 'bagging_fraction': [0.7], 'reg_alpha': [0.2], 'reg_lambda': [0.1]}
        ],
        'callbacks': [lightgbm.early_stopping(stopping_rounds=15, verbose=False)]
    },
    'XGBoost': {
        'model': XGBClassifier(max_depth=4, min_child_weight=2, subsample=0.8, colsample_bytree=0.8,
                               random_state=42, n_jobs=1, eval_metric='logloss', verbosity=0,
                               scale_pos_weight=len(y_train_inner[y_train_inner['SIGN_TARGET'] == 0]) / len(y_train_inner[y_train_inner['SIGN_TARGET'] == 1])),
        'param_grid': [
            {'learning_rate': [0.01], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [100], 'reg_alpha': [0.0], 'reg_lambda': [1.0]},
            {'learning_rate': [0.03], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [150], 'reg_alpha': [0.5], 'reg_lambda': [1.5]},
            {'learning_rate': [0.05], 'max_depth': [5], 'min_child_weight': [3], 'subsample': [0.8], 'n_estimators': [200], 'reg_alpha': [1.0], 'reg_lambda': [2.0]},
            {'learning_rate': [0.1], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [150], 'reg_alpha': [0.5], 'reg_lambda': [1.5]}
        ],
        'callbacks': [xgboost.callback.EarlyStopping(rounds=15, metric_name='logloss', save_best=True)]
    }
}

# Function to train and predict with batch processing
def train_target(target, X_train, y_train, X_test, feature_cols, model, model_name, kf, param_grid, callbacks, batch_size=500):
    X_target = X_train[X_train['ID_TARGET'] == target][feature_cols]
    y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
    weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
    low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]
    weight_scale = 2.0 if target in low_performing_targets else 1.0
    if len(X_target) < 500:
        weight_scale *= 1.5
    weights = weights * weight_scale
    groups = X_train[X_train['ID_TARGET'] == target]['ID_DAY']
    X_test_target = X_test[X_test['ID_TARGET'] == target][feature_cols]
    test_ids = X_test[X_test['ID_TARGET'] == target].index

    if len(X_target) < 100 or len(X_test_target) == 0:
        print(f"Warning: Insufficient data for ID_TARGET {target} (train: {len(X_target)}, test: {len(X_test_target)}). Skipping.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    param_results = []
    best_params = None
    best_score = 0
    best_model = None

    # Grid search with RFE and batch processing
    for params in ParameterGrid(param_grid):
        print(f"Testing params for {model_name} ID_TARGET {target}: {params}", flush=True)
        try:
            model.set_params(**params)
        except ValueError as e:
            print(f"Invalid params for {model_name} ID_TARGET {target}: {e}")
            continue
        rfe = RFE(estimator=model, n_features_to_select=10)
        fold_accuracies = []
        for train_idx, val_idx in kf.split(X_target, y_target, groups):
            for batch_start in range(0, len(train_idx), batch_size):
                batch_train_idx = train_idx[batch_start:batch_start + batch_size]
                X_tr = X_target.iloc[batch_train_idx]
                y_tr = y_target.iloc[batch_train_idx]
                w_tr = weights.iloc[batch_train_idx]
                X_val = X_target.iloc[val_idx]
                y_val = y_target.iloc[val_idx]
                w_val = weights.iloc[val_idx]
                rfe.fit(X_tr, y_tr)
                selected_features = X_tr.columns[rfe.support_].tolist()
                X_tr_rfe, X_val_rfe = X_tr[selected_features], X_val[selected_features]
                if model_name == 'LightGBM' and callbacks:
                    model.fit(X_tr_rfe, y_tr, sample_weight=w_tr, eval_set=[(X_val_rfe, y_val)], eval_metric='logloss', callbacks=callbacks)
                elif model_name == 'XGBoost' and callbacks:
                    model.fit(X_tr_rfe, y_tr, sample_weight=w_tr, eval_set=[(X_val_rfe, y_val)], verbose=False)
                else:
                    model.fit(X_tr_rfe, y_tr, sample_weight=w_tr)
                y_pred = model.predict(X_val_rfe)
                acc = weighted_accuracy(y_val, y_pred, w_val)
                fold_accuracies.append(acc)
                del X_tr_rfe, X_val_rfe, X_tr, X_val, y_tr, y_val, w_tr, w_val
                gc.collect()
        mean_acc = np.mean(fold_accuracies)
        print(f"{model_name} | ID_TARGET: {target} | Params: {params} | Mean Accuracy: {mean_acc:.4f}", flush=True)
        param_results.append({
            'params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': fold_accuracies,
            'selected_features': selected_features
        })
        if mean_acc > best_score:
            best_score = mean_acc
            best_params = params
            best_model = type(model)(**model.get_params())
            best_features = selected_features
        del rfe
        gc.collect()

    if best_model is None:
        print(f"No valid model for {model_name} ID_TARGET {target}. Using default.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    best_model.set_params(**best_params)
    X_target_rfe = X_target[best_features]
    X_test_target_rfe = X_test_target[best_features]
    for batch_start in range(0, len(X_target), batch_size):
        batch_idx = X_target.index[batch_start:batch_start + batch_size]
        X_batch = X_target_rfe.loc[batch_idx]
        y_batch = y_target.loc[batch_idx]
        w_batch = weights.loc[batch_idx]
        if model_name == 'LightGBM' and callbacks:
            best_model.fit(X_batch, y_batch, sample_weight=w_batch, eval_set=[(X_batch, y_batch)], eval_metric='logloss', callbacks=callbacks)
        elif model_name == 'XGBoost' and callbacks:
            best_model.fit(X_batch, y_batch, sample_weight=w_batch, eval_set=[(X_batch, y_batch)], verbose=False)
        else:
            best_model.fit(X_batch, y_batch, sample_weight=w_batch)
        del X_batch, y_batch, w_batch
        gc.collect()
    preds = best_model.predict_proba(X_test_target_rfe)[:, 1].astype(np.float32)
    importance = best_model.feature_importances_
    feature_importance = dict(zip(best_features, importance))
    top_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
    del X_target, y_target, weights, X_test_target, X_target_rfe, X_test_target_rfe
    gc.collect()
    return str(target), best_params, best_score, param_results, test_ids, preds, top_features

# Cross-validation and training
kf = GroupKFold(n_splits=5)
results = {}
predictions = {}
param_log = {}
top_features_all = {}
param_summary = []
low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]

for model_name, model_info in models.items():
    print(f"\nTraining {model_name}...", flush=True)
    param_log[model_name] = {}
    top_features_all[model_name] = {}

    # First-pass training
    results_parallel = Parallel(n_jobs=-1)(
        delayed(train_target)(target, X_train_inner, y_train_inner, X_test, feature_cols, model_info['model'], model_name, kf,
                             model_info['param_grid'], model_info['callbacks'], batch_size=500)
        for target in tqdm(X_train_inner['ID_TARGET'].unique(), desc=f"{model_name} Targets")
    )

    # Collect results
    accuracies = []
    for target, params, mean_acc, param_results, test_ids, preds, top_features in results_parallel:
        param_log[model_name][target] = {
            'best_params': params,
            'mean_accuracy': mean_acc,
            'all_params': param_results
        }
        top_features_all[model_name][target] = top_features
        accuracies.append(mean_acc)
        predictions.setdefault(target, {})[model_name] = dict(zip(test_ids, preds))
        param_summary.append({
            'model': model_name,
            'ID_TARGET': target,
            'best_params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': param_results[-1]['fold_accuracies'] if param_results else []
        })
    results[model_name] = np.mean([acc for acc in accuracies if acc > 0])
    print(f"{model_name} Weighted Accuracy: {results[model_name]:.4f}", flush=True)

    # Second-pass training with top features
    print(f"\nSecond-pass {model_name}...", flush=True)
    for target in tqdm(X_train_inner['ID_TARGET'].unique(), desc=f"{model_name} Targets"):
        target_str = str(target)
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train_inner[X_train_inner['ID_TARGET'] == target][top_features]
        y_target = y_train_inner.loc[X_train_inner['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train_inner.loc[X_train_inner['ID_TARGET'] == target, 'RET_TARGET'].abs()
        weight_scale = 2.0 if target in low_performing_targets else 1.0
        if len(X_target) < 500:
            weight_scale *= 1.5
        weights = weights * weight_scale
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index

        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue

        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        best_params['n_estimators'] = 200
        model.set_params(**best_params)
        for batch_start in range(0, len(X_target), 500):
            batch_idx = X_target.index[batch_start:batch_start + 500]
            X_batch = X_target.loc[batch_idx]
            y_batch = y_target.loc[batch_idx]
            w_batch = weights.loc[batch_idx]
            if model_name == 'LightGBM' and model_info['callbacks']:
                model.fit(X_batch, y_batch, sample_weight=w_batch, eval_set=[(X_batch, y_batch)], eval_metric='logloss', callbacks=model_info['callbacks'])
            elif model_name == 'XGBoost' and model_info['callbacks']:
                model.fit(X_batch, y_batch, sample_weight=w_batch, eval_set=[(X_batch, y_batch)], verbose=False)
            else:
                model.fit(X_batch, y_batch, sample_weight=w_batch)
            del X_batch, y_batch, w_batch
            gc.collect()
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        del X_target, y_target, weights, X_test_target
        gc.collect()

    # Third-pass training for low-performing targets
    print(f"\nThird-pass {model_name}...", flush=True)
    for target in tqdm(low_performing_targets, desc=f"{model_name} Low-Performing Targets"):
        target_str = str(target)
        if target_str not in param_log[model_name] or param_log[model_name][target_str]['mean_accuracy'] >= 0.75:
            continue
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train_inner[X_train_inner['ID_TARGET'] == target][top_features]
        y_target = y_train_inner.loc[X_train_inner['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train_inner.loc[X_train_inner['ID_TARGET'] == target, 'RET_TARGET'].abs() * 2.0
        if len(X_target) < 500:
            weights = weights * 1.5
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index

        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue

        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        fine_tune_grid = [
            {'learning_rate': [best_params['learning_rate'][0] * 0.5], 'n_estimators': [300], 'reg_alpha': [best_params['reg_alpha'][0] + 0.2], 'reg_lambda': [best_params['reg_lambda'][0] + 0.2]},
            {'learning_rate': [best_params['learning_rate'][0] * 1.0], 'n_estimators': [300], 'reg_alpha': [best_params['reg_alpha'][0]], 'reg_lambda': [best_params['reg_lambda'][0]]},
            {'learning_rate': [best_params['learning_rate'][0] * 1.2], 'n_estimators': [350], 'reg_alpha': [best_params['reg_alpha'][0] + 0.15], 'reg_lambda': [best_params['reg_lambda'][0] + 0.15]},
            {'learning_rate': [best_params['learning_rate'][0] * 0.7], 'n_estimators': [350], 'reg_alpha': [best_params['reg_alpha'][0] + 0.1], 'reg_lambda': [best_params['reg_lambda'][0] + 0.1]}
        ]
        best_score = param_log[model_name][target_str]['mean_accuracy']
        best_fine_params = best_params

        for params in ParameterGrid(fine_tune_grid):
            print(f"Fine-tuning {model_name} ID_TARGET {target}: {params}", flush=True)
            model.set_params(**params)
            fold_accuracies = []
            for train_idx, val_idx in kf.split(X_target, y_target, groups):
                for batch_start in range(0, len(train_idx), 500):
                    batch_train_idx = train_idx[batch_start:batch_start + 500]
                    X_tr = X_target.iloc[batch_train_idx]
                    y_tr = y_target.iloc[batch_train_idx]
                    w_tr = weights.iloc[batch_train_idx]
                    X_val = X_target.iloc[val_idx]
                    y_val = y_target.iloc[val_idx]
                    w_val = weights.iloc[val_idx]
                    if model_name == 'LightGBM' and model_info['callbacks']:
                        model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=model_info['callbacks'])
                    elif model_name == 'XGBoost' and model_info['callbacks']:
                        model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
                    else:
                        model.fit(X_tr, y_tr, sample_weight=w_tr)
                    y_pred = model.predict(X_val)
                    acc = weighted_accuracy(y_val, y_pred, w_val)
                    fold_accuracies.append(acc)
                    del X_tr, X_val, y_tr, y_val, w_tr, w_val
                    gc.collect()
            mean_acc = np.mean(fold_accuracies)
            print(f"{model_name} | ID_TARGET: {target} | Fine-tune Params: {params} | Mean Accuracy: {mean_acc:.4f}", flush=True)
            if mean_acc > best_score:
                best_score = mean_acc
                best_fine_params = params
                param_log[model_name][target_str]['best_params'] = best_fine_params
                param_log[model_name][target_str]['mean_accuracy'] = best_score
                param_summary.append({
                    'model': model_name,
                    'ID_TARGET': target_str,
                    'best_params': best_fine_params,
                    'mean_accuracy': best_score,
                    'fold_accuracies': fold_accuracies
                })
            gc.collect()

        model.set_params(**best_fine_params)
        for batch_start in range(0, len(X_target), 500):
            batch_idx = X_target.index[batch_start:batch_start + 500]
            X_batch = X_target.loc[batch_idx]
            y_batch = y_target.loc[batch_idx]
            w_batch = weights.loc[batch_idx]
            if model_name == 'LightGBM' and model_info['callbacks']:
                model.fit(X_batch, y_batch, sample_weight=w_batch, eval_set=[(X_batch, y_batch)], eval_metric='logloss', callbacks=model_info['callbacks'])
            elif model_name == 'XGBoost' and model_info['callbacks']:
                model.fit(X_batch, y_batch, sample_weight=w_batch, eval_set=[(X_batch, y_batch)], verbose=False)
            else:
                model.fit(X_batch, y_batch, sample_weight=w_batch)
            del X_batch, y_batch, w_batch
            gc.collect()
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        del X_target, y_target, weights, X_test_target
        gc.collect()

# Validate on holdout set
holdout_predictions = {}
for model_name, model_info in models.items():
    for target in X_holdout['ID_TARGET'].unique():
        target_str = str(target)
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            continue
        X_target = X_holdout[X_holdout['ID_TARGET'] == target][top_features]
        y_target = y_holdout.loc[X_holdout['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_holdout.loc[X_holdout['ID_TARGET'] == target, 'RET_TARGET'].abs()
        weight_scale = 2.0 if target in low_performing_targets else 1.0
        if len(X_target) < 500:
            weight_scale *= 1.5
        weights = weights * weight_scale
        if len(X_target) < 100:
            continue
        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        model.set_params(**best_params)
        for batch_start in range(0, len(X_target), 500):
            batch_idx = X_target.index[batch_start:batch_start + 500]
            X_batch = X_target.loc[batch_idx]
            y_batch = y_target.loc[batch_idx]
            w_batch = weights.loc[batch_idx]
            model.fit(X_train_inner[X_train_inner['ID_TARGET'] == target][top_features].loc[batch_idx.intersection(X_train_inner.index)],
                      y_train_inner.loc[X_train_inner['ID_TARGET'] == target].loc[batch_idx.intersection(X_train_inner.index), 'SIGN_TARGET'],
                      sample_weight=y_train_inner.loc[X_train_inner['ID_TARGET'] == target].loc[batch_idx.intersection(X_train_inner.index), 'RET_TARGET'].abs() * weight_scale)
            del X_batch, y_batch, w_batch
            gc.collect()
        preds = model.predict_proba(X_target)[:, 1].astype(np.float32)
        holdout_predictions.setdefault(target_str, {})[model_name] = dict(zip(X_target.index, preds))
        del X_target, y_target, weights
        gc.collect()

# Compute holdout accuracy
holdout_final = {}
for target in X_holdout['ID_TARGET'].unique():
    target_str = str(target)
    if target_str not in holdout_predictions:
        continue
    target_idx = X_holdout[X_holdout['ID_TARGET'] == target].index
    weights = y_holdout.loc[target_idx, 'RET_TARGET'].abs()
    ensemble_weights = {
        'LightGBM': param_log['LightGBM'].get(target_str, {}).get('mean_accuracy', 0.5),
        'XGBoost': param_log['XGBoost'].get(target_str, {}).get('mean_accuracy', 0.5)
    }
    total = ensemble_weights['LightGBM'] + ensemble_weights['XGBoost']
    if total == 0:
        ensemble_weights = {'LightGBM': 0.5, 'XGBoost': 0.5}
    else:
        ensemble_weights = {k: v / total for k, v in ensemble_weights.items()}
    for day in X_holdout[X_holdout['ID_TARGET'] == target]['ID_DAY'].unique():
        day_idx = (X_holdout['ID_TARGET'] == target) & (X_holdout['ID_DAY'] == day)
        day_test_ids = X_holdout[day_idx].index
        lgbm_preds = np.array([holdout_predictions[target_str].get('LightGBM', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        xgb_preds = np.array([holdout_predictions[target_str].get('XGBoost', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        ensemble_preds = ensemble_weights['LightGBM'] * lgbm_preds + ensemble_weights['XGBoost'] * xgb_preds
        smoothed_preds = pd.Series(ensemble_preds).ewm(span=3).mean().values[-1] if len(ensemble_preds) > 0 else 0.5
        for idx in day_test_ids:
            holdout_final[idx] = 1 if smoothed_preds >= 0.5 else -1
        del lgbm_preds, xgb_preds, ensemble_preds
        gc.collect()
holdout_df = pd.DataFrame.from_dict(holdout_final, orient='index', columns=['PRED']).reset_index()
holdout_df = holdout_df.merge(y_holdout[['SIGN_TARGET', 'RET_TARGET']], left_on='index', right_index=True)
holdout_acc = weighted_accuracy(holdout_df['SIGN_TARGET'], np.where(holdout_df['PRED'] == 1, 1, -1), holdout_df['RET_TARGET'].abs())
print(f"Holdout Weighted Accuracy: {holdout_acc:.4f}", flush=True)
del holdout_predictions, holdout_df, X_holdout, y_holdout
gc.collect()

# Save parameter log
param_log_path = os.path.join(DATA_DIR, 'hyperparameters.json')
with open(param_log_path, 'w') as f:
    json.dump(param_log, f, indent=4)
print(f"Saved hyperparameter log to: {param_log_path}", flush=True)
files.download(param_log_path)

# Ensemble predictions for test set
final_predictions = {}
missing_day_targets = []
for target in X_test['ID_TARGET'].unique():
    target_str = str(target)
    test_ids = X_test[X_test['ID_TARGET'] == target].index
    ensemble_weights = {
        'LightGBM': param_log['LightGBM'].get(target_str, {}).get('mean_accuracy', 0.5),
        'XGBoost': param_log['XGBoost'].get(target_str, {}).get('mean_accuracy', 0.5)
    }
    total = ensemble_weights['LightGBM'] + ensemble_weights['XGBoost']
    if total == 0:
        ensemble_weights = {'LightGBM': 0.5, 'XGBoost': 0.5}
    else:
        ensemble_weights = {k: v / total for k, v in ensemble_weights.items()}
    for day in X_test[X_test['ID_TARGET'] == target]['ID_DAY'].unique():
        day_idx = (X_test['ID_TARGET'] == target) & (X_test['ID_DAY'] == day)
        day_test_ids = X_test[day_idx].index
        if len(day_test_ids) == 0:
            print(f"Warning: No test samples for ID_TARGET {target} on ID_DAY {day}. Using default prediction.")
            missing_day_targets.append((target, day))
            for idx in test_ids:
                final_predictions[idx] = -1
            continue
        lgbm_preds = np.array([predictions[target_str].get('LightGBM', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        xgb_preds = np.array([predictions[target_str].get('XGBoost', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        ensemble_preds = ensemble_weights['LightGBM'] * lgbm_preds + ensemble_weights['XGBoost'] * xgb_preds
        smoothed_preds = pd.Series(ensemble_preds).ewm(span=3).mean().values[-1] if len(ensemble_preds) > 0 else 0.5
        for idx in day_test_ids:
            final_predictions[idx] = 1 if smoothed_preds >= 0.5 else -1
        del lgbm_preds, xgb_preds, ensemble_preds
        gc.collect()

if missing_day_targets:
    print(f"Missing day-target pairs: {missing_day_targets}")

# Create output CSV
output_df = pd.DataFrame.from_dict(final_predictions, orient='index', columns=['RET_TARGET']).reset_index()
output_df.columns = ['ID', 'RET_TARGET']
output_df = output_df.sort_values('ID')
expected_ids = set(X_test.index)
submission_ids = set(output_df['ID'])
if expected_ids != submission_ids:
    missing_ids = expected_ids - submission_ids
    print(f"Warning: Submission missing {len(missing_ids)} IDs: {missing_ids}")
    missing_df = pd.DataFrame({'ID': list(missing_ids), 'RET_TARGET': -1})
    output_df = pd.concat([output_df, missing_df], ignore_index=True).sort_values('ID')
output_path = os.path.join(DATA_DIR, 'predictions_ensemble.csv')
output_df.to_csv(output_path, index=False)
print(f"Saved predictions to: {output_path}", flush=True)

# Validate output
test_csv = pd.read_csv(output_path)
print(f"Output CSV shape: {test_csv.shape}", flush=True)
print(f"Output CSV ID range: {test_csv['ID'].min()} to {test_csv['ID'].max()}", flush=True)
print(f"Unique RET_TARGET values: {test_csv['RET_TARGET'].unique()}", flush=True)
print(f"Missing values: {test_csv.isna().sum().sum()}", flush=True)

# Download the file
files.download(output_path)

# Print final results
print("\nValidation Results:", flush=True)
for model_name, acc in results.items():
    print(f"{model_name}: {acc:.4f}", flush=True)
print(f"Holdout Weighted Accuracy: {holdout_acc:.4f}", flush=True)
del X_test, test_csv, output_df
gc.collect()

XGBoost version: 2.1.4
Selected top return columns: ['RET_262', 'RET_88', 'RET_172', 'RET_259', 'RET_150', 'RET_118', 'RET_216', 'RET_268']

Training LightGBM...



LightGBM Targets:   0%|          | 0/100 [00:00<?, ?it/s][A

KeyboardInterrupt: 

new mth -- 73.7%

In [None]:
import numpy as np
import pandas as pd
import os
import gc
from google.colab import files
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import lightgbm
from xgboost import XGBClassifier
import xgboost
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
import json
from sklearn.model_selection import ParameterGrid

# Check xgboost version for compatibility
print(f"XGBoost version: {xgboost.__version__}")
if int(xgboost.__version__.split('.')[0]) < 1:
    print("Warning: XGBoost version < 1.0.0 detected. Ensure compatibility with eval_metric in constructor.")

# Set up data directory
DATA_DIR = '/content'

# Load datasets
try:
    X_train = pd.read_csv(os.path.join(DATA_DIR, 'X_train_itDkypA.csv'), index_col='ID', dtype=np.float32)
    y_train = pd.read_csv(os.path.join(DATA_DIR, 'y_train_3LeeT2g.csv'), index_col='ID', dtype=np.float32)
    X_test = pd.read_csv(os.path.join(DATA_DIR, 'X_test_Beg4ey3.csv'), index_col='ID', dtype=np.float32)
    supplementary = pd.read_csv(os.path.join(DATA_DIR, 'supplementary_data_Vkoyn8z.csv'))
except FileNotFoundError as e:
    raise FileNotFoundError(f"Dataset not found: {e}. Please ensure files are uploaded to {DATA_DIR}.")

# Custom weighted accuracy metric
def weighted_accuracy(y_true, y_pred, weights):
    y_pred_mapped = np.where(y_pred == 1, 1, -1)
    correct = (y_pred_mapped == np.sign(y_true)).astype(float)
    return np.sum(np.abs(y_true) * correct) / np.sum(np.abs(y_true))

# Preprocess data with enhanced feature selection
def preprocess_data(X, y=None, supplementary=None, is_train=True, top_cols=None):
    ret_cols = [col for col in X.columns if col.startswith('RET_')]
    if is_train and top_cols is None:
        if len(ret_cols) > 15:
            variances = X[ret_cols].var()
            corr_sum = X[ret_cols].corr().abs().sum()
            combined_score = variances * corr_sum
            valid_cols = variances[variances > 1e-5].index
            if len(valid_cols) < 15:
                top_cols = valid_cols.tolist()
            else:
                top_cols = combined_score.loc[valid_cols].sort_values(ascending=False).head(15).index.tolist()
        else:
            top_cols = ret_cols
        print(f"Selected top return columns: {top_cols}")
    elif top_cols is not None:
        ret_cols = [col for col in ret_cols if col in top_cols]
        if not ret_cols:
            raise ValueError("No return columns selected. Check top_cols consistency.")

    sector_medians = {}
    if supplementary is not None:
        sector_map = supplementary.set_index('ID_asset')['CLASS_LEVEL_1']
        for col in ret_cols:
            asset_id = int(col.replace('RET_', ''))
            sector = sector_map.get(asset_id, np.nan)
            if pd.notna(sector):
                sector_assets = supplementary[supplementary['CLASS_LEVEL_1'] == sector]['ID_asset']
                sector_cols = [f'RET_{a}' for a in sector_assets if f'RET_{a}' in X.columns]
                if sector_cols:
                    sector_medians[col] = X[sector_cols].median(axis=1)
            if col in sector_medians:
                X[col] = X[col].fillna(sector_medians[col])
            else:
                X[col] = X[col].fillna(X[col].median())

    new_cols = {}
    if supplementary is not None:
        level = 'CLASS_LEVEL_1'
        sector_groups = supplementary.groupby(level)['ID_asset'].apply(list).to_dict()
        for sector, assets in sector_groups.items():
            asset_cols = [f'RET_{asset}' for asset in assets if f'RET_{asset}' in ret_cols]
            if asset_cols:
                new_cols[f'SECTOR_AVG_{level}_{sector}'] = X[asset_cols].mean(axis=1).replace([np.inf, -np.inf], 0).fillna(0)
        X = X.merge(supplementary[['ID_asset', level]].rename(columns={'ID_asset': 'ID_TARGET', level: f'TARGET_{level}'}),
                    on='ID_TARGET', how='left')
        X = pd.concat([X, pd.get_dummies(X[f'TARGET_{level}'], prefix=f'TARGET_{level}', dummy_na=True)], axis=1)
        X = X.drop(f'TARGET_{level}', axis=1)

    new_cols['MEAN_RET'] = X[ret_cols].mean(axis=1)
    new_cols['STD_RET'] = X[ret_cols].std(axis=1)
    new_cols.update({f'CS_RANK_{col}': X.groupby('ID_DAY')[col].rank(pct=True).replace([np.inf, -np.inf], 0).fillna(0) for col in ret_cols})

    if is_train:
        corr_matrix = X[ret_cols].corr()
        corr_pairs = corr_matrix.unstack().sort_values(ascending=False)
        top_pairs = [(i, j) for i, j in corr_pairs.index if i < j][:5]
        for i, j in top_pairs:
            new_cols[f'CORR_{i}_{j}'] = X[i] * X[j]

    if ret_cols:
        pca = PCA(n_components=min(2, len(ret_cols)), random_state=42)
        pca_features = pca.fit_transform(X[ret_cols].fillna(0))
        for i in range(pca_features.shape[1]):
            new_cols[f'PCA_{i}'] = pca_features[:, i]

    new_cols_df = pd.DataFrame(new_cols, index=X.index, dtype=np.float32)
    X = pd.concat([X, new_cols_df], axis=1)

    feature_cols = [col for col in X.columns if col not in ['ID_DAY', 'ID_TARGET']]
    scaler = StandardScaler()
    X[feature_cols] = scaler.fit_transform(X[feature_cols].replace([np.inf, -np.inf], 0).fillna(0))

    if is_train:
        y['SIGN_TARGET'] = np.where(y['RET_TARGET'] > 0, 1, 0).astype(np.int32)
        return X, y, feature_cols, top_cols
    return X, None, feature_cols, top_cols

# Preprocess data
try:
    X_train, y_train, feature_cols, top_cols = preprocess_data(X_train, y_train, supplementary, is_train=True)
    X_test, _, test_feature_cols, _ = preprocess_data(X_test, supplementary=supplementary, is_train=False, top_cols=top_cols)
except ValueError as e:
    print(f"Preprocessing error: {e}")
    raise

# Align columns
common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]
feature_cols = [col for col in common_cols if col not in ['ID_DAY', 'ID_TARGET']]

# Align y_train
y_train = y_train.loc[X_train.index]

# Define models with updated LightGBM hyperparameter grid
models = {
    'LightGBM': {
        'model': LGBMClassifier(num_leaves=31, min_child_samples=15, lambda_l1=0.3, lambda_l2=0.3,
                                subsample=0.7, colsample_bytree=0.8, bagging_freq=5, bagging_fraction=0.7,
                                random_state=42, n_jobs=1, verbosity=-1, class_weight='balanced', force_row_wise=True),
        'param_grid': [
            {'learning_rate': [0.01], 'num_leaves': [10], 'min_child_samples': [25], 'bagging_fraction': [0.5], 'reg_alpha': [0.05], 'reg_lambda': [0.05]},
            {'learning_rate': [0.02], 'num_leaves': [15], 'min_child_samples': [20], 'bagging_fraction': [0.6], 'reg_alpha': [0.1], 'reg_lambda': [0.1]},
            {'learning_rate': [0.05], 'num_leaves': [31], 'min_child_samples': [10], 'bagging_fraction': [0.7], 'reg_alpha': [0.2], 'reg_lambda': [0.2]},
            {'learning_rate': [0.07], 'num_leaves': [47], 'min_child_samples': [15], 'bagging_fraction': [0.8], 'reg_alpha': [0.3], 'reg_lambda': [0.3]},
            {'learning_rate': [0.03], 'num_leaves': [25], 'min_child_samples': [15], 'bagging_fraction': [0.65], 'reg_alpha': [0.15], 'reg_lambda': [0.15]}
        ],
        'callbacks': [lightgbm.early_stopping(stopping_rounds=10, verbose=False)]
    },
    'XGBoost': {
        'model': XGBClassifier(max_depth=4, min_child_weight=2, subsample=0.8, colsample_bytree=0.8,
                               random_state=42, n_jobs=1, eval_metric='logloss', verbosity=0,
                               scale_pos_weight=len(y_train[y_train['SIGN_TARGET'] == 0]) / len(y_train[y_train['SIGN_TARGET'] == 1])),
        'param_grid': [
            {'learning_rate': [0.03], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [150], 'reg_alpha': [0.1], 'reg_lambda': [1.0]},
            {'learning_rate': [0.05], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [200], 'reg_alpha': [0.2], 'reg_lambda': [1.5]},
            {'learning_rate': [0.07], 'max_depth': [5], 'min_child_weight': [3], 'subsample': [0.8], 'n_estimators': [150], 'reg_alpha': [0.3], 'reg_lambda': [1.0]}
        ],
        'callbacks': [xgboost.callback.EarlyStopping(rounds=10, metric_name='logloss', save_best=True)]
    }
}

# Function to train and predict for a single target
def train_target(target, X_train, y_train, X_test, feature_cols, model, model_name, kf, param_grid, callbacks):
    X_target = X_train[X_train['ID_TARGET'] == target][feature_cols]
    y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
    weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
    low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]
    weight_scale = 1.5 if target in low_performing_targets else 1.0
    if len(X_target) < 500:
        weight_scale *= 1.5
    weights = weights * weight_scale
    groups = X_train[X_train['ID_TARGET'] == target]['ID_DAY']
    X_test_target = X_test[X_test['ID_TARGET'] == target][feature_cols]
    test_ids = X_test[X_test['ID_TARGET'] == target].index

    if len(X_target) < 100 or len(X_test_target) == 0:
        print(f"Warning: Insufficient data for ID_TARGET {target} (train: {len(X_target)}, test: {len(X_test_target)}). Skipping.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    param_results = []
    best_params = None
    best_score = 0
    best_model = None

    for params in ParameterGrid(param_grid):
        print(f"Testing params for {model_name} ID_TARGET {target}: {params}", flush=True)
        try:
            model.set_params(**params)
        except ValueError as e:
            print(f"Invalid params for {model_name} ID_TARGET {target}: {e}")
            continue
        fold_accuracies = []
        for train_idx, val_idx in kf.split(X_target, y_target, groups):
            X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
            y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
            w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
            if model_name == 'LightGBM' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=callbacks)
            elif model_name == 'XGBoost' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
            else:
                model.fit(X_tr, y_tr, sample_weight=w_tr)
            y_pred = model.predict(X_val)
            acc = weighted_accuracy(y_val, y_pred, w_val)
            fold_accuracies.append(acc)
        mean_acc = np.mean(fold_accuracies)
        print(f"{model_name} | ID_TARGET: {target} | Params: {params} | Mean Accuracy: {mean_acc:.4f} | Fold Accuracies: {fold_accuracies}", flush=True)
        param_results.append({
            'params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': fold_accuracies
        })
        if mean_acc > best_score:
            best_score = mean_acc
            best_params = params
            best_model = type(model)(**model.get_params())

    if best_model is None:
        print(f"No valid model for {model_name} ID_TARGET {target}. Using default.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    best_model.set_params(**best_params)
    if model_name == 'LightGBM' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=callbacks)
    elif model_name == 'XGBoost' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
    else:
        best_model.fit(X_target, y_target, sample_weight=weights)
    preds = best_model.predict_proba(X_test_target)[:, 1].astype(np.float32)
    importance = best_model.feature_importances_
    feature_importance = dict(zip(feature_cols, importance))
    top_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:30]
    gc.collect()
    return str(target), best_params, best_score, param_results, test_ids, preds, top_features

# Cross-validation and training
kf = GroupKFold(n_splits=5)
results = {}
predictions = {}
ensemble_weights = {'LightGBM': 0.7, 'XGBoost': 0.3}
param_log = {}
top_features_all = {}
param_summary = []
low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]

for model_name, model_info in models.items():
    print(f"\nTraining {model_name}...", flush=True)
    param_log[model_name] = {}
    top_features_all[model_name] = {}
    results_parallel = Parallel(n_jobs=-1)(
        delayed(train_target)(target, X_train, y_train, X_test, feature_cols, model_info['model'], model_name, kf,
                             model_info['param_grid'], model_info['callbacks'])
        for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets")
    )
    accuracies = []
    for target, params, mean_acc, param_results, test_ids, preds, top_features in results_parallel:
        param_log[model_name][target] = {
            'best_params': params,
            'mean_accuracy': mean_acc,
            'all_params': param_results
        }
        top_features_all[model_name][target] = top_features
        accuracies.append(mean_acc)
        predictions.setdefault(target, {})[model_name] = dict(zip(test_ids, preds))
        param_summary.append({
            'model': model_name,
            'ID_TARGET': target,
            'best_params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': param_results[-1]['fold_accuracies'] if param_results else []
        })
    results[model_name] = np.mean([acc for acc in accuracies if acc > 0])
    print(f"{model_name} Weighted Accuracy: {results[model_name]:.4f}", flush=True)
    print(f"\n{model_name} Parameter Summary:", flush=True)
    for summary in param_summary:
        if summary['model'] == model_name:
            print(f"ID_TARGET: {summary['ID_TARGET']} | Best Params: {summary['best_params']} | Mean Accuracy: {summary['mean_accuracy']:.4f} | Fold Accuracies: {summary['fold_accuracies']}", flush=True)

# Second-pass training with top 30 features
print("\nPerforming second-pass training with top 30 features...", flush=True)
for model_name, model_info in models.items():
    print(f"\nSecond-pass {model_name}...", flush=True)
    for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets"):
        target_str = str(target)
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
        weight_scale = 1.5 if target in low_performing_targets else 1.0
        if len(X_target) < 500:
            weight_scale *= 1.5
        weights = weights * weight_scale
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index
        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue
        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        best_params['n_estimators'] = 200
        model.set_params(**best_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Third-pass training for low-performing targets
print("\nPerforming third-pass training for low-performing targets...", flush=True)
for model_name, model_info in models.items():
    print(f"\nThird-pass {model_name}...", flush=True)
    for target in tqdm(low_performing_targets, desc=f"{model_name} Low-Performing Targets"):
        target_str = str(target)
        if target_str not in param_log[model_name] or param_log[model_name][target_str]['mean_accuracy'] >= 0.75:
            continue
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs() * 1.5
        if len(X_target) < 500:
            weights = weights * 1.5
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index
        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue
        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        fine_tune_grid = [
            {'learning_rate': best_params['learning_rate'][0] * 0.8, 'n_estimators': 200, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1},
            {'learning_rate': best_params['learning_rate'][0] * 1.0, 'n_estimators': 250, 'reg_alpha': best_params['reg_alpha'][0], 'reg_lambda': best_params['reg_lambda'][0]},
            {'learning_rate': best_params['learning_rate'][0] * 1.2, 'n_estimators': 200, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1}
        ]
        best_score = param_log[model_name][target_str]['mean_accuracy']
        best_fine_params = best_params
        for params in ParameterGrid(fine_tune_grid):
            print(f"Fine-tuning {model_name} ID_TARGET {target}: {params}", flush=True)
            model.set_params(**params)
            fold_accuracies = []
            for train_idx, val_idx in kf.split(X_target, y_target, groups):
                X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
                y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
                w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
                if model_name == 'LightGBM' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=model_info['callbacks'])
                elif model_name == 'XGBoost' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
                else:
                    model.fit(X_tr, y_tr, sample_weight=w_tr)
                y_pred = model.predict(X_val)
                acc = weighted_accuracy(y_val, y_pred, w_val)
                fold_accuracies.append(acc)
            mean_acc = np.mean(fold_accuracies)
            print(f"{model_name} | ID_TARGET: {target} | Fine-tune Params: {params} | Mean Accuracy: {mean_acc:.4f}", flush=True)
            if mean_acc > best_score:
                best_score = mean_acc
                best_fine_params = params
                param_log[model_name][target_str]['best_params'] = best_fine_params
                param_log[model_name][target_str]['mean_accuracy'] = best_score
                param_summary.append({
                    'model': model_name,
                    'ID_TARGET': target_str,
                    'best_params': best_fine_params,
                    'mean_accuracy': best_score,
                    'fold_accuracies': fold_accuracies
                })
        model.set_params(**best_fine_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Save parameter log
param_log_path = os.path.join(DATA_DIR, 'hyperparameters.json')
with open(param_log_path, 'w') as f:
    json.dump(param_log, f, indent=4)
print(f"Saved hyperparameter log to: {param_log_path}", flush=True)
files.download(param_log_path)

# Ensemble predictions
final_predictions = {}
missing_day_targets = []
for target in X_test['ID_TARGET'].unique():
    target_str = str(target)
    test_ids = X_test[X_test['ID_TARGET'] == target].index
    for day in X_test[X_test['ID_TARGET'] == target]['ID_DAY'].unique():
        day_idx = (X_test['ID_TARGET'] == target) & (X_test['ID_DAY'] == day)
        day_test_ids = X_test[day_idx].index
        if len(day_test_ids) == 0:
            print(f"Warning: No test samples for ID_TARGET {target} on ID_DAY {day}. Using default prediction.")
            missing_day_targets.append((target, day))
            for idx in test_ids:
                final_predictions[idx] = -1
            continue
        lgbm_preds = np.array([predictions[target_str].get('LightGBM', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        xgb_preds = np.array([predictions[target_str].get('XGBoost', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        ensemble_preds = ensemble_weights['LightGBM'] * lgbm_preds + ensemble_weights['XGBoost'] * xgb_preds
        smoothed_preds = pd.Series(ensemble_preds).ewm(span=3).mean().values[-1] if len(ensemble_preds) > 0 else 0.5
        for idx in day_test_ids:
            final_predictions[idx] = 1 if smoothed_preds >= 0.5 else -1

if missing_day_targets:
    print(f"Missing day-target pairs: {missing_day_targets}")

# Create output CSV
output_df = pd.DataFrame.from_dict(final_predictions, orient='index', columns=['RET_TARGET']).reset_index()
output_df.columns = ['ID', 'RET_TARGET']
output_df = output_df.sort_values('ID')
expected_ids = set(X_test.index)
submission_ids = set(output_df['ID'])
if expected_ids != submission_ids:
    missing_ids = expected_ids - submission_ids
    print(f"Warning: Submission missing {len(missing_ids)} IDs: {missing_ids}")
    missing_df = pd.DataFrame({'ID': list(missing_ids), 'RET_TARGET': -1})
    output_df = pd.concat([output_df, missing_df], ignore_index=True).sort_values('ID')
output_path = os.path.join(DATA_DIR, 'predictions_ensemble.csv')
output_df.to_csv(output_path, index=False)
print(f"Saved predictions to: {output_path}", flush=True)

# Validate output
test_csv = pd.read_csv(output_path)
print(f"Output CSV shape: {test_csv.shape}", flush=True)
print(f"Output CSV ID range: {test_csv['ID'].min()} to {test_csv['ID'].max()}", flush=True)
print(f"Unique RET_TARGET values: {test_csv['RET_TARGET'].unique()}", flush=True)
print(f"Missing values: {test_csv.isna().sum().sum()}", flush=True)

# Download the file
files.download(output_path)

# Print final results
print("\nValidation Results:", flush=True)
for model_name, acc in results.items():
    print(f"{model_name}: {acc:.4f}", flush=True)

XGBoost version: 3.0.2
Selected top return columns: ['RET_262', 'RET_88', 'RET_172', 'RET_259', 'RET_150', 'RET_118', 'RET_216', 'RET_268', 'RET_115', 'RET_261', 'RET_59', 'RET_238', 'RET_121', 'RET_97', 'RET_30']

Training LightGBM...


LightGBM Targets: 100%|██████████| 100/100 [16:39<00:00,  9.99s/it]


LightGBM Weighted Accuracy: 0.7572

LightGBM Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'bagging_fraction': 0.5, 'learning_rate': 0.01, 'min_child_samples': 25, 'num_leaves': 10, 'reg_alpha': 0.05, 'reg_lambda': 0.05} | Mean Accuracy: 0.9697 | Fold Accuracies: [np.float64(0.8340425531914893), np.float64(0.9375), np.float64(0.7250996015936255), np.float64(0.9069767441860465), np.float64(0.8433734939759037)]
ID_TARGET: 129.0 | Best Params: {'bagging_fraction': 0.5, 'learning_rate': 0.01, 'min_child_samples': 25, 'num_leaves': 10, 'reg_alpha': 0.05, 'reg_lambda': 0.05} | Mean Accuracy: 0.6779 | Fold Accuracies: [np.float64(0.6040816326530613), np.float64(0.6219512195121951), np.float64(0.6174242424242424), np.float64(0.5949367088607594), np.float64(0.7127272727272728)]
ID_TARGET: 136.0 | Best Params: {'bagging_fraction': 0.5, 'learning_rate': 0.01, 'min_child_samples': 25, 'num_leaves': 10, 'reg_alpha': 0.05, 'reg_lambda': 0.05} | Mean Accuracy: 0.6441 | Fold Accuracies: [np.floa

XGBoost Targets: 100%|██████████| 100/100 [13:51<00:00,  8.32s/it]


XGBoost Weighted Accuracy: 0.7645

XGBoost Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'learning_rate': 0.07, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 150, 'reg_alpha': 0.3, 'reg_lambda': 1.0, 'subsample': 0.8} | Mean Accuracy: 0.7133 | Fold Accuracies: [np.float64(0.7574468085106383), np.float64(0.67578125), np.float64(0.7729083665338645), np.float64(0.6937984496124031), np.float64(0.6666666666666666)]
ID_TARGET: 129.0 | Best Params: {'learning_rate': 0.07, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 150, 'reg_alpha': 0.3, 'reg_lambda': 1.0, 'subsample': 0.8} | Mean Accuracy: 0.6275 | Fold Accuracies: [np.float64(0.6040816326530613), np.float64(0.6869918699186992), np.float64(0.6060606060606061), np.float64(0.6075949367088608), np.float64(0.6327272727272727)]
ID_TARGET: 136.0 | Best Params: {'learning_rate': 0.07, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 150, 'reg_alpha': 0.3, 'reg_lambda': 1.0, 'subsample': 0.8} | Mean Accuracy: 1.0000 |

LightGBM Targets: 100%|██████████| 100/100 [00:54<00:00,  1.83it/s]


Second-pass XGBoost...



XGBoost Targets: 100%|██████████| 100/100 [00:53<00:00,  1.85it/s]


Performing third-pass training for low-performing targets...

Third-pass LightGBM...



LightGBM Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 44150.57it/s]


Third-pass XGBoost...



XGBoost Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 57888.76it/s]

Saved hyperparameter log to: /content/hyperparameters.json





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved predictions to: /content/predictions_ensemble.csv
Output CSV shape: (114468, 2)
Output CSV ID range: 0 to 114467
Unique RET_TARGET values: [ 1 -1]
Missing values: 0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Validation Results:
LightGBM: 0.7572
XGBoost: 0.7645


new mtd, lee[ 74.22 lightbm but improve XGBoost -- 74.219

In [None]:
import numpy as np
import pandas as pd
import os
import gc
from google.colab import files
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import lightgbm
from xgboost import XGBClassifier
import xgboost
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
import json
from sklearn.model_selection import ParameterGrid

# Check xgboost version for compatibility
print(f"XGBoost version: {xgboost.__version__}")
if int(xgboost.__version__.split('.')[0]) < 1:
    print("Warning: XGBoost version < 1.0.0 detected. Ensure compatibility with eval_metric in constructor.")

# Set up data directory
DATA_DIR = '/content'

# Load datasets
try:
    X_train = pd.read_csv(os.path.join(DATA_DIR, 'X_train_itDkypA.csv'), index_col='ID', dtype=np.float32)
    y_train = pd.read_csv(os.path.join(DATA_DIR, 'y_train_3LeeT2g.csv'), index_col='ID', dtype=np.float32)
    X_test = pd.read_csv(os.path.join(DATA_DIR, 'X_test_Beg4ey3.csv'), index_col='ID', dtype=np.float32)
    supplementary = pd.read_csv(os.path.join(DATA_DIR, 'supplementary_data_Vkoyn8z.csv'))
except FileNotFoundError as e:
    raise FileNotFoundError(f"Dataset not found: {e}. Please ensure files are uploaded to {DATA_DIR}.")

# Custom weighted accuracy metric
def weighted_accuracy(y_true, y_pred, weights):
    y_pred_mapped = np.where(y_pred == 1, 1, -1)
    correct = (y_pred_mapped == np.sign(y_true)).astype(float)
    return np.sum(np.abs(y_true) * correct) / np.sum(np.abs(y_true))

# Preprocess data with enhanced feature selection
def preprocess_data(X, y=None, supplementary=None, is_train=True, top_cols=None):
    ret_cols = [col for col in X.columns if col.startswith('RET_')]
    if is_train and top_cols is None:
        if len(ret_cols) > 15:
            variances = X[ret_cols].var()
            corr_sum = X[ret_cols].corr().abs().sum()
            combined_score = variances * corr_sum
            valid_cols = variances[variances > 1e-5].index
            if len(valid_cols) < 15:
                top_cols = valid_cols.tolist()
            else:
                top_cols = combined_score.loc[valid_cols].sort_values(ascending=False).head(15).index.tolist()
        else:
            top_cols = ret_cols
        print(f"Selected top return columns: {top_cols}")
    elif top_cols is not None:
        ret_cols = [col for col in ret_cols if col in top_cols]
        if not ret_cols:
            raise ValueError("No return columns selected. Check top_cols consistency.")

    sector_medians = {}
    if supplementary is not None:
        sector_map = supplementary.set_index('ID_asset')['CLASS_LEVEL_1']
        for col in ret_cols:
            asset_id = int(col.replace('RET_', ''))
            sector = sector_map.get(asset_id, np.nan)
            if pd.notna(sector):
                sector_assets = supplementary[supplementary['CLASS_LEVEL_1'] == sector]['ID_asset']
                sector_cols = [f'RET_{a}' for a in sector_assets if f'RET_{a}' in X.columns]
                if sector_cols:
                    sector_medians[col] = X[sector_cols].median(axis=1)
            if col in sector_medians:
                X[col] = X[col].fillna(sector_medians[col])
            else:
                X[col] = X[col].fillna(X[col].median())

    new_cols = {}
    if supplementary is not None:
        level = 'CLASS_LEVEL_1'
        sector_groups = supplementary.groupby(level)['ID_asset'].apply(list).to_dict()
        for sector, assets in sector_groups.items():
            asset_cols = [f'RET_{asset}' for asset in assets if f'RET_{asset}' in ret_cols]
            if asset_cols:
                new_cols[f'SECTOR_AVG_{level}_{sector}'] = X[asset_cols].mean(axis=1).replace([np.inf, -np.inf], 0).fillna(0)
        X = X.merge(supplementary[['ID_asset', level]].rename(columns={'ID_asset': 'ID_TARGET', level: f'TARGET_{level}'}),
                    on='ID_TARGET', how='left')
        X = pd.concat([X, pd.get_dummies(X[f'TARGET_{level}'], prefix=f'TARGET_{level}', dummy_na=True)], axis=1)
        X = X.drop(f'TARGET_{level}', axis=1)

    new_cols['MEAN_RET'] = X[ret_cols].mean(axis=1)
    new_cols['STD_RET'] = X[ret_cols].std(axis=1)
    new_cols.update({f'CS_RANK_{col}': X.groupby('ID_DAY')[col].rank(pct=True).replace([np.inf, -np.inf], 0).fillna(0) for col in ret_cols})

    if is_train:
        corr_matrix = X[ret_cols].corr()
        corr_pairs = corr_matrix.unstack().sort_values(ascending=False)
        top_pairs = [(i, j) for i, j in corr_pairs.index if i < j][:5]
        for i, j in top_pairs:
            new_cols[f'CORR_{i}_{j}'] = X[i] * X[j]

    if ret_cols:
        pca = PCA(n_components=min(2, len(ret_cols)), random_state=42)
        pca_features = pca.fit_transform(X[ret_cols].fillna(0))
        for i in range(pca_features.shape[1]):
            new_cols[f'PCA_{i}'] = pca_features[:, i]

    new_cols_df = pd.DataFrame(new_cols, index=X.index, dtype=np.float32)
    X = pd.concat([X, new_cols_df], axis=1)

    feature_cols = [col for col in X.columns if col not in ['ID_DAY', 'ID_TARGET']]
    scaler = StandardScaler()
    X[feature_cols] = scaler.fit_transform(X[feature_cols].replace([np.inf, -np.inf], 0).fillna(0))

    if is_train:
        y['SIGN_TARGET'] = np.where(y['RET_TARGET'] > 0, 1, 0).astype(np.int32)
        return X, y, feature_cols, top_cols
    return X, None, feature_cols, top_cols

# Preprocess data
try:
    X_train, y_train, feature_cols, top_cols = preprocess_data(X_train, y_train, supplementary, is_train=True)
    X_test, _, test_feature_cols, _ = preprocess_data(X_test, supplementary=supplementary, is_train=False, top_cols=top_cols)
except ValueError as e:
    print(f"Preprocessing error: {e}")
    raise

# Align columns
common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]
feature_cols = [col for col in common_cols if col not in ['ID_DAY', 'ID_TARGET']]

# Align y_train
y_train = y_train.loc[X_train.index]

# Define models with updated XGBoost hyperparameter grid
models = {
    'LightGBM': {
        'model': LGBMClassifier(num_leaves=31, min_child_samples=15, lambda_l1=0.3, lambda_l2=0.3,
                                subsample=0.7, colsample_bytree=0.8, bagging_freq=5, bagging_fraction=0.7,
                                random_state=42, n_jobs=1, verbosity=-1, class_weight='balanced', force_row_wise=True),
        'param_grid': [
            {'learning_rate': [0.02], 'num_leaves': [15], 'min_child_samples': [20], 'bagging_fraction': [0.6], 'reg_alpha': [0.1], 'reg_lambda': [0.1]},
            {'learning_rate': [0.05], 'num_leaves': [31], 'min_child_samples': [10], 'bagging_fraction': [0.7], 'reg_alpha': [0.2], 'reg_lambda': [0.2]},
            {'learning_rate': [0.07], 'num_leaves': [47], 'min_child_samples': [15], 'bagging_fraction': [0.8], 'reg_alpha': [0.3], 'reg_lambda': [0.3]}
        ],
        'callbacks': [lightgbm.early_stopping(stopping_rounds=10, verbose=False)]
    },
    'XGBoost': {
        'model': XGBClassifier(max_depth=4, min_child_weight=2, subsample=0.8, colsample_bytree=0.8,
                               random_state=42, n_jobs=1, eval_metric='logloss', verbosity=0,
                               scale_pos_weight=len(y_train[y_train['SIGN_TARGET'] == 0]) / len(y_train[y_train['SIGN_TARGET'] == 1])),
        'param_grid': [
            {'learning_rate': [0.02], 'max_depth': [2], 'min_child_weight': [1], 'subsample': [0.5], 'n_estimators': [150], 'reg_alpha': [0.05], 'reg_lambda': [0.5]},
            {'learning_rate': [0.03], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [150], 'reg_alpha': [0.1], 'reg_lambda': [1.0]},
            {'learning_rate': [0.05], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [200], 'reg_alpha': [0.2], 'reg_lambda': [1.5]},
            {'learning_rate': [0.07], 'max_depth': [5], 'min_child_weight': [3], 'subsample': [0.8], 'n_estimators': [150], 'reg_alpha': [0.3], 'reg_lambda': [1.0]},
            {'learning_rate': [0.04], 'max_depth': [3], 'min_child_weight': [2], 'subsample': [0.65], 'n_estimators': [175], 'reg_alpha': [0.15], 'reg_lambda': [1.2]}
        ],
        'callbacks': [xgboost.callback.EarlyStopping(rounds=10, metric_name='logloss', save_best=True)]
    }
}

# Function to train and predict for a single target
def train_target(target, X_train, y_train, X_test, feature_cols, model, model_name, kf, param_grid, callbacks):
    X_target = X_train[X_train['ID_TARGET'] == target][feature_cols]
    y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
    weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
    low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]
    weight_scale = 1.5 if target in low_performing_targets else 1.0
    if len(X_target) < 500:
        weight_scale *= 1.5
    weights = weights * weight_scale
    groups = X_train[X_train['ID_TARGET'] == target]['ID_DAY']
    X_test_target = X_test[X_test['ID_TARGET'] == target][feature_cols]
    test_ids = X_test[X_test['ID_TARGET'] == target].index

    if len(X_target) < 100 or len(X_test_target) == 0:
        print(f"Warning: Insufficient data for ID_TARGET {target} (train: {len(X_target)}, test: {len(X_test_target)}). Skipping.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    param_results = []
    best_params = None
    best_score = 0
    best_model = None

    for params in ParameterGrid(param_grid):
        print(f"Testing params for {model_name} ID_TARGET {target}: {params}", flush=True)
        try:
            model.set_params(**params)
        except ValueError as e:
            print(f"Invalid params for {model_name} ID_TARGET {target}: {e}")
            continue
        fold_accuracies = []
        for train_idx, val_idx in kf.split(X_target, y_target, groups):
            X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
            y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
            w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
            if model_name == 'LightGBM' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=callbacks)
            elif model_name == 'XGBoost' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
            else:
                model.fit(X_tr, y_tr, sample_weight=w_tr)
            y_pred = model.predict(X_val)
            acc = weighted_accuracy(y_val, y_pred, w_val)
            fold_accuracies.append(acc)
        mean_acc = np.mean(fold_accuracies)
        print(f"{model_name} | ID_TARGET: {target} | Params: {params} | Mean Accuracy: {mean_acc:.4f} | Fold Accuracies: {fold_accuracies}", flush=True)
        param_results.append({
            'params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': fold_accuracies
        })
        if mean_acc > best_score:
            best_score = mean_acc
            best_params = params
            best_model = type(model)(**model.get_params())

    if best_model is None:
        print(f"No valid model for {model_name} ID_TARGET {target}. Using default.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    best_model.set_params(**best_params)
    if model_name == 'LightGBM' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=callbacks)
    elif model_name == 'XGBoost' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
    else:
        best_model.fit(X_target, y_target, sample_weight=weights)
    preds = best_model.predict_proba(X_test_target)[:, 1].astype(np.float32)
    importance = best_model.feature_importances_
    feature_importance = dict(zip(feature_cols, importance))
    top_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:30]
    gc.collect()
    return str(target), best_params, best_score, param_results, test_ids, preds, top_features

# Cross-validation and training
kf = GroupKFold(n_splits=5)
results = {}
predictions = {}
ensemble_weights = {'LightGBM': 0.7, 'XGBoost': 0.3}
param_log = {}
top_features_all = {}
param_summary = []
low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]

for model_name, model_info in models.items():
    print(f"\nTraining {model_name}...", flush=True)
    param_log[model_name] = {}
    top_features_all[model_name] = {}
    results_parallel = Parallel(n_jobs=-1)(
        delayed(train_target)(target, X_train, y_train, X_test, feature_cols, model_info['model'], model_name, kf,
                             model_info['param_grid'], model_info['callbacks'])
        for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets")
    )
    accuracies = []
    for target, params, mean_acc, param_results, test_ids, preds, top_features in results_parallel:
        param_log[model_name][target] = {
            'best_params': params,
            'mean_accuracy': mean_acc,
            'all_params': param_results
        }
        top_features_all[model_name][target] = top_features
        accuracies.append(mean_acc)
        predictions.setdefault(target, {})[model_name] = dict(zip(test_ids, preds))
        param_summary.append({
            'model': model_name,
            'ID_TARGET': target,
            'best_params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': param_results[-1]['fold_accuracies'] if param_results else []
        })
    results[model_name] = np.mean([acc for acc in accuracies if acc > 0])
    print(f"{model_name} Weighted Accuracy: {results[model_name]:.4f}", flush=True)
    print(f"\n{model_name} Parameter Summary:", flush=True)
    for summary in param_summary:
        if summary['model'] == model_name:
            print(f"ID_TARGET: {summary['ID_TARGET']} | Best Params: {summary['best_params']} | Mean Accuracy: {summary['mean_accuracy']:.4f} | Fold Accuracies: {summary['fold_accuracies']}", flush=True)

# Second-pass training with top 30 features
print("\nPerforming second-pass training with top 30 features...", flush=True)
for model_name, model_info in models.items():
    print(f"\nSecond-pass {model_name}...", flush=True)
    for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets"):
        target_str = str(target)
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
        weight_scale = 1.5 if target in low_performing_targets else 1.0
        if len(X_target) < 500:
            weight_scale *= 1.5
        weights = weights * weight_scale
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index
        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue
        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        best_params['n_estimators'] = 200
        model.set_params(**best_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Third-pass training for low-performing targets
print("\nPerforming third-pass training for low-performing targets...", flush=True)
for model_name, model_info in models.items():
    print(f"\nThird-pass {model_name}...", flush=True)
    for target in tqdm(low_performing_targets, desc=f"{model_name} Low-Performing Targets"):
        target_str = str(target)
        if target_str not in param_log[model_name] or param_log[model_name][target_str]['mean_accuracy'] >= 0.75:
            continue
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs() * 1.5
        if len(X_target) < 500:
            weights = weights * 1.5
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index
        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue
        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        fine_tune_grid = [
            {'learning_rate': best_params['learning_rate'][0] * 0.8, 'n_estimators': 200, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1},
            {'learning_rate': best_params['learning_rate'][0] * 1.0, 'n_estimators': 250, 'reg_alpha': best_params['reg_alpha'][0], 'reg_lambda': best_params['reg_lambda'][0]},
            {'learning_rate': best_params['learning_rate'][0] * 1.2, 'n_estimators': 200, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1}
        ]
        best_score = param_log[model_name][target_str]['mean_accuracy']
        best_fine_params = best_params
        for params in ParameterGrid(fine_tune_grid):
            print(f"Fine-tuning {model_name} ID_TARGET {target}: {params}", flush=True)
            model.set_params(**params)
            fold_accuracies = []
            for train_idx, val_idx in kf.split(X_target, y_target, groups):
                X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
                y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
                w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
                if model_name == 'LightGBM' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=model_info['callbacks'])
                elif model_name == 'XGBoost' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
                else:
                    model.fit(X_tr, y_tr, sample_weight=w_tr)
                y_pred = model.predict(X_val)
                acc = weighted_accuracy(y_val, y_pred, w_val)
                fold_accuracies.append(acc)
            mean_acc = np.mean(fold_accuracies)
            print(f"{model_name} | ID_TARGET: {target} | Fine-tune Params: {params} | Mean Accuracy: {mean_acc:.4f}", flush=True)
            if mean_acc > best_score:
                best_score = mean_acc
                best_fine_params = params
                param_log[model_name][target_str]['best_params'] = best_fine_params
                param_log[model_name][target_str]['mean_accuracy'] = best_score
                param_summary.append({
                    'model': model_name,
                    'ID_TARGET': target_str,
                    'best_params': best_fine_params,
                    'mean_accuracy': best_score,
                    'fold_accuracies': fold_accuracies
                })
        model.set_params(**best_fine_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Save parameter log
param_log_path = os.path.join(DATA_DIR, 'hyperparameters.json')
with open(param_log_path, 'w') as f:
    json.dump(param_log, f, indent=4)
print(f"Saved hyperparameter log to: {param_log_path}", flush=True)
files.download(param_log_path)

# Ensemble predictions
final_predictions = {}
missing_day_targets = []
for target in X_test['ID_TARGET'].unique():
    target_str = str(target)
    test_ids = X_test[X_test['ID_TARGET'] == target].index
    for day in X_test[X_test['ID_TARGET'] == target]['ID_DAY'].unique():
        day_idx = (X_test['ID_TARGET'] == target) & (X_test['ID_DAY'] == day)
        day_test_ids = X_test[day_idx].index
        if len(day_test_ids) == 0:
            print(f"Warning: No test samples for ID_TARGET {target} on ID_DAY {day}. Using default prediction.")
            missing_day_targets.append((target, day))
            for idx in test_ids:
                final_predictions[idx] = -1
            continue
        lgbm_preds = np.array([predictions[target_str].get('LightGBM', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        xgb_preds = np.array([predictions[target_str].get('XGBoost', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        ensemble_preds = ensemble_weights['LightGBM'] * lgbm_preds + ensemble_weights['XGBoost'] * xgb_preds
        smoothed_preds = pd.Series(ensemble_preds).ewm(span=3).mean().values[-1] if len(ensemble_preds) > 0 else 0.5
        for idx in day_test_ids:
            final_predictions[idx] = 1 if smoothed_preds >= 0.5 else -1

if missing_day_targets:
    print(f"Missing day-target pairs: {missing_day_targets}")

# Create output CSV
output_df = pd.DataFrame.from_dict(final_predictions, orient='index', columns=['RET_TARGET']).reset_index()
output_df.columns = ['ID', 'RET_TARGET']
output_df = output_df.sort_values('ID')
expected_ids = set(X_test.index)
submission_ids = set(output_df['ID'])
if expected_ids != submission_ids:
    missing_ids = expected_ids - submission_ids
    print(f"Warning: Submission missing {len(missing_ids)} IDs: {missing_ids}")
    missing_df = pd.DataFrame({'ID': list(missing_ids), 'RET_TARGET': -1})
    output_df = pd.concat([output_df, missing_df], ignore_index=True).sort_values('ID')
output_path = os.path.join(DATA_DIR, 'predictions_ensemble.csv')
output_df.to_csv(output_path, index=False)
print(f"Saved predictions to: {output_path}", flush=True)

# Validate output
test_csv = pd.read_csv(output_path)
print(f"Output CSV shape: {test_csv.shape}", flush=True)
print(f"Output CSV ID range: {test_csv['ID'].min()} to {test_csv['ID'].max()}", flush=True)
print(f"Unique RET_TARGET values: {test_csv['RET_TARGET'].unique()}", flush=True)
print(f"Missing values: {test_csv.isna().sum().sum()}", flush=True)

# Download the file
files.download(output_path)

# Print final results
print("\nValidation Results:", flush=True)
for model_name, acc in results.items():
    print(f"{model_name}: {acc:.4f}", flush=True)

XGBoost version: 3.0.2
Selected top return columns: ['RET_262', 'RET_88', 'RET_172', 'RET_259', 'RET_150', 'RET_118', 'RET_216', 'RET_268', 'RET_115', 'RET_261', 'RET_59', 'RET_238', 'RET_121', 'RET_97', 'RET_30']

Training LightGBM...


LightGBM Targets: 100%|██████████| 100/100 [10:32<00:00,  6.33s/it]


LightGBM Weighted Accuracy: 0.7327

LightGBM Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'bagging_fraction': 0.6, 'learning_rate': 0.02, 'min_child_samples': 20, 'num_leaves': 15, 'reg_alpha': 0.1, 'reg_lambda': 0.1} | Mean Accuracy: 0.9173 | Fold Accuracies: [np.float64(0.7531914893617021), np.float64(0.83203125), np.float64(0.7330677290836654), np.float64(0.8255813953488372), np.float64(0.8112449799196787)]
ID_TARGET: 129.0 | Best Params: {'bagging_fraction': 0.6, 'learning_rate': 0.02, 'min_child_samples': 20, 'num_leaves': 15, 'reg_alpha': 0.1, 'reg_lambda': 0.1} | Mean Accuracy: 0.6481 | Fold Accuracies: [np.float64(0.5959183673469388), np.float64(0.6504065040650406), np.float64(0.6363636363636364), np.float64(0.6118143459915611), np.float64(0.7090909090909091)]
ID_TARGET: 136.0 | Best Params: {'bagging_fraction': 0.8, 'learning_rate': 0.07, 'min_child_samples': 15, 'num_leaves': 47, 'reg_alpha': 0.3, 'reg_lambda': 0.3} | Mean Accuracy: 0.6080 | Fold Accuracies: [np.float6

XGBoost Targets: 100%|██████████| 100/100 [21:42<00:00, 13.03s/it]


XGBoost Weighted Accuracy: 0.7672

XGBoost Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'learning_rate': 0.04, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 175, 'reg_alpha': 0.15, 'reg_lambda': 1.2, 'subsample': 0.65} | Mean Accuracy: 0.7161 | Fold Accuracies: [np.float64(0.7914893617021277), np.float64(0.70703125), np.float64(0.7649402390438247), np.float64(0.6705426356589147), np.float64(0.6465863453815262)]
ID_TARGET: 129.0 | Best Params: {'learning_rate': 0.07, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 150, 'reg_alpha': 0.3, 'reg_lambda': 1.0, 'subsample': 0.8} | Mean Accuracy: 0.6275 | Fold Accuracies: [np.float64(0.6163265306122448), np.float64(0.6260162601626016), np.float64(0.5909090909090909), np.float64(0.5780590717299579), np.float64(0.5854545454545454)]
ID_TARGET: 136.0 | Best Params: {'learning_rate': 0.07, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 150, 'reg_alpha': 0.3, 'reg_lambda': 1.0, 'subsample': 0.8} | Mean Accuracy: 1.0000

LightGBM Targets: 100%|██████████| 100/100 [01:01<00:00,  1.62it/s]


Second-pass XGBoost...



XGBoost Targets: 100%|██████████| 100/100 [00:54<00:00,  1.83it/s]


Performing third-pass training for low-performing targets...

Third-pass LightGBM...



LightGBM Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 55721.43it/s]


Third-pass XGBoost...



XGBoost Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 55320.56it/s]

Saved hyperparameter log to: /content/hyperparameters.json





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved predictions to: /content/predictions_ensemble.csv
Output CSV shape: (114468, 2)
Output CSV ID range: 0 to 114467
Unique RET_TARGET values: [ 1 -1]
Missing values: 0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Validation Results:
LightGBM: 0.7327
XGBoost: 0.7672


keep xgboost orginal, but tune down lightgbm

In [None]:
import numpy as np
import pandas as pd
import os
import gc
from google.colab import files
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
import lightgbm
from xgboost import XGBClassifier
import xgboost
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
import json
from sklearn.model_selection import ParameterGrid

# Check xgboost version for compatibility
print(f"XGBoost version: {xgboost.__version__}")
if int(xgboost.__version__.split('.')[0]) < 1:
    print("Warning: XGBoost version < 1.0.0 detected. Ensure compatibility with eval_metric in constructor.")

# Set up data directory
DATA_DIR = '/content'

# Load datasets
try:
    X_train = pd.read_csv(os.path.join(DATA_DIR, 'X_train_itDkypA.csv'), index_col='ID', dtype=np.float32)
    y_train = pd.read_csv(os.path.join(DATA_DIR, 'y_train_3LeeT2g.csv'), index_col='ID', dtype=np.float32)
    X_test = pd.read_csv(os.path.join(DATA_DIR, 'X_test_Beg4ey3.csv'), index_col='ID', dtype=np.float32)
    supplementary = pd.read_csv(os.path.join(DATA_DIR, 'supplementary_data_Vkoyn8z.csv'))
except FileNotFoundError as e:
    raise FileNotFoundError(f"Dataset not found: {e}. Please ensure files are uploaded to {DATA_DIR}.")

# Custom weighted accuracy metric
def weighted_accuracy(y_true, y_pred, weights):
    y_pred_mapped = np.where(y_pred == 1, 1, -1)
    correct = (y_pred_mapped == np.sign(y_true)).astype(float)
    return np.sum(np.abs(y_true) * correct) / np.sum(np.abs(y_true))

# Preprocess data with enhanced feature selection
def preprocess_data(X, y=None, supplementary=None, is_train=True, top_cols=None):
    ret_cols = [col for col in X.columns if col.startswith('RET_')]
    if is_train and top_cols is None:
        if len(ret_cols) > 15:
            variances = X[ret_cols].var()
            corr_sum = X[ret_cols].corr().abs().sum()
            combined_score = variances * corr_sum
            valid_cols = variances[variances > 1e-5].index
            if len(valid_cols) < 15:
                top_cols = valid_cols.tolist()
            else:
                top_cols = combined_score.loc[valid_cols].sort_values(ascending=False).head(15).index.tolist()
        else:
            top_cols = ret_cols
        print(f"Selected top return columns: {top_cols}")
    elif top_cols is not None:
        ret_cols = [col for col in ret_cols if col in top_cols]
        if not ret_cols:
            raise ValueError("No return columns selected. Check top_cols consistency.")

    sector_medians = {}
    if supplementary is not None:
        sector_map = supplementary.set_index('ID_asset')['CLASS_LEVEL_1']
        for col in ret_cols:
            asset_id = int(col.replace('RET_', ''))
            sector = sector_map.get(asset_id, np.nan)
            if pd.notna(sector):
                sector_assets = supplementary[supplementary['CLASS_LEVEL_1'] == sector]['ID_asset']
                sector_cols = [f'RET_{a}' for a in sector_assets if f'RET_{a}' in X.columns]
                if sector_cols:
                    sector_medians[col] = X[sector_cols].median(axis=1)
            if col in sector_medians:
                X[col] = X[col].fillna(sector_medians[col])
            else:
                X[col] = X[col].fillna(X[col].median())

    new_cols = {}
    if supplementary is not None:
        level = 'CLASS_LEVEL_1'
        sector_groups = supplementary.groupby(level)['ID_asset'].apply(list).to_dict()
        for sector, assets in sector_groups.items():
            asset_cols = [f'RET_{asset}' for asset in assets if f'RET_{asset}' in ret_cols]
            if asset_cols:
                new_cols[f'SECTOR_AVG_{level}_{sector}'] = X[asset_cols].mean(axis=1).replace([np.inf, -np.inf], 0).fillna(0)
        X = X.merge(supplementary[['ID_asset', level]].rename(columns={'ID_asset': 'ID_TARGET', level: f'TARGET_{level}'}),
                    on='ID_TARGET', how='left')
        X = pd.concat([X, pd.get_dummies(X[f'TARGET_{level}'], prefix=f'TARGET_{level}', dummy_na=True)], axis=1)
        X = X.drop(f'TARGET_{level}', axis=1)

    new_cols['MEAN_RET'] = X[ret_cols].mean(axis=1)
    new_cols['STD_RET'] = X[ret_cols].std(axis=1)
    new_cols.update({f'CS_RANK_{col}': X.groupby('ID_DAY')[col].rank(pct=True).replace([np.inf, -np.inf], 0).fillna(0) for col in ret_cols})

    if is_train:
        corr_matrix = X[ret_cols].corr()
        corr_pairs = corr_matrix.unstack().sort_values(ascending=False)
        top_pairs = [(i, j) for i, j in corr_pairs.index if i < j][:5]
        for i, j in top_pairs:
            new_cols[f'CORR_{i}_{j}'] = X[i] * X[j]

    if ret_cols:
        pca = PCA(n_components=min(2, len(ret_cols)), random_state=42)
        pca_features = pca.fit_transform(X[ret_cols].fillna(0))
        for i in range(pca_features.shape[1]):
            new_cols[f'PCA_{i}'] = pca_features[:, i]

    new_cols_df = pd.DataFrame(new_cols, index=X.index, dtype=np.float32)
    X = pd.concat([X, new_cols_df], axis=1)

    feature_cols = [col for col in X.columns if col not in ['ID_DAY', 'ID_TARGET']]
    scaler = StandardScaler()
    X[feature_cols] = scaler.fit_transform(X[feature_cols].replace([np.inf, -np.inf], 0).fillna(0))

    if is_train:
        y['SIGN_TARGET'] = np.where(y['RET_TARGET'] > 0, 1, 0).astype(np.int32)
        return X, y, feature_cols, top_cols
    return X, None, feature_cols, top_cols

# Preprocess data
try:
    X_train, y_train, feature_cols, top_cols = preprocess_data(X_train, y_train, supplementary, is_train=True)
    X_test, _, test_feature_cols, _ = preprocess_data(X_test, supplementary=supplementary, is_train=False, top_cols=top_cols)
except ValueError as e:
    print(f"Preprocessing error: {e}")
    raise

# Align columns
common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]
feature_cols = [col for col in common_cols if col not in ['ID_DAY', 'ID_TARGET']]

# Align y_train
y_train = y_train.loc[X_train.index]

# Define models with aggressively tuned-down LightGBM hyperparameter grid and original XGBoost
models = {
    'LightGBM': {
        'model': LGBMClassifier(num_leaves=31, min_child_samples=15, lambda_l1=0.3, lambda_l2=0.3,
                                subsample=0.7, colsample_bytree=0.8, bagging_freq=5, bagging_fraction=0.7,
                                random_state=42, n_jobs=1, verbosity=-1, force_row_wise=True),
        'param_grid': [
            {'learning_rate': [0.001], 'num_leaves': [3], 'min_child_samples': [100], 'bagging_fraction': [0.3], 'colsample_bytree': [0.5], 'reg_alpha': [3.0], 'reg_lambda': [3.0], 'n_estimators': [20]},
            {'learning_rate': [0.005], 'num_leaves': [5], 'min_child_samples': [75], 'bagging_fraction': [0.4], 'colsample_bytree': [0.6], 'reg_alpha': [2.0], 'reg_lambda': [2.0], 'n_estimators': [30]},
            {'learning_rate': [0.007], 'num_leaves': [6], 'min_child_samples': [60], 'bagging_fraction': [0.3], 'colsample_bytree': [0.5], 'reg_alpha': [1.5], 'reg_lambda': [1.5], 'n_estimators': [40]},
            {'learning_rate': [0.01], 'num_leaves': [8], 'min_child_samples': [50], 'bagging_fraction': [0.4], 'colsample_bytree': [0.6], 'reg_alpha': [1.0], 'reg_lambda': [1.0], 'n_estimators': [50]}
        ],
        'callbacks': [lightgbm.early_stopping(stopping_rounds=5, verbose=False)]
    },
    'XGBoost': {
        'model': XGBClassifier(max_depth=4, min_child_weight=2, subsample=0.8, colsample_bytree=0.8,
                               random_state=42, n_jobs=1, eval_metric='logloss', verbosity=0,
                               scale_pos_weight=len(y_train[y_train['SIGN_TARGET'] == 0]) / len(y_train[y_train['SIGN_TARGET'] == 1])),
        'param_grid': [
            {'learning_rate': [0.03], 'max_depth': [3], 'min_child_weight': [1], 'subsample': [0.6], 'n_estimators': [150], 'reg_alpha': [0.1], 'reg_lambda': [1.0]},
            {'learning_rate': [0.05], 'max_depth': [4], 'min_child_weight': [2], 'subsample': [0.7], 'n_estimators': [200], 'reg_alpha': [0.2], 'reg_lambda': [1.5]},
            {'learning_rate': [0.07], 'max_depth': [5], 'min_child_weight': [3], 'subsample': [0.8], 'n_estimators': [150], 'reg_alpha': [0.3], 'reg_lambda': [1.0]}
        ],
        'callbacks': [xgboost.callback.EarlyStopping(rounds=10, metric_name='logloss', save_best=True)]
    }
}

# Function to train and predict for a single target
def train_target(target, X_train, y_train, X_test, feature_cols, model, model_name, kf, param_grid, callbacks):
    X_target = X_train[X_train['ID_TARGET'] == target][feature_cols]
    y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
    weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
    low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]
    weight_scale = 1.5 if target in low_performing_targets else 1.0
    if len(X_target) < 500:
        weight_scale *= 1.5
    weights = weights * weight_scale
    groups = X_train[X_train['ID_TARGET'] == target]['ID_DAY']
    X_test_target = X_test[X_test['ID_TARGET'] == target][feature_cols]
    test_ids = X_test[X_test['ID_TARGET'] == target].index

    if len(X_target) < 100 or len(X_test_target) == 0:
        print(f"Warning: Insufficient data for ID_TARGET {target} (train: {len(X_target)}, test: {len(X_test_target)}). Skipping.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    param_results = []
    best_params = None
    best_score = 0
    best_model = None

    for params in ParameterGrid(param_grid):
        print(f"Testing params for {model_name} ID_TARGET {target}: {params}", flush=True)
        try:
            model.set_params(**params)
        except ValueError as e:
            print(f"Invalid params for {model_name} ID_TARGET {target}: {e}")
            continue
        fold_accuracies = []
        for train_idx, val_idx in kf.split(X_target, y_target, groups):
            X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
            y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
            w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
            if model_name == 'LightGBM' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=callbacks)
            elif model_name == 'XGBoost' and callbacks:
                model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
            else:
                model.fit(X_tr, y_tr, sample_weight=w_tr)
            y_pred = model.predict(X_val)
            acc = weighted_accuracy(y_val, y_pred, w_val)
            fold_accuracies.append(acc)
        mean_acc = np.mean(fold_accuracies)
        print(f"{model_name} | ID_TARGET: {target} | Params: {params} | Mean Accuracy: {mean_acc:.4f} | Fold Accuracies: {fold_accuracies}", flush=True)
        param_results.append({
            'params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': fold_accuracies
        })
        if mean_acc > best_score:
            best_score = mean_acc
            best_params = params
            best_model = type(model)(**model.get_params())

    if best_model is None:
        print(f"No valid model for {model_name} ID_TARGET {target}. Using default.")
        return str(target), {}, 0, [], test_ids, np.zeros(len(test_ids), dtype=np.float32), []

    best_model.set_params(**best_params)
    if model_name == 'LightGBM' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=callbacks)
    elif model_name == 'XGBoost' and callbacks:
        best_model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
    else:
        best_model.fit(X_target, y_target, sample_weight=weights)
    preds = best_model.predict_proba(X_test_target)[:, 1].astype(np.float32)
    importance = best_model.feature_importances_
    feature_importance = dict(zip(feature_cols, importance))
    top_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:30]
    gc.collect()
    return str(target), best_params, best_score, param_results, test_ids, preds, top_features

# Cross-validation and training
kf = GroupKFold(n_splits=5)
results = {}
predictions = {}
ensemble_weights = {'LightGBM': 0.7, 'XGBoost': 0.3}
param_log = {}
top_features_all = {}
param_summary = []
low_performing_targets = [139, 8, 131, 269, 157, 249, 54, 130, 136, 3, 129]

for model_name, model_info in models.items():
    print(f"\nTraining {model_name}...", flush=True)
    param_log[model_name] = {}
    top_features_all[model_name] = {}
    results_parallel = Parallel(n_jobs=-1)(
        delayed(train_target)(target, X_train, y_train, X_test, feature_cols, model_info['model'], model_name, kf,
                             model_info['param_grid'], model_info['callbacks'])
        for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets")
    )
    accuracies = []
    for target, params, mean_acc, param_results, test_ids, preds, top_features in results_parallel:
        param_log[model_name][target] = {
            'best_params': params,
            'mean_accuracy': mean_acc,
            'all_params': param_results
        }
        top_features_all[model_name][target] = top_features
        accuracies.append(mean_acc)
        predictions.setdefault(target, {})[model_name] = dict(zip(test_ids, preds))
        param_summary.append({
            'model': model_name,
            'ID_TARGET': target,
            'best_params': params,
            'mean_accuracy': mean_acc,
            'fold_accuracies': param_results[-1]['fold_accuracies'] if param_results else []
        })
    results[model_name] = np.mean([acc for acc in accuracies if acc > 0])
    print(f"{model_name} Weighted Accuracy: {results[model_name]:.4f}", flush=True)
    print(f"\n{model_name} Parameter Summary:", flush=True)
    for summary in param_summary:
        if summary['model'] == model_name:
            print(f"ID_TARGET: {summary['ID_TARGET']} | Best Params: {summary['best_params']} | Mean Accuracy: {summary['mean_accuracy']:.4f} | Fold Accuracies: {summary['fold_accuracies']}", flush=True)

# Second-pass training with top 30 features
print("\nPerforming second-pass training with top 30 features...", flush=True)
for model_name, model_info in models.items():
    print(f"\nSecond-pass {model_name}...", flush=True)
    for target in tqdm(X_train['ID_TARGET'].unique(), desc=f"{model_name} Targets"):
        target_str = str(target)
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs()
        weight_scale = 1.5 if target in low_performing_targets else 1.0
        if len(X_target) < 500:
            weight_scale *= 1.5
        weights = weights * weight_scale
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index
        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue
        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        best_params['n_estimators'] = 200
        model.set_params(**best_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Third-pass training for low-performing targets
print("\nPerforming third-pass training for low-performing targets...", flush=True)
for model_name, model_info in models.items():
    print(f"\nThird-pass {model_name}...", flush=True)
    for target in tqdm(low_performing_targets, desc=f"{model_name} Low-Performing Targets"):
        target_str = str(target)
        if target_str not in param_log[model_name] or param_log[model_name][target_str]['mean_accuracy'] >= 0.75:
            continue
        top_features = [f[0] for f in top_features_all[model_name][target_str]]
        if not top_features:
            print(f"Warning: No features for ID_TARGET {target}. Skipping.")
            continue
        X_target = X_train[X_train['ID_TARGET'] == target][top_features]
        y_target = y_train.loc[X_train['ID_TARGET'] == target, 'SIGN_TARGET']
        weights = y_train.loc[X_train['ID_TARGET'] == target, 'RET_TARGET'].abs() * 1.5
        if len(X_target) < 500:
            weights = weights * 1.5
        X_test_target = X_test[X_test['ID_TARGET'] == target][top_features]
        test_ids = X_test[X_test['ID_TARGET'] == target].index
        if len(X_target) < 100:
            print(f"Warning: Insufficient training data for ID_TARGET {target} ({len(X_target)} samples). Skipping.")
            continue
        model = model_info['model']
        best_params = param_log[model_name][target_str]['best_params']
        fine_tune_grid = [
            {'learning_rate': best_params['learning_rate'][0] * 0.8, 'n_estimators': 200, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1},
            {'learning_rate': best_params['learning_rate'][0] * 1.0, 'n_estimators': 250, 'reg_alpha': best_params['reg_alpha'][0], 'reg_lambda': best_params['reg_lambda'][0]},
            {'learning_rate': best_params['learning_rate'][0] * 1.2, 'n_estimators': 200, 'reg_alpha': best_params['reg_alpha'][0] + 0.1, 'reg_lambda': best_params['reg_lambda'][0] + 0.1}
        ]
        best_score = param_log[model_name][target_str]['mean_accuracy']
        best_fine_params = best_params
        for params in ParameterGrid(fine_tune_grid):
            print(f"Fine-tuning {model_name} ID_TARGET {target}: {params}", flush=True)
            model.set_params(**params)
            fold_accuracies = []
            for train_idx, val_idx in kf.split(X_target, y_target, groups):
                X_tr, X_val = X_target.iloc[train_idx], X_target.iloc[val_idx]
                y_tr, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]
                w_tr, w_val = weights.iloc[train_idx], weights.iloc[val_idx]
                if model_name == 'LightGBM' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=model_info['callbacks'])
                elif model_name == 'XGBoost' and model_info['callbacks']:
                    model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], verbose=False)
                else:
                    model.fit(X_tr, y_tr, sample_weight=w_tr)
                y_pred = model.predict(X_val)
                acc = weighted_accuracy(y_val, y_pred, w_val)
                fold_accuracies.append(acc)
            mean_acc = np.mean(fold_accuracies)
            print(f"{model_name} | ID_TARGET: {target} | Fine-tune Params: {params} | Mean Accuracy: {mean_acc:.4f}", flush=True)
            if mean_acc > best_score:
                best_score = mean_acc
                best_fine_params = params
                param_log[model_name][target_str]['best_params'] = best_fine_params
                param_log[model_name][target_str]['mean_accuracy'] = best_score
                param_summary.append({
                    'model': model_name,
                    'ID_TARGET': target_str,
                    'best_params': best_fine_params,
                    'mean_accuracy': best_score,
                    'fold_accuracies': fold_accuracies
                })
        model.set_params(**best_fine_params)
        if model_name == 'LightGBM' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], eval_metric='logloss', callbacks=model_info['callbacks'])
        elif model_name == 'XGBoost' and model_info['callbacks']:
            model.fit(X_target, y_target, sample_weight=weights, eval_set=[(X_target, y_target)], verbose=False)
        else:
            model.fit(X_target, y_target, sample_weight=weights)
        preds = model.predict_proba(X_test_target)[:, 1].astype(np.float32)
        predictions[target_str][model_name] = dict(zip(test_ids, preds))
        gc.collect()

# Save parameter log
param_log_path = os.path.join(DATA_DIR, 'hyperparameters.json')
with open(param_log_path, 'w') as f:
    json.dump(param_log, f, indent=4)
print(f"Saved hyperparameter log to: {param_log_path}", flush=True)
files.download(param_log_path)

# Ensemble predictions
final_predictions = {}
missing_day_targets = []
for target in X_test['ID_TARGET'].unique():
    target_str = str(target)
    test_ids = X_test[X_test['ID_TARGET'] == target].index
    for day in X_test[X_test['ID_TARGET'] == target]['ID_DAY'].unique():
        day_idx = (X_test['ID_TARGET'] == target) & (X_test['ID_DAY'] == day)
        day_test_ids = X_test[day_idx].index
        if len(day_test_ids) == 0:
            print(f"Warning: No test samples for ID_TARGET {target} on ID_DAY {day}. Using default prediction.")
            missing_day_targets.append((target, day))
            for idx in test_ids:
                final_predictions[idx] = -1
            continue
        lgbm_preds = np.array([predictions[target_str].get('LightGBM', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        xgb_preds = np.array([predictions[target_str].get('XGBoost', {}).get(idx, 0.5) for idx in day_test_ids], dtype=np.float32)
        ensemble_preds = ensemble_weights['LightGBM'] * lgbm_preds + ensemble_weights['XGBoost'] * xgb_preds
        smoothed_preds = pd.Series(ensemble_preds).ewm(span=3).mean().values[-1] if len(ensemble_preds) > 0 else 0.5
        for idx in day_test_ids:
            final_predictions[idx] = 1 if smoothed_preds >= 0.5 else -1

if missing_day_targets:
    print(f"Missing day-target pairs: {missing_day_targets}")

# Create output CSV
output_df = pd.DataFrame.from_dict(final_predictions, orient='index', columns=['RET_TARGET']).reset_index()
output_df.columns = ['ID', 'RET_TARGET']
output_df = output_df.sort_values('ID')
expected_ids = set(X_test.index)
submission_ids = set(output_df['ID'])
if expected_ids != submission_ids:
    missing_ids = expected_ids - submission_ids
    print(f"Warning: Submission missing {len(missing_ids)} IDs: {missing_ids}")
    missing_df = pd.DataFrame({'ID': list(missing_ids), 'RET_TARGET': -1})
    output_df = pd.concat([output_df, missing_df], ignore_index=True).sort_values('ID')
output_path = os.path.join(DATA_DIR, 'predictions_ensemble.csv')
output_df.to_csv(output_path, index=False)
print(f"Saved predictions to: {output_path}", flush=True)

# Validate output
test_csv = pd.read_csv(output_path)
print(f"Output CSV shape: {test_csv.shape}", flush=True)
print(f"Output CSV ID range: {test_csv['ID'].min()} to {test_csv['ID'].max()}", flush=True)
print(f"Unique RET_TARGET values: {test_csv['RET_TARGET'].unique()}", flush=True)
print(f"Missing values: {test_csv.isna().sum().sum()}", flush=True)

# Download the file
files.download(output_path)

# Print final results
print("\nValidation Results:", flush=True)
for model_name, acc in results.items():
    print(f"{model_name}: {acc:.4f}", flush=True)

XGBoost version: 3.0.2
Selected top return columns: ['RET_262', 'RET_88', 'RET_172', 'RET_259', 'RET_150', 'RET_118', 'RET_216', 'RET_268', 'RET_115', 'RET_261', 'RET_59', 'RET_238', 'RET_121', 'RET_97', 'RET_30']

Training LightGBM...




LightGBM Targets:   0%|          | 0/100 [00:00<?, ?it/s][A[A

LightGBM Targets:   4%|▍         | 4/100 [00:12<05:04,  3.17s/it][A[A

LightGBM Targets:   6%|▌         | 6/100 [00:19<05:03,  3.23s/it][A[A

LightGBM Targets:   8%|▊         | 8/100 [00:26<05:10,  3.38s/it][A[A

LightGBM Targets:  10%|█         | 10/100 [00:32<04:50,  3.22s/it][A[A

LightGBM Targets:  12%|█▏        | 12/100 [00:40<05:04,  3.47s/it][A[A

LightGBM Targets:  14%|█▍        | 14/100 [00:46<04:47,  3.34s/it][A[A

LightGBM Targets:  16%|█▌        | 16/100 [00:54<04:56,  3.53s/it][A[A

LightGBM Targets:  18%|█▊        | 18/100 [01:00<04:36,  3.37s/it][A[A

LightGBM Targets:  20%|██        | 20/100 [01:08<04:40,  3.51s/it][A[A

LightGBM Targets:  22%|██▏       | 22/100 [01:15<04:44,  3.64s/it][A[A

LightGBM Targets:  24%|██▍       | 24/100 [01:22<04:32,  3.59s/it][A[A

LightGBM Targets:  26%|██▌       | 26/100 [01:28<04:13,  3.43s/it][A[A

LightGBM Targets:  28%|██▊       | 28/100 [01:36

LightGBM Weighted Accuracy: 0.9045

LightGBM Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'bagging_fraction': 0.3, 'colsample_bytree': 0.5, 'learning_rate': 0.001, 'min_child_samples': 100, 'n_estimators': 20, 'num_leaves': 3, 'reg_alpha': 3.0, 'reg_lambda': 3.0} | Mean Accuracy: 1.0000 | Fold Accuracies: [np.float64(0.8808510638297873), np.float64(1.0), np.float64(0.9482071713147411), np.float64(1.0), np.float64(0.4738955823293173)]
ID_TARGET: 129.0 | Best Params: {'bagging_fraction': 0.3, 'colsample_bytree': 0.5, 'learning_rate': 0.001, 'min_child_samples': 100, 'n_estimators': 20, 'num_leaves': 3, 'reg_alpha': 3.0, 'reg_lambda': 3.0} | Mean Accuracy: 1.0000 | Fold Accuracies: [np.float64(0.673469387755102), np.float64(0.6178861788617886), np.float64(0.6477272727272727), np.float64(0.5991561181434599), np.float64(0.6509090909090909)]
ID_TARGET: 136.0 | Best Params: {'bagging_fraction': 0.3, 'colsample_bytree': 0.5, 'learning_rate': 0.001, 'min_child_samples': 100, 'n_estimator



XGBoost Targets:   0%|          | 0/100 [00:00<?, ?it/s][A[A

XGBoost Targets:   4%|▍         | 4/100 [00:18<07:12,  4.51s/it][A[A

XGBoost Targets:   6%|▌         | 6/100 [00:34<09:36,  6.13s/it][A[A

XGBoost Targets:   8%|▊         | 8/100 [00:51<10:35,  6.90s/it][A[A

XGBoost Targets:  10%|█         | 10/100 [01:06<10:42,  7.14s/it][A[A

XGBoost Targets:  12%|█▏        | 12/100 [01:26<11:55,  8.13s/it][A[A

XGBoost Targets:  14%|█▍        | 14/100 [01:46<12:22,  8.64s/it][A[A

XGBoost Targets:  16%|█▌        | 16/100 [02:08<13:08,  9.38s/it][A[A

XGBoost Targets:  18%|█▊        | 18/100 [02:26<12:38,  9.25s/it][A[A

XGBoost Targets:  20%|██        | 20/100 [02:45<12:35,  9.44s/it][A[A

XGBoost Targets:  22%|██▏       | 22/100 [03:04<12:19,  9.48s/it][A[A

XGBoost Targets:  24%|██▍       | 24/100 [03:20<11:20,  8.96s/it][A[A

XGBoost Targets:  26%|██▌       | 26/100 [03:39<11:17,  9.15s/it][A[A

XGBoost Targets:  28%|██▊       | 28/100 [04:00<11:23,  9.49s

XGBoost Weighted Accuracy: 0.7645

XGBoost Parameter Summary:
ID_TARGET: 139.0 | Best Params: {'learning_rate': 0.07, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 150, 'reg_alpha': 0.3, 'reg_lambda': 1.0, 'subsample': 0.8} | Mean Accuracy: 0.7133 | Fold Accuracies: [np.float64(0.7574468085106383), np.float64(0.67578125), np.float64(0.7729083665338645), np.float64(0.6937984496124031), np.float64(0.6666666666666666)]
ID_TARGET: 129.0 | Best Params: {'learning_rate': 0.07, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 150, 'reg_alpha': 0.3, 'reg_lambda': 1.0, 'subsample': 0.8} | Mean Accuracy: 0.6275 | Fold Accuracies: [np.float64(0.6040816326530613), np.float64(0.6869918699186992), np.float64(0.6060606060606061), np.float64(0.6075949367088608), np.float64(0.6327272727272727)]
ID_TARGET: 136.0 | Best Params: {'learning_rate': 0.07, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 150, 'reg_alpha': 0.3, 'reg_lambda': 1.0, 'subsample': 0.8} | Mean Accuracy: 1.0000 |











































































































































































































LightGBM Targets: 100%|██████████| 100/100 [00:45<00:00,  2.19it/s]


Second-pass XGBoost...





XGBoost Targets:   0%|          | 0/100 [00:00<?, ?it/s][A[A

XGBoost Targets:   1%|          | 1/100 [00:00<00:58,  1.69it/s][A[A

XGBoost Targets:   2%|▏         | 2/100 [00:01<00:58,  1.69it/s][A[A

XGBoost Targets:   3%|▎         | 3/100 [00:01<00:57,  1.69it/s][A[A

XGBoost Targets:   4%|▍         | 4/100 [00:02<00:56,  1.70it/s][A[A

XGBoost Targets:   5%|▌         | 5/100 [00:02<00:54,  1.75it/s][A[A

XGBoost Targets:   6%|▌         | 6/100 [00:03<00:54,  1.74it/s][A[A

XGBoost Targets:   7%|▋         | 7/100 [00:04<00:52,  1.78it/s][A[A

XGBoost Targets:   8%|▊         | 8/100 [00:04<00:54,  1.70it/s][A[A

XGBoost Targets:   9%|▉         | 9/100 [00:05<00:53,  1.70it/s][A[A

XGBoost Targets:  10%|█         | 10/100 [00:05<00:52,  1.70it/s][A[A

XGBoost Targets:  11%|█         | 11/100 [00:06<00:52,  1.71it/s][A[A

XGBoost Targets:  12%|█▏        | 12/100 [00:07<01:01,  1.42it/s][A[A

XGBoost Targets:  13%|█▎        | 13/100 [00:08<01:07,  1.28it/s]


Performing third-pass training for low-performing targets...

Third-pass LightGBM...





LightGBM Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 52488.45it/s]


Third-pass XGBoost...





XGBoost Low-Performing Targets: 100%|██████████| 11/11 [00:00<00:00, 100517.09it/s]

Saved hyperparameter log to: /content/hyperparameters.json





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved predictions to: /content/predictions_ensemble.csv
Output CSV shape: (114468, 2)
Output CSV ID range: 0 to 114467
Unique RET_TARGET values: [ 1 -1]
Missing values: 0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Validation Results:
LightGBM: 0.9045
XGBoost: 0.7645
