# Detecting Reversal Points in US Equities — Single-Notebook Solution

This notebook is designed to run on Kaggle with the dataset imported as a Kaggle Dataset. It auto-detects files, builds strong tabular models with time-aware, ticker-aware cross-validation, and produces a valid submission CSV.

In [1]:
# Setup: imports, versions, seeds
import os
import sys
import gc
from pathlib import Path
import random
import json
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GroupKFold
try:
    from sklearn.model_selection import StratifiedGroupKFold
except Exception:
    StratifiedGroupKFold = None
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from IPython.display import display

# LightGBM / XGBoost (present on Kaggle)
try:
    import lightgbm as lgb
except Exception as e:
    lgb = None
try:
    from xgboost import XGBClassifier
except Exception as e:
    XGBClassifier = None

SEED = 42
random.seed(SEED); np.random.seed(SEED)

def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)

def display_versions():
    print('Python', sys.version)
    print('pandas', pd.__version__)
    print('numpy', np.__version__)
    try:
        import sklearn
        print('scikit-learn', sklearn.__version__)
    except Exception:
        pass
    if lgb is not None:
        print('lightgbm', lgb.__version__)
    if XGBClassifier is not None:
        import xgboost
        print('xgboost', xgboost.__version__)

display_versions()


Python 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]
pandas 2.2.3
numpy 1.26.4
scikit-learn 1.2.2
lightgbm 4.6.0
xgboost 2.0.3


In [2]:
import os
from pathlib import Path

# Configuration: dataset name and path detection
DATASET_NAME = os.environ.get('DATASET_NAME', 'detecting-reversal-points-in-us-equities')
ROOT_INPUT = Path(f"/kaggle/input/{DATASET_NAME}")

# Common Kaggle layout: many competition datasets put CSVs in a competition_data subfolder
PREFERRED_SUBDIRS = [
    ROOT_INPUT,
    ROOT_INPUT / 'competition_data',
    ROOT_INPUT / 'data',
]

# Local fallback options (keeps notebook runnable outside Kaggle)
LOCAL_CANDIDATES = [
    Path('./input'),
    Path('./data'),
    Path('.')
]

def resolve_dataset_path():
    # Prefer Kaggle paths first
    for p in PREFERRED_SUBDIRS:
        if p.exists() and any(p.glob('*.csv')):
            return p
    # Then local fallbacks
    for p in LOCAL_CANDIDATES:
        if p.exists() and (p / 'train.csv').exists() and (p / 'test.csv').exists():
            return p
    # If not found, provide a helpful error with diagnostics
    diagnostics = {
        "expected_root": str(ROOT_INPUT),
        "checked_paths": [str(x) for x in PREFERRED_SUBDIRS + LOCAL_CANDIDATES],
    }
    raise FileNotFoundError(
        "Dataset not found. Checked these paths: "
        f"{diagnostics['checked_paths']}. "
        "On Kaggle, ensure the dataset is added via Add Data and that CSVs exist. "
        f"Root folder expected at: {diagnostics['expected_root']}"
    )

DATA_DIR = resolve_dataset_path()
print('Using data directory:', DATA_DIR)

train_path = DATA_DIR / 'train.csv'
test_path = DATA_DIR / 'test.csv'
sample_path = DATA_DIR / 'sample_submission.csv'

missing = [str(p) for p in (train_path, test_path, sample_path) if not p.exists()]
if missing:
    print("Missing files (look here):")
    for m in missing:
        print(" -", m)
    # Show helpful listing for the dataset directory
    print("\nDirectory listing of DATA_DIR:")
    !ls -la "{DATA_DIR}" || true
    raise FileNotFoundError("One or more required files are missing. Update DATA_DIR or move/unzip CSVs accordingly.")
else:
    print("All required files present:", train_path.name, test_path.name, sample_path.name)


Using data directory: /kaggle/input/detecting-reversal-points-in-us-equities/competition_data
All required files present: train.csv test.csv sample_submission.csv


In [3]:
# Load data (robust to dtypes)
date_cols = ['t', 'date', 'timestamp']
parse_dates = [c for c in date_cols if c in pd.read_csv(train_path, nrows=0).columns]

train = pd.read_csv(train_path, parse_dates=parse_dates if parse_dates else None)
test = pd.read_csv(test_path, parse_dates=parse_dates if parse_dates else None)
sample_sub = pd.read_csv(sample_path)

print('Train shape:', train.shape)
print('Test shape:', test.shape)
print('Sample submission shape:', sample_sub.shape)

# Identify columns
POSSIBLE_TARGETS = ['class_label', 'target', 'label', 'y']
target_col = None
for c in POSSIBLE_TARGETS:
    if c in train.columns:
        target_col = c; break
if target_col is None:
    # Try from sample_sub second column name
    if sample_sub.shape[1] >= 2:
        target_col = sample_sub.columns[1]
    else:
        raise RuntimeError('Cannot infer target column.')

ID_CANDIDATES = ['id', 'row_id', 'ID']
id_col = None
for c in ID_CANDIDATES:
    if c in test.columns:
        id_col = c; break
if id_col is None:
    # fallback: use index as id when not provided
    id_col = 'id'
    test[id_col] = np.arange(len(test))

TICKER_CANDIDATES = ['ticker_id', 'ticker', 'symbol', 'stk', 'asset_id']
ticker_col = None
for c in TICKER_CANDIDATES:
    if c in train.columns:
        ticker_col = c; break

TIME_CANDIDATES = ['t', 'date', 'timestamp']
time_col = None
for c in TIME_CANDIDATES:
    if c in train.columns:
        time_col = c; break

print('Target column:', target_col)
print('ID column:', id_col)
print('Ticker column:', ticker_col)
print('Time column:', time_col)


Train shape: (1932, 68507)
Test shape: (828, 68506)
Sample submission shape: (828, 2)
Target column: class_label
ID column: id
Ticker column: ticker_id
Time column: t


In [4]:
# Reduce memory (downcast numerics)
def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if pd.api.types.is_numeric_dtype(col_type):
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.api.types.is_integer_dtype(col_type):
                if c_min >= 0:
                    if c_max < 255:
                        df[col] = df[col].astype(np.uint8)
                    elif c_max < 65535:
                        df[col] = df[col].astype(np.uint16)
                    elif c_max < 4294967295:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if np.iinfo(np.int8).min < c_min < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif np.iinfo(np.int16).min < c_min < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif np.iinfo(np.int32).min < c_min < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    else:
                        df[col] = df[col].astype(np.int64)
            else:
                df[col] = df[col].astype(np.float32)
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose:
        print(f'Mem usage {start_mem:0.2f} MB -> {end_mem:0.2f} MB')
    return df

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)


Mem usage 126.37 MB -> 504.95 MB
Mem usage 54.13 MB -> 216.38 MB


In [None]:
# Basic overview and target encodingprint('Train head:')display(train.head())y = train[target_col].copy()# Fill missing targets before converting to string to avoid literal "nan" appearing as a classy = y.fillna('None')y = y.astype(str)# Normalize label variants so only expected strings remainy = y.replace({    'nan': 'None',    'NaN': 'None',    '': 'None',    'NONE': 'None',    'none': 'None',    'HL': 'L',  # Map the pattern classes correctly    'LL': 'L',    'HH': 'H',    'LH': 'H'})EXPECTED_LABELS = ['H', 'L', 'None']  # The official competition classesunique_labels = list(pd.unique(y))# Preserve expected ordering when present, append unexpected labels deterministicallyclass_values = [lbl for lbl in EXPECTED_LABELS if lbl in unique_labels]extra_labels = sorted([lbl for lbl in unique_labels if lbl not in class_values])class_values.extend(extra_labels)# Ensure "None" is available for fallbacksif 'None' not in class_values:    class_values.append('None')print('Classes:', class_values)print('\nClass Distribution:')class_counts = y.value_counts()print(class_counts)print('\nClass Percentages:')print((class_counts / len(y) * 100).round(2), '%')# Calculate balanced weights with stronger minority boostn_samples = len(y)class_counts_dict = y.value_counts().to_dict()# Scale the weights even more aggressively for extreme imbalanceweight_multiplier = {    'H': 15.0,  # Boost H class weight    'L': 15.0,  # Boost L class weight    'None': 1.0  # Keep None class as is}class_weight_dict = {    c: (n_samples / (len(class_counts_dict) * class_counts_dict[c])) * weight_multiplier[c]    for c in class_values}print('\nComputed class weights:')print(class_weight_dict)# LabelEncoder for internal modeling order but preserve original labelsle = LabelEncoder()le.fit(class_values)y_enc = le.transform(y)print('\nLabel mapping:', dict(zip(le.classes_, le.transform(le.classes_))))\nclass_weight_encoded = {le.transform([cls])[0]: class_weight_dict[cls] for cls in class_values}\nprint('\\nEncoded class weights:', class_weight_encoded)

Train head:


Unnamed: 0,train_id,ticker_id,t,cross_threshold_from_above_100.0,cross_threshold_from_above_100.5,cross_threshold_from_above_101.0,cross_threshold_from_above_101.5,cross_threshold_from_above_102.0,cross_threshold_from_above_102.5,cross_threshold_from_above_103.0,...,zone_102.0,zone_102.5,zone_103.0,zone_97.0,zone_97.5,zone_98.0,zone_98.5,zone_99.0,zone_99.5,class_label
0,0,2,2024-06-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,
1,1,3,2024-09-18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,
2,2,6,2023-05-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,
3,3,3,2024-11-18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,4,2,2024-08-21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


Classes: ['HH', 'HL', 'LH', 'LL', 'nan']
Label mapping: {'HH': 0, 'HL': 1, 'LH': 2, 'LL': 3, 'nan': 4}


In [6]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from scipy import sparse

# --- Assumes train, test, target_col, id_col, ticker_col, time_col are already defined ---

# Build feature matrix: drop target, ids, and non-numeric object cols except limited categoricals
drop_cols = set([target_col]) if target_col is not None else set()
for c in [id_col, ticker_col, time_col]:
    if c is not None and c in train.columns:
        drop_cols.add(c)
feat_cols = [c for c in train.columns if c not in drop_cols]

# Identify categorical candidates among remaining features (based on TRAIN)
cat_cols = []
num_cols = []
for c in feat_cols:
    if train[c].dtype == 'object' or str(train[c].dtype).startswith('category'):
        cat_cols.append(c)
    else:
        num_cols.append(c)
print(f'Numeric feature count: {len(num_cols)} | Categorical: {len(cat_cols)} | Total features kept: {len(feat_cols)}')

# Ensure TEST has the same numeric columns in the same order; missing cols will be filled with NaN
# This avoids KeyError when test is missing columns present in train
test_num = test.reindex(columns=num_cols)
# Ensure TRAIN numeric DataFrame uses the same column order
train_num = train.reindex(columns=num_cols)

# Imputers
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Fit imputers on train numeric, transform test numeric
X_num = num_imputer.fit_transform(train_num) if len(num_cols) else np.empty((len(train), 0))
X_num_test = num_imputer.transform(test_num) if len(num_cols) else np.empty((len(test), 0))

# Handle categoricals robustly: if a categorical column is missing in test, create an all-empty series for it
if len(cat_cols):
    for c in cat_cols:
        train_series = train[c].astype(str).fillna('')
        if c in test.columns:
            test_series = test[c].astype(str).fillna('')
        else:
            test_series = pd.Series([''] * len(test), index=test.index)
        both = pd.concat([train_series, test_series], axis=0)
        enc = LabelEncoder()
        enc.fit(both)
        train[c] = enc.transform(train_series)
        test[c] = enc.transform(test_series)
    # After encoding, reindex test/train to ensure same column order for cat_cols
    train_cat_df = train.reindex(columns=cat_cols)
    test_cat_df = test.reindex(columns=cat_cols)
    X_cat = cat_imputer.fit_transform(train_cat_df)
    X_cat_test = cat_imputer.transform(test_cat_df)
else:
    X_cat = np.empty((len(train), 0)); X_cat_test = np.empty((len(test), 0))

# Combine numeric and categorical sparse matrices
X = sparse.hstack([sparse.csr_matrix(X_num), sparse.csr_matrix(X_cat)], format='csr')
X_test = sparse.hstack([sparse.csr_matrix(X_num_test), sparse.csr_matrix(X_cat_test)], format='csr')
print('X shape:', X.shape, '| X_test shape:', X_test.shape)


Numeric feature count: 68504 | Categorical: 0 | Total features kept: 68504
X shape: (1932, 68504) | X_test shape: (828, 68504)


In [7]:
# Time-aware, ticker-aware cross-validation splitter
def get_folds(n_splits=5, shuffle=False):
    # Prefer group-wise stratification by ticker if present
    if ticker_col and ticker_col in train.columns:
        groups = train[ticker_col].astype(str).values
    else:
        groups = None
    if StratifiedGroupKFold is not None and groups is not None:
        return StratifiedGroupKFold(n_splits=n_splits, shuffle=shuffle, random_state=SEED).split(X, y_enc, groups)
    if groups is not None:
        # Fall back to GroupKFold without stratification
        gkf = GroupKFold(n_splits=n_splits)
        return gkf.split(X, y_enc, groups)
    # Final fallback: stratified by y only
    skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=SEED)
    return skf.split(X, y_enc)

def class_weights_from_y(y_vals):
    # Inverse frequency weights to handle imbalance in some learners
    classes, counts = np.unique(y_vals, return_counts=True)
    total = counts.sum()
    weights = {c: total / (len(classes) * cnt) for c, cnt in zip(classes, counts)}
    return weights

cw_map = class_weights_from_y(y_enc)
print('Class weights (inverse freq):', cw_map)


Class weights (inverse freq): {0: 9.66, 1: 13.8, 2: 21.466666666666665, 3: 14.861538461538462, 4: 0.2123076923076923}


In [None]:
# Modeling utilities: LightGBM and Logistic baselines; hierarchical classifier for accuracy
def train_lgbm(X_tr, y_tr, X_va=None, y_va=None, num_class=3, params=None):
    if lgb is None:
        return None
    default_params = dict(
        objective='multiclass',
        num_class=num_class,
        boosting_type='gbdt',
        learning_rate=0.05,
        n_estimators=1000,
        subsample=0.9,
        colsample_bytree=0.2,
        reg_alpha=1.0,
        reg_lambda=2.0,
        max_depth=-1,
        num_leaves=63,
        min_data_in_leaf=20,
        random_state=SEED,
        n_jobs=-1
    )
    if params: default_params.update(params)
    model = lgb.LGBMClassifier(**default_params)
    if X_va is not None and y_va is not None:
        model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric='multi_logloss',
                  callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)])
    else:
        model.fit(X_tr, y_tr)
    return model

def train_logreg(X_tr, y_tr, X_va=None, y_va=None, num_class=3):
    # Multinomial logistic with strong regularization for high-dim sparse
    solver = 'saga'
    C = 1.0
    max_iter = 300
    model = LogisticRegression(
        multi_class='multinomial',
        solver=solver,
        C=C,
        max_iter=max_iter,
        class_weight='balanced',
        n_jobs=-1,
        random_state=SEED
    )
    model.fit(X_tr, y_tr)
    return model

def evaluate_preds(y_true, y_pred, average='macro'):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average=average)
    return acc, f1

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def hierarchical_fit_predict(X, y_enc, folds, classes_order, favor_accuracy=True):
    # Step 1: Binary model None vs {H,L} using logistic regression (robust for sparse high-dim)
    # Identify encodings
    idx_none = np.where(classes_order == 'None')[0]
    if len(idx_none) == 0 or len(classes_order) < 3:
        # If 'None' absent or not enough classes, revert to multiclass training
        return None
    none_label_enc = idx_none[0]
    y_is_none = (y_enc == none_label_enc).astype(int)

    thresholds = []
    fold_metrics = []
    for fold, (tr_idx, va_idx) in enumerate(folds):
        X_tr, X_va = X[tr_idx], X[va_idx]
        y_tr_bin, y_va_bin = y_is_none[tr_idx], y_is_none[va_idx]
        # Binary logistic with strong regularization
        clf_none = LogisticRegression(
            solver='saga',
            penalty='l2',
            C=1.0,
            max_iter=300,
            class_weight='balanced',
            n_jobs=-1,
            random_state=SEED
        )
        clf_none.fit(X_tr, y_tr_bin)
        # Decision scores/proba for validation
        if hasattr(clf_none, 'predict_proba'):
            p_none = clf_none.predict_proba(X_va)[:,1]
        else:
            p_none = sigmoid(clf_none.decision_function(X_va))
        # Tune threshold to maximize accuracy on validation
        best_thr, best_acc = 0.5, -1
        for thr in np.linspace(0.5, 0.99, 50):
            pred_is_none = (p_none >= thr).astype(int)
            # Provisional accuracy for binary decision (None vs non-None)
            acc_tmp = (pred_is_none == y_va_bin).mean()
            if acc_tmp > best_acc:
                best_acc, best_thr = acc_tmp, thr
        thresholds.append(best_thr)
        fold_metrics.append({'fold': fold, 'thr': best_thr, 'acc_tmp': best_acc})
    thr_global = float(np.median(thresholds)) if favor_accuracy else float(np.mean(thresholds))
    return {'thr': thr_global}  # store what we need for inference

def hierarchical_predict_test(X_train, y_enc, X_test, classes_order, thr_cfg):
    # Fit final binary 'None' classifier on full data
    idx_none = np.where(classes_order == 'None')[0]
    none_label_enc = idx_none[0]
    y_is_none = (y_enc == none_label_enc).astype(int)
    clf_none = LogisticRegression(
        solver='saga', penalty='l2', C=1.0, max_iter=400, class_weight='balanced', n_jobs=-1, random_state=SEED
    )
    clf_none.fit(X_train, y_is_none)
    if hasattr(clf_none, 'predict_proba'):
        p_none_test = clf_none.predict_proba(X_test)[:,1]
    else:
        p_none_test = sigmoid(clf_none.decision_function(X_test))
    thr = thr_cfg['thr']
    is_none_pred = (p_none_test >= thr)
    # For non-None, train a 2-class model on H vs L only
    mask_hl = (y_enc != none_label_enc)
    y_hl = y_enc[mask_hl]
    # Remap encodings of remaining classes to [0,1,...] for n-class
    classes_hl = sorted([c for c in range(len(classes_order)) if c != none_label_enc])
    enc_map = {orig_label: i for i, orig_label in enumerate(classes_hl)}
    y_hl_bin = np.array([enc_map[v] for v in y_hl], dtype=int)
    X_hl = X_train[mask_hl]
    # Use a fast linear model for HL split
    clf_hl = LogisticRegression(
        solver='saga', penalty='l2', C=1.0, max_iter=400, class_weight='balanced', n_jobs=-1, random_state=SEED
    )
    clf_hl.fit(X_hl, y_hl_bin)
    # Predict HL only where non-None
    pred = np.full(X_test.shape[0], none_label_enc, dtype=int)
    idx_non_none = np.where(~is_none_pred)[0]
    if idx_non_none.size > 0:
        proba_hl = clf_hl.predict_proba(X_test[idx_non_none])
        hl_pred = proba_hl.argmax(axis=1)
        # Map back to original encodings
        rev_map = {i: orig_label for i, orig_label in enumerate(classes_hl)}
        pred[idx_non_none] = np.array([rev_map[v] for v in hl_pred], dtype=int)
    return pred


In [None]:
# Fixed CV block for LightGBM + hierarchical fallback
# - uses stable params for small-data situations
# - forces row-wise to avoid the multi-threading warning overhead
# - uses predict_proba + argmax for multiclass
# - protects against models that fail to split (returns fallback predictions)
# - frees memory between folds and logs cleanly

import gc
import json
import numpy as np
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score
from tqdm.auto import tqdm

# Helper: safe train wrapper that returns None on catastrophic failure
def safe_train_lgbm(X_tr, y_tr, X_va, y_va, num_class, params=None):
    if params is None:
        params = {
            "objective": "multiclass",
            "num_class": num_class,
            "metric": "multi_logloss",
            "verbosity": -1,
            "boosting_type": "gbdt",
            "learning_rate": 0.05,
            "num_leaves": 64,
            "min_data_in_leaf": 20,
            # Force row-wise to avoid row/colwise auto-choice overhead message
            "force_row_wise": True,
            # Add class weights to handle imbalance
            "is_unbalance": True,
            # Reduce risk of "no further splits" by allowing small child samples when dataset tiny
            "min_child_samples": 5,
            "seed": 42,
        }
    try:
        # Calculate class weights for handling imbalance
        class_counts = np.bincount(y_tr)
        class_weights = len(y_tr) / (num_class * class_counts)
        sample_weights_tr = class_weights[y_tr]
        sample_weights_va = class_weights[y_va]
        
        dtrain = lgb.Dataset(X_tr, label=y_tr, weight=sample_weights_tr)
        dvalid = lgb.Dataset(X_va, label=y_va, weight=sample_weights_va, reference=dtrain)
        
        # Train with early stopping
        model = lgb.train(
            params,
            dtrain,
            num_boost_round=2000,
            valid_sets=[dvalid],
            callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
        )
        return model
    except Exception as e:
        print("[LGBM] Training failed with exception:", repr(e))
        return None

# Evaluate helpers
def evaluate_preds(y_true, y_pred):
    # Accept both class labels and prob arrays
    if y_pred.ndim == 2:
        y_hat = np.argmax(y_pred, axis=1)
    else:
        y_hat = np.asarray(y_pred).astype(int)
    acc = accuracy_score(y_true, y_hat)
    f1m = f1_score(y_true, y_hat, average='macro', zero_division=0)
    return acc, f1m

# Decide number of splits safely
n_splits = 5 if (len(np.unique(y_enc)) >= 3 and len(y_enc) >= 100) else 3
splits = list(get_folds(n_splits=n_splits, shuffle=True))

cv_results = {}

# LightGBM multiclass CV (if available)
if 'lgb' in globals() and lgb is not None:
    lgb_acc, lgb_f1 = [], []
    for fold, (tr_idx, va_idx) in enumerate(splits):
        X_tr, X_va = X[tr_idx], X[va_idx]
        y_tr, y_va = y_enc[tr_idx], y_enc[va_idx]

        model = safe_train_lgbm(X_tr, y_tr, X_va, y_va, num_class=len(le.classes_))

        if model is None:
            # Fallback: predict majority class if model failed
            majority = np.bincount(y_tr).argmax()
            y_pred = np.full(len(y_va), majority, dtype=int)
            # convert to 2D prob-like for evaluate_preds compatibility
            y_pred_probs = np.eye(len(le.classes_))[y_pred]
            acc, f1m = evaluate_preds(y_va, y_pred_probs)
            print(f'[LGBM] Fold {fold}: model failed; using majority-class fallback acc={acc:.5f} f1_macro={f1m:.5f}')
        else:
            # predict_proba then argmax (works for sparse matrices returned by X)
            try:
                y_pred_probs = model.predict(X_va)
                # some lgb versions return shape (n_samples,) for binary; normalize to 2D
                if y_pred_probs.ndim == 1:
                    y_pred_probs = np.vstack([1 - y_pred_probs, y_pred_probs]).T
                
                # Apply confidence threshold for None class
                none_idx = np.where(le.classes_ == 'None')[0][0]
                conf_threshold = 0.4  # Adjust based on validation performance
                max_probs = np.max(y_pred_probs, axis=1)
                uncertain_mask = max_probs < conf_threshold
                
                y_pred = np.argmax(y_pred_probs, axis=1)
                # Assign uncertain predictions to None class
                y_pred[uncertain_mask] = none_idx
                
                acc, f1m = evaluate_preds(y_va, y_pred)
                print(f'[LGBM] Fold {fold}: acc={acc:.5f} f1_macro={f1m:.5f}')
                
                # Print confusion matrix for this fold
                from sklearn.metrics import confusion_matrix
                cm = confusion_matrix(y_va, y_pred)
                print(f'\nConfusion Matrix for Fold {fold}:')
                print('True labels (rows) vs Predicted labels (columns)')
                print('Labels:', le.classes_)
                print(cm)
                
            except Exception as e:
                print("[LGBM] Prediction failed:", repr(e))
                majority = np.bincount(y_tr).argmax()
                y_pred_probs = np.eye(len(le.classes_))[np.full(len(y_va), majority)]
                acc, f1m = evaluate_preds(y_va, y_pred_probs)
                print(f'[LGBM] Fold {fold}: prediction failed, using majority fallback acc={acc:.5f} f1_macro={f1m:.5f}')
            
            # free model ASAP to reduce memory footprint
            del model

        lgb_acc.append(acc)
        lgb_f1.append(f1m)
        gc.collect()

    cv_results['lgbm'] = {'acc': float(np.mean(lgb_acc)), 'f1_macro': float(np.mean(lgb_f1))}
    print('\n[LGBM] CV Summary:')
    print('Mean accuracy:', cv_results['lgbm']['acc'])
    print('Mean F1-macro:', cv_results['lgbm']['f1_macro'])
    print('Std accuracy:', float(np.std(lgb_acc)))
    print('Std F1-macro:', float(np.std(lgb_f1)))
else:
    print('LightGBM not available; skipping.')

# Hierarchical approach CV (optimize accuracy)
classes_order = le.classes_
hier_cfg = hierarchical_fit_predict(X, y_enc, splits, classes_order, favor_accuracy=True)
if hier_cfg is not None:
    accs, f1s = [], []
    for fold, (tr_idx, va_idx) in enumerate(splits):
        # Fit on train split only (use simple split wrapper: here we fit on tr_idx and predict va_idx)
        cfg_fold = hierarchical_fit_predict(X[tr_idx], y_enc[tr_idx], [(np.arange(len(tr_idx)), np.arange(len(tr_idx)))], classes_order)
        y_pred_va = hierarchical_predict_test(X[tr_idx], y_enc[tr_idx], X[va_idx], classes_order, cfg_fold)
        acc, f1m = evaluate_preds(y_enc[va_idx], y_pred_va)
        accs.append(acc); f1s.append(f1m)
        print(f'[HIER] Fold {fold}: acc={acc:.5f} f1_macro={f1m:.5f}')
        
        # Print confusion matrix for this fold
        from sklearn.metrics import confusion_matrix
        cm = confusion_matrix(y_enc[va_idx], y_pred_va)
        print(f'\nConfusion Matrix for Fold {fold}:')
        print('True labels (rows) vs Predicted labels (columns)')
        print('Labels:', le.classes_)
        print(cm)
        
    cv_results['hier'] = {'acc': float(np.mean(accs)), 'f1_macro': float(np.mean(f1s)), 'cfg': hier_cfg}
    print('\n[HIER] CV Summary:')
    print('Mean accuracy:', cv_results['hier']['acc'])
    print('Mean F1-macro:', cv_results['hier']['f1_macro'])
    print('Std accuracy:', float(np.std(accs)))
    print('Std F1-macro:', float(np.std(f1s)))
else:
    print('Hierarchical approach not applicable (no None class detected).')

print('\nCV results:', json.dumps(cv_results, indent=2))

# Choose best by F1-macro instead of accuracy to handle class imbalance better
best_key = max(cv_results.keys(), key=lambda k: cv_results[k]['f1_macro']) if cv_results else None
print('Selected approach:', best_key)
if best_key is None:
    raise RuntimeError('No model trained during CV.')

[LGBM] Training failed with exception: TypeError("train() got an unexpected keyword argument 'early_stopping_rounds'")
[LGBM] Fold 0: model failed; using majority-class fallback acc=0.95497 f1_macro=0.19539
[LGBM] Training failed with exception: TypeError("train() got an unexpected keyword argument 'early_stopping_rounds'")
[LGBM] Fold 1: model failed; using majority-class fallback acc=0.93478 f1_macro=0.19326
[LGBM] Training failed with exception: TypeError("train() got an unexpected keyword argument 'early_stopping_rounds'")
[LGBM] Fold 2: model failed; using majority-class fallback acc=0.94410 f1_macro=0.19425
[LGBM] Training failed with exception: TypeError("train() got an unexpected keyword argument 'early_stopping_rounds'")
[LGBM] Fold 3: model failed; using majority-class fallback acc=0.92857 f1_macro=0.19259
[LGBM] Training failed with exception: TypeError("train() got an unexpected keyword argument 'early_stopping_rounds'")
[LGBM] Fold 4: model failed; using majority-class fal

In [None]:
import gc
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression

# Safe LightGBM trainer for final model (returns model or None on failure)
def safe_train_lgbm_final(X_train, y_train, num_class, params=None):
    try:
        if params is None:
            params = {
                "objective": "multiclass",
                "num_class": num_class,
                "metric": ["multi_logloss", "multi_error"],
                "verbosity": -1,
                "boosting_type": "gbdt",
                "learning_rate": 0.02,  # Increased for better convergence
                "num_leaves": 31,  # Increased for more complex patterns
                "min_data_in_leaf": 10,  # Reduced to allow more splits
                "min_child_samples": 5,  # Reduced to allow more granular predictions
                "feature_fraction": 0.8,
                "bagging_fraction": 0.8,
                "bagging_freq": 1,
                "lambda_l1": 0.05,  # Reduced regularization
                "lambda_l2": 0.05,  # Reduced regularization
                "force_row_wise": True,
                "seed": 42
            }
            
        # Get class distribution in training data
        class_counts = np.bincount(y_train)
        n_samples = len(y_train)
        
        # Calculate weight multipliers
        weight_multipliers = np.array([
            15.0 if label == 0 else  # H class
            15.0 if label == 1 else  # L class
            1.0  # None class
            for label in range(num_class)
        ])
        
        # Calculate balanced weights with multipliers
        class_weights = (n_samples / (num_class * class_counts)) * weight_multipliers
        sample_weights = class_weights[y_train]
        
        dtrain = lgb.Dataset(X_train, label=y_train, weight=sample_weights)
        model = lgb.train(
            params,
            dtrain,
            num_boost_round=1000,  # Reduced number of rounds
            valid_sets=[dtrain],
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),
                lgb.log_evaluation(period=100)
            ]
        )
        return model
    except Exception as e:
        print("[LGBM] Final training failed:", repr(e))
        return None

# Enhanced logistic regression with class balancing
def train_logreg(X_train, y_train):
    # Use the same aggressive weighting as LightGBM
    n_samples = len(y_train)
    class_counts = np.bincount(y_train)
    weight_multipliers = np.array([15.0, 15.0, 1.0])  # [H, L, None]
    class_weights = (n_samples / (len(class_counts) * class_counts)) * weight_multipliers
    class_weight_dict = {i: w for i, w in enumerate(class_weights)}
    
    clf = LogisticRegression(
        max_iter=2000,
        multi_class='multinomial', 
        solver='saga',
        class_weight=class_weight_dict,
        C=0.3,
        n_jobs=-1
    )
    clf.fit(X_train, y_train)
    return clf

majority_idx = int(np.bincount(y_enc).argmax())
majority_label = le.inverse_transform([majority_idx])[0]

# --- Train final model according to selected approach ---
test_pred_enc = None

if best_key == 'lgbm':
    print("[FINAL] Training LightGBM on full data...")
    final_model = safe_train_lgbm_final(X, y_enc, num_class=len(le.classes_))
    if final_model is None:
        print("[FINAL] LGBM training failed; falling back to logistic regression.")
        logreg = train_logreg(X, y_enc)
        pred_probs = logreg.predict_proba(X_test)
        
        # Apply calibrated thresholds for minority classes
        none_idx = np.where(le.classes_ == 'None')[0][0]
        h_idx = np.where(le.classes_ == 'H')[0][0]
        l_idx = np.where(le.classes_ == 'L')[0][0]
        
        # More aggressive thresholds for minority classes
        none_threshold = 0.8  # Higher threshold for None class
        hl_threshold = 0.2   # Lower threshold for H/L classes
        
        # Start with predictions based on max probability
        test_pred_enc = np.argmax(pred_probs, axis=1)
        
        # Override predictions based on thresholds
        none_probs = pred_probs[:, none_idx]
        hl_probs = pred_probs[:, [h_idx, l_idx]]
        max_hl_probs = np.max(hl_probs, axis=1)
        
        # Where None probability is very high, predict None
        strong_none_mask = none_probs >= none_threshold
        test_pred_enc[strong_none_mask] = none_idx
        
        # Where H/L probabilities are significant, predict H/L
        strong_hl_mask = max_hl_probs >= hl_threshold
        hl_preds = np.argmax(hl_probs[strong_hl_mask], axis=1)
        test_pred_enc[strong_hl_mask] = [h_idx if x == 0 else l_idx for x in hl_preds]
        
        del logreg; gc.collect()
    else:
        try:
            pred_probs = final_model.predict(X_test)
            if pred_probs.ndim == 1:
                pred_probs = pred_probs.reshape(-1, len(le.classes_))
            
            # Get indices for each class
            none_idx = np.where(le.classes_ == 'None')[0][0]
            h_idx = np.where(le.classes_ == 'H')[0][0]
            l_idx = np.where(le.classes_ == 'L')[0][0]
            
            # Much more aggressive thresholds to favor H/L predictions
            none_threshold = 0.95  # Very high threshold for None
            hl_threshold = 0.05    # Very low threshold for H/L
            
            # Initial predictions based on probability thresholds
            test_pred_enc = np.full(pred_probs.shape[0], none_idx, dtype=int)
            
            # Apply thresholding rules
            none_probs = pred_probs[:, none_idx]
            hl_probs = pred_probs[:, [h_idx, l_idx]]
            max_hl_probs = np.max(hl_probs, axis=1)
            
            # Target distribution (based on expectation: ~3% H, ~3% L, ~94% None)
            target_h_ratio = 0.03
            target_l_ratio = 0.03
            
            # Calculate number of samples for each class
            n_samples = len(test_pred_enc)
            target_h = int(n_samples * target_h_ratio)
            target_l = int(n_samples * target_l_ratio)
            
            # Sort samples by H and L probabilities
            h_probs = pred_probs[:, h_idx]
            l_probs = pred_probs[:, l_idx]
            
            # Get top H predictions
            h_indices = np.argsort(h_probs)[-target_h:]
            test_pred_enc[h_indices] = h_idx
            
            # Get top L predictions (excluding those already marked as H)
            remaining_mask = ~np.isin(np.arange(n_samples), h_indices)
            l_indices = np.argsort(l_probs[remaining_mask])[-target_l:]
            test_pred_enc[remaining_mask][l_indices] = l_idx
            
            # Only predict None for samples with very high None probability
            strong_none_mask = none_probs >= none_threshold
            # Don't override H/L predictions we just made
            none_indices = np.where(~np.isin(np.arange(n_samples), np.concatenate([h_indices, l_indices])) & strong_none_mask)[0]
            test_pred_enc[none_indices] = none_idx
            
            # Print prediction distribution
            print("\nPrediction Distribution:")
            for label, count in zip(le.classes_, np.bincount(test_pred_enc, minlength=len(le.classes_))):
                print(f"{label}: {count} ({count/len(test_pred_enc)*100:.2f}%)")
            
        except Exception as e:
            print("[FINAL] LGBM prediction failed:", repr(e))
            test_pred_enc = np.full((X_test.shape[0],), majority_idx, dtype=int)
        del final_model; gc.collect()

elif best_key == 'hier':
    print("[FINAL] Using hierarchical predictor for final predictions...")
    thr_cfg = cv_results['hier']['cfg']
    try:
        test_pred_enc = hierarchical_predict_test(X, y_enc, X_test, classes_order, thr_cfg)
    except Exception as e:
        print("[FINAL] Hierarchical prediction failed:", repr(e))
        test_pred_enc = np.full((X_test.shape[0],), majority_idx, dtype=int)

else:
    print("[FINAL] Using logistic regression with balanced weights...")
    model = train_logreg(X, y_enc)
    pred_probs = model.predict_proba(X_test)
    
    # Apply similar thresholding strategy
    none_idx = np.where(le.classes_ == 'None')[0][0]
    h_idx = np.where(le.classes_ == 'H')[0][0]
    l_idx = np.where(le.classes_ == 'L')[0][0]
    
    # Initial predictions based on max probability
    test_pred_enc = np.argmax(pred_probs, axis=1)
    
    # Apply thresholds
    none_probs = pred_probs[:, none_idx]
    hl_probs = pred_probs[:, [h_idx, l_idx]]
    max_hl_probs = np.max(hl_probs, axis=1)
    
    strong_none_mask = none_probs >= 0.8
    test_pred_enc[strong_none_mask] = none_idx
    
    strong_hl_mask = max_hl_probs >= 0.2
    hl_preds = np.argmax(hl_probs[strong_hl_mask], axis=1)
    test_pred_enc[strong_hl_mask] = [h_idx if x == 0 else l_idx for x in hl_preds]
    
    del model; gc.collect()

# Ensure encoded predictions match test length
test_pred_enc = np.asarray(test_pred_enc).astype(int).reshape(-1)
if test_pred_enc.size != X_test.shape[0]:
    print("[FINAL] Prediction length mismatch; filling with majority label.")
    test_pred_enc = np.full((X_test.shape[0],), majority_idx, dtype=int)

# Map encoded labels back to original label names
try:
    test_pred_labels = le.inverse_transform(test_pred_enc)
except Exception as e:
    print("[FINAL] Label inverse_transform failed:", repr(e))
    classes = np.asarray(le.classes_)
    clipped = np.clip(test_pred_enc, 0, len(classes) - 1)
    test_pred_labels = classes[clipped]

# Clean labels to ensure only valid competition classes
valid_labels = set(['H', 'L', 'None'])  # Only allow official competition classes
labels_series = pd.Series(test_pred_labels, dtype=object)
labels_series = labels_series.fillna('None')
labels_series = labels_series.replace({None: 'None', 'nan': 'None', 'NaN': 'None', '': 'None'})
labels_series = labels_series.astype(str)

# Ensure only competition-valid labels
invalid_mask = ~labels_series.isin(valid_labels)
if invalid_mask.any():
    print(f"[FINAL] Replacing {invalid_mask.sum()} invalid labels with 'None'")
    labels_series.loc[invalid_mask] = 'None'

test_pred_labels = labels_series.to_numpy(dtype=object)

# Force minimum counts for H and L based on training distribution
if len(test) >= 100:  # Only apply for reasonably sized test sets
    min_h_count = max(int(len(test) * 0.03), 1)  # At least 3% H
    min_l_count = max(int(len(test) * 0.03), 1)  # At least 3% L
    
    current_counts = pd.Series(test_pred_labels).value_counts()
    h_count = current_counts.get('H', 0)
    l_count = current_counts.get('L', 0)
    
    # If we need more H or L predictions, convert some high-probability None predictions
    none_indices = np.where(test_pred_labels == 'None')[0]
    if len(none_indices) > 0:
        # Get probabilities for None samples
        if best_key == 'lgbm' or best_key == 'hier':
            none_probs = pred_probs[none_indices]
            
            # Add more H predictions if needed
            if h_count < min_h_count:
                h_candidates = none_indices[np.argsort(none_probs[none_indices, h_idx])[::-1]]
                n_convert = min(min_h_count - h_count, len(h_candidates))
                test_pred_labels[h_candidates[:n_convert]] = 'H'
            
            # Add more L predictions if needed
            if l_count < min_l_count:
                l_candidates = none_indices[np.argsort(none_probs[none_indices, l_idx])[::-1]]
                n_convert = min(min_l_count - l_count, len(l_candidates))
                test_pred_labels[l_candidates[:n_convert]] = 'L'

assert len(test_pred_labels) == len(test), "Prediction/row count mismatch after cleaning."

# Create submission IDs safely
if id_col is not None and id_col in test.columns:
    submission_ids = pd.Series(test[id_col], copy=True)
else:
    print(f"[FINAL] Warning: id_col '{id_col}' not found in test; using index as id.")
    submission_ids = pd.Series(test.index, copy=True)

if submission_ids.isna().any():
    print("[FINAL] Found NaNs in submission ids; replacing with sequential indices.")
    na_mask = submission_ids.isna()
    submission_ids.loc[na_mask] = np.arange(len(submission_ids))[na_mask]
submission_ids = submission_ids.astype(int, errors='ignore')

submission_col = sample_sub.columns[1] if sample_sub is not None else 'target'
id_key = id_col if (id_col is not None and id_col in test.columns) else (sample_sub.columns[0] if sample_sub is not None else 'id')

# Create final submission
submission = pd.DataFrame({id_key: submission_ids, submission_col: test_pred_labels})
submission[id_key] = submission[id_key].astype(submission_ids.dtype)
submission[submission_col] = submission[submission_col].astype(str)

# Final validation that we only have competition-valid labels
valid_comp_labels = {'H', 'L', 'None'}
assert submission[submission_col].isin(valid_comp_labels).all(), "Found non-competition labels in submission!"

submission = submission[[id_key, submission_col]]
submission_path = Path('./submission.csv')
submission.to_csv(submission_path, index=False)
print('Saved submission to', submission_path.resolve())
print('\nSubmission Label Distribution:')
label_counts = submission[submission_col].value_counts()
print(label_counts)
print('\nPercentages:')
print((label_counts / len(submission) * 100).round(2), '%')

# Calculate expected counts based on training distribution
train_dist = y.value_counts(normalize=True)
expected_test_counts = (train_dist * len(test)).round(0)
print('\nExpected counts based on training distribution:')
print(expected_test_counts.astype(int))

display(submission.head())

[FINAL] Training LightGBM on full data...
[LGBM] Final training failed: TypeError("train() got an unexpected keyword argument 'early_stopping_rounds'")
[FINAL] LGBM training failed; falling back to logistic regression.
Saved submission to /kaggle/working/submission.csv


Unnamed: 0,id,class_label
0,0,
1,1,
2,2,
3,3,
4,4,


## Notes
- Set `DATASET_NAME` if your Kaggle dataset has a different slug.
- This notebook auto-detects columns (`class_label`, `id`, `ticker_id`, `t`) and adapts.
- It performs ticker-aware CV when possible, tunes a hierarchical threshold to optimize accuracy, and falls back gracefully.
- Output: `submission.csv` with columns `[id, class_label]`.
