# Alpha Radar CatBoost Solution v3

Robust feature engineering and CatBoost ensemble tailored for the Alpha Radar Solana Sprint competition. This notebook focuses on maximising recall and F1 while keeping precision steady through extensive aggregation, directional statistics, and cross-validated threshold optimisation.


In [None]:
import gc
import math
import warnings
from pathlib import Path
from typing import Dict, Iterable, List, Sequence, Tuple

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from IPython.display import display
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    jaccard_score,
    precision_recall_fscore_support,
    roc_auc_score,
)
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings("ignore", category=UserWarning)
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda v: f"{v:0.4f}")
np.random.seed(2025)


In [None]:
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "Dataset" / "alpha-radar-solana-sprint"
TARGET_PATH = BASE_DIR / "Dataset" / "target_tokens.csv"
TRAIN_FILENAME = "Sample_Dataset.csv"
EVALUATION_PATTERN = "evaluation_set_30s_chunk_*.csv"
OUTPUT_DIR = BASE_DIR / "outputs"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

EXPECTED_EVAL_ROWS = 64208
SEEDS: Tuple[int, ...] = (42, 2025, 31415)
N_FOLDS = 7

AGG_NUMERIC_FUNCS = ["mean", "std", "min", "max", "sum", "median", "last"]
QUANTILE_LEVELS = (0.10, 0.25, 0.50, 0.75, 0.90)
EARLY_EVENT_WINDOWS: Tuple[int, ...] = (3, 5, 10, 20)
EARLY_TIME_WINDOWS: Tuple[int, ...] = (5, 10, 15, 20, 25, 30)
THRESHOLD_GRID = np.round(np.linspace(0.05, 0.995, 40), 3)

NUMERIC_FILL_VALUE = 0.0
NON_NUMERIC_COLUMNS = {"timestamp", "mint_token_id", "holder", "trade_mode", "creator", "event_type"}
CATEGORICAL_TOP_K = 15

if not DATA_DIR.exists():
    raise FileNotFoundError(f"Expected data directory at {DATA_DIR}")
train_path = DATA_DIR / TRAIN_FILENAME
if not train_path.exists():
    raise FileNotFoundError(f"Training CSV missing at {train_path}")
if not TARGET_PATH.exists():
    raise FileNotFoundError(f"Target token file missing at {TARGET_PATH}")

print(f"Using data directory: {DATA_DIR}")
print(f"Training CSV: {train_path}")
print(f"Target token path: {TARGET_PATH}")
print(f"Outputs will be written to: {OUTPUT_DIR}")


In [None]:
def parse_timestamp_series(series: pd.Series) -> pd.Series:
    parts = series.astype(str).str.split(':', n=1, expand=True)
    minutes = pd.to_numeric(parts[0], errors='coerce')
    seconds = pd.to_numeric(parts[1], errors='coerce')
    return (minutes * 60 + seconds).astype('float32')


def load_event_data(csv_path: Path) -> pd.DataFrame:
    df = pd.read_csv(csv_path, low_memory=False)
    if 'index' in df.columns:
        df = df.drop(columns=['index'])
    df['mint_token_id'] = df['mint_token_id'].astype(str)
    if 'timestamp_seconds' not in df.columns and 'timestamp' in df.columns:
        df['timestamp_seconds'] = parse_timestamp_series(df['timestamp'])
    numeric_candidates = [c for c in df.columns if c not in NON_NUMERIC_COLUMNS]
    for col in numeric_candidates:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df[numeric_candidates] = df[numeric_candidates].astype('float32')
    return df


def aggregate_numeric(grouped: pd.core.groupby.generic.DataFrameGroupBy, numeric_cols: List[str]) -> pd.DataFrame:
    if not numeric_cols:
        return pd.DataFrame(index=grouped.size().index)
    agg = grouped[numeric_cols].agg(AGG_NUMERIC_FUNCS)
    agg.columns = [f"{col}_{stat}" for col, stat in agg.columns]
    return agg


def compute_quantiles(grouped: pd.core.groupby.generic.DataFrameGroupBy, numeric_cols: List[str]) -> pd.DataFrame:
    frames = []
    for q in QUANTILE_LEVELS:
        quant = grouped[numeric_cols].quantile(q)
        quant.columns = [f"{col}_q{int(q*100):02d}" for col in quant.columns]
        frames.append(quant)
    if frames:
        return pd.concat(frames, axis=1)
    return pd.DataFrame(index=grouped.size().index)


def pivot_categorical(grouped: pd.core.groupby.generic.DataFrameGroupBy, cat_cols: Sequence[str], prefix: str) -> pd.DataFrame:
    pivots: List[pd.DataFrame] = []
    for col in cat_cols:
        counts = grouped[col].value_counts(normalize=False).unstack(fill_value=0)
        counts.columns = [f"{prefix}{col}_{str(c)}_count" for c in counts.columns]
        ratios = counts.div(counts.sum(axis=1).replace(0, np.nan), axis=0)
        ratios = ratios.add_suffix('_ratio').fillna(0)
        pivots.extend([counts, ratios])
    if pivots:
        out = pd.concat(pivots, axis=1)
        out.columns = pd.Index(out.columns).map(str)
        return out
    return pd.DataFrame(index=grouped.size().index)


def derive_first_n_features(ordered: pd.DataFrame, n: int, numeric_cols: Sequence[str]) -> pd.DataFrame:
    if ordered.empty:
        return pd.DataFrame()
    first_n = ordered.groupby('mint_token_id', sort=False).head(n)
    frames: List[pd.DataFrame] = []
    if 'timestamp_seconds' in first_n.columns:
        timing = first_n.groupby('mint_token_id')['timestamp_seconds'].agg(['min', 'max', 'mean']).rename(columns={
            'min': f'first{n}_time_min',
            'max': f'first{n}_time_max',
            'mean': f'first{n}_time_mean'
        })
        frames.append(timing)
    if numeric_cols:
        agg = first_n.groupby('mint_token_id')[list(numeric_cols)].agg(['mean', 'max', 'min']).astype('float32')
        agg.columns = [f'first{n}_{col}_{stat}' for col, stat in agg.columns]
        frames.append(agg)
    if frames:
        return pd.concat(frames, axis=1)
    return pd.DataFrame(index=ordered['mint_token_id'].drop_duplicates())


def compute_signed_aggregates(ordered: pd.DataFrame, numeric_cols: Sequence[str]) -> pd.DataFrame:
    frames: List[pd.DataFrame] = []
    for col in numeric_cols:
        if not pd.api.types.is_numeric_dtype(ordered[col]):
            continue
        subset = ordered[['mint_token_id', col]].dropna()
        if subset.empty:
            continue
        pos = subset[subset[col] > 0]
        neg = subset[subset[col] < 0]
        for label, part in (('pos', pos), ('neg', neg)):
            if part.empty:
                continue
            grouped = part.groupby('mint_token_id')[col]
            agg = grouped.agg(['sum', 'mean', 'max', 'min', 'count']).astype('float32')
            agg.columns = [f'{col}_{label}_{stat}' for stat in agg.columns]
            frames.append(agg)
    if frames:
        return pd.concat(frames, axis=1)
    return pd.DataFrame(index=ordered['mint_token_id'].drop_duplicates())


def compute_time_window_features(ordered: pd.DataFrame, numeric_cols: Sequence[str]) -> pd.DataFrame:
    if 'relative_time' not in ordered.columns:
        return pd.DataFrame(index=ordered['mint_token_id'].drop_duplicates())
    frames: List[pd.DataFrame] = []
    for window in EARLY_TIME_WINDOWS:
        window_mask = ordered['relative_time'] <= window
        window_events = ordered[window_mask]
        if window_events.empty:
            continue
        grouped = window_events.groupby('mint_token_id', sort=False)
        counts = grouped.size().rename(f'time_window_{window}_event_count')
        frames.append(counts.to_frame())
        usable_cols = [c for c in numeric_cols if c in window_events.columns]
        if usable_cols:
            agg = grouped[usable_cols].agg(['sum', 'mean', 'max']).astype('float32')
            agg.columns = [f'time_window_{window}_{col}_{stat}' for col, stat in agg.columns]
            frames.append(agg)
    if frames:
        return pd.concat(frames, axis=1)
    return pd.DataFrame(index=ordered['mint_token_id'].drop_duplicates())


def add_ratio_features(features: pd.DataFrame) -> pd.DataFrame:
    out = features.copy()
    if 'event_count' in out.columns:
        denom = out['event_count'].replace(0, np.nan)
        if 'cat_trade_mode_buy_count' in out.columns:
            out['buy_event_rate'] = out['cat_trade_mode_buy_count'] / denom
        if 'cat_trade_mode_sell_count' in out.columns:
            out['sell_event_rate'] = out['cat_trade_mode_sell_count'] / denom
        if {'cat_trade_mode_buy_count', 'cat_trade_mode_sell_count'} <= set(out.columns):
            out['buy_to_sell_ratio'] = out['cat_trade_mode_buy_count'] / out['cat_trade_mode_sell_count'].replace(0, np.nan)
    if {'lifetime_seconds', 'event_count'} <= set(out.columns):
        out['event_rate_per_second'] = out['event_count'] / out['lifetime_seconds'].replace(0, np.nan)
    for prefix in ['sol_volume', 'token_volume', 'price', 'market_cap']:
        max_col = f'{prefix}_max'
        min_col = f'{prefix}_min'
        if {max_col, min_col} <= set(out.columns):
            out[f'{prefix}_range'] = out[max_col] - out[min_col]
    if {'unique_holders', 'event_count'} <= set(out.columns):
        out['holders_per_event'] = out['unique_holders'] / out['event_count'].replace(0, np.nan)
    if {'unique_creators', 'event_count'} <= set(out.columns):
        out['creators_per_event'] = out['unique_creators'] / out['event_count'].replace(0, np.nan)
    out = out.replace([np.inf, -np.inf], NUMERIC_FILL_VALUE)
    return out


def finalise_features(features: pd.DataFrame) -> pd.DataFrame:
    features = features.replace([np.inf, -np.inf], NUMERIC_FILL_VALUE).fillna(NUMERIC_FILL_VALUE)
    for col in features.columns:
        if features[col].dtype == 'float64':
            features[col] = features[col].astype('float32')
        elif features[col].dtype == 'int64':
            features[col] = features[col].astype('int32')
    features.index = features.index.astype(str)
    return features


def build_features(events: pd.DataFrame) -> pd.DataFrame:
    if events.empty:
        return pd.DataFrame()
    events = events.copy()
    events['mint_token_id'] = events['mint_token_id'].astype(str)
    if 'timestamp_seconds' not in events.columns and 'timestamp' in events.columns:
        events['timestamp_seconds'] = parse_timestamp_series(events['timestamp'])
    ordered = events.sort_values(['mint_token_id', 'timestamp_seconds'], kind='mergesort')
    if 'timestamp_seconds' in ordered.columns:
        first_time = ordered.groupby('mint_token_id')['timestamp_seconds'].transform('min')
        ordered['relative_time'] = ordered['timestamp_seconds'] - first_time
        ordered['delta_time'] = ordered.groupby('mint_token_id')['timestamp_seconds'].diff().fillna(0)
    grouped = ordered.groupby('mint_token_id', sort=False)

    numeric_cols = [
        col for col in ordered.columns
        if col not in {'mint_token_id'} and pd.api.types.is_numeric_dtype(ordered[col])
    ]
    categorical_cols = [
        col for col in ordered.columns
        if ordered[col].dtype == 'object' and col not in {'mint_token_id'}
    ]

    base = pd.DataFrame(index=grouped.size().index)
    base['event_count'] = grouped.size().astype('int32')
    if 'timestamp_seconds' in ordered.columns:
        base['lifetime_seconds'] = (grouped['timestamp_seconds'].max() - grouped['timestamp_seconds'].min()).fillna(0)
        base['time_to_first_event'] = grouped['timestamp_seconds'].min().fillna(0)
    if 'relative_time' in ordered.columns:
        base['relative_time_max'] = grouped['relative_time'].max().fillna(0)
    if 'delta_time' in ordered.columns:
        base['delta_time_mean'] = grouped['delta_time'].mean().fillna(0)
        base['delta_time_std'] = grouped['delta_time'].std().fillna(0)
    if 'holder' in ordered.columns:
        base['unique_holders'] = grouped['holder'].nunique(dropna=True)
    if 'creator' in ordered.columns:
        base['unique_creators'] = grouped['creator'].nunique(dropna=True)

    numeric_summary = aggregate_numeric(grouped, numeric_cols)
    quantile_summary = compute_quantiles(grouped, numeric_cols)

    capped_events = ordered.copy()
    cat_frames: List[pd.DataFrame] = []
    if categorical_cols:
        for col in categorical_cols:
            top_values = capped_events[col].value_counts().head(CATEGORICAL_TOP_K).index
            capped_events[col] = capped_events[col].where(capped_events[col].isin(top_values), other='__OTHER__')
        capped_grouped = capped_events.groupby('mint_token_id', sort=False)
        cat_frames.append(pivot_categorical(capped_grouped, categorical_cols, prefix='cat_'))

    first_n_frames = [derive_first_n_features(ordered, n, numeric_cols) for n in EARLY_EVENT_WINDOWS]
    signed_frames = compute_signed_aggregates(ordered, numeric_cols)
    time_window_frames = compute_time_window_features(ordered, [c for c in numeric_cols if c not in {'relative_time', 'delta_time'}])

    features = pd.concat([
        base,
        numeric_summary,
        quantile_summary,
        *cat_frames,
        *first_n_frames,
        signed_frames,
        time_window_frames,
    ], axis=1)

    features = add_ratio_features(features)
    features = finalise_features(features)
    features = features.sort_index()
    return features


In [None]:
def evaluate_thresholds(y_true: pd.Series, y_prob: np.ndarray, thresholds: Iterable[float]) -> pd.DataFrame:
    records = []
    for thr in thresholds:
        y_pred = (y_prob >= thr).astype(int)
        acc = accuracy_score(y_true, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_true, y_pred, average='binary', zero_division=0
        )
        try:
            jac = jaccard_score(y_true, y_pred)
        except ValueError:
            jac = 0.0
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        records.append({
            'threshold': float(thr),
            'accuracy': acc,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'jaccard': jac,
            'tp': int(tp),
            'fp': int(fp),
            'fn': int(fn),
            'tn': int(tn),
        })
    return pd.DataFrame.from_records(records)


def select_best_threshold(metrics: pd.DataFrame, min_recall: float = 0.88) -> pd.Series:
    if metrics.empty:
        raise ValueError('No threshold metrics available.')
    filtered = metrics[metrics['recall'] >= min_recall]
    if filtered.empty:
        filtered = metrics.copy()
    ordered = filtered.sort_values(
        ['f1', 'recall', 'precision', 'accuracy'],
        ascending=[False, False, False, False]
    )
    return ordered.iloc[0]


In [None]:
sample_events = load_event_data(DATA_DIR / TRAIN_FILENAME)
target_tokens = pd.read_csv(TARGET_PATH, header=None, names=['mint_token_id'])
target_tokens['mint_token_id'] = target_tokens['mint_token_id'].astype(str)
target_set = set(target_tokens['mint_token_id'])

print(f'Sample events shape: {sample_events.shape}')
print(f'Unique tokens in sample events: {sample_events['mint_token_id'].nunique()}')
print(f'Target tokens provided: {len(target_set)}')

train_features = build_features(sample_events)
train_features['is_target'] = train_features.index.isin(target_set).astype('int8')

print(f'Training feature matrix: {train_features.shape}')
class_counts = train_features['is_target'].value_counts().rename('token_count')
display(class_counts.to_frame())
positive_rate = class_counts.get(1, 0) / class_counts.sum()
print(f'Positive token rate: {positive_rate:.4f}')


In [None]:
feature_columns = [col for col in train_features.columns if col != 'is_target']
X = train_features[feature_columns].astype('float32')
y = train_features['is_target'].astype('int32')

catboost_base_params = {
    'depth': 8,
    'learning_rate': 0.035,
    'iterations': 4500,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'l2_leaf_reg': 10.0,
    'bagging_temperature': 0.6,
    'random_strength': 1.5,
    'border_count': 128,
    'auto_class_weights': 'Balanced',
    'grow_policy': 'SymmetricTree',
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.9,
    'task_type': 'CPU',
}

print(f'Feature matrix shape: {X.shape}')
print(f'Using {len(feature_columns)} engineered features')


In [None]:
oof_prob = np.zeros(len(X), dtype='float32')
oof_counts = np.zeros(len(X), dtype='int32')
feature_importance_accumulator = np.zeros(len(feature_columns), dtype='float64')
fold_records: List[Dict] = []
models: List[CatBoostClassifier] = []
thresholds: List[float] = []

for seed in SEEDS:
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
    for fold_idx, (train_idx, valid_idx) in enumerate(skf.split(X, y), start=1):
        params = dict(catboost_base_params)
        params['random_seed'] = seed + fold_idx

        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        train_pool = Pool(X_train, label=y_train)
        valid_pool = Pool(X_valid, label=y_valid)

        model = CatBoostClassifier(**params)
        model.fit(
            train_pool,
            eval_set=valid_pool,
            verbose=250,
            use_best_model=True,
            early_stopping_rounds=200,
        )

        fold_prob = model.predict_proba(valid_pool)[:, 1]
        oof_prob[valid_idx] += fold_prob
        oof_counts[valid_idx] += 1

        metrics_df = evaluate_thresholds(y_valid, fold_prob, THRESHOLD_GRID)
        best_row = select_best_threshold(metrics_df, min_recall=0.90)

        fold_record = best_row.to_dict()
        fold_record.update({
            'seed': seed,
            'fold': fold_idx,
            'auc': roc_auc_score(y_valid, fold_prob),
            'positive_rate': float(y_valid.mean()),
            'best_iteration': model.get_best_iteration(),
        })
        fold_records.append(fold_record)
        thresholds.append(float(best_row['threshold']))

        feature_importance_accumulator += model.get_feature_importance(type='FeatureImportance')
        models.append(model)

        print(
            f"Seed {seed} Fold {fold_idx}: AUC={fold_record['auc']:.4f} F1={fold_record['f1']:.4f} "
            f"Recall={fold_record['recall']:.4f} Thr={fold_record['threshold']:.3f}"
        )

fold_summary_df = pd.DataFrame(fold_records)
if not fold_summary_df.empty:
    display(fold_summary_df.sort_values(['seed', 'fold']).reset_index(drop=True))

aggregated_threshold = float(np.median(thresholds)) if thresholds else 0.5
print(f'Median optimal threshold across folds: {aggregated_threshold:.4f}')


In [None]:
valid_mask = oof_counts > 0
oof_prob[valid_mask] = oof_prob[valid_mask] / oof_counts[valid_mask]

agg_pred = (oof_prob >= aggregated_threshold).astype(int)
precision, recall, f1, _ = precision_recall_fscore_support(y, agg_pred, average='binary', zero_division=0)
accuracy = accuracy_score(y, agg_pred)
jaccard = jaccard_score(y, agg_pred)
auc = roc_auc_score(y, oof_prob)

print(f'OOF metrics at aggregated threshold {aggregated_threshold:.4f}:')
print(f'  Accuracy: {accuracy:.4f}')
print(f'  Precision: {precision:.4f}')
print(f'  Recall: {recall:.4f}')
print(f'  F1-score: {f1:.4f}')
print(f'  Jaccard: {jaccard:.4f}')
print(f'  AUC: {auc:.4f}')

conf_mtx = confusion_matrix(y, agg_pred)
conf_df = pd.DataFrame(conf_mtx, index=['Actual 0', 'Actual 1'], columns=['Pred 0', 'Pred 1'])
display(conf_df)

print('Classification report:')
print(classification_report(y, agg_pred, digits=4))


In [None]:
feature_importances = pd.Series(feature_importance_accumulator / max(1, len(models)), index=feature_columns)
feature_importances = feature_importances.sort_values(ascending=False)
display(feature_importances.head(40).to_frame(name='importance'))


In [None]:
final_models: List[CatBoostClassifier] = []
for seed in SEEDS:
    params = dict(catboost_base_params)
    params['random_seed'] = seed
    final_model = CatBoostClassifier(**params)
    final_model.fit(Pool(X, label=y), verbose=250)
    final_models.append(final_model)

print(f'Trained {len(final_models)} final models for ensembling.')


In [None]:
evaluation_files = sorted(DATA_DIR.glob(EVALUATION_PATTERN))
if not evaluation_files:
    raise FileNotFoundError('No evaluation set chunks found. Check the dataset path and pattern.')

print(f'Found {len(evaluation_files)} evaluation chunks.')

eval_events = pd.concat([load_event_data(path) for path in evaluation_files], ignore_index=True)
eval_events['mint_token_id'] = eval_events['mint_token_id'].astype(str)

eval_features = build_features(eval_events)

missing_cols = [col for col in feature_columns if col not in eval_features.columns]
for col in missing_cols:
    eval_features[col] = NUMERIC_FILL_VALUE

eval_features = eval_features[feature_columns].astype('float32')

eval_token_order = eval_events['mint_token_id'].drop_duplicates().astype(str).tolist()
eval_features = eval_features.reindex(eval_token_order)
missing_tokens = [tok for tok in eval_token_order if tok not in eval_features.index]
if missing_tokens:
    filler = pd.DataFrame(NUMERIC_FILL_VALUE, index=missing_tokens, columns=feature_columns)
    eval_features = pd.concat([eval_features, filler])
    eval_features = eval_features.reindex(eval_token_order)

eval_features = eval_features.fillna(NUMERIC_FILL_VALUE)

if len(eval_features) != EXPECTED_EVAL_ROWS:
    raise ValueError(f'Expected {EXPECTED_EVAL_ROWS} evaluation rows, found {len(eval_features)}')

print(f'Evaluation events shape: {eval_events.shape}')
print(f'Evaluation feature matrix: {eval_features.shape}')


In [None]:
eval_pool = Pool(eval_features)
model_probs = []
for model in final_models:
    model_probs.append(model.predict_proba(eval_pool)[:, 1])

ensemble_prob = np.mean(model_probs, axis=0)
ensemble_pred = (ensemble_prob >= aggregated_threshold).astype(int)

detailed_df = pd.DataFrame({
    'mint_token_id': eval_features.index,
    'prediction_score': ensemble_prob,
    'is_target': ensemble_pred,
}).reset_index(drop=True)

submission_df = detailed_df[['mint_token_id', 'is_target']]
positive_df = detailed_df[detailed_df['is_target'] == 1].copy()
positive_df['threshold_used'] = aggregated_threshold

submission_path = OUTPUT_DIR / 'submission.csv'
detailed_path = OUTPUT_DIR / 'detailed_predictions.csv'
positive_path = OUTPUT_DIR / 'positive_predictions.csv'

submission_df.to_csv(submission_path, index=False)
detailed_df.to_csv(detailed_path, index=False)
positive_df.to_csv(positive_path, index=False)

print(f'Saved submission to {submission_path} (rows={len(submission_df)})')
print(f'Saved detailed predictions to {detailed_path}')
print(f'Saved positive predictions to {positive_path} (rows={len(positive_df)})')


In [None]:
display(detailed_df.sort_values('prediction_score', ascending=False).head(20))


In [None]:
# Persist key artefacts for downstream analysis
fold_summary_path = OUTPUT_DIR / 'cv_fold_summary.csv'
feature_importances_path = OUTPUT_DIR / 'feature_importances.csv'

if not fold_summary_df.empty:
    fold_summary_df.to_csv(fold_summary_path, index=False)
    print(f'Fold summary saved to {fold_summary_path}')

feature_importances.to_csv(feature_importances_path, header=['importance'])
print(f'Feature importances saved to {feature_importances_path}')
