# Alpha Radar Solana Sprint — High-Recall CatBoost Solution

This notebook rebuilds the end-to-end pipeline for the Alpha Radar Solana Sprint competition.  It focuses on
feature engineering that captures ultra-early token behaviours, robust cross-validation with F1-driven threshold
tuning, and a final CatBoost model that is optimised for recall without sacrificing precision.


## 1. Environment and configuration


In [None]:
import json
from pathlib import Path
from typing import Dict, Iterable, List, Tuple

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    jaccard_score,
    precision_recall_curve,
    precision_recall_fscore_support,
    roc_auc_score,
)
from sklearn.model_selection import StratifiedKFold

pd.set_option('display.max_columns', None)
np.random.seed(2025)

BASE_DIR = Path('.')
DATA_DIR = BASE_DIR / 'Dataset' / 'alpha-radar-solana-sprint'
TARGET_PATH = BASE_DIR / 'Dataset' / 'target_tokens.csv'
OUTPUT_DIR = BASE_DIR / 'outputs'
OUTPUT_DIR.mkdir(exist_ok=True)
EXPECTED_EVAL_ROWS = 64208


## 2. Utility helpers


In [None]:
def parse_timestamp_to_seconds(value) -> float:
    'Convert a mm:ss.s style timestamp string into float seconds.'
    if pd.isna(value):
        return np.nan
    if isinstance(value, (int, float)):
        return float(value)
    text = str(value).strip()
    if not text:
        return np.nan
    if ':' not in text:
        try:
            return float(text)
        except ValueError:
            return np.nan
    try:
        minutes, seconds = text.split(':', 1)
        return int(minutes) * 60.0 + float(seconds)
    except ValueError:
        return np.nan


def coerce_numeric(df: pd.DataFrame) -> pd.DataFrame:
    'Soft-convert numeric looking object columns.'
    for col in df.select_dtypes(include=['object']).columns:
        sample = df[col].dropna().astype(str)
        if sample.empty:
            continue
        sample = sample.str.replace(',', '', regex=False)
        numeric_ratio = sample.str.fullmatch(r'-?\d+(?:\.\d+)?').mean()
        if numeric_ratio >= 0.85:
            df[col] = pd.to_numeric(sample, errors='coerce')
    return df


def load_event_csv(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path)
    if 'index' in df.columns:
        df = df.drop(columns=['index'])
    if 'timestamp' in df.columns and 'timestamp_seconds' not in df.columns:
        df['timestamp_seconds'] = df['timestamp'].apply(parse_timestamp_to_seconds).astype('float32')
    df = coerce_numeric(df)
    return df


def load_events(paths: Iterable[Path]) -> pd.DataFrame:
    frames = []
    for path in paths:
        print(f'Loading {path.name} ...')
        frame = load_event_csv(path)
        frames.append(frame)
    data = pd.concat(frames, ignore_index=True)
    print(f'Loaded {len(data):,} rows across {len(frames)} file(s).')
    return data


def load_target_tokens(path: Path) -> pd.Index:
    target_df = pd.read_csv(path)
    if target_df.shape[1] == 1:
        series = target_df.iloc[:, 0]
    else:
        for candidate in ['mint_token_id', 'token', 'mint']:
            if candidate in target_df.columns:
                series = target_df[candidate]
                break
        else:
            series = target_df.iloc[:, 0]
    tokens = series.dropna().astype(str).unique()
    return pd.Index(tokens, name='mint_token_id')


## 3. Feature engineering


In [None]:
AGG_NUMERIC_FUNCS = ['mean', 'std', 'min', 'max', 'sum', 'median', 'last']


def aggregate_numeric(grouped: pd.core.groupby.generic.DataFrameGroupBy, numeric_cols: List[str]) -> pd.DataFrame:
    if not numeric_cols:
        return pd.DataFrame(index=grouped.size().index)
    agg = grouped[numeric_cols].agg(AGG_NUMERIC_FUNCS)
    agg.columns = [f'{col}_{stat}' for col, stat in agg.columns]
    return agg


def pivot_categorical(grouped: pd.core.groupby.generic.DataFrameGroupBy, cat_cols: List[str], prefix: str) -> pd.DataFrame:
    pivots = []
    for col in cat_cols:
        counts = grouped[col].value_counts(normalize=False).unstack(fill_value=0)
        counts.columns = [f'{prefix}{col}_{str(c)}_count' for c in counts.columns]
        ratios = counts.div(counts.sum(axis=1).replace(0, np.nan), axis=0)
        ratios = ratios.add_suffix('_ratio').fillna(0)
        pivots.append(counts)
        pivots.append(ratios)
    if not pivots:
        return pd.DataFrame(index=grouped.size().index)
    out = pd.concat(pivots, axis=1)
    out.columns = pd.Index(out.columns).map(str)
    return out


def derive_first_n_features(ordered: pd.DataFrame, grouped: pd.core.groupby.generic.DataFrameGroupBy, n: int) -> pd.DataFrame:
    first_n = ordered.groupby('mint_token_id', sort=False).head(n)
    feature_frames = []
    if 'timestamp_seconds' in first_n.columns:
        timing = first_n.groupby('mint_token_id')['timestamp_seconds'].agg(['min', 'max', 'mean']).rename(columns={
            'min': f'first{n}_time_min',
            'max': f'first{n}_time_max',
            'mean': f'first{n}_time_mean'
        })
        feature_frames.append(timing)
    if 'event_type' in first_n.columns:
        counts = first_n.groupby(['mint_token_id', 'event_type']).size().unstack(fill_value=0)
        counts.columns = [f'first{n}_event_{c}_count' for c in counts.columns]
        feature_frames.append(counts)
    numeric_candidates = [
        col for col in first_n.columns
        if col not in {'mint_token_id'} and pd.api.types.is_numeric_dtype(first_n[col])
    ]
    if numeric_candidates:
        agg = first_n.groupby('mint_token_id')[numeric_candidates].agg(['mean', 'max', 'min']).astype('float32')
        agg.columns = [f'first{n}_{col}_{stat}' for col, stat in agg.columns]
        feature_frames.append(agg)
    if feature_frames:
        return pd.concat(feature_frames, axis=1)
    return pd.DataFrame(index=grouped.size().index)


def build_token_features(events: pd.DataFrame, *, top_categories: int = 10, first_n_events: Tuple[int, ...] = (5, 10, 20)) -> pd.DataFrame:
    if 'mint_token_id' not in events.columns:
        raise KeyError('Expected mint_token_id column to group events by token.')

    events = events.copy()
    events['mint_token_id'] = events['mint_token_id'].astype(str)

    if 'timestamp_seconds' not in events.columns and 'timestamp' in events.columns:
        events['timestamp_seconds'] = events['timestamp'].apply(parse_timestamp_to_seconds).astype('float32')

    ordered = events.sort_values(['mint_token_id', 'timestamp_seconds'], kind='mergesort')
    grouped = ordered.groupby('mint_token_id', sort=False)

    numeric_cols = [
        col for col in ordered.columns
        if col not in {'mint_token_id'} and pd.api.types.is_numeric_dtype(ordered[col])
    ]
    categorical_cols = [
        col for col in ordered.columns
        if ordered[col].dtype == 'object' and col not in {'mint_token_id'}
    ]

    base = pd.DataFrame(index=grouped.size().index)
    base['event_count'] = grouped.size().astype('int32')
    if 'timestamp_seconds' in ordered.columns:
        base['lifetime_seconds'] = (grouped['timestamp_seconds'].max() - grouped['timestamp_seconds'].min()).fillna(0)
        base['time_to_first_trade'] = grouped['timestamp_seconds'].min().fillna(0)

    if 'holder' in ordered.columns:
        base['unique_holders'] = grouped['holder'].nunique(dropna=True)

    if 'trade_signature' in ordered.columns:
        base['unique_trades'] = grouped['trade_signature'].nunique(dropna=True)

    numeric_summary = aggregate_numeric(grouped, numeric_cols)

    pivot_frames = []
    if categorical_cols:
        capped_cols = {}
        for col in categorical_cols:
            top_values = ordered[col].value_counts().head(top_categories).index
            capped_cols[col] = ordered[col].where(ordered[col].isin(top_values), other='__OTHER__')
        capped = ordered.assign(**{col: capped_cols[col] for col in capped_cols})
        capped_grouped = capped.groupby('mint_token_id', sort=False)
        pivot_frames.append(pivot_categorical(capped_grouped, list(capped_cols.keys()), prefix='cat_'))

    first_n_frames = [derive_first_n_features(ordered, grouped, n) for n in first_n_events]

    features = pd.concat([base, numeric_summary, *pivot_frames, *first_n_frames], axis=1).fillna(0)

    for col in features.columns:
        if pd.api.types.is_float_dtype(features[col]):
            features[col] = features[col].astype('float32')
        elif pd.api.types.is_integer_dtype(features[col]):
            features[col] = features[col].astype('int32')

    return features


## 4. Build the training matrix


In [None]:
training_sources = [DATA_DIR / 'Sample_Dataset.csv']
train_events = load_events(training_sources)
train_features = build_token_features(train_events)

positive_tokens = load_target_tokens(TARGET_PATH)
train_features['is_target'] = train_features.index.isin(positive_tokens).astype('int8')

print(f'Training tokens: {len(train_features):,}')
print(train_features['is_target'].value_counts())


## 5. Stratified cross-validation with F1-driven threshold search


In [None]:
from dataclasses import dataclass


@dataclass
class FoldResult:
    fold: int
    best_threshold: float
    auc: float
    accuracy: float
    precision: float
    recall: float
    f1: float
    jaccard: float


def find_best_threshold(y_true: np.ndarray, y_prob: np.ndarray) -> Tuple[float, Dict[str, float]]:
    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    thresholds = np.append(thresholds, 1.0)
    f1_scores = 2 * precision * recall / np.maximum(precision + recall, 1e-8)
    best_idx = np.nanargmax(f1_scores)
    best_thr = float(thresholds[best_idx])
    preds = (y_prob >= best_thr).astype(int)
    precision_val, recall_val, f1_val, _ = precision_recall_fscore_support(y_true, preds, average='binary', zero_division=0)
    acc = accuracy_score(y_true, preds)
    jac = jaccard_score(y_true, preds, zero_division=0)
    auc = roc_auc_score(y_true, y_prob)
    metrics = {
        'accuracy': acc,
        'precision': precision_val,
        'recall': recall_val,
        'f1': f1_val,
        'jaccard': jac,
        'auc': auc,
    }
    return best_thr, metrics


features = train_features.drop(columns=['is_target'])
labels = train_features['is_target'].astype(int)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2025)
fold_results: List[FoldResult] = []
validation_rows = []

class_weight = max(1.0, (len(labels) - labels.sum()) / max(labels.sum(), 1))

params = dict(
    iterations=2500,
    learning_rate=0.035,
    depth=8,
    l2_leaf_reg=3.0,
    random_seed=2025,
    loss_function='Logloss',
    eval_metric='AUC',
    bootstrap_type='Bernoulli',
    subsample=0.9,
    rsm=0.6,
    grow_policy='Lossguide',
    min_data_in_leaf=32,
    scale_pos_weight=class_weight,
    early_stopping_rounds=200,
    allow_writing_files=False,
    verbose=250,
    custom_metric=['F1', 'Recall'],
)

for fold, (train_idx, valid_idx) in enumerate(skf.split(features, labels), start=1):
    X_train, X_valid = features.iloc[train_idx], features.iloc[valid_idx]
    y_train, y_valid = labels.iloc[train_idx], labels.iloc[valid_idx]

    train_pool = Pool(X_train, y_train)
    valid_pool = Pool(X_valid, y_valid)

    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)

    valid_prob = model.predict_proba(X_valid)[:, 1]
    best_thr, fold_metrics = find_best_threshold(y_valid.to_numpy(), valid_prob)

    fold_results.append(FoldResult(
        fold=fold,
        best_threshold=best_thr,
        auc=fold_metrics['auc'],
        accuracy=fold_metrics['accuracy'],
        precision=fold_metrics['precision'],
        recall=fold_metrics['recall'],
        f1=fold_metrics['f1'],
        jaccard=fold_metrics['jaccard'],
    ))

    validation_rows.append(pd.DataFrame({
        'fold': fold,
        'mint_token_id': features.index[valid_idx],
        'y_true': y_valid.to_numpy(),
        'y_prob': valid_prob,
    }))

fold_df = pd.DataFrame([fr.__dict__ for fr in fold_results])
fold_df


In [None]:
median_threshold = float(fold_df['best_threshold'].median())
print(f'Median threshold (F1-optimal): {median_threshold:.4f}')
print(fold_df[['fold', 'auc', 'accuracy', 'precision', 'recall', 'f1', 'jaccard', 'best_threshold']])

stacked_val = pd.concat(validation_rows, ignore_index=True)
stacked_val['y_pred'] = (stacked_val['y_prob'] >= median_threshold).astype(int)

overall_precision, overall_recall, overall_f1, _ = precision_recall_fscore_support(
    stacked_val['y_true'], stacked_val['y_pred'], average='binary', zero_division=0
)
overall_acc = accuracy_score(stacked_val['y_true'], stacked_val['y_pred'])
overall_jaccard = jaccard_score(stacked_val['y_true'], stacked_val['y_pred'], zero_division=0)
overall_auc = roc_auc_score(stacked_val['y_true'], stacked_val['y_prob'])

print(json.dumps({
    'accuracy': round(overall_acc, 6),
    'precision': round(overall_precision, 6),
    'recall': round(overall_recall, 6),
    'f1': round(overall_f1, 6),
    'jaccard': round(overall_jaccard, 6),
    'auc': round(overall_auc, 6),
}, indent=2))

conf_mat = confusion_matrix(stacked_val['y_true'], stacked_val['y_pred'])
conf_df = pd.DataFrame(conf_mat, index=['Actual 0', 'Actual 1'], columns=['Pred 0', 'Pred 1'])
conf_df


## 6. Train the final CatBoost model


In [None]:
final_params = params.copy()
final_params.update(dict(verbose=250, early_stopping_rounds=150))

final_model = CatBoostClassifier(**final_params)
full_pool = Pool(features, labels)
final_model.fit(full_pool)

feature_importance = pd.Series(final_model.get_feature_importance(), index=features.columns)
feature_importance.sort_values(ascending=False).head(30)


## 7. Inference on the evaluation split


In [None]:
eval_paths = sorted(DATA_DIR.glob('evaluation_set_30s_chunk_*.csv'))
if not eval_paths:
    raise FileNotFoundError('Evaluation chunks were not found. Make sure the updated evaluation set is present.')

eval_events = load_events(eval_paths)
eval_token_index = pd.Index(
    eval_events['mint_token_id'].astype(str).drop_duplicates(),
    name='mint_token_id'
)
if len(eval_token_index) != EXPECTED_EVAL_ROWS:
    print(f"Warning: expected {EXPECTED_EVAL_ROWS:,} evaluation tokens but found {len(eval_token_index):,} in the loaded events.")

eval_features = build_token_features(eval_events)

missing_cols = set(features.columns) - set(eval_features.columns)
for col in missing_cols:
    eval_features[col] = 0.0
extra_cols = set(eval_features.columns) - set(features.columns)
if extra_cols:
    eval_features = eval_features.drop(columns=list(extra_cols))

eval_features = eval_features[features.columns]
eval_features = eval_features.reindex(eval_token_index).fillna(0.0)
for col, dtype in features.dtypes.items():
    eval_features[col] = eval_features[col].astype(dtype)

if len(eval_features) != EXPECTED_EVAL_ROWS:
    raise ValueError(f"Submission must contain {EXPECTED_EVAL_ROWS:,} rows, but prepared {len(eval_features):,}.")

probabilities = final_model.predict_proba(eval_features)[:, 1]
predictions = (probabilities >= median_threshold).astype(int)

submission = pd.DataFrame({
    'mint_token_id': eval_features.index,
    'is_target': predictions,
})
submission_path = OUTPUT_DIR / 'submission.csv'
submission.to_csv(submission_path, index=False)
print(f'Saved submission to {submission_path}')

report = pd.DataFrame({
    'token': eval_features.index,
    'threshold': median_threshold,
    'prediction_value': probabilities,
    'isTargetToken': predictions,
})
report_path = OUTPUT_DIR / 'detailed_predictions.csv'
report.to_csv(report_path, index=False)
print(f'Saved detailed predictions to {report_path}')

submission.head()


## 8. Export a competition-ready metric summary


In [None]:
summary_path = OUTPUT_DIR / 'validation_metrics.json'
summary_payload = {
    'fold_metrics': fold_df.to_dict(orient='records'),
    'aggregated': {
        'threshold': median_threshold,
        'accuracy': overall_acc,
        'precision': overall_precision,
        'recall': overall_recall,
        'f1': overall_f1,
        'jaccard': overall_jaccard,
        'auc': overall_auc,
    },
}
with open(summary_path, 'w') as fp:
    json.dump(summary_payload, fp, indent=2)
print(f'Validation summary stored at {summary_path}')
summary_payload
