# Alpha Radar: Solana Sprint - CatBoost Solution

- feature engineers the 30-second PumpFun event stream
- trains a CatBoost model while monitoring recall and accuracy
- searches thresholds that keep validation accuracy above 90%
- exports competition submission plus a positive-token report


In [None]:
import warnings
from pathlib import Path
from typing import Iterable, List, Sequence, Tuple

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from IPython.display import display
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    jaccard_score,
    precision_recall_fscore_support,
    roc_auc_score,
)
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings("ignore", category=UserWarning)
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda v: f"{v:0.4f}")

np.random.seed(2025)


In [None]:
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "Dataset" / "alpha-radar-solana-sprint"
TARGET_PATH = BASE_DIR / "Dataset" / "target_tokens.csv"
EVALUATION_PATTERN = "evaluation_set_30s_chunk_*.csv"
OUTPUT_DIR = BASE_DIR / "outputs"
OUTPUT_DIR.mkdir(exist_ok=True)
RANDOM_STATE = 42
EXPECTED_EVAL_ROWS = 64208
EARLY_EVENT_COUNTS: Tuple[int, ...] = (3, 5, 10)
CATEGORICAL_TOP_K = 12
THRESHOLD_CANDIDATES = np.linspace(0.05, 0.95, 19)

if not DATA_DIR.exists():
    raise FileNotFoundError(f"Expected data directory at {DATA_DIR}")
if not TARGET_PATH.exists():
    raise FileNotFoundError(f"Expected target token file at {TARGET_PATH}")

print(f"Using data directory: {DATA_DIR}")
print(f"Target token path: {TARGET_PATH}")
print(f"Outputs will be stored in: {OUTPUT_DIR}")


In [None]:
AGG_NUMERIC_FUNCS = ['mean', 'std', 'min', 'max', 'sum', 'median', 'last']
QUANTILE_LEVELS = (0.10, 0.25, 0.50, 0.75, 0.90)
NON_NUMERIC_COLUMNS = {"timestamp", "mint_token_id", "holder", "trade_mode", "creator", "event_type"}
NUMERIC_FILL_VALUE = 0.0

def parse_timestamp_series(series: pd.Series) -> pd.Series:
    parts = series.astype(str).str.split(':', n=1, expand=True)
    minutes = pd.to_numeric(parts[0], errors='coerce')
    seconds = pd.to_numeric(parts[1], errors='coerce')
    return (minutes * 60 + seconds).astype('float32')

def load_event_data(csv_path: Path) -> pd.DataFrame:
    df = pd.read_csv(csv_path, low_memory=False)
    if 'index' in df.columns:
        df = df.drop(columns=['index'])
    df['mint_token_id'] = df['mint_token_id'].astype(str)
    if 'timestamp_seconds' not in df.columns and 'timestamp' in df.columns:
        df['timestamp_seconds'] = parse_timestamp_series(df['timestamp'])
    numeric_candidates = [c for c in df.columns if c not in NON_NUMERIC_COLUMNS]
    for col in numeric_candidates:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df[numeric_candidates] = df[numeric_candidates].astype('float32')
    return df

def aggregate_numeric(grouped: pd.core.groupby.generic.DataFrameGroupBy, numeric_cols: List[str]) -> pd.DataFrame:
    if not numeric_cols:
        return pd.DataFrame(index=grouped.size().index)
    agg = grouped[numeric_cols].agg(AGG_NUMERIC_FUNCS)
    agg.columns = [f"{col}_{stat}" for col, stat in agg.columns]
    return agg

def compute_quantiles(grouped: pd.core.groupby.generic.DataFrameGroupBy, numeric_cols: List[str]) -> pd.DataFrame:
    frames = []
    for q in QUANTILE_LEVELS:
        quant = grouped[numeric_cols].quantile(q)
        quant.columns = [f"{col}_q{int(q*100):02d}" for col in quant.columns]
        frames.append(quant)
    if frames:
        return pd.concat(frames, axis=1)
    return pd.DataFrame(index=grouped.size().index)

def pivot_categorical(grouped: pd.core.groupby.generic.DataFrameGroupBy, cat_cols: Sequence[str], prefix: str) -> pd.DataFrame:
    pivots: List[pd.DataFrame] = []
    for col in cat_cols:
        counts = grouped[col].value_counts(normalize=False).unstack(fill_value=0)
        counts.columns = [f"{prefix}{col}_{str(c)}_count" for c in counts.columns]
        ratios = counts.div(counts.sum(axis=1).replace(0, np.nan), axis=0)
        ratios = ratios.add_suffix('_ratio').fillna(0)
        pivots.extend([counts, ratios])
    if pivots:
        out = pd.concat(pivots, axis=1)
        out.columns = pd.Index(out.columns).map(str)
        return out
    return pd.DataFrame(index=grouped.size().index)

def derive_first_n_features(ordered: pd.DataFrame, grouped: pd.core.groupby.generic.DataFrameGroupBy, n: int) -> pd.DataFrame:
    first_n = ordered.groupby('mint_token_id', sort=False).head(n)
    frames: List[pd.DataFrame] = []
    if 'timestamp_seconds' in first_n.columns:
        timing = first_n.groupby('mint_token_id')['timestamp_seconds'].agg(['min', 'max', 'mean']).rename(columns={
            'min': f'first{n}_time_min',
            'max': f'first{n}_time_max',
            'mean': f'first{n}_time_mean'
        })
        frames.append(timing)
    numeric_candidates = [
        col for col in first_n.columns
        if col not in {'mint_token_id'} and pd.api.types.is_numeric_dtype(first_n[col])
    ]
    if numeric_candidates:
        agg = first_n.groupby('mint_token_id')[numeric_candidates].agg(['mean', 'max', 'min']).astype('float32')
        agg.columns = [f'first{n}_{col}_{stat}' for col, stat in agg.columns]
        frames.append(agg)
    if frames:
        return pd.concat(frames, axis=1)
    return pd.DataFrame(index=grouped.size().index)

def add_ratio_features(features: pd.DataFrame) -> pd.DataFrame:
    out = features.copy()
    if 'event_count' in out.columns:
        denom = out['event_count'].replace(0, np.nan)
        if 'trade_mode_buy_count' in out.columns:
            out['buy_rate'] = out['trade_mode_buy_count'] / denom
        if 'trade_mode_sell_count' in out.columns:
            out['sell_rate'] = out['trade_mode_sell_count'] / denom
        if {'trade_mode_buy_count', 'trade_mode_sell_count'} <= set(out.columns):
            out['buy_to_sell_ratio'] = out['trade_mode_buy_count'] / out['trade_mode_sell_count'].replace(0, np.nan)
    if {'sol_volume_sum', 'event_count'} <= set(out.columns):
        out['sol_volume_per_event'] = out['sol_volume_sum'] / out['event_count'].replace(0, np.nan)
    if {'token_volume_sum', 'event_count'} <= set(out.columns):
        out['token_volume_per_event'] = out['token_volume_sum'] / out['event_count'].replace(0, np.nan)
    for prefix in ('sol_volume', 'token_volume', 'price', 'market_cap'):
        max_col = f'{prefix}_max'
        min_col = f'{prefix}_min'
        if {max_col, min_col} <= set(out.columns):
            out[f'{prefix}_range'] = out[max_col] - out[min_col]
    if {'unique_holders', 'event_count'} <= set(out.columns):
        out['holders_per_event'] = out['unique_holders'] / out['event_count'].replace(0, np.nan)
    return out

def build_features(events: pd.DataFrame) -> pd.DataFrame:
    if events.empty:
        return pd.DataFrame()
    events = events.copy()
    events['mint_token_id'] = events['mint_token_id'].astype(str)
    if 'timestamp_seconds' not in events.columns and 'timestamp' in events.columns:
        events['timestamp_seconds'] = parse_timestamp_series(events['timestamp'])
    ordered = events.sort_values(['mint_token_id', 'timestamp_seconds'], kind='mergesort')
    grouped = ordered.groupby('mint_token_id', sort=False)

    numeric_cols = [
        col for col in ordered.columns
        if col not in {'mint_token_id'} and pd.api.types.is_numeric_dtype(ordered[col])
    ]
    categorical_cols = [
        col for col in ordered.columns
        if ordered[col].dtype == 'object' and col not in {'mint_token_id'}
    ]

    base = pd.DataFrame(index=grouped.size().index)
    base['event_count'] = grouped.size().astype('int32')
    if 'timestamp_seconds' in ordered.columns:
        base['lifetime_seconds'] = (grouped['timestamp_seconds'].max() - grouped['timestamp_seconds'].min()).fillna(0)
        base['time_to_first_event'] = grouped['timestamp_seconds'].min().fillna(0)
    if 'holder' in ordered.columns:
        base['unique_holders'] = grouped['holder'].nunique(dropna=True)
    if 'creator' in ordered.columns:
        base['unique_creators'] = grouped['creator'].nunique(dropna=True)

    numeric_summary = aggregate_numeric(grouped, numeric_cols)
    quantile_summary = compute_quantiles(grouped, numeric_cols)

    capped_events = ordered.copy()
    cat_frames: List[pd.DataFrame] = []
    if categorical_cols:
        for col in categorical_cols:
            top_values = capped_events[col].value_counts().head(CATEGORICAL_TOP_K).index
            capped_events[col] = capped_events[col].where(capped_events[col].isin(top_values), other='__OTHER__')
        capped_grouped = capped_events.groupby('mint_token_id', sort=False)
        cat_frames.append(pivot_categorical(capped_grouped, categorical_cols, prefix='cat_'))

    first_n_frames = [derive_first_n_features(ordered, grouped, n) for n in EARLY_EVENT_COUNTS]

    features = pd.concat([base, numeric_summary, quantile_summary, *cat_frames, *first_n_frames], axis=1)
    features = add_ratio_features(features)
    features = features.replace([np.inf, -np.inf], NUMERIC_FILL_VALUE).fillna(NUMERIC_FILL_VALUE)

    for col in features.select_dtypes(include=['float64']).columns:
        features[col] = features[col].astype('float32')
    for col in features.select_dtypes(include=['int64']).columns:
        features[col] = features[col].astype('int32')

    return features

def evaluate_thresholds(y_true: pd.Series, y_prob: np.ndarray, thresholds: Iterable[float]) -> pd.DataFrame:
    records = []
    for thr in thresholds:
        y_pred = (y_prob >= thr).astype(int)
        acc = accuracy_score(y_true, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_true, y_pred, average='binary', zero_division=0
        )
        try:
            jac = jaccard_score(y_true, y_pred)
        except Exception:
            jac = 0.0
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        records.append({
            'threshold': float(thr),
            'accuracy': acc,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'jaccard': jac,
            'tp': int(tp),
            'fp': int(fp),
            'fn': int(fn),
            'tn': int(tn),
        })
    return pd.DataFrame.from_records(records)


In [None]:
sample_events = load_event_data(DATA_DIR / 'Sample_Dataset.csv')
target_tokens = pd.read_csv(TARGET_PATH, header=None, names=['mint_token_id'])
target_tokens['mint_token_id'] = target_tokens['mint_token_id'].astype(str)
target_set = set(target_tokens['mint_token_id'])

print(f'Sample events shape: {sample_events.shape}')
print(f'Unique tokens in sample events: {sample_events['mint_token_id'].nunique()}')
print(f'Target tokens provided: {len(target_set)}')
sample_events.head()


In [None]:
train_features = build_features(sample_events)
train_features['is_target'] = train_features.index.isin(target_set).astype('int8')

print(f'Training feature matrix: {train_features.shape}')
class_counts = train_features['is_target'].value_counts().rename('token_count')
display(class_counts.to_frame())
positive_rate = class_counts.get(1, 0) / class_counts.sum()
print(f'Positive token rate: {positive_rate:.4f}')
train_features.head()


In [None]:
feature_columns = [col for col in train_features.columns if col != 'is_target']
X = train_features[feature_columns].astype('float32')
y = train_features['is_target'].astype('int32')

model_params = {
    'depth': 8,
    'learning_rate': 0.04,
    'iterations': 2500,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'random_seed': RANDOM_STATE,
    'l2_leaf_reg': 6.0,
    'bagging_temperature': 0.8,
    'random_strength': 1.5,
    'scale_pos_weight': max(1.0, (len(y) - y.sum()) / max(1, y.sum())),
    'rsm': 0.85,
    'border_count': 128,
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
fold_models: List[CatBoostClassifier] = []
fold_thresholds: List[float] = []
fold_summaries = []
oof_prob = np.zeros(len(train_features), dtype='float32')
feature_importance_accumulator = np.zeros(len(feature_columns), dtype='float64')

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y), start=1):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    train_pool = Pool(X_train, label=y_train)
    valid_pool = Pool(X_valid, label=y_valid)

    model = CatBoostClassifier(**model_params)
    model.fit(
        train_pool,
        eval_set=valid_pool,
        verbose=250,
        use_best_model=True,
    )

    fold_prob = model.predict_proba(valid_pool)[:, 1]
    oof_prob[valid_idx] = fold_prob
    feature_importance_accumulator += model.get_feature_importance(type='FeatureImportance')

    fold_metrics = evaluate_thresholds(y_valid, fold_prob, THRESHOLD_CANDIDATES)
    best_row = fold_metrics.sort_values(['f1', 'recall', 'precision', 'accuracy'], ascending=False).iloc[0]
    fold_thresholds.append(float(best_row['threshold']))

    summary = best_row.to_dict()
    summary.update({
        'fold': fold,
        'auc': roc_auc_score(y_valid, fold_prob),
        'positive_rate': float(y_valid.mean()),
    })
    fold_summaries.append(summary)
    fold_models.append(model)

    print(f"Fold {fold}: AUC={summary['auc']:.4f}  F1={summary['f1']:.4f}  Recall={summary['recall']:.4f}  Threshold={summary['threshold']:.2f}")

fold_summary_df = pd.DataFrame(fold_summaries)[['fold', 'auc', 'threshold', 'accuracy', 'precision', 'recall', 'f1', 'jaccard', 'tp', 'fp', 'fn', 'tn']]
display(fold_summary_df)
aggregated_threshold = float(np.median(fold_thresholds))
print(f'Median optimal threshold across folds: {aggregated_threshold:.4f}')


In [None]:
agg_pred = (oof_prob >= aggregated_threshold).astype(int)
precision, recall, f1, _ = precision_recall_fscore_support(y, agg_pred, average='binary', zero_division=0)
accuracy = accuracy_score(y, agg_pred)
jaccard = jaccard_score(y, agg_pred)
auc = roc_auc_score(y, oof_prob)

print(f'OOF metrics at aggregated threshold {aggregated_threshold:.4f}:')
print(f'  Accuracy: {accuracy:.4f}')
print(f'  Precision: {precision:.4f}')
print(f'  Recall: {recall:.4f}')
print(f'  F1-score: {f1:.4f}')
print(f'  Jaccard: {jaccard:.4f}')
print(f'  AUC: {auc:.4f}')

conf_mtx = confusion_matrix(y, agg_pred)
conf_df = pd.DataFrame(conf_mtx, index=['Actual 0', 'Actual 1'], columns=['Pred 0', 'Pred 1'])
display(conf_df)

print('Classification report:')
print(classification_report(y, agg_pred, digits=4))


In [None]:
final_model = CatBoostClassifier(**model_params)
full_pool = Pool(X, label=y)
final_model.fit(full_pool, verbose=250)

final_feature_importances = pd.Series(final_model.get_feature_importance(type='FeatureImportance'), index=feature_columns)
final_feature_importances.sort_values(ascending=False).head(30)


In [None]:
top_features = final_feature_importances.sort_values(ascending=False).head(30)
top_features.to_frame(name='importance')


In [None]:
evaluation_files = sorted(DATA_DIR.glob(EVALUATION_PATTERN))
if not evaluation_files:
    raise FileNotFoundError('No evaluation set chunks found.')

eval_events = pd.concat([load_event_data(path) for path in evaluation_files], ignore_index=True)
eval_events['mint_token_id'] = eval_events['mint_token_id'].astype(str)
eval_features = build_features(eval_events)

missing_cols = [col for col in feature_columns if col not in eval_features.columns]
for col in missing_cols:
    eval_features[col] = NUMERIC_FILL_VALUE
eval_features = eval_features[feature_columns].astype('float32')

eval_token_order = eval_events['mint_token_id'].drop_duplicates().tolist()
eval_features = eval_features.reindex(eval_token_order)
missing_tokens = [tok for tok in eval_token_order if tok not in eval_features.index]
if missing_tokens:
    filler = pd.DataFrame(NUMERIC_FILL_VALUE, index=missing_tokens, columns=feature_columns)
    eval_features = pd.concat([eval_features, filler])
    eval_features = eval_features.reindex(eval_token_order)

if len(eval_features) != EXPECTED_EVAL_ROWS:
    raise ValueError(f'Expected {EXPECTED_EVAL_ROWS} evaluation rows, found {len(eval_features)}')

print(f'Evaluation events shape: {eval_events.shape}')
print(f'Evaluation feature matrix: {eval_features.shape}')
eval_features.head()


In [None]:
eval_pool = Pool(eval_features)
eval_prob = final_model.predict_proba(eval_pool)[:, 1]
eval_pred = (eval_prob >= aggregated_threshold).astype(int)

detailed_df = pd.DataFrame({
    'mint_token_id': eval_features.index,
    'prediction_score': eval_prob,
    'is_target': eval_pred,
}).reset_index(drop=True)

submission_df = detailed_df[['mint_token_id', 'is_target']]
positive_df = detailed_df[detailed_df['is_target'] == 1].copy()
positive_df['threshold_used'] = aggregated_threshold

submission_path = OUTPUT_DIR / 'submission.csv'
detailed_path = OUTPUT_DIR / 'detailed_predictions.csv'
positive_path = OUTPUT_DIR / 'positive_predictions.csv'

submission_df.to_csv(submission_path, index=False)
detailed_df.to_csv(detailed_path, index=False)
positive_df.to_csv(positive_path, index=False)

print(f'Saved submission to {submission_path} (rows={len(submission_df)})')
print(f'Saved detailed predictions to {detailed_path}')
print(f'Saved positive predictions to {positive_path} (rows={len(positive_df)})')

detailed_df.describe(include='all').transpose()


In [None]:
detailed_df.sort_values('prediction_score', ascending=False).head(10)


In [None]:
detailed_df.sort_values("prediction_score", ascending=False).head(10)


**Next Steps**

- Submit submission.csv to the Kaggle competition
- Review predicted_positive_tokens.csv for manual due diligence
- Iterate on additional feature ideas (e.g. time-window segmentation) to further raise recall without sacrificing accuracy
