# Alpha Radar: Solana Sprint — CatBoost SolutionThis notebook develops a high-recall CatBoost classifier for the Alpha Radar Solana Sprint competition. The workflow carefully follows the competition instructions: it only uses the provided 30-second PumpFun transaction dataset, engineers informative per-token features, and trains a CatBoost model that achieves very high validation accuracy while preserving recall. The notebook finishes by generating both the required competition submission file and an extended prediction file containing probabilities and threshold decisions for auditability.

## 1. SetupWe start by importing the required libraries, defining file-system paths, and setting deterministic behaviour for reproducibility. The notebook expects the provided dataset directory to have the following structure (as shipped with the competition starter kit):```Dataset/ ├─ alpha-radar-solana-sprint/ │   ├─ Sample_Dataset.csv │   ├─ evaluation_set_30s_chunk_001.csv │   ├─ ... │   └─ evaluation_set_30s_chunk_005.csv └─ target_tokens.csv```

In [None]:
import gc
import json
from pathlib import Path
from typing import Dict, Iterable, List, Tuple

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    jaccard_score,
    precision_recall_curve,
    precision_recall_fscore_support,
    roc_auc_score,
)
from sklearn.model_selection import StratifiedKFold

pd.set_option('display.max_columns', None)
np.random.seed(42)

BASE_DIR = Path('.')
DATA_DIR = BASE_DIR / 'Dataset' / 'alpha-radar-solana-sprint'
TARGET_PATH = BASE_DIR / 'Dataset' / 'target_tokens.csv'
OUTPUT_DIR = BASE_DIR / 'outputs'
OUTPUT_DIR.mkdir(exist_ok=True)


## 2. Data Loading HelpersThe raw CSV files can be large, so we stream them chunk by chunk. We also normalise timestamps (e.g. `26:31.1`) into numeric seconds so that CatBoost can consume them as proper features. Every attempt is made to coerce numerically encoded strings into floats/integers while leaving true categorical identifiers untouched.

In [None]:
def parse_timestamp_to_seconds(value) -> float:
    '''Convert a `mm:ss.s` style timestamp string into seconds.'''
    if pd.isna(value):
        return np.nan
    if isinstance(value, (int, float)):
        return float(value)
    if not isinstance(value, str):
        return np.nan
    text = value.strip()
    if not text:
        return np.nan
    if ':' not in text:
        try:
            return float(text)
        except ValueError:
            return np.nan
    try:
        minutes, seconds = text.split(':', 1)
        return int(minutes) * 60.0 + float(seconds)
    except ValueError:
        return np.nan


def coerce_numeric_columns(df: pd.DataFrame) -> pd.DataFrame:
    '''Convert numeric-looking object columns to numeric dtype when safe.'''
    for col in df.select_dtypes(include=['object']).columns:
        sample = df[col].dropna().astype(str)
        if sample.empty:
            continue
        sample = sample.sample(n=min(len(sample), 1000), random_state=42)
        sample = sample.str.replace(',', '', regex=False)
        numeric_ratio = sample.str.fullmatch(r'-?\d+(?:\.\d+)?').mean()
        if numeric_ratio >= 0.8:
            df[col] = pd.to_numeric(df[col].astype(str).str.replace(',', '', regex=False), errors='coerce')
    return df


def load_event_csv(path: Path) -> pd.DataFrame:
    '''Load a single CSV file and perform lightweight normalisation.'''
    df = pd.read_csv(path)
    if 'index' in df.columns:
        df = df.drop(columns=['index'])
    if 'timestamp' in df.columns:
        df['timestamp_seconds'] = df['timestamp'].apply(parse_timestamp_to_seconds).astype('float32')
    df = coerce_numeric_columns(df)
    return df


def load_events(paths: Iterable[Path]) -> pd.DataFrame:
    '''Concatenate multiple event CSV files into a single DataFrame.'''
    frames = []
    for path in paths:
        print(f'Loading {path.name} ...')
        frame = load_event_csv(path)
        frames.append(frame)
    df = pd.concat(frames, ignore_index=True)
    print(f'Loaded {len(df):,} rows from {len(frames)} file(s).')
    return df


def load_target_tokens(path: Path) -> pd.Index:
    target_df = pd.read_csv(path)
    if target_df.shape[1] == 1:
        tokens = target_df.iloc[:, 0]
    else:
        for candidate in ['mint_token_id', 'token', 'mint']:
            if candidate in target_df.columns:
                tokens = target_df[candidate]
                break
        else:
            tokens = target_df.iloc[:, 0]
    tokens = tokens.dropna().astype(str)
    return pd.Index(tokens.unique(), name='mint_token_id')


## 3. Feature EngineeringWe engineer expressive, token-level features that summarise the first 30 seconds of trading activity. The engineered features include:* Aggregations (mean, std, min, max, median, sum, last) of every numeric signal.* Event counts, total volume, unique participant counts, and time-span measurements.* Pivoted frequency counts for categorical columns with a manageable number of distinct values (≤ 8).* First/last event snapshots to preserve temporal asymmetry between early buyers and late sellers.All NaNs are filled with zeros so CatBoost can consume the matrix directly.

In [None]:
def summarise_numeric(grouped: pd.core.groupby.generic.DataFrameGroupBy, numeric_cols: List[str]) -> pd.DataFrame:
    agg_funcs = ['mean', 'std', 'min', 'max', 'median', 'sum', 'last']
    if not numeric_cols:
        return pd.DataFrame(index=grouped.size().index)
    agg = grouped[numeric_cols].agg(agg_funcs)
    agg.columns = [f'{col}_{stat}' for col, stat in agg.columns]
    return agg


def build_token_features(events: pd.DataFrame, max_pivot_categories: int = 8) -> pd.DataFrame:
    if 'mint_token_id' not in events.columns:
        raise KeyError("Expected a 'mint_token_id' column in the events data.")

    events = events.copy()
    events['mint_token_id'] = events['mint_token_id'].astype(str)

    if 'timestamp_seconds' not in events.columns and 'timestamp' in events.columns:
        events['timestamp_seconds'] = events['timestamp'].apply(parse_timestamp_to_seconds).astype('float32')

    ordered = events.sort_values(['mint_token_id', 'timestamp_seconds'], kind='mergesort')
    grouped = ordered.groupby('mint_token_id', sort=False)

    numeric_cols = [
        col
        for col in ordered.columns
        if col != 'mint_token_id' and pd.api.types.is_numeric_dtype(ordered[col])
    ]
    numeric_summary = summarise_numeric(grouped, numeric_cols)

    feature_frames = [numeric_summary]
    counts = grouped.size().rename('event_count').to_frame()
    feature_frames.append(counts)

    for col in ['holder', 'creator', 'trade_mode', 'event_type', 'side']:
        if col in ordered.columns:
            nunique = grouped[col].nunique().rename(f'unique_{col}s')
            feature_frames.append(nunique.to_frame())

    if 'timestamp_seconds_last' in numeric_summary.columns and 'timestamp_seconds_min' in numeric_summary.columns:
        lifespan = numeric_summary['timestamp_seconds_last'] - numeric_summary['timestamp_seconds_min']
        feature_frames.append(lifespan.rename('time_span_seconds').to_frame())

    if numeric_cols:
        first_numeric = grouped[numeric_cols].nth(0)
        first_numeric.columns = [f'first_{col}' for col in first_numeric.columns]
        feature_frames.append(first_numeric)

        last_numeric = grouped[numeric_cols].nth(-1)
        last_numeric.columns = [f'last_{col}' for col in last_numeric.columns]
        feature_frames.append(last_numeric)

    categorical_cols = [
        col
        for col in ordered.select_dtypes(include=['object', 'category']).columns
        if col not in {'mint_token_id'}
    ]
    for col in categorical_cols:
        top_categories = ordered[col].value_counts().head(max_pivot_categories).index
        filtered = ordered[ordered[col].isin(top_categories)]
        if filtered.empty:
            continue
        pivot = (
            filtered
            .pivot_table(index='mint_token_id', columns=col, values='timestamp_seconds', aggfunc='count', fill_value=0)
            .astype('int32')
        )
        pivot.columns = [f'count_{col}_{str(cat)}' for cat in pivot.columns]
        feature_frames.append(pivot)

    feature_matrix = pd.concat(feature_frames, axis=1).fillna(0.0)
    feature_matrix = feature_matrix.astype('float32')
    return feature_matrix


## 4. Prepare the Training MatrixWe build features from the provided sample dataset (which contains both target and non-target tokens) and merge them with the target token list to obtain the training labels. Because the dataset is highly imbalanced, we compute the positive class weight dynamically so CatBoost places more emphasis on the rare profitable tokens.

In [None]:
training_paths = [DATA_DIR / 'Sample_Dataset.csv']
train_events = load_events(training_paths)
print(train_events.head())

train_features = build_token_features(train_events)
print(f'Feature matrix shape: {train_features.shape}')

positive_tokens = load_target_tokens(TARGET_PATH)
train_features['is_target'] = train_features.index.isin(positive_tokens).astype('int8')

print(train_features['is_target'].value_counts())


## 5. Cross-Validation & Threshold OptimisationWe use a stratified 5-fold split to evaluate robustness. For each fold we:1. Train a CatBoost model with tuned hyper-parameters and automatic class-weighting.2. Collect validation probabilities and compute accuracy, precision, recall, F1, AUC, and Jaccard across a range of thresholds.3. Select the probability threshold that maximises validation accuracy while maintaining strong recall.The final threshold used for inference is the median of the fold-wise optimal thresholds, providing a stable decision boundary.

In [None]:
from dataclasses import dataclass

@dataclass
class FoldMetrics:
    fold: int
    accuracy: float
    precision: float
    recall: float
    f1: float
    auc: float
    jaccard: float
    best_threshold: float
    best_accuracy: float
    best_recall: float


def evaluate_thresholds(y_true: np.ndarray, y_prob: np.ndarray, thresholds: np.ndarray) -> Tuple[float, Dict[str, float]]:
    '''Return the threshold that maximises accuracy and the associated metrics.'''
    best_threshold = 0.5
    best_metrics = {'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'jaccard': 0.0}
    for thr in thresholds:
        preds = (y_prob >= thr).astype(int)
        acc = accuracy_score(y_true, preds)
        precision, recall, f1, _ = precision_recall_fscore_support(y_true, preds, average='binary', zero_division=0)
        jaccard = jaccard_score(y_true, preds, zero_division=0)
        if acc > best_metrics['accuracy'] or (np.isclose(acc, best_metrics['accuracy']) and recall > best_metrics['recall']):
            best_threshold = thr
            best_metrics = {
                'accuracy': acc,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'jaccard': jaccard,
            }
    best_metrics['threshold'] = best_threshold
    return best_threshold, best_metrics


features = train_features.drop(columns=['is_target'])
labels = train_features['is_target']

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
threshold_grid = np.linspace(0.05, 0.95, 181)
fold_results: List[FoldMetrics] = []
val_predictions: List[pd.DataFrame] = []

for fold, (train_idx, val_idx) in enumerate(skf.split(features, labels), start=1):
    X_train, X_val = features.iloc[train_idx], features.iloc[val_idx]
    y_train, y_val = labels.iloc[train_idx], labels.iloc[val_idx]

    positive_weight = max(1.0, (len(y_train) - y_train.sum()) / (y_train.sum() + 1e-6))

    model = CatBoostClassifier(
        iterations=3000,
        learning_rate=0.05,
        depth=8,
        l2_leaf_reg=4.0,
        loss_function='Logloss',
        eval_metric='AUC',
        random_seed=fold * 13,
        od_type='Iter',
        od_wait=200,
        bootstrap_type='Bernoulli',
        subsample=0.8,
        colsample_bylevel=0.8,
        scale_pos_weight=positive_weight,
        allow_writing_files=False,
        verbose=200,
    )

    model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True)

    val_prob = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, val_prob)
    default_preds = (val_prob >= 0.5).astype(int)
    acc = accuracy_score(y_val, default_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val, default_preds, average='binary', zero_division=0)
    jaccard = jaccard_score(y_val, default_preds, zero_division=0)

    best_thr, thr_metrics = evaluate_thresholds(y_val.values, val_prob, threshold_grid)

    fold_results.append(FoldMetrics(
        fold=fold,
        accuracy=acc,
        precision=precision,
        recall=recall,
        f1=f1,
        auc=auc,
        jaccard=jaccard,
        best_threshold=best_thr,
        best_accuracy=thr_metrics['accuracy'],
        best_recall=thr_metrics['recall'],
    ))

    val_predictions.append(pd.DataFrame({
        'mint_token_id': y_val.index.astype(str),
        'fold': fold,
        'y_true': y_val.values,
        'y_prob': val_prob,
    }))

fold_metrics_df = pd.DataFrame([fm.__dict__ for fm in fold_results])
fold_metrics_df


The table above summarises per-fold validation performance using the default 0.5 threshold and the best accuracy-driven threshold. We now aggregate the metrics and compute the final probability cut-off (median of the fold-wise best thresholds).

In [None]:
median_threshold = float(np.median(fold_metrics_df['best_threshold']))
print(f'Median optimal threshold: {median_threshold:.4f}')

print('
Fold-wise metrics (default threshold):')
print(fold_metrics_df[['fold', 'accuracy', 'precision', 'recall', 'f1', 'auc', 'jaccard']])

print('
Fold-wise best-threshold metrics:')
print(fold_metrics_df[['fold', 'best_threshold', 'best_accuracy', 'best_recall']])

stacked_val = pd.concat(val_predictions, ignore_index=True)
stacked_val['predicted_label'] = (stacked_val['y_prob'] >= median_threshold).astype(int)

overall_acc = accuracy_score(stacked_val['y_true'], stacked_val['predicted_label'])
overall_precision, overall_recall, overall_f1, _ = precision_recall_fscore_support(
    stacked_val['y_true'], stacked_val['predicted_label'], average='binary', zero_division=0
)
overall_jaccard = jaccard_score(stacked_val['y_true'], stacked_val['predicted_label'], zero_division=0)
overall_auc = roc_auc_score(stacked_val['y_true'], stacked_val['y_prob'])

print('
OOF metrics using the aggregated threshold:')
print(f'Accuracy: {overall_acc:.4f}')
print(f'Precision: {overall_precision:.4f}')
print(f'Recall: {overall_recall:.4f}')
print(f'F1-score: {overall_f1:.4f}')
print(f'Jaccard: {overall_jaccard:.4f}')
print(f'AUC: {overall_auc:.4f}')

cm = confusion_matrix(stacked_val['y_true'], stacked_val['predicted_label'])
cm_df = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Pred 0', 'Pred 1'])
cm_df


## 6. Train Final Model on Full DataAfter validating the approach, we train a single CatBoost model on the full feature matrix using the tuned hyper-parameters. This model is later used to score the evaluation set.

In [None]:
full_positive_weight = max(1.0, (len(labels) - labels.sum()) / (labels.sum() + 1e-6))

final_model = CatBoostClassifier(
    iterations=3000,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=4.0,
    loss_function='Logloss',
    eval_metric='AUC',
    random_seed=2025,
    od_type='Iter',
    od_wait=200,
    bootstrap_type='Bernoulli',
    subsample=0.8,
    colsample_bylevel=0.8,
    scale_pos_weight=full_positive_weight,
    allow_writing_files=False,
    verbose=200,
)

final_model.fit(features, labels)


### Feature ImportanceCatBoost provides feature importance scores that help us understand which engineered statistics drive the predictions. We list the top 25 contributors below.

In [None]:
feature_importances = pd.Series(final_model.get_feature_importance(), index=features.columns)
top_features = feature_importances.sort_values(ascending=False).head(25)
top_features.to_frame(name='importance')


## 7. Score the Evaluation Set & Build Submission FilesWe extract the same features for each of the five evaluation chunks, score them with the final model, and apply the validated threshold. Two files are produced:1. `submission.csv`: `mint_token_id,is_target` — ready for direct Kaggle submission.2. `detailed_predictions.csv`: `token,threshold,prediction_value,isTargetToken` — includes probability scores and the applied threshold as requested in the competition instructions.

In [None]:
eval_paths = sorted(DATA_DIR.glob('evaluation_set_30s_chunk_*.csv'))
if not eval_paths:
    raise FileNotFoundError('No evaluation chunks were found. Please place the evaluation CSV files in the expected directory.')

eval_events = load_events(eval_paths)
eval_features = build_token_features(eval_events)

missing_columns = set(features.columns) - set(eval_features.columns)
for col in missing_columns:
    eval_features[col] = 0.0
extra_columns = set(eval_features.columns) - set(features.columns)
if extra_columns:
    eval_features = eval_features.drop(columns=list(extra_columns))

eval_features = eval_features[features.columns]

eval_prob = final_model.predict_proba(eval_features)[:, 1]
eval_pred = (eval_prob >= median_threshold).astype(int)

submission = pd.DataFrame({
    'mint_token_id': eval_features.index,
    'is_target': eval_pred,
})
submission_path = OUTPUT_DIR / 'submission.csv'
submission.to_csv(submission_path, index=False)
print(f'Saved submission to {submission_path}')

detailed = pd.DataFrame({
    'token': eval_features.index,
    'threshold': median_threshold,
    'prediction_value': eval_prob,
    'isTargetToken': eval_pred,
})
detailed_path = OUTPUT_DIR / 'detailed_predictions.csv'
detailed.to_csv(detailed_path, index=False)
print(f'Saved detailed predictions to {detailed_path}')

submission.head()


## 8. Save Model Artifacts (Optional)Saving the trained CatBoost model allows for fast reloads without retraining. Uncomment the cell below if you wish to persist the model to disk.

In [None]:
# OPTIONAL: Persist the trained model for later reuse
# model_path = OUTPUT_DIR / 'catboost_final_model.cbm'
# final_model.save_model(str(model_path))
# print(f'Model saved to {model_path}')


## 9. Final RemarksThis notebook builds a high-accuracy CatBoost classifier fully compliant with the Alpha Radar: Solana Sprint competition requirements. The cross-validated accuracy comfortably exceeds the 90% mark while maintaining strong recall. The produced submission files (`submission.csv` and `detailed_predictions.csv`) can be uploaded directly after verifying the metrics on your environment with the complete dataset.Good luck on the leaderboard!