# Alpha Radar: Solana Sprint - CatBoost Solution

- feature engineers the 30-second PumpFun event stream
- trains a CatBoost model while monitoring recall and accuracy
- searches thresholds that keep validation accuracy above 90%
- exports competition submission plus a positive-token report


In [None]:
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from IPython.display import display
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    jaccard_score,
    precision_recall_fscore_support,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore", category=UserWarning)
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda v: f"{v:0.4f}")


In [None]:
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "Dataset" / "alpha-radar-solana-sprint"
TARGET_PATH = BASE_DIR / "Dataset" / "target_tokens.csv"
EVALUATION_PATTERN = "evaluation_set_30s_chunk_*.csv"
RANDOM_STATE = 42

if not DATA_DIR.exists():
    raise FileNotFoundError(f"Expected data directory at {DATA_DIR}")
if not TARGET_PATH.exists():
    raise FileNotFoundError(f"Expected target token file at {TARGET_PATH}")

print(f"Using data directory: {DATA_DIR}")


In [None]:
NON_NUMERIC_COLUMNS = {"timestamp", "mint_token_id", "holder", "trade_mode", "creator"}

def parse_timestamp_series(series: pd.Series) -> pd.Series:
    parts = series.astype(str).str.split(":", n=1, expand=True)
    minutes = pd.to_numeric(parts[0], errors="coerce")
    seconds = pd.to_numeric(parts[1], errors="coerce")
    return (minutes * 60 + seconds).astype("float32")

def load_event_data(csv_path: Path) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    if "index" in df.columns:
        df = df.drop(columns=["index"])
    df["timestamp_seconds"] = parse_timestamp_series(df["timestamp"])
    numeric_cols = [c for c in df.columns if c not in NON_NUMERIC_COLUMNS]
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")
    df[numeric_cols] = df[numeric_cols].astype("float32")
    return df

def build_features(events: pd.DataFrame) -> pd.DataFrame:
    ordered = events.sort_values(["mint_token_id", "timestamp_seconds"]).reset_index(drop=True)
    numeric_cols = ordered.select_dtypes(include=[np.number]).columns.tolist()
    grouped = ordered.groupby("mint_token_id", sort=False)
    stats = grouped[numeric_cols].agg(["mean", "std", "min", "max", "last"])
    stats.columns = [f"{col}_{stat}" for col, stat in stats.columns]
    features = stats
    features["event_count"] = grouped.size().astype("float32")
    features["unique_holders"] = grouped["holder"].nunique().astype("float32")
    features["unique_creators"] = grouped["creator"].nunique().astype("float32")
    trade_counts = ordered.pivot_table(
        index="mint_token_id",
        columns="trade_mode",
        values="timestamp_seconds",
        aggfunc="count",
        fill_value=0,
    )
    trade_counts.columns = [f"trade_mode_{col}_count" for col in trade_counts.columns]
    features = features.join(trade_counts, how="left")
    for col in ("trade_mode_buy_count", "trade_mode_sell_count"):
        if col not in features.columns:
            features[col] = 0.0
        else:
            features[col] = features[col].astype("float32")
    event_count = features["event_count"].clip(lower=1.0)
    features["buy_share"] = features["trade_mode_buy_count"] / event_count
    features["sell_share"] = features["trade_mode_sell_count"] / event_count
    features["net_buy"] = features["trade_mode_buy_count"] - features["trade_mode_sell_count"]
    if "timestamp_seconds_last" in features.columns and "timestamp_seconds_min" in features.columns:
        features["active_duration"] = features["timestamp_seconds_last"] - features["timestamp_seconds_min"]
    if "sol_volume_last" in features.columns and "sol_volume_mean" in features.columns:
        denom = features["sol_volume_mean"].replace(0, np.nan)
        features["sol_volume_last_to_mean"] = features["sol_volume_last"] / denom
    if "token_volume_last" in features.columns and "token_volume_mean" in features.columns:
        denom = features["token_volume_mean"].replace(0, np.nan)
        features["token_volume_last_to_mean"] = features["token_volume_last"] / denom
    features = features.replace([np.inf, -np.inf], 0.0).fillna(0.0)
    return features.astype("float32")

def evaluate_thresholds(y_true: pd.Series, y_prob: np.ndarray, thresholds: np.ndarray) -> pd.DataFrame:
    rows = []
    for thr in thresholds:
        y_pred = (y_prob >= thr).astype(int)
        acc = accuracy_score(y_true, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_true, y_pred, average="binary", zero_division=0
        )
        try:
            jac = jaccard_score(y_true, y_pred)
        except Exception:
            jac = 0.0
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        rows.append(
            {
                "threshold": thr,
                "accuracy": acc,
                "precision": precision,
                "recall": recall,
                "f1": f1,
                "jaccard": jac,
                "tp": int(tp),
                "fp": int(fp),
                "fn": int(fn),
                "tn": int(tn),
            }
        )
    return pd.DataFrame(rows)


In [None]:
sample_events = load_event_data(DATA_DIR / "Sample_Dataset.csv")
target_tokens = pd.read_csv(TARGET_PATH, header=None, names=["mint_token_id"])
target_set = set(target_tokens["mint_token_id"])

print(f"Sample events shape: {sample_events.shape}")
print(f"Unique tokens in sample events: {sample_events['mint_token_id'].nunique()}")
print(f"Target tokens provided: {len(target_set)}")
sample_events.head()


In [None]:
train_features = build_features(sample_events)
train_features["is_target"] = train_features.index.isin(target_set).astype("int8")

print(f"Training feature matrix: {train_features.shape}")
class_counts = train_features["is_target"].value_counts().rename("token_count")
display(class_counts.to_frame())
train_features.head()


In [None]:
feature_columns = [col for col in train_features.columns if col != "is_target"]
X = train_features[feature_columns]
y = train_features["is_target"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

model_params = {
    "depth": 8,
    "learning_rate": 0.06,
    "iterations": 1500,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "random_seed": RANDOM_STATE,
    "scale_pos_weight": 10.0,
    "l2_leaf_reg": 5.0,
    "bootstrap_type": "Bernoulli",
    "subsample": 0.8,
    "od_type": "Iter",
    "od_wait": 120,
    "verbose": 100,
}

model = CatBoostClassifier(**model_params)
model.fit(X_train, y_train, eval_set=(X_val, y_val))
val_prob = model.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_val, val_prob)
print(f"Validation ROC-AUC: {roc_auc:.4f}")
print(f"Best iteration: {model.get_best_iteration()}")


In [None]:
default_pred = (val_prob >= 0.5).astype(int)
precision, recall, f1, _ = precision_recall_fscore_support(y_val, default_pred, average="binary", zero_division=0)
accuracy = accuracy_score(y_val, default_pred)
try:
    jaccard = jaccard_score(y_val, default_pred)
except Exception:
    jaccard = 0.0
tn, fp, fn, tp = confusion_matrix(y_val, default_pred).ravel()
print(f"Accuracy @0.50: {accuracy:.4f}")
print(f"Precision: {precision:.4f}  Recall: {recall:.4f}  F1: {f1:.4f}  Jaccard: {jaccard:.4f}")
print({"tn": tn, "fp": fp, "fn": fn, "tp": tp})


In [None]:
threshold_grid = np.linspace(0.05, 0.9, 18)
threshold_results = evaluate_thresholds(y_val, val_prob, threshold_grid).sort_values("threshold").reset_index(drop=True)
display(threshold_results)

candidate_mask = (threshold_results["accuracy"] >= 0.90) & (threshold_results["recall"] >= 0.60)
if candidate_mask.any():
    chosen_row = threshold_results[candidate_mask].sort_values(["f1", "jaccard"], ascending=False).iloc[0]
else:
    chosen_row = threshold_results.sort_values(["recall", "accuracy"], ascending=[False, False]).iloc[0]
chosen_threshold = float(chosen_row["threshold"])
print(f"Selected threshold: {chosen_threshold:.2f}")
print(chosen_row)


In [None]:
chosen_pred = (val_prob >= chosen_threshold).astype(int)
precision, recall, f1, _ = precision_recall_fscore_support(y_val, chosen_pred, average="binary", zero_division=0)
accuracy = accuracy_score(y_val, chosen_pred)
jaccard = jaccard_score(y_val, chosen_pred)
tn, fp, fn, tp = confusion_matrix(y_val, chosen_pred).ravel()
print(f"Validation accuracy @threshold {chosen_threshold:.2f}: {accuracy:.4f}")
print(f"Precision: {precision:.4f}  Recall: {recall:.4f}  F1: {f1:.4f}  Jaccard: {jaccard:.4f}")
print({"tn": tn, "fp": fp, "fn": fn, "tp": tp})


In [None]:
feature_importances = pd.Series(model.get_feature_importance(), index=feature_columns)
top_features = feature_importances.sort_values(ascending=False).head(25)
top_features.to_frame(name="importance")


In [None]:
evaluation_files = sorted(DATA_DIR.glob(EVALUATION_PATTERN))
if not evaluation_files:
    raise FileNotFoundError("No evaluation set chunks found.")

eval_events = pd.concat([load_event_data(path) for path in evaluation_files], ignore_index=True)
eval_features = build_features(eval_events)
eval_features = eval_features.reindex(columns=feature_columns, fill_value=0.0)

print(f"Evaluation events shape: {eval_events.shape}")
print(f"Evaluation feature matrix: {eval_features.shape}")
eval_features.head()


In [None]:
eval_prob = model.predict_proba(eval_features)[:, 1]
eval_pred = (eval_prob >= chosen_threshold).astype(int)

detailed_df = pd.DataFrame(
    {
        "mint_token_id": eval_features.index,
        "prediction_score": eval_prob,
        "is_target": eval_pred,
    }
).reset_index(drop=True)

submission_df = detailed_df[["mint_token_id", "is_target"]]
positive_df = detailed_df[detailed_df["is_target"] == 1].copy()
positive_df["threshold_used"] = chosen_threshold

submission_path = BASE_DIR / "submission.csv"
detailed_path = BASE_DIR / "predictions_detailed.csv"
positive_path = BASE_DIR / "predicted_positive_tokens.csv"

submission_df.to_csv(submission_path, index=False)
detailed_df.to_csv(detailed_path, index=False)
positive_df.to_csv(positive_path, index=False)

assert submission_df.shape[0] == 64208, "Submission must contain 64,208 rows."

print(f"Saved submission to: {submission_path}")
print(f"Saved detailed predictions to: {detailed_path}")
print(f"Saved positive-token report to: {positive_path}")
print(f"Predicted positives: {positive_df.shape[0]} (threshold={chosen_threshold:.2f})")
submission_df['is_target'].value_counts().to_frame(name='token_count')


In [None]:
detailed_df.sort_values("prediction_score", ascending=False).head(10)


**Next Steps**

- Submit submission.csv to the Kaggle competition
- Review predicted_positive_tokens.csv for manual due diligence
- Iterate on additional feature ideas (e.g. time-window segmentation) to further raise recall without sacrificing accuracy
