In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# === Load Dataset ===
df = pd.read_csv("deduplicated_filtered_output.csv")  # Replace with actual filename

# === Deduplicate based on tx_id and token_balance_change ===
df = df.drop_duplicates(subset=["tx_id", "token_balance_change"])

# === Standardize column names ===
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# === Rename and parse columns ===
df.rename(columns={
    "address": "wallet",
    "token_mint_address": "token",
    "token_balance_change": "amount",
    "block_time": "timestamp",
    "price_usd": "usd_price"
}, inplace=True)

# === Parse datetime ===
df["timestamp"] = pd.to_datetime(df["timestamp"])
df.set_index("timestamp", inplace=True)

# === Remove extreme transactions (> 1 billion tokens moved) ===
df = df[df["amount"].abs() <= 1e9]


# === Token name mapping ===
TOKEN_NAMES = {
    "FUAfBo2jgks6gB4Z4LfZkqSZgzNucisEHqnNebaRxM1P": "MELANIA",
    "6p6xgHyF7AeE6TZkSmFsko444wqoP15icUSqi2jfGiPN": "TRUMP",
    "Bo9jh3wsmcC2AjakLWzNmKJ3SgtZmXEcSaW7L2FAvUsU": "LIBRA"
}

# === Settings ===
interval = "H"
output_dir = "token_analysis_outputs2"
os.makedirs(output_dir, exist_ok=True)

# === Process each token ===
for token, token_df in df.groupby("token"):
    token_df = token_df.sort_index()
    token_label = TOKEN_NAMES.get(token, token[:8])

    # === USD Volume Calculation ===
    token_df["usd_volume"] = token_df["amount"].abs() * token_df["usd_price"]

    # === Metric calculations ===
    usd_volume = token_df["usd_volume"].resample(interval).sum().rename("token_volume_usd")
    raw_volume = token_df["amount"].abs().resample(interval).sum().rename("token_volume_tokens")

    unique_wallets = token_df.groupby(pd.Grouper(freq=interval))["wallet"].nunique().rename("unique_wallets")
    tx_count = token_df.resample(interval)["wallet"].count().rename("transaction_count")

    token_df["is_new_wallet"] = ~token_df["wallet"].duplicated()
    new_wallets = token_df[token_df["is_new_wallet"]].resample(interval)["wallet"].count().rename("new_wallets")

    # Sniper detection
    first_tx_time = token_df.index.min()
    sniper_cutoff = first_tx_time + pd.Timedelta(minutes=3)
    token_df["is_sniper"] = token_df.index <= sniper_cutoff
    sniper_activity = token_df[token_df["is_sniper"]].resample(interval)["wallet"].count().rename("sniper_tx_count")

    # Wash trading detection
    wash_traders = set()
    for wallet, group in token_df.groupby("wallet"):
        group = group.sort_index()
        if len(group) < 6:
            continue
        group["direction"] = group["amount"].apply(lambda x: "buy" if x > 0 else "sell")
        group["alt_trade"] = group["direction"] != group["direction"].shift()
        group["rolling_alt_count"] = group["alt_trade"].rolling("5min").sum()
        group["rolling_usd_volume"] = (group["amount"].abs() * group["usd_price"]).rolling("5min").sum()
        if ((group["rolling_alt_count"] >= 6) & (group["rolling_usd_volume"] >= 100)).any():
            wash_traders.add(wallet)

    token_df["is_wash_trader"] = token_df["wallet"].isin(wash_traders)
    wash_trading_activity = token_df[token_df["is_wash_trader"]].resample(interval)["wallet"].count().rename("wash_tx_count")

    avg_tx_size = token_df["amount"].abs().resample(interval).mean().rename("avg_tx_size")

    # === Combine metrics ===
    summary = pd.concat([
        usd_volume,
        raw_volume,
        tx_count,
        unique_wallets,
        new_wallets,
        sniper_activity,
        wash_trading_activity,
        avg_tx_size
    ], axis=1).fillna(0)

    # Ensure all columns are float type for clean CSV output
    summary = summary.astype(float)

    # === Save full CSV with clean numeric formatting ===
    summary.to_csv(
        f"{output_dir}/metrics_{token_label}.csv",
        float_format="%.5f"
    )

    # === Clip outliers for plotting (99.5%) ===
    summary_clipped = summary.copy()
    for col in summary_clipped.columns:
        upper = summary_clipped[col].quantile(0.995)
        summary_clipped[col] = summary_clipped[col].clip(upper=upper)

    # === Save individual plots for each metric ===
    for column in summary_clipped.columns:
        plt.figure(figsize=(8, 4))
        summary_clipped[column].plot(title=column.replace('_', ' ').title())
        plt.xlabel("Time")
        plt.tight_layout()
        plt.savefig(f"{output_dir}/{token_label}_{column}.png")
        plt.close()

print("✅ Done. Metrics and plots saved to:", output_dir)
