# Task 2: Jito Tip Sandwich MEV Detection


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json

DATA_PATH = '/Users/aileen/Downloads/pamm/pamm_clean_final.parquet'
df = pd.read_parquet(DATA_PATH)

# Extract addresses from account_updates (same as task 1)
def extract_addresses(row):
    # Handle already-parsed data (lists, dicts) or strings
    addresses = []
    
    # Check if row is null/NaN - handle scalar and array cases
    if row is None:
        return []
    try:
        # For scalar values, pd.isna works fine
        if not isinstance(row, (list, dict, str)):
            if pd.isna(row):
                return []
    except (ValueError, TypeError):
        # If pd.isna fails (e.g., for arrays), skip the check
        pass
    
    # If already a list, extract addresses directly
    if isinstance(row, list):
        for item in row:
            if isinstance(item, dict) and 'account' in item:
                addr = item.get('account', '')
                if addr and isinstance(addr, str):
                    addresses.append(addr)
            elif isinstance(item, str) and len(item) >= 32:
                # Direct address string in list
                addresses.append(item)
    
    # If already a dict, extract account
    elif isinstance(row, dict):
        if 'account' in row:
            addr = row.get('account', '')
            if addr and isinstance(addr, str):
                addresses.append(addr)
    
    # If string, try to parse or extract with regex
    elif isinstance(row, str):
        # Try to parse as JSON first
        try:
            cleaned = row.replace("'", '"').replace('None', 'null').replace('True', 'true').replace('False', 'false')
            parsed = json.loads(cleaned)
            if isinstance(parsed, list):
                for item in parsed:
                    if isinstance(item, dict) and 'account' in item:
                        addr = item.get('account', '')
                        if addr:
                            addresses.append(addr)
            elif isinstance(parsed, dict) and 'account' in parsed:
                addr = parsed.get('account', '')
                if addr:
                    addresses.append(addr)
        except (json.JSONDecodeError, ValueError, AttributeError):
            pass
        # Fallback to regex extraction
        if not addresses:
            addresses = re.findall(r'[A-Za-z0-9]{32,44}', row)
    
    return addresses

if 'account_updates' in df.columns:
    # Extract addresses - function handles already-parsed lists/dicts or strings
    df['parsed_accounts'] = df['account_updates'].apply(extract_addresses)
else:
    df['parsed_accounts'] = [[] for _ in range(len(df))]

jito_tips = [
    '96gYZGLnJYVFmbjzopPSU6QiEV5fGqZNyN9nmNhvrZU5',
    'HFqU5x63VTqvQss8hp11i4wVV8bD44PvwucfZ2bU7gRe',
    'Cw8CFyM9FkoMi7K7Crf6HNQqf4uEMzpKw6QNghXLvLkY',
    'ADaUMid9yfUytqMBgopwjb2DTLSokTSzL1zt6iGPaS49',
    'DfXygSm4jCyNCybVYYK6DwvWqjKee8pbDmJGcLWNDXjh',
    'ADuUkR4vqLUMWXxW9gh6D6L8pMSawimctcNZ5pGwDcEt',
    'DttWaMuVvTiduZRnguLF7jNxTgiMBZ1hyAumKUiL2KRL',
    '3AVi9Tg9Uo68tJfuvoKvqKNWKkC5wPdSSdeBnizKZ6jT'
]

# Build filter conditions, handling missing columns properly
tip_filter = df['parsed_accounts'].apply(lambda x: any(t in x for t in jito_tips))

if 'signer' in df.columns:
    tip_filter = tip_filter | df['signer'].isin(jito_tips)

if 'validator' in df.columns:
    tip_filter = tip_filter | df['validator'].isin(jito_tips)

tip_matches = df[tip_filter]

print("Jito Tip Matches:")
if not tip_matches.empty:
    # Select only columns that exist
    available_cols = ['time', 'datetime', 'kind', 'amm', 'validator', 'signer', 'kind', 'parsed_accounts']
    cols_to_show = [col for col in available_cols if col in tip_matches.columns]
    print(tip_matches[cols_to_show].head(10))
else:
    print("No matches.")

if not tip_matches.empty and 'kind' in tip_matches.columns:
    trades = tip_matches[tip_matches['kind'] == 'TRADE']
    if not trades.empty and 'validator' in trades.columns and 'signer' in trades.columns:
        grouped = trades.groupby('validator')
        sandwiches = []
        for val, group in grouped:
            if len(group) >= 3 and 'signer' in group.columns and 'time' in group.columns:
                signers = group['signer'].tolist()
                for i in range(len(signers) - 2):
                    if signers[i] == signers[i+2] and signers[i] != signers[i+1]:
                        sandwiches.append({'validator': val, 'times': group.iloc[i:i+3]['time'].tolist()})
        print(f"Found {len(sandwiches)} potential sandwich A-B-A patterns near tips.")
        print("Samples:", sandwiches[:3] if sandwiches else "None")

# Visualization
if not tip_matches.empty and 'amm' in tip_matches.columns and 'kind' in tip_matches.columns:
    plt.figure(figsize=(10,6))
    sns.countplot(data=tip_matches, x='amm', hue='kind')
    plt.title('Tip-Related Events by pAMM')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('task2_tip_events_by_amm.png')
    print("Visualization saved.")
elif not tip_matches.empty:
    print("Cannot create visualization: missing 'amm' or 'kind' columns.")

print("\nInference: Tip txs suggest bundling; A-B-A indicates sandwich. High tips imply attack.")

Jito Tip Matches:
No matches.

Inference: Tip txs suggest bundling; A-B-A indicates sandwich. High tips imply attack.
