# Isolation Forest: Insider Detection with Rules-Based Features
## Using Volume/Trade Data, Financial Data, and Regex Features

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries loaded successfully")

## 2. Load Data

In [None]:
# Load user data with regex features
df = pd.read_csv('outputs/output_with_signals.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nTotal users: {len(df)}")
df.head()

## 3. Feature Selection

In [None]:
# Define features for anomaly detection
FEATURES = [
    # Volume/Trade features
    'trades_count',
    'buy_trades_count',
    'sell_trades_count',
    'total_volume',
    'avg_trade_size',
    'total_cash_volume',
    
    # Financial features
    'realized_pnl',
    'total_position_value',
    'win_rate',
    'winning_positions_count',
    'closed_positions_count',
    
    # Behavioral features
    'positions_count',
    'active_markets_count',
    'days_since_first_trade',
    'total_markets_count',
    
    # Regex features (insider signals)
    'earnings_count',
    'earnings_distinct_issuers',
    'other_earnings_markets',
    'traded_crypto'
]

# Extract features
df_features = df[FEATURES].copy()

# Handle missing values with median
df_features = df_features.fillna(df_features.median())

print(f"Total features: {len(FEATURES)}")
print(f"\nFeature statistics:")
df_features.describe()

## 4. Scale Features

In [None]:
# Use RobustScaler (handles outliers better)
scaler = RobustScaler()
features_scaled = scaler.fit_transform(df_features)

df_scaled = pd.DataFrame(
    features_scaled,
    columns=df_features.columns,
    index=df_features.index
)

print(f"Features scaled using RobustScaler")
print(f"Scaled data shape: {df_scaled.shape}")

## 5. Run Isolation Forest

In [None]:
# Run Isolation Forest
# contamination: expected proportion of anomalies (0.1 = 10%)
iso_forest = IsolationForest(
    contamination=0.10,
    random_state=42,
    n_estimators=100
)

# Fit and predict (-1 for anomalies, 1 for normal)
predictions = iso_forest.fit_predict(df_scaled)

# Get anomaly scores (lower = more anomalous)
anomaly_scores = iso_forest.score_samples(df_scaled)

# Add predictions and scores to dataframe
df['anomaly_label'] = predictions
df['anomaly_score'] = anomaly_scores

# Calculate statistics
n_anomalies = (predictions == -1).sum()
n_normal = (predictions == 1).sum()

print(f"\nIsolation Forest Results:")
print(f"  Contamination: 10%")
print(f"  N Estimators: 100")
print(f"  Total users: {len(df)}")
print(f"  Normal users: {n_normal} ({100*n_normal/len(df):.1f}%)")
print(f"  Anomalies: {n_anomalies} ({100*n_anomalies/len(df):.1f}%)")
print(f"\nAnomaly score range: [{anomaly_scores.min():.3f}, {anomaly_scores.max():.3f}]")
print(f"  Lower scores = more anomalous")

## 6. Visualize with PCA

In [None]:
# Reduce to 2D for visualization
pca = PCA(n_components=2)
features_2d = pca.fit_transform(df_scaled)

# Create plot
plt.figure(figsize=(12, 6))

# Plot normal users
normal_mask = predictions == 1
plt.scatter(features_2d[normal_mask, 0], features_2d[normal_mask, 1],
            c='blue', alpha=0.5, s=30, label='Normal')

# Plot anomalies
anomaly_mask = predictions == -1
plt.scatter(features_2d[anomaly_mask, 0], features_2d[anomaly_mask, 1],
            c='red', marker='x', s=100, label='Anomalies', linewidths=2)

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('Isolation Forest Results (PCA Projection)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('outputs/isolation_forest_pca.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nVisualization saved to outputs/isolation_forest_pca.png")

## 7. Analyze Anomalies

In [None]:
# Extract anomalies
anomalies_df = df[df['anomaly_label'] == -1].copy()

print(f"Total anomalies detected: {len(anomalies_df)}")

if len(anomalies_df) > 0:
    # Show key features for anomalies
    print(f"\nAnomalies summary:")
    print(f"  Users with 2+ earnings issuers: {anomalies_df['other_earnings_markets'].sum()}")
    print(f"  Users with crypto trades: {anomalies_df['traded_crypto'].sum()}")
    print(f"  Avg earnings count: {anomalies_df['earnings_count'].mean():.1f}")
    print(f"  Avg win rate: {anomalies_df['win_rate'].mean():.2f}")
    print(f"  Avg realized P&L: ${anomalies_df['realized_pnl'].mean():.2f}")
    
    # Show top anomalies by score (most anomalous)
    print(f"\nTop 10 most anomalous users (by anomaly score):")
    display_cols = ['username', 'anomaly_score', 'earnings_count', 'win_rate', 
                    'realized_pnl', 'total_volume', 'other_earnings_markets', 'traded_crypto']
    print(anomalies_df.nsmallest(10, 'anomaly_score')[display_cols].to_string(index=False))
else:
    print("\nNo anomalies detected - try adjusting contamination parameter")

## 8. Anomaly Score Distribution

In [None]:
# Plot anomaly score distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram of all scores
axes[0].hist(df['anomaly_score'], bins=50, alpha=0.7, edgecolor='black')
axes[0].axvline(x=df[df['anomaly_label'] == -1]['anomaly_score'].max(), 
                color='red', linestyle='--', label='Anomaly Threshold')
axes[0].set_xlabel('Anomaly Score')
axes[0].set_ylabel('Count')
axes[0].set_title('Distribution of Anomaly Scores')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Boxplot comparing normal vs anomalies
data_to_plot = [df[df['anomaly_label'] == 1]['anomaly_score'],
                df[df['anomaly_label'] == -1]['anomaly_score']]
axes[1].boxplot(data_to_plot, labels=['Normal', 'Anomalies'])
axes[1].set_ylabel('Anomaly Score')
axes[1].set_title('Anomaly Scores: Normal vs Anomalies')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('outputs/isolation_forest_scores.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nScore distribution saved to outputs/isolation_forest_scores.png")

## 9. Feature Importance for Anomalies

In [None]:
if len(anomalies_df) > 0:
    # Compare anomalies vs normal users
    normal_df = df[df['anomaly_label'] == 1]
    
    # Calculate mean differences
    comparison = pd.DataFrame({
        'Anomalies': anomalies_df[FEATURES].mean(),
        'Normal': normal_df[FEATURES].mean()
    })
    comparison['Difference'] = comparison['Anomalies'] - comparison['Normal']
    comparison['Ratio'] = comparison['Anomalies'] / (comparison['Normal'] + 0.001)
    comparison = comparison.sort_values('Ratio', ascending=False)
    
    print("\nFeature comparison (Anomalies vs Normal):")
    print(comparison.head(10).to_string())
    
    # Plot top differences
    fig, ax = plt.subplots(figsize=(10, 6))
    top_features = comparison.head(10)
    x = range(len(top_features))
    
    ax.barh(x, top_features['Ratio'], alpha=0.7)
    ax.set_yticks(x)
    ax.set_yticklabels(top_features.index)
    ax.set_xlabel('Ratio (Anomalies / Normal)')
    ax.set_title('Top 10 Distinguishing Features for Anomalies')
    ax.axvline(x=1.0, color='red', linestyle='--', alpha=0.5)
    ax.grid(True, alpha=0.3, axis='x')
    plt.tight_layout()
    plt.savefig('outputs/isolation_forest_feature_importance.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    print("\nFeature importance saved to outputs/isolation_forest_feature_importance.png")

## 10. Export Results

In [None]:
if len(anomalies_df) > 0:
    # Select relevant columns for export
    export_cols = ['wallet', 'username', 'anomaly_label', 'anomaly_score'] + FEATURES
    
    # Save anomalies (sorted by most anomalous)
    output_path = 'outputs/isolation_forest_anomalies.csv'
    anomalies_df.sort_values('anomaly_score')[export_cols].to_csv(output_path, index=False)
    print(f"Saved {len(anomalies_df)} anomalies to: {output_path}")
    
    # Save all users with labels and scores
    output_path_all = 'outputs/isolation_forest_all_users.csv'
    df[export_cols].to_csv(output_path_all, index=False)
    print(f"Saved all {len(df)} users with anomaly labels/scores to: {output_path_all}")
else:
    print("No anomalies to export")