# DAO Price Movement Prediction - Data Preprocessing

This notebook implements a comprehensive data preprocessing pipeline for predicting cryptocurrency price movements based on DAO governance activity, social sentiment, and technical indicators.

## Objectives
- Clean and prepare raw data for machine learning
- Engineer relevant features for price movement prediction
- Create target variables for classification tasks
- Implement robust data validation and quality checks

## Key Features
- **Technical Indicators**: Moving averages, RSI, EMA
- **Governance Features**: Voting activity, proposal metrics
- **Social Features**: Social media sentiment and activity
- **Market Features**: Price data, volume, market cap

In [None]:
# Import required libraries
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '..', 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Import custom modules
from config import *
from utils.data_preprocessing import *
from utils.evaluation import log_memory_usage

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✓ Libraries imported successfully")
print(f"✓ Project root: {PROJECT_ROOT}")
print(f"✓ Available feature columns: {len(FEATURE_COLUMNS)}")

In [None]:
# Load raw data
print("Loading raw data...")
file_path = os.path.join(DATA_PATH, "regression_df_v24cc_v6.csv")

try:
    raw_data = pd.read_csv(file_path)
    print(f"✓ Data loaded successfully")
    print(f"  - Shape: {raw_data.shape}")
    print(f"  - Date range: {raw_data['vote_date'].min()} to {raw_data['vote_date'].max()}")
    print(f"  - Unique DAOs: {raw_data['Slug_Santiment'].nunique()}")
except FileNotFoundError:
    print(f"❌ File not found: {file_path}")
    raise

# Log initial memory usage
log_memory_usage("After data loading")

# Display basic info
raw_data.info()

In [None]:
# Data quality assessment
print("=== Data Quality Assessment ===")

# Check for missing values
missing_summary = raw_data.isnull().sum()
missing_pct = (missing_summary / len(raw_data)) * 100
missing_df = pd.DataFrame({
    'Missing_Count': missing_summary,
    'Missing_Percentage': missing_pct
}).sort_values('Missing_Percentage', ascending=False)

print("\nMissing values summary:")
print(missing_df[missing_df['Missing_Count'] > 0].head(10))

# Check for infinite values
inf_cols = []
for col in raw_data.select_dtypes(include=[np.number]).columns:
    if np.isinf(raw_data[col]).any():
        inf_cols.append(col)

print(f"\nColumns with infinite values: {inf_cols}")

# Validate required columns exist
required_cols = ['Slug_Santiment', 'vote_date', 'marketcap_usd_cleaned', 'price_usd']
missing_required = [col for col in required_cols if col not in raw_data.columns]
if missing_required:
    print(f"❌ Missing required columns: {missing_required}")
    raise ValueError("Required columns are missing from the dataset")
else:
    print("✓ All required columns present")

In [None]:
# Step 1: Encode categorical features
print("Step 1: Encoding categorical features...")

try:
    processed_data, encoders = encode_categorical_features(raw_data)
    
    print(f"✓ Categorical encoding completed")
    print(f"  - Slug_Santiment: {len(encoders['Slug_Santiment'].classes_)} unique values")
    print(f"  - Market segments: {len(encoders['marketSegment'].classes_)} unique values")
    
    # Store encoders for later use
    os.makedirs(os.path.join(MODEL_PATH, 'encoders'), exist_ok=True)
    import joblib
    joblib.dump(encoders, os.path.join(MODEL_PATH, 'encoders', 'label_encoders.pkl'))
    print("✓ Encoders saved for future use")
    
except Exception as e:
    print(f"❌ Error in categorical encoding: {e}")
    raise

log_memory_usage("After categorical encoding")

In [None]:
# Step 2: Scale large values and handle data types
print("Step 2: Scaling large numerical values...")

# Scale market cap and transaction volume to billions using config
try:
    processed_data['marketcap_usd_cleaned'] = processed_data['marketcap_usd_cleaned'] / 1_000_000_000
    processed_data['transaction_volume'] = processed_data['transaction_volume'] / 1_000_000_000
    
    print("✓ Values scaled to appropriate ranges (billions)")
    print(f"  - Market cap range: {processed_data['marketcap_usd_cleaned'].min():.2f} - {processed_data['marketcap_usd_cleaned'].max():.2f}B")
    print(f"  - Transaction volume range: {processed_data['transaction_volume'].min():.2f} - {processed_data['transaction_volume'].max():.2f}B")
    
except Exception as e:
    print(f"❌ Error in scaling: {e}")
    raise

log_memory_usage("After scaling")

In [None]:
# Step 3: Calculate returns and create target variables
print("Step 3: Calculating returns and define the target variable...")
try:
    processed_data = calculate_returns_and_targets(processed_data)
    
    print("✓ Returns calculated and target variable created")
    
    # Analyze target distribution
    trend_counts = processed_data['price_trend'].value_counts()
    total_valid = trend_counts.sum()
    
    print(f"  - Price trend distribution:")
    print(f"    Fall (0): {trend_counts.get(0, 0):,} ({trend_counts.get(0, 0)/total_valid:.1%})")
    print(f"    Rise (1): {trend_counts.get(1, 0):,} ({trend_counts.get(1, 0)/total_valid:.1%})")
    
    # Check for class imbalance
    if abs(trend_counts.get(0, 0)/total_valid - 0.5) > 0.2:
        print("⚠️  Warning: Significant class imbalance detected. Consider adding more training data or taking a smaller sample size")
    
except Exception as e:
    print(f"❌ Error in return calculation: {e}")
    raise

log_memory_usage("After return calculation")

In [None]:
# Step 4: Calculate technical indicators
print("Step 4: Calculating technical indicators...")
try:
    processed_data = calculate_technical_indicators(processed_data, TECHNICAL_INDICATORS)
    
    print("✓ Technical indicators calculated")
    print(f"  - MA_{TECHNICAL_INDICATORS['MA_WINDOW']}: Moving Average ({TECHNICAL_INDICATORS['MA_WINDOW']} days)")
    print(f"  - EMA_{TECHNICAL_INDICATORS['EMA_WINDOW']}: Exponential Moving Average ({TECHNICAL_INDICATORS['EMA_WINDOW']} days)")
    print(f"  - RSI_{TECHNICAL_INDICATORS['RSI_WINDOW']}: Relative Strength Index ({TECHNICAL_INDICATORS['RSI_WINDOW']} days)")
    
    # Validate indicator ranges
    rsi_values = processed_data['RSI_14'].dropna()
    if len(rsi_values) > 0:
        print(f"  - RSI range: {rsi_values.min():.1f} - {rsi_values.max():.1f}")
        if rsi_values.min() < 0 or rsi_values.max() > 100:
            print("⚠️  Warning: RSI values outside expected range [0, 100]")
    
except Exception as e:
    print(f"❌ Error in technical indicators: {e}")
    raise

log_memory_usage("After technical indicators")

In [None]:
# Step 5: Calculate activity features
print("Step 5: Calculating governance and social activity features...")
try:
    processed_data = calculate_activity_features(processed_data, GOVERNANCE_WINDOWS)
    
    print("✓ Activity features calculated")
    print(f"  - Governance activity windows: {GOVERNANCE_WINDOWS}")
    print(f"  - Social media activity windows: {SOCIAL_WINDOWS}")
    print(f"  - Network activity windows: {GOVERNANCE_WINDOWS}")
    
    # Validate some key features
    key_features = ['14_day_gov_activity', '30_social_media_activity', '14_day_network_ewma']
    for feature in key_features:
        if feature in processed_data.columns:
            non_null_count = processed_data[feature].notna().sum()
            print(f"  - {feature}: {non_null_count:,} non-null values")
    
except Exception as e:
    print(f"❌ Error in activity features: {e}")
    raise

log_memory_usage("After activity features")

In [None]:
# Step 6: Create lagged features
print("Step 6: Creating lagged features...")
try:
    processed_data = create_lagged_features(processed_data, LAG_FEATURES, LAG_PERIODS)
    
    print(f"✓ Lagged features created")
    print(f"  - Features: {LAG_FEATURES}")
    print(f"  - Lag periods: 1 to {LAG_PERIODS}")
    print(f"  - Total lagged features: {len(LAG_FEATURES) * LAG_PERIODS}")
    
    # Check for any issues with lagged features
    lagged_cols = [f'{feature}_lag_{i}' for feature in LAG_FEATURES for i in range(1, LAG_PERIODS + 1)]
    existing_lagged = [col for col in lagged_cols if col in processed_data.columns]
    print(f"  - Successfully created: {len(existing_lagged)} lagged features")
    
except Exception as e:
    print(f"❌ Error in lagged features: {e}")
    raise

log_memory_usage("After lagged features")

In [None]:
# Step 7: Final data cleaning and validation
print("Step 7: Final data cleaning and validation...")

try:
    # Replace infinite and NaN values
    processed_data = processed_data.replace([np.inf, -np.inf], np.nan)
    
    # Handle missing values using the utility function
    numeric_columns = processed_data.select_dtypes(include=[np.number]).columns.tolist()
    processed_data = impute_missing_values(processed_data, numeric_columns)
    
    # Final statistics
    print("\n=== Final Dataset Statistics ===")
    print(f"Shape: {processed_data.shape}")
    print(f"Date range: {processed_data['vote_date'].min()} to {processed_data['vote_date'].max()}")
    print(f"Number of DAOs: {processed_data['Slug_Santiment'].nunique()}")
    
    # Check final missing values
    final_missing = processed_data.isnull().sum().sum()
    print(f"Total missing values: {final_missing:,}")
    
    # Validate feature columns exist
    available_features = [col for col in FEATURE_COLUMNS if col in processed_data.columns]
    missing_features = [col for col in FEATURE_COLUMNS if col not in processed_data.columns]
    
    print(f"Available configured features: {len(available_features)}/{len(FEATURE_COLUMNS)}")
    if missing_features:
        print(f"Missing configured features: {missing_features}")
    
    # Save processed data
    output_path = os.path.join(PROCESSED_DATA_PATH, 'processed_dao_data.csv')
    processed_data.to_csv(output_path, index=False)
    print(f"✓ Processed data saved to: {output_path}")
    
    # Save data quality report
    quality_report = {
        'total_rows': len(processed_data),
        'total_columns': processed_data.shape[1],
        'date_range': f"{processed_data['vote_date'].min()} to {processed_data['vote_date'].max()}",
        'unique_daos': processed_data['Slug_Santiment'].nunique(),
        'missing_values': final_missing,
        'available_features': len(available_features),
        'missing_features': missing_features
    }
    
    import json
    with open(os.path.join(PROCESSED_DATA_PATH, 'data_quality_report.json'), 'w') as f:
        json.dump(quality_report, f, indent=2, default=str)
    
except Exception as e:
    print(f"❌ Error in final cleaning: {e}")
    raise

log_memory_usage("After final cleaning")

In [None]:
# Data visualization and validation
print("Creating data visualizations...")

try:
    # Create subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 1. Price trend distribution
    trend_counts = processed_data['price_trend'].value_counts()
    axes[0, 0].bar(['Fall', 'Rise'], [trend_counts.get(0, 0), trend_counts.get(1, 0)], 
                   color=['red', 'green'], alpha=0.7)
    axes[0, 0].set_title('Price Trend Distribution')
    axes[0, 0].set_ylabel('Frequency')
    
    # Add percentage labels
    total = sum(trend_counts.values())
    for i, v in enumerate([trend_counts.get(0, 0), trend_counts.get(1, 0)]):
        axes[0, 0].text(i, v + total*0.01, f'{v:,}\n({v/total:.1%})', 
                        ha='center', va='bottom')
    
    # 2. Market cap distribution (log scale)
    market_cap_clean = processed_data['marketcap_usd_cleaned'].dropna()
    market_cap_clean = market_cap_clean[market_cap_clean > 0]
    if len(market_cap_clean) > 0:
        axes[0, 1].hist(np.log10(market_cap_clean), bins=50, alpha=0.7, color='blue')
        axes[0, 1].set_title('Market Cap Distribution (Log10)')
        axes[0, 1].set_xlabel('Log10(Market Cap in Billions)')
        axes[0, 1].set_ylabel('Frequency')
    
    # 3. Governance activity over time
    if 'total_votes' in processed_data.columns:
        processed_data['vote_date_parsed'] = pd.to_datetime(processed_data['vote_date'])
        monthly_gov = processed_data.groupby(
            processed_data['vote_date_parsed'].dt.to_period('M')
        )['total_votes'].sum()
        
        if len(monthly_gov) > 0:
            axes[1, 0].plot(range(len(monthly_gov)), monthly_gov.values, color='purple')
            axes[1, 0].set_title('Monthly Governance Activity')
            axes[1, 0].set_xlabel('Month (Index)')
            axes[1, 0].set_ylabel('Total Votes')
            axes[1, 0].tick_params(axis='x', rotation=45)
    
    # 4. Correlation heatmap of key features
    key_features = ['marketcap_usd_cleaned', 'total_votes', 'unique_sv_total_1h', 
                    'transaction_volume', 'price_trend']
    available_key_features = [f for f in key_features if f in processed_data.columns]
    
    if len(available_key_features) >= 2:
        corr_data = processed_data[available_key_features].corr()
        im = axes[1, 1].imshow(corr_data, cmap='coolwarm', vmin=-1, vmax=1)
        axes[1, 1].set_xticks(range(len(available_key_features)))
        axes[1, 1].set_yticks(range(len(available_key_features)))
        axes[1, 1].set_xticklabels(available_key_features, rotation=45, ha='right')
        axes[1, 1].set_yticklabels(available_key_features)
        axes[1, 1].set_title('Feature Correlation Matrix')
        
        # Add colorbar
        plt.colorbar(im, ax=axes[1, 1], fraction=0.046, pad=0.04)
    
    plt.tight_layout()
    
    # Save plots
    plot_path = os.path.join(RESULTS_PATH, 'data_preprocessing_plots.png')
    os.makedirs(RESULTS_PATH, exist_ok=True)
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"✓ Plots saved to: {plot_path}")
    print("✓ Data preprocessing completed successfully!")
    print(f"✓ Dataset ready for model training with {processed_data.shape[1]} features")
    
    # Final summary
    print(f"\n=== Processing Summary ===")
    print(f"✓ Processed {len(processed_data):,} records")
    print(f"✓ Created {processed_data.shape[1]} features")
    print(f"✓ Target variable: {processed_data['price_trend'].notna().sum():,} valid labels")
    print(f"✓ Data saved to: {output_path}")
    
except Exception as e:
    print(f"⚠️  Warning: Error in visualization: {e}")
    print("Data processing completed but visualization failed")

log_memory_usage("Final memory usage")