# Semi-Supervised Price Movement Predictor

**Date:** 18.03.25

This notebook implements a comprehensive semi-supervised learning framework for DAO price movement prediction using LSTM models with Monte Carlo attention mechanisms.

## Key Features
- **Monte Carlo Attention**: Advanced attention mechanism with uncertainty quantification
- **Semi-Supervised Learning**: Iterative pseudo-labeling with confidence thresholds
- **Multi-Model Architecture**: LSTM + XGBoost ensemble
- **Comprehensive Evaluation**: Performance tracking across iterations

## Methodology
1. **Baseline Training**: Train LSTM classifier on labeled data
2. **Pseudo-Labeling**: Generate high-confidence labels for unlabeled data
3. **Iterative Refinement**: Progressively improve model with augmented data
4. **Ensemble Prediction**: Combine LSTM features with XGBoost for final predictions

In [None]:
# Import required libraries
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '..', 'src'))

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, roc_auc_score
import xgboost as xgb
import matplotlib.pyplot as plt

# Import custom modules
from config import *
from utils.data_preprocessing import create_sequences, prepare_preprocessor
from utils.pseudo_labeling import (batch_pseudo_labeling_monte_carlo, 
                                  batch_pseudo_labeling_regular,
                                  split_labeled_unlabeled, update_training_data)

from utils.evaluation import (compute_classification_metrics, 
                             plot_metrics_over_iterations, 
                             save_results, log_memory_usage)

from models.monte_carlo_attention import SingleInputLSTMClassifier as MCLSTMClassifier
from models.lstm_classifier import SingleInputLSTMClassifier as RegularLSTMClassifier

print("✓ All modules imported successfully")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✓ Using device: {device}")

In [None]:
# Load preprocessed data
print("Loading preprocessed data...")
try:
    preprocessed_data = pd.read_csv(os.path.join(PROCESSED_DATA_PATH, "processed_dao_data.csv"))
    print(f"✓ Data loaded successfully: {preprocessed_data.shape}")
    
    # Optional: Sample data for faster experimentation
    if len(preprocessed_data['Slug_Santiment'].unique()) > 40:
        selected_groups = np.random.choice(
            preprocessed_data['Slug_Santiment'].unique(), 
            size=40, replace=False
        )
        preprocessed_data = preprocessed_data[
            preprocessed_data['Slug_Santiment'].isin(selected_groups)
        ]
        print(f"✓ Sampled to 40 groups: {preprocessed_data.shape}")
        
except FileNotFoundError:
    print("❌ Processed data not found. Please run data preprocessing first.")
    raise

log_memory_usage("After data loading")

In [None]:
# Prepare features and preprocessing pipeline
print("Setting up feature preprocessing...")

# Define feature columns from config
feature_columns_return = [
    'Slug_Santiment_Encoded', 'vp_gini', 'transaction_volume',
    'velocity', 'unique_sv_total_1h', 'dev_activity', 'price_usd',
    'price_btc_usd', 'dao_age', 'marketSegment_Encoded', 'Total TVL in USD', 
    'SP500', 'MA_14', 'EMA_14', 'RSI_14', 
    '14_social_media_activity', '30_social_media_activity', 
    '14_day_social_ewma', '30_day_social_ewma', '60_day_social_ewma', '90_day_social_ewma'
]

# Add lagged features
lagged_feature_columns = [f'{feature}_lag_{i}' for feature in LAG_FEATURES for i in range(1, LAG_PERIODS + 1)]
feature_columns_return.extend(lagged_feature_columns)

# Prepare preprocessor
preprocessor = prepare_preprocessor(feature_columns_return, LOG_TRANSFORM_COLUMNS)

print(f"✓ Feature preprocessing configured")
print(f"  - Total features: {len(feature_columns_return)}")
print(f"  - Log transform features: {len(LOG_TRANSFORM_COLUMNS)}")
print(f"  - Lagged features: {len(lagged_feature_columns)}")

log_memory_usage("After feature setup")

In [None]:
# Data preprocessing and sequence creation
print("Processing data for LSTM training...")

# Sort and clean data
preprocessed_data.sort_values(by=['Slug_Santiment_Encoded', 'vote_date'], inplace=True)
train_data = preprocessed_data.copy()

# Prepare features and targets
X_train = train_data[feature_columns_return].replace([np.inf, -np.inf], np.nan).fillna(0)
y_train = train_data['price_trend'].values
y_train = np.where(np.isnan(y_train) | np.isinf(y_train), 0, y_train)

# Apply preprocessing
X_train_scaled = preprocessor.fit_transform(X_train)

# Create feature names for scaled data
log_transformed_features = [f"log_{col}" for col in LOG_TRANSFORM_COLUMNS]
scaled_features = [col for col in feature_columns_return 
                  if col not in LOG_TRANSFORM_COLUMNS + ['Slug_Santiment_Encoded', 'marketSegment_Encoded', 'dao_age']]
passthrough_features = ['Slug_Santiment_Encoded', 'marketSegment_Encoded', 'dao_age']
columns = log_transformed_features + scaled_features + passthrough_features

# Create DataFrame with processed features
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=columns, index=train_data.index)
X_train_scaled_df['vote_date'] = train_data['vote_date']
X_train_scaled_df['price_trend'] = y_train

# Group by organization for sequence creation
grouped_train = X_train_scaled_df.groupby(train_data['Slug_Santiment'])

print("✓ Data preprocessing completed")
print(f"  - Training samples: {len(X_train_scaled_df)}")
print(f"  - Organizations: {len(grouped_train)}")

log_memory_usage("After data preprocessing")

In [None]:
# Create sequences and train/test split
print("Creating sequences and splitting data...")

# Split data into train and test sets
train_data['vote_date'] = pd.to_datetime(train_data['vote_date'], errors='coerce')
max_date = train_data['vote_date'].max()
test_start_date = max_date - pd.Timedelta(days=30)

train_input_streams = {}
train_target_streams = {}
test_input_streams = {}
test_target_streams = {}

for name, group in grouped_train:
    X_seq, y_seq, dates = create_sequences(group, WINDOW_SIZE)
    dates = np.array(dates, dtype='datetime64')
    train_mask = dates < test_start_date
    test_mask = dates >= test_start_date
    
    train_input_streams[name] = X_seq[train_mask]
    train_target_streams[name] = y_seq[train_mask]
    test_input_streams[name] = X_seq[test_mask]
    test_target_streams[name] = y_seq[test_mask]

num_features = train_input_streams[list(train_input_streams.keys())[0]].shape[2]
print(f"✓ Sequences created")
print(f"  - Feature dimensions: {num_features}")
print(f"  - Train organizations: {len(train_input_streams)}")
print(f"  - Test organizations: {len(test_input_streams)}")

log_memory_usage("After sequence creation")

## Experiment Configuration

Configure different experimental settings for comparison:
- **Baseline Model**: Regular LSTM without Monte Carlo attention
- **Monte Carlo Model**: LSTM with Monte Carlo attention mechanism
- **Semi-Supervised Variants**: Different labeled data percentages

In [None]:
# Experiment configuration
EXPERIMENTS = [
    {
        'name': 'baseline_100',
        'labeled_percentage': 1.0,
        'model_type': 'regular',
        'use_monte_carlo': False,
        'description': 'Baseline with 100% labeled data'
    },
    {
        'name': 'semi_supervised_90_regular',
        'labeled_percentage': 0.9,
        'model_type': 'regular',
        'use_monte_carlo': False,
        'description': 'Semi-supervised with 90% labeled data (regular pseudo-labeling)'
    },
    {
        'name': 'semi_supervised_90_mc',
        'labeled_percentage': 0.9,
        'model_type': 'monte_carlo',
        'use_monte_carlo': True,
        'description': 'Semi-supervised with 90% labeled data (Monte Carlo pseudo-labeling)'
    },
    {
        'name': 'semi_supervised_80_regular',
        'labeled_percentage': 0.8,
        'model_type': 'regular',
        'use_monte_carlo': False,
        'description': 'Semi-supervised with 80% labeled data (regular pseudo-labeling)'
    },
    {
        'name': 'semi_supervised_80_mc',
        'labeled_percentage': 0.8,
        'model_type': 'monte_carlo',
        'use_monte_carlo': True,
        'description': 'Semi-supervised with 80% labeled data (Monte Carlo pseudo-labeling)'
    },
    {
        'name': 'semi_supervised_60_regular',
        'labeled_percentage': 0.6,
        'model_type': 'regular',
        'use_monte_carlo': False,
        'description': 'Semi-supervised with 60% labeled data (regular pseudo-labeling)'
    },
    {
        'name': 'semi_supervised_60_mc',
        'labeled_percentage': 0.6,
        'model_type': 'monte_carlo',
        'use_monte_carlo': True,
        'description': 'Semi-supervised with 60% labeled data (Monte Carlo pseudo-labeling)'
    }
]

print("Experiment Configuration:")
for exp in EXPERIMENTS:
    print(f"  - {exp['name']}: {exp['description']}")

In [None]:
# Semi-supervised training function
def run_semi_supervised_experiment(experiment_config, train_input_streams, train_target_streams):
    """Run a single semi-supervised learning experiment."""
    print(f"\n{'='*60}")
    print(f"Running Experiment: {experiment_config['name']}")
    print(f"Description: {experiment_config['description']}")
    print(f"{'='*60}")
    
    # Initialize model
    if experiment_config['model_type'] == 'monte_carlo':
        model = MCLSTMClassifier(input_dim=num_features).to(device)
    else:
        model = RegularLSTMClassifier(input_dim=num_features).to(device)
    
    # Initialize metrics tracking
    iteration_metrics = []
    criterion = nn.BCELoss()
    
    # Run iterative training
    for iteration in range(NUM_ITERATIONS):
        print(f"\n=== Iteration {iteration + 1}/{NUM_ITERATIONS} ===")
        
        all_y_true, all_y_pred, all_y_prob = [], [], []
        processed_orgs = set()
        
        for name in train_input_streams.keys():
            X = train_input_streams[name]
            y = train_target_streams[name]
            
            if len(X) < WINDOW_SIZE + 1:
                continue
            
            # Split into labeled and unlabeled
            X_labeled, y_labeled, X_unlabeled, _ = split_labeled_unlabeled(
                X, y, experiment_config['labeled_percentage']
            )
            
            # Time series cross-validation
            tscv = TimeSeriesSplit(n_splits=3)
            for split_id, (train_idx, val_idx) in enumerate(tscv.split(X_labeled)):
                print(f"Training {name}, split {split_id + 1}")
                
                X_train_fold, X_val_fold = X_labeled[train_idx], X_labeled[val_idx]
                y_train_fold, y_val_fold = y_labeled[train_idx], y_labeled[val_idx]
                
                # Train model
                train_dataset = torch.utils.data.TensorDataset(
                    torch.tensor(X_train_fold, dtype=torch.float32),
                    torch.tensor(y_train_fold, dtype=torch.float32)
                )
                train_loader = torch.utils.data.DataLoader(
                    train_dataset, batch_size=BATCH_SIZE, shuffle=True
                )
                
                optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
                scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                    optimizer, 'min', patience=PATIENCE, factor=0.1, verbose=True
                )
                
                # Training loop
                model.train()
                for epoch in range(NUM_EPOCHS):
                    epoch_loss = 0.0
                    for X_batch, y_batch in train_loader:
                        X_batch = X_batch.to(device)
                        y_batch = y_batch.to(device).unsqueeze(1)
                        
                        optimizer.zero_grad()
                        output = model(X_batch)
                        loss = criterion(output, y_batch)
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                        optimizer.step()
                        epoch_loss += loss.item()
                    
                    # Validation
                    model.eval()
                    with torch.no_grad():
                        X_val_tensor = torch.tensor(X_val_fold, dtype=torch.float32).to(device)
                        val_output = model(X_val_tensor)
                        val_loss = criterion(val_output, torch.tensor(y_val_fold, dtype=torch.float32).to(device).unsqueeze(1))
                    
                    scheduler.step(val_loss)
                    
                    if epoch % 10 == 0:
                        print(f"Epoch {epoch}, Train Loss: {epoch_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}")
                
                # XGBoost ensemble
                model.eval()
                with torch.no_grad():
                    train_features = model(torch.tensor(X_train_fold, dtype=torch.float32).to(device)).cpu().numpy()
                    val_features = model(torch.tensor(X_val_fold, dtype=torch.float32).to(device)).cpu().numpy()
                
                gbm = xgb.XGBClassifier(**XGBOOST_PARAMS)
                gbm.fit(train_features, y_train_fold)
                val_prob = gbm.predict_proba(val_features)[:, 1]
                val_pred = (val_prob > 0.5).astype(int)
                
                all_y_true.extend(y_val_fold)
                all_y_pred.extend(val_pred)
                all_y_prob.extend(val_prob)
                processed_orgs.add(name)
            
            # Pseudo-labeling for next iteration
            if len(X_unlabeled) > 0 and experiment_config['labeled_percentage'] < 1.0:
                if experiment_config['use_monte_carlo']:
                    pseudo_labels = batch_pseudo_labeling_monte_carlo(
                        model, X_unlabeled, BATCH_SIZE, CONFIDENCE_THRESHOLD, 
                        VARIANCE_THRESHOLD, NUM_MC_SAMPLES, device
                    )
                else:
                    pseudo_labels = batch_pseudo_labeling_regular(
                        model, X_unlabeled, BATCH_SIZE, CONFIDENCE_THRESHOLD, device
                    )
                
                # Update training data
                train_input_streams[name], train_target_streams[name] = update_training_data(
                    X_labeled, y_labeled, X_unlabeled, pseudo_labels
                )
        
        # Compute metrics for this iteration
        if all_y_true:
            metrics = compute_classification_metrics(
                np.array(all_y_true), np.array(all_y_pred), np.array(all_y_prob)
            )
            metrics['iteration'] = iteration + 1
            iteration_metrics.append(metrics)
            
            print(f"\nIteration {iteration + 1} Results:")
            print(f"Accuracy: {metrics['Accuracy']:.4f}")
            print(f"ROC AUC: {metrics['ROC_AUC']:.4f}")
            print(f"PR AUC: {metrics['PR_AUC']:.4f}")
        
        log_memory_usage(f"After iteration {iteration + 1}")
    
    # Save results
    results_df = pd.DataFrame(iteration_metrics)
    results_path = os.path.join(RESULTS_PATH, f"{experiment_config['name']}_results.csv")
    save_results(results_df, results_path)
    
    # Save model
    model_path = os.path.join(MODEL_PATH, f"{experiment_config['name']}_model.pth")
    torch.save(model.state_dict(), model_path)
    
    return results_df, model

print("✓ Semi-supervised training function loaded")

In [None]:
# Run all experiments
all_results = {}
all_models = {}

for experiment in EXPERIMENTS:
    try:
        results_df, model = run_semi_supervised_experiment(
            experiment, train_input_streams.copy(), train_target_streams.copy()
        )
        all_results[experiment['name']] = results_df
        all_models[experiment['name']] = model
        
        print(f"✓ Completed experiment: {experiment['name']}")
        
    except Exception as e:
        print(f"❌ Error in experiment {experiment['name']}: {e}")
        continue

print("\n" + "="*60)
print("ALL EXPERIMENTS COMPLETED")
print("="*60)

In [None]:
# Comparative analysis and visualization
print("Generating comparative analysis...")

try:
    # Combine all results for comparison
    comparison_data = []
    
    for exp_name, results_df in all_results.items():
        if not results_df.empty:
            # Get final iteration metrics
            final_metrics = results_df.iloc[-1].to_dict()
            final_metrics['Experiment'] = exp_name
            comparison_data.append(final_metrics)
    
    if comparison_data:
        comparison_df = pd.DataFrame(comparison_data)
        
        # Save comparison results
        comparison_path = os.path.join(RESULTS_PATH, 'experiment_comparison.csv')
        save_results(comparison_df, comparison_path)
        
        # Create comparison plots
        metrics_to_plot = ['Accuracy', 'ROC_AUC', 'PR_AUC', 'F1_Score']
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        axes = axes.ravel()
        
        for i, metric in enumerate(metrics_to_plot):
            if metric in comparison_df.columns:
                bars = axes[i].bar(comparison_df['Experiment'], comparison_df[metric])
                axes[i].set_title(f'{metric} Comparison')
                axes[i].set_ylabel(metric)
                axes[i].tick_params(axis='x', rotation=45)
                
                # Add value labels on bars
                for bar in bars:
                    height = bar.get_height()
                    axes[i].text(bar.get_x() + bar.get_width()/2., height,
                               f'{height:.3f}', ha='center', va='bottom')
        
        plt.tight_layout()
        plt.savefig(os.path.join(RESULTS_PATH, 'experiment_comparison.png'), 
                   dpi=300, bbox_inches='tight')
        plt.show()
        
        # Plot iteration evolution for each experiment
        plt.figure(figsize=(15, 10))
        
        for i, (exp_name, results_df) in enumerate(all_results.items()):
            if not results_df.empty and 'iteration' in results_df.columns:
                plt.subplot(2, 3, i+1)
                plt.plot(results_df['iteration'], results_df['Accuracy'], 'o-', label='Accuracy')
                plt.plot(results_df['iteration'], results_df['ROC_AUC'], 's-', label='ROC AUC')
                plt.xlabel('Iteration')
                plt.ylabel('Metric Value')
                plt.title(f'{exp_name}')
                plt.legend()
                plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(os.path.join(RESULTS_PATH, 'iteration_evolution.png'), 
                   dpi=300, bbox_inches='tight')
        plt.show()
        
        print(f"✓ Comparative analysis completed")
        print(f"  - Results saved to: {comparison_path}")
        print(f"  - Plots saved to: {RESULTS_PATH}")
        
        # Print summary table
        print("\n" + "="*80)
        print("FINAL RESULTS SUMMARY")
        print("="*80)
        summary_cols = ['Experiment', 'Accuracy', 'ROC_AUC', 'PR_AUC', 'F1_Score']
        print(comparison_df[summary_cols].to_string(index=False, float_format='%.4f'))
        
    else:
        print("❌ No successful experiments to compare")

except Exception as e:
    print(f"❌ Error in comparative analysis: {e}")

log_memory_usage("Final memory usage")

In [None]:
# Test set evaluation
print("\nEvaluating on test set...")

try:
    test_results = {}
    
    for exp_name, model in all_models.items():
        print(f"\nEvaluating {exp_name} on test set...")
        
        all_test_true, all_test_pred, all_test_prob = [], [], []
        
        for name in test_input_streams.keys():
            X_test = test_input_streams[name]
            y_test = test_target_streams[name]
            
            if len(X_test) > 0:
                model.eval()
                with torch.no_grad():
                    X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
                    test_features = model(X_test_tensor).cpu().numpy()
                
                # Use a simple threshold for final prediction (could use saved XGBoost)
                test_pred = (test_features > 0.5).astype(int).flatten()
                test_prob = test_features.flatten()
                
                all_test_true.extend(y_test)
                all_test_pred.extend(test_pred)
                all_test_prob.extend(test_prob)
        
        if all_test_true:
            test_metrics = compute_classification_metrics(
                np.array(all_test_true), np.array(all_test_pred), np.array(all_test_prob)
            )
            test_results[exp_name] = test_metrics
            
            print(f"Test Accuracy: {test_metrics['Accuracy']:.4f}")
            print(f"Test ROC AUC: {test_metrics['ROC_AUC']:.4f}")
    
    # Save test results
    if test_results:
        test_df = pd.DataFrame(test_results).T
        test_df.to_csv(os.path.join(RESULTS_PATH, 'test_results.csv'))
        
        print("\n" + "="*60)
        print("TEST SET RESULTS")
        print("="*60)
        print(test_df[['Accuracy', 'ROC_AUC', 'PR_AUC', 'F1_Score']].to_string(float_format='%.4f'))

except Exception as e:
    print(f"❌ Error in test evaluation: {e}")

print("\nSemi-supervised learning analysis completed!")
print(f"All results saved to: {RESULTS_PATH}")
print(f"All models saved to: {MODEL_PATH}")