In [4]:
import pandas as pd
import numpy as np

# ============================================================================
# STEP 1: CREATE COMPLETE MERGED DATASET (2020-2025)
# ============================================================================

def create_full_merged_dataset():
    """
    Merge sentiment with market features using ret_regime as target
    """

    # Load sentiment
    sentiment = pd.read_csv('fed_sentiment_results.csv')

    # Parse dates
    def parse_date(s):
        s = s.split('_')[0]
        months = {'Jan': 1, 'Feb': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
                  'July': 7, 'Aug': 8, 'Sept': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
        for m, num in months.items():
            if s.startswith(m):
                year = int(s.replace(m, ''))
                return pd.Timestamp(year=year, month=num, day=1)
        return None

    sentiment['date'] = sentiment['statement_date'].apply(parse_date)
    sentiment['year_month'] = sentiment['date'].dt.to_period('M')

    # Load market features (2020-2024)
    market = pd.read_csv('sp500_features.csv')
    market['DATE'] = pd.to_datetime(market['DATE'])
    market = market[market['DATE'] >= '2020-01-01'].copy()
    market['year_month'] = market['DATE'].dt.to_period('M')

    # Load 2025 buckets data
    market_2025 = pd.read_csv('sp500_2025_buckets_and_onehot.csv')
    market_2025['Date'] = pd.to_datetime(market_2025['Date'])
    market_2025 = market_2025[market_2025['Date'] >= '2025-01-01'].copy()

    # Map 2025 bucket to ret_regime format
    def map_bucket_to_regime(bucket):
        if pd.isna(bucket):
            return np.nan
        return bucket  # Already in -2, -1, 0, 1, 2 format

    market_2025['ret_regime'] = market_2025['bucket'].apply(map_bucket_to_regime)
    market_2025['year_month'] = market_2025['Date'].dt.to_period('M')

    # Rename Date to DATE for consistency
    market_2025 = market_2025.rename(columns={'Date': 'DATE'})

    # For 2025, we only have limited features, so we'll use what's available
    # Keep only common columns between datasets
    common_cols = ['DATE', 'year_month', 'ret_regime']

    market_2025_minimal = market_2025[common_cols].copy()
    market_full = market[['DATE', 'year_month', 'ret_regime', 'ret_1m', 'vol_3m', 'vol_6m',
                          'ret_1m_z', 'vol_3m_z', 'vol_6m_z',
                          'ret_high_negative', 'ret_negative', 'ret_flat',
                          'ret_positive', 'ret_high_positive']].copy()

    # Merge sentiment with market (2020-2024 with full features)
    merged_2020_2024 = pd.merge(
        sentiment[sentiment['date'] < '2025-01-01'],
        market_full,
        on='year_month',
        how='inner'
    )

    # Merge sentiment with 2025 (limited features)
    merged_2025 = pd.merge(
        sentiment[sentiment['date'] >= '2025-01-01'],
        market_2025_minimal,
        on='year_month',
        how='inner'
    )

    print("2020-2024 merged:", len(merged_2020_2024), "paragraphs")
    print("2025 merged:", len(merged_2025), "paragraphs")

    # For 2025, fill missing market features with 0 (or we can drop them)
    # Let's just use sentiment features for consistency

    # Get next month's regime as target
    # Sort by date
    merged_2020_2024 = merged_2020_2024.sort_values(['DATE', 'statement_date', 'paragraph_num']).reset_index(drop=True)
    merged_2025 = merged_2025.sort_values(['DATE', 'statement_date', 'paragraph_num']).reset_index(drop=True)

    # For each statement, get next statement's regime
    def get_next_regime(df, market_all):
        unique_stmts = df[['statement_date', 'DATE', 'year_month']].drop_duplicates().sort_values('DATE').reset_index(drop=True)

        stmt_to_next_regime = {}

        for i in range(len(unique_stmts) - 1):
            current_stmt = unique_stmts.iloc[i]
            next_stmt = unique_stmts.iloc[i + 1]

            # Get next month's regime from market data
            next_month_data = market_all[market_all['DATE'] == next_stmt['DATE']]

            if len(next_month_data) > 0:
                next_regime = next_month_data.iloc[0]['ret_regime']
                stmt_to_next_regime[current_stmt['statement_date']] = next_regime

        df['target_regime'] = df['statement_date'].map(stmt_to_next_regime)
        return df

    # Combine market data
    market_all = pd.concat([market[['DATE', 'ret_regime']], market_2025[['DATE', 'ret_regime']]], ignore_index=True)

    merged_2020_2024 = get_next_regime(merged_2020_2024, market_all)
    merged_2025 = get_next_regime(merged_2025, market_all)

    # Create binary target from ret_regime
    # ret_regime: -2,-1,0,1,2 → binary: 0 (down/flat: -2,-1,0), 1 (up: 1,2)
    merged_2020_2024['target_binary'] = (merged_2020_2024['target_regime'] > 0).astype(float)
    merged_2025['target_binary'] = (merged_2025['target_regime'] > 0).astype(float)

    # Map ret_regime to 5-class (0,1,2,3,4)
    def map_regime_to_class(regime):
        if pd.isna(regime):
            return np.nan
        regime_map = {-2.0: 0, -1.0: 1, 0.0: 2, 1.0: 3, 2.0: 4}
        return regime_map.get(regime, np.nan)

    merged_2020_2024['target_5class'] = merged_2020_2024['target_regime'].apply(map_regime_to_class)
    merged_2025['target_5class'] = merged_2025['target_regime'].apply(map_regime_to_class)

    # Remove rows without target
    merged_2020_2024 = merged_2020_2024[merged_2020_2024['target_5class'].notna()].copy()
    merged_2025 = merged_2025[merged_2025['target_5class'].notna()].copy()

    merged_2020_2024['target_5class'] = merged_2020_2024['target_5class'].astype(int)
    merged_2025['target_5class'] = merged_2025['target_5class'].astype(int)
    merged_2020_2024['target_binary'] = merged_2020_2024['target_binary'].astype(int)
    merged_2025['target_binary'] = merged_2025['target_binary'].astype(int)

    # Save
    merged_2020_2024.to_csv('merged_train_2020_2024.csv', index=False)
    merged_2025.to_csv('merged_test_2025.csv', index=False)

    print(f"\n✅ Saved:")
    print(f"   Train (2020-2024): {len(merged_2020_2024)} paragraphs")
    print(f"   Test (2025): {len(merged_2025)} paragraphs")

    print(f"\nTrain class distribution (5-class):")
    for i in range(5):
        count = (merged_2020_2024['target_5class'] == i).sum()
        print(f"  Class {i} (regime {[-2,-1,0,1,2][i]}): {count} ({100*count/len(merged_2020_2024):.1f}%)")

    print(f"\nTest class distribution (5-class):")
    for i in range(5):
        count = (merged_2025['target_5class'] == i).sum()
        print(f"  Class {i} (regime {[-2,-1,0,1,2][i]}): {count} ({100*count/len(merged_2025):.1f}%)")

    print(f"\nTrain class distribution (binary):")
    for i in range(2):
        count = (merged_2020_2024['target_binary'] == i).sum()
        print(f"  Class {i}: {count} ({100*count/len(merged_2020_2024):.1f}%)")

    print(f"\nTest class distribution (binary):")
    for i in range(2):
        count = (merged_2025['target_binary'] == i).sum()
        print(f"  Class {i}: {count} ({100*count/len(merged_2025):.1f}%)")

if __name__ == '__main__':
    create_full_merged_dataset()

2020-2024 merged: 390 paragraphs
2025 merged: 59 paragraphs

✅ Saved:
   Train (2020-2024): 380 paragraphs
   Test (2025): 50 paragraphs

Train class distribution (5-class):
  Class 0 (regime -2): 92 (24.2%)
  Class 1 (regime -1): 39 (10.3%)
  Class 2 (regime 0): 18 (4.7%)
  Class 3 (regime 1): 133 (35.0%)
  Class 4 (regime 2): 98 (25.8%)

Test class distribution (5-class):
  Class 0 (regime -2): 11 (22.0%)
  Class 1 (regime -1): 0 (0.0%)
  Class 2 (regime 0): 0 (0.0%)
  Class 3 (regime 1): 0 (0.0%)
  Class 4 (regime 2): 39 (78.0%)

Train class distribution (binary):
  Class 0: 149 (39.2%)
  Class 1: 231 (60.8%)

Test class distribution (binary):
  Class 0: 11 (22.0%)
  Class 1: 39 (78.0%)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

torch.manual_seed(42)
np.random.seed(42)

# ============================================================================
# DATASET
# ============================================================================

class FedDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# ============================================================================
# MODELS
# ============================================================================

class MLP(nn.Module):
    def __init__(self, input_dim, n_classes):
        super(MLP, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, n_classes)
        )

    def forward(self, x):
        return self.network(x)

class CNN1D(nn.Module):
    def __init__(self, input_dim, n_classes):
        super(CNN1D, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv1d(1, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Conv1d(64, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.AdaptiveMaxPool1d(1)
        )
        self.fc = nn.Sequential(
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, n_classes)
        )

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.conv_layers(x)
        x = x.squeeze(-1)
        return self.fc(x)

class AttentionMLP(nn.Module):
    def __init__(self, input_dim, n_classes):
        super(AttentionMLP, self).__init__()
        self.embedding = nn.Sequential(nn.Linear(input_dim, 64), nn.ReLU())
        self.attention = nn.Sequential(nn.Linear(64, 32), nn.Tanh(), nn.Linear(32, 1))
        self.classifier = nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Dropout(0.3), nn.Linear(32, n_classes))

    def forward(self, x):
        embedded = self.embedding(x)
        attention_weights = torch.softmax(self.attention(embedded), dim=0)
        weighted = embedded * attention_weights
        return self.classifier(weighted)

class ResNetMLP(nn.Module):
    def __init__(self, input_dim, n_classes):
        super(ResNetMLP, self).__init__()
        self.input_proj = nn.Linear(input_dim, 64)
        self.block1 = nn.Sequential(nn.Linear(64, 64), nn.ReLU(), nn.Dropout(0.3), nn.Linear(64, 64), nn.ReLU())
        self.block2 = nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Dropout(0.3), nn.Linear(32, 32), nn.ReLU())
        self.fc = nn.Linear(32, n_classes)

    def forward(self, x):
        x = self.input_proj(x)
        x = x + self.block1(x)
        x = self.block2(x)
        return self.fc(x)

class AutoencoderClassifier(nn.Module):
    def __init__(self, input_dim, n_classes, latent_dim=8):
        super(AutoencoderClassifier, self).__init__()
        self.encoder = nn.Sequential(nn.Linear(input_dim, 32), nn.ReLU(), nn.Linear(32, 16), nn.ReLU(), nn.Linear(16, latent_dim))
        self.decoder = nn.Sequential(nn.Linear(latent_dim, 16), nn.ReLU(), nn.Linear(16, 32), nn.ReLU(), nn.Linear(32, input_dim))
        self.classifier = nn.Sequential(nn.Linear(latent_dim, 16), nn.ReLU(), nn.Dropout(0.3), nn.Linear(16, n_classes))

    def forward(self, x):
        encoded = self.encoder(x)
        return self.classifier(encoded)

# ============================================================================
# LOAD DATA
# ============================================================================

def load_data(task='binary'):
    """
    Load train (2020-2024) and test (2025) data
    """

    train_df = pd.read_csv('merged_train_2020_2024.csv')
    test_df = pd.read_csv('merged_test_2025.csv')

    # Features - only sentiment (since 2025 doesn't have market features)
    feature_cols = ['positive_score', 'negative_score', 'neutral_score', 'paragraph_num']

    target_col = 'target_binary' if task == 'binary' else 'target_5class'
    n_classes = 2 if task == 'binary' else 5

    print(f"\n{'='*60}")
    print(f"LOADING DATA - TASK: {task}")
    print(f"{'='*60}")
    print(f"Train: {len(train_df)} paragraphs (2020-2024)")
    print(f"Test: {len(test_df)} paragraphs (2025)")
    print(f"Features: {feature_cols}")

    X_train = train_df[feature_cols].values
    y_train = train_df[target_col].values
    X_test = test_df[feature_cols].values
    y_test = test_df[target_col].values

    # Split train into train/val
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
    )

    # Scale
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    with open(f'{task}_scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)

    # Datasets
    train_dataset = FedDataset(X_train, y_train)
    val_dataset = FedDataset(X_val, y_val)
    test_dataset = FedDataset(X_test, y_test)

    # Class weights
    class_counts = np.bincount(y_train)
    class_weights = 1.0 / class_counts
    class_weights = class_weights / class_weights.sum()
    class_weights = torch.FloatTensor(class_weights)

    print(f"\nTrain: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

    return train_dataset, val_dataset, test_dataset, n_classes, class_weights, len(feature_cols), test_df

# ============================================================================
# TRAIN
# ============================================================================

def train_model(model, train_loader, val_loader, criterion, device, epochs=100, lr=0.001, model_name='Model'):
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10)

    best_val_acc = 0.0
    patience_counter = 0

    for epoch in range(epochs):
        # Train
        model.train()
        train_loss = 0.0
        train_preds, train_labels = [], []

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            train_preds.extend(preds.cpu().numpy())
            train_labels.extend(y_batch.cpu().numpy())

        train_loss /= len(train_loader)
        train_acc = accuracy_score(train_labels, train_preds)

        # Val
        model.eval()
        val_loss = 0.0
        val_preds, val_labels = [], []

        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)

                val_loss += loss.item()
                _, preds = torch.max(outputs, 1)
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(y_batch.cpu().numpy())

        val_loss /= len(val_loader)
        val_acc = accuracy_score(val_labels, val_preds)

        scheduler.step(val_loss)

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), f'{model_name}_best.pth')
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= 20:
            print(f'Early stop at epoch {epoch+1}')
            break

        if (epoch + 1) % 20 == 0:
            print(f'Epoch {epoch+1}: Train={train_acc:.4f}, Val={val_acc:.4f}')

    print(f'Best Val Acc: {best_val_acc:.4f}')
    return best_val_acc

# ============================================================================
# TEST
# ============================================================================

def test_model(model, test_loader, device):
    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            _, pred = torch.max(outputs, 1)
            preds.extend(pred.cpu().numpy())
            labels.extend(y_batch.cpu().numpy())

    acc = accuracy_score(labels, preds)
    return preds, labels, acc

# ============================================================================
# MAIN
# ============================================================================

def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'Device: {device}')

    TASK = 'binary'  # Change to '5class' for 5-class

    # Load
    train_dataset, val_dataset, test_dataset, n_classes, class_weights, input_dim, test_df = load_data(TASK)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))

    models_dict = {
        'MLP': MLP,
        'CNN1D': CNN1D,
        'AttentionMLP': AttentionMLP,
        'ResNetMLP': ResNetMLP,
        'AutoencoderClassifier': AutoencoderClassifier
    }

    results = {}

    # Train and test each model
    for model_name, model_class in models_dict.items():
        print(f'\n{"="*60}')
        print(f'TRAINING: {model_name}')
        print(f'{"="*60}')

        model = model_class(input_dim, n_classes)
        best_val = train_model(model, train_loader, val_loader, criterion, device,
                               epochs=100, lr=0.001, model_name=f'{TASK}_{model_name}')

        # Load best and test on 2025
        model.load_state_dict(torch.load(f'{TASK}_{model_name}_best.pth'))
        model = model.to(device)

        test_preds, test_labels, test_acc = test_model(model, test_loader, device)

        print(f'\n2025 Test Accuracy: {test_acc:.4f}')
        print('\nClassification Report:')
        print(classification_report(test_labels, test_preds))

        # Confusion matrix
        cm = confusion_matrix(test_labels, test_preds)
        plt.figure(figsize=(6, 5))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'{model_name} - 2025 Test')
        plt.ylabel('Actual')
        plt.xlabel('Predicted')
        plt.savefig(f'{TASK}_{model_name}_cm.png', dpi=150, bbox_inches='tight')
        plt.close()

        results[model_name] = {
            'val_acc': best_val,
            'test_acc': test_acc,
            'predictions': test_preds
        }

    # Summary
    print('\n' + '='*60)
    print(f'FINAL RESULTS - {TASK.upper()}')
    print('='*60)
    for name, res in results.items():
        print(f'{name:25s} Val: {res["val_acc"]:.4f} | Test 2025: {res["test_acc"]:.4f}')

    # Baseline
    baseline = np.bincount(test_labels).argmax()
    baseline_acc = (test_labels == baseline).mean()
    print(f'\n{"Baseline (most common)":25s} Test 2025: {baseline_acc:.4f}')

    # Save predictions
    for name, res in results.items():
        test_df[f'pred_{name}'] = res['predictions']

    target_col = 'target_binary' if TASK == 'binary' else 'target_5class'
    test_df.to_csv(f'{TASK}_2025_predictions.csv', index=False)
    print(f'\n✅ Saved: {TASK}_2025_predictions.csv')

    # Download
    try:
        from google.colab import files
        files.download(f'{TASK}_2025_predictions.csv')
        for name in results.keys():
            files.download(f'{TASK}_{name}_best.pth')
            files.download(f'{TASK}_{name}_cm.png')
        files.download(f'{TASK}_scaler.pkl')
    except:
        pass

if __name__ == '__main__':
    main()

Device: cuda

LOADING DATA - TASK: binary
Train: 380 paragraphs (2020-2024)
Test: 50 paragraphs (2025)
Features: ['positive_score', 'negative_score', 'neutral_score', 'paragraph_num']

Train: 304, Val: 76, Test: 50

TRAINING: MLP
Epoch 20: Train=0.5461, Val=0.6316
Early stop at epoch 33
Best Val Acc: 0.6447

2025 Test Accuracy: 0.7000

Classification Report:
              precision    recall  f1-score   support

           0       0.30      0.27      0.29        11
           1       0.80      0.82      0.81        39

    accuracy                           0.70        50
   macro avg       0.55      0.55      0.55        50
weighted avg       0.69      0.70      0.69        50


TRAINING: CNN1D
Epoch 20: Train=0.6118, Val=0.6447
Early stop at epoch 28
Best Val Acc: 0.6579

2025 Test Accuracy: 0.8000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.09      0.17        11
           1       0.80      1.00      0.89        39



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

torch.manual_seed(42)
np.random.seed(42)

# ============================================================================
# DATASET
# ============================================================================

class FedDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# ============================================================================
# MODELS
# ============================================================================

class MLP(nn.Module):
    def __init__(self, input_dim, n_classes):
        super(MLP, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, n_classes)
        )

    def forward(self, x):
        return self.network(x)

class CNN1D(nn.Module):
    def __init__(self, input_dim, n_classes):
        super(CNN1D, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv1d(1, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Conv1d(64, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.AdaptiveMaxPool1d(1)
        )
        self.fc = nn.Sequential(
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, n_classes)
        )

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.conv_layers(x)
        x = x.squeeze(-1)
        return self.fc(x)

class AttentionMLP(nn.Module):
    def __init__(self, input_dim, n_classes):
        super(AttentionMLP, self).__init__()
        self.embedding = nn.Sequential(nn.Linear(input_dim, 64), nn.ReLU())
        self.attention = nn.Sequential(nn.Linear(64, 32), nn.Tanh(), nn.Linear(32, 1))
        self.classifier = nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Dropout(0.3), nn.Linear(32, n_classes))

    def forward(self, x):
        embedded = self.embedding(x)
        attention_weights = torch.softmax(self.attention(embedded), dim=0)
        weighted = embedded * attention_weights
        return self.classifier(weighted)

class ResNetMLP(nn.Module):
    def __init__(self, input_dim, n_classes):
        super(ResNetMLP, self).__init__()
        self.input_proj = nn.Linear(input_dim, 64)
        self.block1 = nn.Sequential(nn.Linear(64, 64), nn.ReLU(), nn.Dropout(0.3), nn.Linear(64, 64), nn.ReLU())
        self.block2 = nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Dropout(0.3), nn.Linear(32, 32), nn.ReLU())
        self.fc = nn.Linear(32, n_classes)

    def forward(self, x):
        x = self.input_proj(x)
        x = x + self.block1(x)
        x = self.block2(x)
        return self.fc(x)

class AutoencoderClassifier(nn.Module):
    def __init__(self, input_dim, n_classes, latent_dim=8):
        super(AutoencoderClassifier, self).__init__()
        self.encoder = nn.Sequential(nn.Linear(input_dim, 32), nn.ReLU(), nn.Linear(32, 16), nn.ReLU(), nn.Linear(16, latent_dim))
        self.decoder = nn.Sequential(nn.Linear(latent_dim, 16), nn.ReLU(), nn.Linear(16, 32), nn.ReLU(), nn.Linear(32, input_dim))
        self.classifier = nn.Sequential(nn.Linear(latent_dim, 16), nn.ReLU(), nn.Dropout(0.3), nn.Linear(16, n_classes))

    def forward(self, x):
        encoded = self.encoder(x)
        return self.classifier(encoded)

class TransformerClassifier(nn.Module):
    def __init__(self, input_dim, n_classes, d_model=64, nhead=4, num_layers=2, dropout=0.3):
        super(TransformerClassifier, self).__init__()

        # Project input to d_model dimension
        self.input_projection = nn.Linear(input_dim, d_model)

        # Positional encoding (learnable)
        self.pos_embedding = nn.Parameter(torch.randn(1, 1, d_model))

        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=d_model * 4,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, n_classes)
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x shape: (batch, input_dim)
        x = self.input_projection(x)  # (batch, d_model)
        x = x.unsqueeze(1)  # (batch, 1, d_model)
        x = x + self.pos_embedding
        x = self.dropout(x)
        x = self.transformer_encoder(x)  # (batch, 1, d_model)
        x = x.squeeze(1)  # (batch, d_model)
        output = self.classifier(x)
        return output

# ============================================================================
# LOAD DATA
# ============================================================================

def load_data(task='binary'):
    train_df = pd.read_csv('merged_train_2020_2024.csv')
    test_df = pd.read_csv('merged_test_2025.csv')

    feature_cols = ['positive_score', 'negative_score', 'neutral_score', 'paragraph_num']
    target_col = 'target_binary' if task == 'binary' else 'target_5class'
    n_classes = 2 if task == 'binary' else 5

    print(f"\n{'='*60}")
    print(f"LOADING DATA - TASK: {task}")
    print(f"{'='*60}")
    print(f"Train: {len(train_df)} paragraphs (2020-2024)")
    print(f"Test: {len(test_df)} paragraphs (2025)")

    X_train = train_df[feature_cols].values
    y_train = train_df[target_col].values
    X_test = test_df[feature_cols].values
    y_test = test_df[target_col].values

    # Split train into train/val
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
    )

    # Scale
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    with open(f'{task}_scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)

    train_dataset = FedDataset(X_train, y_train)
    val_dataset = FedDataset(X_val, y_val)
    test_dataset = FedDataset(X_test, y_test)

    # Class weights
    class_counts = np.bincount(y_train)
    class_weights = 1.0 / class_counts
    class_weights = class_weights / class_weights.sum()
    class_weights = torch.FloatTensor(class_weights)

    print(f"\nTrain: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

    return train_dataset, val_dataset, test_dataset, n_classes, class_weights, len(feature_cols), test_df

# ============================================================================
# TRAIN
# ============================================================================

def train_model(model, train_loader, val_loader, criterion, device, epochs=100, lr=0.001, model_name='Model'):
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10)

    best_val_acc = 0.0
    patience_counter = 0

    for epoch in range(epochs):
        # Train
        model.train()
        train_loss = 0.0
        train_preds, train_labels = [], []

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()

            # Gradient clipping for transformer stability
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            train_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            train_preds.extend(preds.cpu().numpy())
            train_labels.extend(y_batch.cpu().numpy())

        train_loss /= len(train_loader)
        train_acc = accuracy_score(train_labels, train_preds)

        # Val
        model.eval()
        val_loss = 0.0
        val_preds, val_labels = [], []

        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)

                val_loss += loss.item()
                _, preds = torch.max(outputs, 1)
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(y_batch.cpu().numpy())

        val_loss /= len(val_loader)
        val_acc = accuracy_score(val_labels, val_preds)

        scheduler.step(val_loss)

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), f'{model_name}_best.pth')
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= 20:
            print(f'Early stop at epoch {epoch+1}')
            break

        if (epoch + 1) % 20 == 0:
            print(f'Epoch {epoch+1}: Train={train_acc:.4f}, Val={val_acc:.4f}')

    print(f'Best Val Acc: {best_val_acc:.4f}')
    return best_val_acc

# ============================================================================
# TEST
# ============================================================================

def test_model(model, test_loader, device):
    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            _, pred = torch.max(outputs, 1)
            preds.extend(pred.cpu().numpy())
            labels.extend(y_batch.cpu().numpy())

    acc = accuracy_score(labels, preds)
    return preds, labels, acc

# ============================================================================
# MAIN
# ============================================================================

def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'Device: {device}')

    TASK = 'binary'  # Change to '5class' for 5-class

    # Load
    train_dataset, val_dataset, test_dataset, n_classes, class_weights, input_dim, test_df = load_data(TASK)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))

    models_dict = {
        'MLP': MLP,
        'CNN1D': CNN1D,
        'AttentionMLP': AttentionMLP,
        'ResNetMLP': ResNetMLP,
        'AutoencoderClassifier': AutoencoderClassifier,
        'TransformerClassifier': TransformerClassifier
    }

    results = {}

    # Train and test each model
    for model_name, model_class in models_dict.items():
        print(f'\n{"="*60}')
        print(f'TRAINING: {model_name}')
        print(f'{"="*60}')

        model = model_class(input_dim, n_classes)
        best_val = train_model(model, train_loader, val_loader, criterion, device,
                               epochs=100, lr=0.001, model_name=f'{TASK}_{model_name}')

        # Load best and test on 2025
        model.load_state_dict(torch.load(f'{TASK}_{model_name}_best.pth'))
        model = model.to(device)

        test_preds, test_labels, test_acc = test_model(model, test_loader, device)

        print(f'\n2025 Test Accuracy: {test_acc:.4f}')
        print('\nClassification Report:')
        print(classification_report(test_labels, test_preds))

        # Confusion matrix
        cm = confusion_matrix(test_labels, test_preds)
        plt.figure(figsize=(6, 5))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'{model_name} - 2025 Test')
        plt.ylabel('Actual')
        plt.xlabel('Predicted')
        plt.savefig(f'{TASK}_{model_name}_cm.png', dpi=150, bbox_inches='tight')
        plt.close()

        results[model_name] = {
            'val_acc': best_val,
            'test_acc': test_acc,
            'predictions': test_preds
        }

    # Summary
    print('\n' + '='*60)
    print(f'FINAL RESULTS - {TASK.upper()}')
    print('='*60)
    for name, res in results.items():
        print(f'{name:25s} Val: {res["val_acc"]:.4f} | Test 2025: {res["test_acc"]:.4f}')

    # Baseline
    baseline = np.bincount(test_labels).argmax()
    baseline_acc = (test_labels == baseline).mean()
    print(f'\n{"Baseline (most common)":25s} Test 2025: {baseline_acc:.4f}')

    # Save predictions
    for name, res in results.items():
        test_df[f'pred_{name}'] = res['predictions']

    target_col = 'target_binary' if TASK == 'binary' else 'target_5class'
    test_df.to_csv(f'{TASK}_2025_predictions.csv', index=False)
    print(f'\n✅ Saved: {TASK}_2025_predictions.csv')

    # Download
    try:
        from google.colab import files
        files.download(f'{TASK}_2025_predictions.csv')
        for name in results.keys():
            files.download(f'{TASK}_{name}_best.pth')
            files.download(f'{TASK}_{name}_cm.png')
        files.download(f'{TASK}_scaler.pkl')
    except:
        pass

if __name__ == '__main__':
    main()

Device: cuda

LOADING DATA - TASK: binary
Train: 380 paragraphs (2020-2024)
Test: 50 paragraphs (2025)

Train: 304, Val: 76, Test: 50

TRAINING: MLP
Epoch 20: Train=0.5461, Val=0.6184
Early stop at epoch 34
Best Val Acc: 0.6711

2025 Test Accuracy: 0.7800

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.09      0.15        11
           1       0.79      0.97      0.87        39

    accuracy                           0.78        50
   macro avg       0.65      0.53      0.51        50
weighted avg       0.73      0.78      0.72        50


TRAINING: CNN1D
Epoch 20: Train=0.6118, Val=0.6316
Early stop at epoch 25
Best Val Acc: 0.6711

2025 Test Accuracy: 0.7800

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.09      0.15        11
           1       0.79      0.97      0.87        39

    accuracy                           0.78        50
   macro avg       0.65    

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>