# Colab 3/3 — PPO Adversarial Training (GPU)

**Rolle:** KI-Labor — Deep Reinforcement Learning mit GPU

**Dieses Notebook tut NUR:**
- Vorbereitete Features von Drive laden (von Notebook 1)
- PPO Trader + Adversary auf GPU trainieren
- Checkpoint alle N Iterationen auf Drive sichern
- Bestes Modell fuer den Local Master bereitstellen

**GPU PFLICHT! Wähle: Runtime > Change runtime type > T4 GPU**

Warum GPU hier gut genutzt wird:
- GRU-Backpropagation = parallelisierbare Matrix-Multiplikation
- batch_size=256 auf GPU: 10x schneller als CPU
- Der RAM bleibt stabil weil nur vorberechnete Features geladen werden

---
**Voraussetzung:** Notebook 1 muss vorher gelaufen sein
(Features auf Drive: `MyDrive/BITCOIN4Traders/processed/train_feat.parquet`)
---

## Schritt 1: GPU pruefen

In [None]:
import torch

if not torch.cuda.is_available():
    raise RuntimeError(
        'KEINE GPU!\n'
        'Gehe zu: Runtime > Change runtime type > Hardware accelerator: GPU (T4)\n'
        'Dann diese Zelle erneut ausfuehren.'
    )

DEVICE = 'cuda'
gpu    = torch.cuda.get_device_name(0)
vram   = torch.cuda.get_device_properties(0).total_memory / 1e9
print(f'GPU:   {gpu}')
print(f'VRAM:  {vram:.1f} GB')
print(f'PyTorch: {torch.__version__}')
print(f'CUDA:    {torch.version.cuda}')

## Schritt 2: Repo & Dependencies

In [None]:
import os, shutil

PROJECT_DIR = '/content/BITCOIN4Traders'
REPO_URL    = 'https://github.com/juancarlosrial76-code/BITCOIN4Traders.git'

if os.path.exists(PROJECT_DIR) and not os.path.exists(f'{PROJECT_DIR}/.git'):
    shutil.rmtree(PROJECT_DIR)

if not os.path.exists(PROJECT_DIR):
    !git clone {REPO_URL} {PROJECT_DIR} --quiet
    print('Repo geklont.')
else:
    !git -C {PROJECT_DIR} pull --quiet
    print('Repo aktualisiert.')

os.chdir(PROJECT_DIR)

In [None]:
%%time
# Torch ist in Colab schon vorinstalliert - nur Extras noetig
!pip install -q loguru pyarrow pandas numpy scikit-learn pyyaml python-dotenv tqdm gymnasium
print('Dependencies bereit.')

## Schritt 3: Drive mounten & Konfiguration

In [None]:
from google.colab import drive
drive.mount('/content/drive')

DRIVE_DIR   = '/content/drive/MyDrive/BITCOIN4Traders'
DRIVE_PROC  = f'{DRIVE_DIR}/processed'
DRIVE_MODEL = f'{DRIVE_DIR}/models'

import os
os.makedirs(DRIVE_MODEL, exist_ok=True)

# ===== TRAINING-EINSTELLUNGEN =====
# GPU-optimierte Werte (groessere Batches als CPU)
N_ITERATIONS    = 500     # Anzahl PPO-Iterationen
STEPS_PER_ITER  = 2048    # Schritte pro Iteration (Trajectory-Laenge)
BATCH_SIZE      = 256     # Groessere Batches auf GPU (CPU: 64)
HIDDEN_DIM      = 256     # Groesseres Netz auf GPU (CPU: 128)
SAVE_EVERY      = 25      # Drive-Sync alle N Iterationen
RESUME_FROM     = None    # z.B. 'checkpoint_iter_100' fuer Weitertraining

print(f'Device:        {DEVICE}')
print(f'Iterationen:   {N_ITERATIONS}')
print(f'Batch-Size:    {BATCH_SIZE}  (GPU-optimiert)')
print(f'Hidden-Dim:    {HIDDEN_DIM} (GPU-optimiert)')
print(f'Drive-Sync:    alle {SAVE_EVERY} Iterationen')

## Schritt 4: Features von Drive laden

In [None]:
import sys, gc
import pandas as pd
import numpy as np
from pathlib import Path
from loguru import logger

sys.path.insert(0, PROJECT_DIR)
sys.path.insert(0, f'{PROJECT_DIR}/src')

proc = Path(DRIVE_PROC)

# Pruefen ob Notebook 1 gelaufen ist
required = ['train_price.parquet', 'train_feat.parquet',
            'val_price.parquet',   'val_feat.parquet']
missing = [f for f in required if not (proc / f).exists()]

if missing:
    raise FileNotFoundError(
        f'Fehlende Dateien auf Drive: {missing}\n'
        f'Bitte zuerst Colab_1_Daten.ipynb ausfuehren!'
    )

logger.info('Lade vorbereitete Features von Drive...')

# float32 laden - bereits von Notebook 1 konvertiert
train_price = pd.read_parquet(proc / 'train_price.parquet')
train_feat  = pd.read_parquet(proc / 'train_feat.parquet')
val_price   = pd.read_parquet(proc / 'val_price.parquet')
val_feat    = pd.read_parquet(proc / 'val_feat.parquet')

logger.success(f'Train: {len(train_price):,} Bars | Val: {len(val_price):,} Bars')
logger.success(f'Features: {train_feat.shape[1]} Spalten')

# RAM-Status
import psutil
ram_used = psutil.virtual_memory().used / 1024**3
ram_total = psutil.virtual_memory().total / 1024**3
print(f'\nRAM nach Laden: {ram_used:.1f}/{ram_total:.1f} GB')
print(f'GPU VRAM:       {torch.cuda.memory_allocated()/1e9:.2f}/{vram:.1f} GB')

## Schritt 5: Environment erstellen

In [None]:
from environment.config_integrated_env import ConfigIntegratedTradingEnv
from environment.config_system import EnvironmentConfig, load_environment_config_from_yaml

cfg_path = Path('config/environment/realistic_env.yaml')
if cfg_path.exists():
    env_config = load_environment_config_from_yaml(str(cfg_path))
    logger.info('Environment-Config aus YAML geladen')
else:
    env_config = EnvironmentConfig()
    logger.warning('Standard-Config (YAML nicht gefunden)')

env = ConfigIntegratedTradingEnv(train_price, train_feat, env_config)

STATE_DIM  = env.observation_space.shape[0]
N_ACTIONS  = env.action_space.n

logger.success(f'Environment: State={STATE_DIM} | Actions={N_ACTIONS}')
print(f'Observation Space: {env.observation_space.shape}')

## Schritt 6: PPO Trainer erstellen (GPU-optimiert)

In [None]:
from agents.ppo_agent import PPOConfig
from training.adversarial_trainer import AdversarialTrainer, AdversarialConfig

# ── Trader: groesseres Netz auf GPU ─────────────────────────────────
trader_cfg = PPOConfig(
    state_dim=STATE_DIM,
    hidden_dim=HIDDEN_DIM,     # 256 auf GPU (128 auf CPU)
    n_actions=N_ACTIONS,
    actor_lr=1e-4,             # Reduziert (war 3e-4 -> Entropy-Kollaps)
    critic_lr=3e-4,
    gamma=0.99,
    gae_lambda=0.95,
    clip_epsilon=0.2,
    n_epochs=10,
    batch_size=BATCH_SIZE,     # 256 auf GPU
    use_recurrent=True,
    rnn_type='GRU',
    entropy_coef=0.08,         # Verhindert Entropy-Kollaps
    value_loss_coef=0.5,
    max_grad_norm=0.5,
    target_kl=0.015,
)

# ── Adversary: etwas kleiner (CPU kann Adversary-Teil uebernehmen) ──
adversary_cfg = PPOConfig(
    state_dim=STATE_DIM,
    hidden_dim=128,
    n_actions=N_ACTIONS,
    actor_lr=5e-5,
    critic_lr=1e-4,
    gamma=0.99,
    gae_lambda=0.95,
    clip_epsilon=0.2,
    n_epochs=10,
    batch_size=BATCH_SIZE,
    use_recurrent=True,
    rnn_type='GRU',
    entropy_coef=0.05,
)

training_cfg = AdversarialConfig(
    n_iterations=N_ITERATIONS,
    steps_per_iteration=STEPS_PER_ITER,
    trader_config=trader_cfg,
    adversary_config=adversary_cfg,
    adversary_start_iteration=100,
    adversary_strength=0.1,
    save_frequency=SAVE_EVERY,
    log_frequency=10,
    checkpoint_dir='data/models/adversarial',
)

trainer = AdversarialTrainer(env, training_cfg, device=DEVICE)

logger.success(f'Trainer auf {DEVICE} erstellt')
print(f'Trader-Parameter: {sum(p.numel() for p in trainer.trader.actor.parameters()):,}')
print(f'VRAM nach Modell: {torch.cuda.memory_allocated()/1e9:.3f} GB')

## Schritt 7: [Optional] Von Checkpoint weitermachen

In [None]:
import os, shutil, glob

os.makedirs('data/models/adversarial', exist_ok=True)

# Checkpoints von Drive nach lokal kopieren
drive_ckpts = sorted(glob.glob(f'{DRIVE_MODEL}/*.pth'))

if drive_ckpts:
    for cp in drive_ckpts:
        shutil.copy2(cp, 'data/models/adversarial/')
    print(f'{len(drive_ckpts)} Checkpoint(s) von Drive geladen')

    # Letzten Checkpoint laden
    main_ckpts = [c for c in drive_ckpts if '_trader' not in c and '_adversary' not in c]
    if main_ckpts and RESUME_FROM is None:
        latest = sorted(main_ckpts)[-1]
        local  = f'data/models/adversarial/{os.path.basename(latest)}'
        try:
            trainer.load_checkpoint(local)
            logger.success(f'Weitertraining ab: {os.path.basename(latest)}')
        except Exception as e:
            logger.warning(f'Checkpoint-Laden fehlgeschlagen: {e}')
else:
    logger.info('Kein Checkpoint -> Training startet von Anfang an')

## Schritt 8: Auto-Sync zu Drive einrichten

Sichert Checkpoints alle N Iterationen. Verhindert Verlust bei Colab-Timeout.

In [None]:
import glob, shutil, os

def sync_to_drive():
    """Kopiert alle lokalen Checkpoints auf Google Drive."""
    local  = 'data/models/adversarial'
    remote = DRIVE_MODEL
    copied = 0
    for cp in glob.glob(f'{local}/*.pth'):
        dst = os.path.join(remote, os.path.basename(cp))
        shutil.copy2(cp, dst)
        copied += 1
    if copied:
        print(f'Drive sync: {copied} Datei(en) gesichert')

# Sofort testen
sync_to_drive()
print('Auto-Sync bereit.')

## Schritt 9: TRAINING STARTEN

RAM-Verbrauch sollte STABIL bleiben:
- history begrenzt auf 200 Eintraege (memory_management.yaml)
- adversary_states nach jedem Training geloescht
- torch.cuda.empty_cache() alle 10 Iterationen
- gc.collect() integriert

In [None]:
import time, gc
import psutil

def ram_status():
    ram_used  = psutil.virtual_memory().used / 1024**3
    ram_total = psutil.virtual_memory().total / 1024**3
    vram_used = torch.cuda.memory_allocated() / 1e9
    return f'RAM {ram_used:.1f}/{ram_total:.1f}GB | VRAM {vram_used:.2f}GB'

logger.info('=' * 70)
logger.info('PPO ADVERSARIAL TRAINING GESTARTET')
logger.info(f'Device: {DEVICE} | Batch: {BATCH_SIZE} | Hidden: {HIDDEN_DIM}')
logger.info(f'Start-RAM: {ram_status()}')
logger.info('=' * 70)

start_time = time.time()
_done = False

try:
    trainer.train()
    _done = True

except KeyboardInterrupt:
    logger.warning('Unterbrochen - speichere...')

except RuntimeError as e:
    if 'out of memory' in str(e).lower():
        logger.error(f'GPU OOM! {e}')
        logger.info('Tipp: BATCH_SIZE auf 128 reduzieren und neu starten')
        torch.cuda.empty_cache()
    else:
        raise

finally:
    elapsed = (time.time() - start_time) / 3600
    logger.info(f'Dauer: {elapsed:.1f}h | End-RAM: {ram_status()}')
    sync_to_drive()
    logger.success('Checkpoints auf Drive gesichert')
    if _done:
        logger.success('Training vollstaendig abgeschlossen!')

## Schritt 10: Modell evaluieren

In [None]:
# Evaluation auf Validation-Set
from environment.config_integrated_env import ConfigIntegratedTradingEnv

val_env = ConfigIntegratedTradingEnv(val_price, val_feat, env_config)
val_trainer = AdversarialTrainer(val_env, training_cfg, device=DEVICE)

# Bestes Modell laden
best_path = 'data/models/adversarial/best_model_trader.pth'
if os.path.exists(best_path):
    val_trainer.trader.load(best_path)
    logger.success(f'Bestes Modell geladen: {best_path}')

try:
    metrics = trainer.evaluate(n_episodes=50)
    print('\n=== Evaluationsergebnisse (Validation-Set) ===')
    for k, v in metrics.items():
        if isinstance(v, float):
            print(f'  {k}: {v:.4f}')

    # Modell auch als ppo_best.pt speichern (fuer run.py)
    import torch
    torch.save(trainer.trader.state_dict(), 'data/models/ppo_best.pt')
    shutil.copy2('data/models/ppo_best.pt', f'{DRIVE_MODEL}/ppo_best.pt')
    logger.success('Modell als ppo_best.pt gespeichert (fuer Live-Trading mit run.py)')

except Exception as e:
    logger.error(f'Evaluation fehlgeschlagen: {e}')

## Fertig!

Das trainierte PPO-Modell liegt jetzt auf:
- `MyDrive/BITCOIN4Traders/models/best_model_trader.pth`
- `MyDrive/BITCOIN4Traders/models/ppo_best.pt`

**Naechste Schritte:**
1. `ppo_best.pt` in dein Repo kopieren: `data/models/ppo_best.pt`
2. In `run.py` den StubAgent durch den echten PPOAgent ersetzen
3. `pm2 start ecosystem.config.js` auf dem Linux Local Master