# Colab 1/3 — Daten-Aufbereitung

**Rolle in der 3-Saeulen-Architektur:** KI-Labor (Daten-Vorbereitung)

**Dieses Notebook tut NUR:**
- BTC/USDT OHLCV-Daten von Binance oder Yahoo Finance laden
- Features berechnen (FeatureEngine)
- Scaler fitten und speichern
- Alles auf Google Drive sichern fuer Notebook 2 und 3

**Kein GPU noetig! Wähle bei Runtime: `None` (CPU)**

---
**Warum 3 getrennte Notebooks?**
Ein einzelnes Notebook das alles macht (Daten + Evolution + PPO) verbraucht nach
~1h den gesamten RAM (12 GB). Die Daten bleiben im RAM waehrend das Training laeuft.
Mit 3 Notebooks wird jede Aufgabe in einer frischen Session gestartet - kein Altlast-RAM.
---

## Schritt 1: Repo klonen & Dependencies installieren

In [None]:
import os, shutil

PROJECT_DIR = '/content/BITCOIN4Traders'
REPO_URL    = 'https://github.com/juancarlosrial76-code/BITCOIN4Traders.git'

if os.path.exists(PROJECT_DIR) and not os.path.exists(f'{PROJECT_DIR}/.git'):
    shutil.rmtree(PROJECT_DIR)

if not os.path.exists(PROJECT_DIR):
    !git clone {REPO_URL} {PROJECT_DIR} --quiet
    print('Repo geklont.')
else:
    !git -C {PROJECT_DIR} pull --quiet
    print('Repo aktualisiert.')

os.chdir(PROJECT_DIR)
print(f'Verzeichnis: {os.getcwd()}')

In [None]:
%%time
# Nur Daten-Dependencies - kein torch, kein gymnasium
# Schneller + weniger RAM als full install
!pip install -q ccxt loguru pyarrow pandas numpy ta yfinance numba joblib pyyaml scikit-learn python-dotenv tqdm
print('Dependencies installiert.')

## Schritt 2: Google Drive mounten

In [None]:
from google.colab import drive
drive.mount('/content/drive')

DRIVE_DIR   = '/content/drive/MyDrive/BITCOIN4Traders'
DRIVE_DATA  = f'{DRIVE_DIR}/data'
DRIVE_PROC  = f'{DRIVE_DIR}/processed'

import os
for d in [DRIVE_DIR, DRIVE_DATA, DRIVE_PROC]:
    os.makedirs(d, exist_ok=True)

print(f'Drive bereit: {DRIVE_DIR}')

## Schritt 3: Konfiguration

In [None]:
# ===== EINSTELLUNGEN =====
SYMBOL      = 'BTC/USDT'
TIMEFRAME   = '1h'
START_DATE  = '2021-01-01'   # Mehr Daten = besser aber mehr RAM
END_DATE    = None            # None = bis heute

# Datentyp: float32 spart 50% RAM vs float64
DTYPE       = 'float32'

# Max Candles in RAM halten (Colab: 12 GB Limit)
# 1h-Candles seit 2021 = ~35.000 Bars = ~50 MB als float32 - passt
MAX_CANDLES = 40_000

print(f'Symbol:     {SYMBOL}')
print(f'Zeitraum:   {START_DATE} bis {END_DATE or "heute"}')
print(f'Max Bars:   {MAX_CANDLES:,}')

## Schritt 4: Daten laden (Binance oder Yahoo Finance Fallback)

In [None]:
import sys, gc
import pandas as pd
import numpy as np
from pathlib import Path
from loguru import logger

sys.path.insert(0, '/content/BITCOIN4Traders')
sys.path.insert(0, '/content/BITCOIN4Traders/src')

CACHE_FILE = Path(DRIVE_DATA) / 'BTC_USDT_1h_raw.parquet'

# ── Bereits auf Drive gecacht? ──────────────────────────────────────
if CACHE_FILE.exists():
    logger.info(f'Lade gecachte Rohdaten von Drive: {CACHE_FILE}')
    price_data = pd.read_parquet(CACHE_FILE)
    # Auf MAX_CANDLES begrenzen (neueste Daten)
    if len(price_data) > MAX_CANDLES:
        price_data = price_data.iloc[-MAX_CANDLES:]
    logger.success(f'Gecachte Daten geladen: {len(price_data):,} Bars')

else:
    price_data = None

    # ── Versuch 1: Binance via CCXT ─────────────────────────────────
    if price_data is None:
        try:
            import ccxt
            exchange = ccxt.binance({'enableRateLimit': True})
            logger.info('Lade von Binance...')

            since_ms = exchange.parse8601(f'{START_DATE}T00:00:00Z')
            all_ohlcv = []
            limit = 1000

            while True:
                ohlcv = exchange.fetch_ohlcv(SYMBOL, TIMEFRAME, since=since_ms, limit=limit)
                if not ohlcv:
                    break
                all_ohlcv.extend(ohlcv)
                since_ms = ohlcv[-1][0] + 1
                if len(ohlcv) < limit:
                    break
                import time; time.sleep(0.3)  # Rate-Limit

            price_data = pd.DataFrame(
                all_ohlcv,
                columns=['timestamp', 'open', 'high', 'low', 'close', 'volume']
            )
            price_data['timestamp'] = pd.to_datetime(price_data['timestamp'], unit='ms')
            price_data = price_data.set_index('timestamp').sort_index()
            price_data = price_data.astype(DTYPE)
            logger.success(f'Binance: {len(price_data):,} Bars geladen')

        except Exception as e:
            logger.warning(f'Binance fehlgeschlagen: {e}')

    # ── Versuch 2: Yahoo Finance (Fallback, kein Geo-Block) ─────────
    if price_data is None:
        try:
            import yfinance as yf
            yf_symbol = 'BTC-USD'
            logger.info(f'Lade {yf_symbol} von Yahoo Finance...')
            df = yf.download(yf_symbol, start=START_DATE, end=END_DATE,
                             interval='1h', progress=False, auto_adjust=True)
            df.columns = [c.lower() for c in df.columns]
            df = df[['open', 'high', 'low', 'close', 'volume']]
            df.index = pd.to_datetime(df.index).tz_localize(None)
            df = df.astype(DTYPE)
            price_data = df
            logger.success(f'Yahoo Finance: {len(price_data):,} Bars geladen')
        except Exception as e:
            logger.error(f'Yahoo Finance fehlgeschlagen: {e}')

    if price_data is None:
        raise RuntimeError('Keine Daten geladen! Netzwerk pruefen.')

    # Auf MAX_CANDLES begrenzen
    if len(price_data) > MAX_CANDLES:
        price_data = price_data.iloc[-MAX_CANDLES:]

    # Auf Drive cachen
    logger.info(f'Speichere Rohdaten auf Drive: {CACHE_FILE}')
    price_data.to_parquet(CACHE_FILE, engine='pyarrow', compression='snappy')

# NaN entfernen
price_data = price_data.dropna()

# RAM-Check
mem_mb = price_data.memory_usage(deep=True).sum() / 1024**2
print(f'\nDaten: {len(price_data):,} Bars | RAM: {mem_mb:.1f} MB | dtype: {price_data.dtypes[0]}')
print(f'Zeitraum: {price_data.index[0]} bis {price_data.index[-1]}')
print(price_data.tail(3))

## Schritt 5: Features berechnen & Scaler speichern

In [None]:
import gc
from features.feature_engine import FeatureEngine, FeatureConfig

PROC_DIR = Path(DRIVE_PROC)

# ── Split: 70% Train, 15% Val, 15% Test ─────────────────────────────
n         = len(price_data)
train_end = int(n * 0.70)
val_end   = int(n * 0.85)

train_raw = price_data.iloc[:train_end]
val_raw   = price_data.iloc[train_end:val_end]
test_raw  = price_data.iloc[val_end:]

logger.info(f'Split: Train={len(train_raw):,} | Val={len(val_raw):,} | Test={len(test_raw):,}')

# ── Feature Engineering ─────────────────────────────────────────────
feat_cfg = FeatureConfig(
    volatility_window=20,
    ou_window=20,
    rolling_mean_window=20,
    use_log_returns=True,
    scaler_type='standard',
    save_scaler=True,
    scaler_path=PROC_DIR,         # Scaler wird auf Drive gespeichert
    dropna_strategy='rolling',
    min_valid_rows=500,
)

engine = FeatureEngine(feat_cfg)

logger.info('Fit FeatureEngine auf Trainingsdaten (KEIN Leakage)...')
train_feat = engine.fit_transform(train_raw)
val_feat   = engine.transform(val_raw)
test_feat  = engine.transform(test_raw)

# Indizes angleichen
idx_train = train_raw.index.intersection(train_feat.index)
idx_val   = val_raw.index.intersection(val_feat.index)
idx_test  = test_raw.index.intersection(test_feat.index)

logger.success(f'Features: {train_feat.shape[1]} Spalten | Train-Samples: {len(idx_train):,}')

# ── Auf Drive speichern (komprimiert float32) ───────────────────────
def save_split(price, feat, idx, name):
    p_path = PROC_DIR / f'{name}_price.parquet'
    f_path = PROC_DIR / f'{name}_feat.parquet'
    price.loc[idx].astype('float32').to_parquet(p_path, compression='snappy')
    feat.loc[idx].astype('float32').to_parquet(f_path, compression='snappy')
    size_mb = (p_path.stat().st_size + f_path.stat().st_size) / 1024**2
    logger.success(f'Gespeichert: {name} ({size_mb:.1f} MB)')

save_split(train_raw, train_feat, idx_train, 'train')
save_split(val_raw,   val_feat,   idx_val,   'val')
save_split(test_raw,  test_feat,  idx_test,  'test')

# ── RAM freigeben ───────────────────────────────────────────────────
del train_feat, val_feat, test_feat, train_raw, val_raw, test_raw, price_data
gc.collect()

print('\nDaten-Aufbereitung abgeschlossen!')
print(f'Dateien auf Drive: {PROC_DIR}')
print('Weiter mit: Colab_2_Evolution.ipynb oder Colab_3_PPO_Training.ipynb')

## Schritt 6: Ergebnis pruefen

Zeigt alle gespeicherten Dateien auf Drive.

In [None]:
import os
from pathlib import Path

print('=== Gespeicherte Dateien auf Drive ===')
total = 0
for f in sorted(Path(DRIVE_PROC).iterdir()):
    mb = f.stat().st_size / 1024**2
    total += mb
    print(f'  {f.name:35s}  {mb:.1f} MB')
print(f'\nGesamt: {total:.1f} MB')
print('\nNotebook 1 fertig. Starte jetzt Notebook 2 oder 3.')