In [5]:
# -*- coding: utf-8 -*-
"""
LSTM + Dueling Double DQN 股票交易（低買高賣導向）- 修正版（完整可直接跑）

你原始版本的主要問題我已一起修掉：
1) reset() 把 balance 固定成 4500，和 initial_balance/終止報酬基準 100000 不一致 → 改為一致
2) Sell 邏輯：原本算 sold_shares = shares/2 但實際賣出卻把全部賣掉 → 改成真的「賣一半」
3) reward 方向：原本買在 >MA 反而鼓勵追高 → 改成鼓勵「價格低於 MA + RSI 偏低」時買；「價格高於 MA + RSI 偏高」時賣
4) epsilon 探索：原本 epsilon 分支仍用 model 計算 softmax → 改成標準：epsilon 隨機、否則 greedy（另保留 temperature 做 Boltzmann 可選）
5) DQN 更新：改成 Double DQN（online 選 action、target 評估）
6) 尺度/資料洩漏：state 的正規化改用「視窗內」與當前參考值（避免用全期間 max）
7) 輸出路徑：移除硬編碼桌面路徑，改成自動存到當前資料夾 ./outputs/
8) 其他：加入 device、梯度裁剪、訓練過程的 checkpoint / best model、較完整的紀錄

注意：這份是「教育/專題」用途，不構成投資建議。
"""

import os
import random
from collections import deque
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import yfinance as yf

# ========== 全域設定 ==========
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

OUTPUT_DIR = os.path.join(os.getcwd(), "outputs")
os.makedirs(OUTPUT_DIR, exist_ok=True)


# ========== 下載資料 ==========
def fetch_data(ticker: str, start_date: str, end_date: str):
    data = yf.download(ticker, start=start_date, end=end_date, interval="1d", auto_adjust=False)
    if data is None or data.empty:
        raise ValueError("無法抓取數據，請檢查網路 / ticker / 日期範圍")

    data = data.ffill().dropna()
    prices = np.asarray(data["Close"].values, dtype=np.float64).flatten()
    volumes = np.asarray(data["Volume"].values, dtype=np.float64).flatten()
    dates = pd.to_datetime(data.index).values

    mask = ~np.isnan(prices) & ~np.isnan(volumes)
    idx = np.where(mask)[0]
    return prices[idx], volumes[idx], dates[idx]


# ========== 技術指標 ==========
def calculate_technical_indicators(prices, volumes, ma_window=20, rsi_window=14):
    prices = np.asarray(prices, dtype=np.float64).flatten()
    volumes = np.asarray(volumes, dtype=np.float64).flatten()

    n = len(prices)
    if n < max(ma_window, rsi_window) + 2:
        raise ValueError("資料太短，無法計算 MA/RSI。請拉長日期區間。")

    # MA（簡單移動平均）
    ma = np.convolve(prices, np.ones(ma_window) / ma_window, mode="valid")
    ma = np.concatenate([np.full(ma_window - 1, ma[0]), ma])

    # RSI（簡化版）
    diff = np.diff(prices, prepend=prices[0])
    gain = np.where(diff > 0, diff, 0.0)
    loss = np.where(diff < 0, -diff, 0.0)

    avg_gain = np.convolve(gain, np.ones(rsi_window) / rsi_window, mode="valid")
    avg_loss = np.convolve(loss, np.ones(rsi_window) / rsi_window, mode="valid")
    avg_gain = np.concatenate([np.full(rsi_window - 1, avg_gain[0]), avg_gain])
    avg_loss = np.concatenate([np.full(rsi_window - 1, avg_loss[0]), avg_loss])

    rs = np.where(avg_loss > 1e-12, avg_gain / (avg_loss + 1e-12), 100.0)
    rsi = 100.0 - (100.0 / (1.0 + rs))  # 0~100

    # Volume normalize（全段最大值）
    v_max = np.max(volumes) if np.max(volumes) > 0 else 1.0
    volume_norm = volumes / v_max

    return ma, rsi, volume_norm


# ========== 環境（非 Gym 版，專題可用） ==========
class StockTradingEnv:
    """
    動作:
      0 Hold
      1 Buy  (買一半資金)
      2 Sell (賣一半持股)

    狀態:
      sequence: (window_size, 4)  [price_norm, ma_norm, rsi_norm, volume_norm]
      extra_state: (2,)           [position_ratio, cash_ratio]
    """

    def __init__(
        self,
        prices,
        volumes,
        dates,
        window_size=20,
        initial_balance=100000.0,
        trading_cost=0.001,
        cooldown_steps=2,
        ma_window=20,
        rsi_window=14,
    ):
        self.prices = np.asarray(prices, dtype=np.float64).flatten()
        self.volumes = np.asarray(volumes, dtype=np.float64).flatten()
        self.dates = dates

        self.window_size = int(window_size)
        self.initial_balance = float(initial_balance)
        self.trading_cost = float(trading_cost)
        self.cooldown_steps = int(cooldown_steps)

        self.ma, self.rsi, self.volume = calculate_technical_indicators(
            self.prices, self.volumes, ma_window=ma_window, rsi_window=rsi_window
        )

        self.n = len(self.prices)
        self.max_steps = self.n - self.window_size - 2  # 保留 next_price
        if self.max_steps <= 1:
            raise ValueError("資料長度不足以跑環境，請增加資料。")

        self.reset()

    def reset(self):
        self.current_step = 0
        self.balance = self.initial_balance
        self.shares = 0.0

        self.trade_cooldown = 0
        self.last_buy_price = None
        self.last_buy_step = None

        self.action_log = []
        self.hold_streak = 0

        self.prev_portfolio_value = self.initial_balance  # 用於增量報酬
        return self._get_state()

    def _portfolio_value(self, price):
        return float(self.balance + self.shares * price)

    def step(self, action: int):
        # 推進一步
        self.current_step += 1
        idx = self.current_step + self.window_size - 1

        current_price = float(self.prices[idx])
        current_ma = float(self.ma[idx])
        current_rsi = float(self.rsi[idx])  # 0~100
        current_date = self.dates[idx]

        # 下一天價格用於增量 PV（避免 look-ahead 的話，你也可以改成用當天 PV 差）
        next_price = float(self.prices[idx + 1])

        reward = 0.0
        info_trade_profit = 0.0
        sold_shares = 0.0

        # cooldown
        self.trade_cooldown = max(0, self.trade_cooldown - 1)

        # --------- 動作處理 ----------
        if action == 1 and self.trade_cooldown == 0:
            # Buy: 用一半現金買入
            if self.balance >= current_price:
                budget = self.balance * 0.5
                shares_to_buy = int(budget // current_price)
                if shares_to_buy > 0:
                    cost = shares_to_buy * current_price * (1.0 + self.trading_cost)
                    if self.balance >= cost:
                        self.balance -= cost
                        self.shares += shares_to_buy
                        self.last_buy_price = current_price
                        self.last_buy_step = self.current_step
                        self.action_log.append(f"Buy {shares_to_buy} @ {current_price:.2f}")
                        self.hold_streak = 0
                        self.trade_cooldown = self.cooldown_steps

                        # 低買導向 shaping：低於 MA、RSI 偏低 -> 更高獎勵
                        if current_price < current_ma:
                            reward += 0.5
                        if current_rsi < 35:
                            reward += 0.5
                    else:
                        self.action_log.append("Failed Buy (cost)")
                        reward -= 0.2
                else:
                    self.action_log.append("Buy 0 (budget too small)")
                    reward -= 0.05
            else:
                self.action_log.append("Failed Buy (insufficient cash)")
                reward -= 0.2

        elif action == 2 and self.shares > 0 and self.trade_cooldown == 0:
            # Sell: 賣一半持股
            sell_qty = int(self.shares * 0.5)
            sell_qty = max(1, sell_qty)  # 至少賣 1 股（若 shares>0）
            sell_qty = min(sell_qty, int(self.shares))

            revenue = sell_qty * current_price * (1.0 - self.trading_cost)
            self.balance += revenue
            self.shares -= sell_qty

            sold_shares = float(sell_qty)
            self.action_log.append(f"Sell {sell_qty} @ {current_price:.2f}")
            self.hold_streak = 0
            self.trade_cooldown = self.cooldown_steps

            # 低買高賣：若有 last_buy_price，計算已實現獲利
            if self.last_buy_price is not None:
                info_trade_profit = (current_price - self.last_buy_price) * sold_shares

                # 高賣導向 shaping：高於 MA、RSI 偏高 -> 更高獎勵
                if current_price > current_ma:
                    reward += 0.5
                if current_rsi > 65:
                    reward += 0.5

                # 真正「賺錢」才給大獎勵，「虧錢」給懲罰
                if info_trade_profit > 0:
                    reward += 1.0 + (info_trade_profit / (self.initial_balance + 1e-12)) * 200.0
                else:
                    reward -= 0.5 + (abs(info_trade_profit) / (self.initial_balance + 1e-12)) * 50.0

            # 如果賣到剩很少（或清倉），可選擇重置 last_buy_price
            if self.shares <= 0:
                self.shares = 0.0
                self.last_buy_price = None
                self.last_buy_step = None

        else:
            # Hold
            self.action_log.append("Hold")
            self.hold_streak += 1
            # 太久不動給小懲罰（避免永遠 Hold）
            reward += 0.05
            if self.hold_streak >= 10:
                reward -= 0.05

        # --------- 增量資產報酬（核心 reward） ----------
        pv_now = self._portfolio_value(current_price)
        pv_next = self._portfolio_value(next_price)

        # 用「下一步 PV - 本步 PV」做主要獎勵，避免只靠 shaping
        delta = (pv_next - pv_now) / (pv_now + 1e-12)
        reward += delta * 500.0

        done = self.current_step >= self.max_steps

        # episode 結束時加總回報（讓模型更重視長期）
        if done:
            total_return = (pv_now - self.initial_balance) / (self.initial_balance + 1e-12)
            reward += total_return * 300.0

        info = {
            "date": current_date,
            "action": self.action_log[-1],
            "price": current_price,
            "shares": float(self.shares),
            "balance": float(self.balance),
            "portfolio_value": float(pv_now),
            "sold_shares": float(sold_shares),
            "trade_profit": float(info_trade_profit),
        }
        return self._get_state(), float(reward), bool(done), info

    def _get_state(self):
        s = self.current_step
        e = s + self.window_size

        price_window = self.prices[s:e].astype(np.float64)
        ma_window = self.ma[s:e].astype(np.float64)
        rsi_window = self.rsi[s:e].astype(np.float64) / 100.0  # 0~1
        volume_window = self.volume[s:e].astype(np.float64)     # 0~1

        # 正規化：用視窗末端價格作基準，避免用全域 max（降低資料洩漏與尺度不穩）
        ref_price = price_window[-1] if price_window[-1] > 0 else 1.0
        price_norm = price_window / ref_price
        ma_norm = ma_window / ref_price

        sequence = np.stack([price_norm, ma_norm, rsi_window, volume_window], axis=1).astype(np.float32)

        # 額外狀態：持股/現金比例（簡化）
        # position_ratio：用「持股市值 / 初始資金」近似
        current_price = ref_price
        position_value = self.shares * current_price
        position_ratio = np.clip(position_value / (self.initial_balance + 1e-12), 0.0, 10.0)

        cash_ratio = np.clip(self.balance / (self.initial_balance + 1e-12), 0.0, 10.0)
        extra_state = np.array([position_ratio, cash_ratio], dtype=np.float32)

        return sequence, extra_state


# ========== LSTM + Dueling DQN ==========
class LSTM_DuelingDQN(nn.Module):
    def __init__(self, feature_size: int, action_size: int, lstm_hidden=64, lstm_layers=2):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=feature_size,
            hidden_size=lstm_hidden,
            num_layers=lstm_layers,
            batch_first=True,
            dropout=0.2 if lstm_layers > 1 else 0.0,
        )

        combined_size = lstm_hidden + 2  # + extra_state(2)

        self.feature = nn.Sequential(
            nn.Linear(combined_size, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
        )

        self.value = nn.Sequential(
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )

        self.advantage = nn.Sequential(
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, action_size),
        )

    def forward(self, sequence, extra_state):
        # sequence: (B, T, F)
        lstm_out, _ = self.lstm(sequence)
        lstm_feat = lstm_out[:, -1, :]  # (B, H)

        combined = torch.cat([lstm_feat, extra_state], dim=1)  # (B, H+2)
        x = self.feature(combined)

        v = self.value(x)                 # (B, 1)
        a = self.advantage(x)             # (B, A)
        q = v + a - a.mean(dim=1, keepdim=True)
        return q


# ========== DQN Agent（Double DQN + 目標網路） ==========
class DQNAgent:
    def __init__(
        self,
        action_size=3,
        feature_size=4,
        gamma=0.99,
        lr=1e-3,
        memory_size=50000,
        batch_size=64,
        epsilon_start=1.0,
        epsilon_min=0.05,
        epsilon_decay=0.995,
        target_update_steps=500,
        temperature=1.0,  # 可選：Boltzmann
        use_boltzmann=False,
    ):
        self.action_size = action_size
        self.feature_size = feature_size

        self.gamma = float(gamma)
        self.batch_size = int(batch_size)

        self.epsilon = float(epsilon_start)
        self.epsilon_min = float(epsilon_min)
        self.epsilon_decay = float(epsilon_decay)

        self.temperature = float(temperature)
        self.use_boltzmann = bool(use_boltzmann)

        self.memory = deque(maxlen=int(memory_size))

        self.model = LSTM_DuelingDQN(feature_size=feature_size, action_size=action_size).to(DEVICE)
        self.target_model = LSTM_DuelingDQN(feature_size=feature_size, action_size=action_size).to(DEVICE)
        self.target_model.load_state_dict(self.model.state_dict())
        self.target_model.eval()

        self.optimizer = optim.Adam(self.model.parameters(), lr=float(lr))
        self.criterion = nn.SmoothL1Loss()  # Huber loss

        self.learn_step = 0
        self.target_update_steps = int(target_update_steps)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, int(action), float(reward), next_state, float(done)))

    def act(self, state):
        sequence, extra_state = state

        # epsilon 探索：直接隨機
        if random.random() < self.epsilon:
            return random.randrange(self.action_size)

        seq_t = torch.tensor(sequence, dtype=torch.float32, device=DEVICE).unsqueeze(0)
        extra_t = torch.tensor(extra_state, dtype=torch.float32, device=DEVICE).unsqueeze(0)

        with torch.no_grad():
            q = self.model(seq_t, extra_t)  # (1, A)

        if self.use_boltzmann:
            prob = torch.softmax(q / max(self.temperature, 1e-6), dim=1).detach().cpu().numpy()[0]
            return int(np.random.choice(self.action_size, p=prob))

        return int(torch.argmax(q, dim=1).item())

    def replay(self):
        if len(self.memory) < self.batch_size:
            return None

        minibatch = random.sample(self.memory, self.batch_size)

        sequences = torch.tensor([t[0][0] for t in minibatch], dtype=torch.float32, device=DEVICE)
        extra_states = torch.tensor([t[0][1] for t in minibatch], dtype=torch.float32, device=DEVICE)
        actions = torch.tensor([t[1] for t in minibatch], dtype=torch.long, device=DEVICE)
        rewards = torch.tensor([t[2] for t in minibatch], dtype=torch.float32, device=DEVICE)
        next_sequences = torch.tensor([t[3][0] for t in minibatch], dtype=torch.float32, device=DEVICE)
        next_extra_states = torch.tensor([t[3][1] for t in minibatch], dtype=torch.float32, device=DEVICE)
        dones = torch.tensor([t[4] for t in minibatch], dtype=torch.float32, device=DEVICE)

        # Q(s,a)
        q_values = self.model(sequences, extra_states).gather(1, actions.unsqueeze(1)).squeeze(1)

        # Double DQN：
        # online 網路選下一步 action，target 網路估值
        with torch.no_grad():
            next_actions = torch.argmax(self.model(next_sequences, next_extra_states), dim=1)
            next_q = self.target_model(next_sequences, next_extra_states).gather(1, next_actions.unsqueeze(1)).squeeze(1)
            target_q = rewards + (1.0 - dones) * self.gamma * next_q

        loss = self.criterion(q_values, target_q)

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
        self.optimizer.step()

        # epsilon decay
        if self.epsilon > self.epsilon_min:
            self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

        # target update
        self.learn_step += 1
        if self.learn_step % self.target_update_steps == 0:
            self.target_model.load_state_dict(self.model.state_dict())

        # temperature（如果用 boltzmann）
        if self.use_boltzmann:
            self.temperature = max(0.1, self.temperature * 0.999)

        return float(loss.item())

    def save(self, path):
        os.makedirs(os.path.dirname(path), exist_ok=True)
        torch.save(
            {
                "model": self.model.state_dict(),
                "target": self.target_model.state_dict(),
                "epsilon": self.epsilon,
                "learn_step": self.learn_step,
            },
            path,
        )

    def load(self, path):
        ckpt = torch.load(path, map_location=DEVICE)
        self.model.load_state_dict(ckpt["model"])
        self.target_model.load_state_dict(ckpt["target"])
        self.epsilon = float(ckpt.get("epsilon", 0.0))
        self.learn_step = int(ckpt.get("learn_step", 0))


# ========== 訓練 + 測試 ==========
def train_and_test_dqn(
    ticker="SPMO",
    end_date=datetime(2024, 5, 9),
    train_years=8,
    test_years=1,
    window_size=20,
    initial_balance=100000.0,
    episodes=200,
):
    end_str = end_date.strftime("%Y-%m-%d")
    train_start = (end_date - timedelta(days=int(train_years * 365))).strftime("%Y-%m-%d")
    test_start = (end_date - timedelta(days=int(test_years * 365))).strftime("%Y-%m-%d")

    prices, volumes, dates = fetch_data(ticker, train_start, end_str)

    train_mask = dates < np.datetime64(test_start)
    test_mask = ~train_mask

    train_env = StockTradingEnv(
        prices[train_mask], volumes[train_mask], dates[train_mask],
        window_size=window_size, initial_balance=initial_balance, trading_cost=0.001
    )

    agent = DQNAgent(
        action_size=3,
        feature_size=4,
        gamma=0.99,
        lr=1e-3,
        memory_size=50000,
        batch_size=64,
        epsilon_start=1.0,
        epsilon_min=0.05,
        epsilon_decay=0.995,
        target_update_steps=500,
        use_boltzmann=False,
    )

    model_path = os.path.join(OUTPUT_DIR, "lstm_dueling_double_dqn_best.pth")
    best_pv = -1e18

    print(f"Device: {DEVICE}")
    print(f"Train range: {train_start} ~ {test_start} (exclusive)")
    print(f"Test  range: {test_start} ~ {end_str}")
    print(f"Save model to: {model_path}")

    for ep in range(1, episodes + 1):
        state = train_env.reset()
        total_reward = 0.0
        losses = []
        action_cnt = [0, 0, 0]

        for _ in range(train_env.max_steps):
            action = agent.act(state)
            action_cnt[action] += 1

            next_state, reward, done, info = train_env.step(action)
            agent.remember(state, action, reward, next_state, done)

            loss = agent.replay()
            if loss is not None:
                losses.append(loss)

            state = next_state
            total_reward += reward

            if done:
                break

        pv = train_env.balance + train_env.shares * float(train_env.prices[-1])
        avg_loss = float(np.mean(losses)) if losses else 0.0

        if pv > best_pv:
            best_pv = pv
            agent.save(model_path)

        if ep % 10 == 0 or ep == 1:
            print(
                f"Ep {ep:4d}/{episodes} | Reward {total_reward:8.2f} | PV {pv:10.2f} | "
                f"Loss {avg_loss:8.4f} | eps {agent.epsilon:5.3f} | "
                f"Cnt(H/B/S)={action_cnt}"
            )
            print("Recent actions:", train_env.action_log[-8:])

    # ========== 測試 ==========
    test_env = StockTradingEnv(
        prices[test_mask], volumes[test_mask], dates[test_mask],
        window_size=window_size, initial_balance=initial_balance, trading_cost=0.001
    )

    agent.load(model_path)
    agent.epsilon = 0.0  # 測試不探索

    state = test_env.reset()
    results = []

    for _ in range(test_env.max_steps):
        action = agent.act(state)
        next_state, reward, done, info = test_env.step(action)

        results.append(
            {
                "Date": info["date"],
                "Action": info["action"],
                "Price": info["price"],
                "Shares": info["shares"],
                "Balance": info["balance"],
                "Portfolio_Value": info["portfolio_value"],
                "Trade_Profit": info["trade_profit"],
            }
        )
        state = next_state
        if done:
            break

    df = pd.DataFrame(results)
    df["Date"] = pd.to_datetime(df["Date"])

    out_csv = os.path.join(OUTPUT_DIR, f"{ticker}_lstm_dqn_test_results.csv")
    df.to_csv(out_csv, index=False, encoding="utf-8-sig")

    final_pv = float(results[-1]["Portfolio_Value"]) if results else float("nan")
    hold_cnt = sum(r["Action"] == "Hold" for r in results)
    buy_cnt = sum(str(r["Action"]).startswith("Buy") for r in results)
    sell_cnt = sum(str(r["Action"]).startswith("Sell") for r in results)

    print("\n==== 測試完成 ====")
    print(f"測試結果已儲存：{out_csv}")
    print(f"最終資產：{final_pv:.2f}")
    print(f"動作統計：Hold={hold_cnt}, Buy={buy_cnt}, Sell={sell_cnt}")
    print(f"Best PV in training：{best_pv:.2f}")

    return model_path, out_csv


def predict_today_action(
    model_path,
    ticker="SPMO",
    window_size=20,
    initial_balance=100000.0,
    lookback_days=120,
):
    agent = DQNAgent(action_size=3, feature_size=4)
    agent.load(model_path)
    agent.epsilon = 0.0

    today = datetime.today()
    start_date = (today - timedelta(days=int(lookback_days))).strftime("%Y-%m-%d")
    end_date = today.strftime("%Y-%m-%d")

    prices, volumes, dates = fetch_data(ticker, start_date, end_date)
    env = StockTradingEnv(
        prices, volumes, dates,
        window_size=window_size, initial_balance=initial_balance, trading_cost=0.001
    )
    state = env.reset()

    info = None
    action = 0
    for _ in range(env.max_steps):
        action = agent.act(state)
        state, _, done, info = env.step(action)
        if done:
            break

    action_str = ["Hold", "Buy", "Sell"][action]
    print(f"\n[{pd.to_datetime(info['date']).date()}] 建議動作：{action_str}")
    print(
        f"價格：{info['price']:.2f} | 持股：{info['shares']:.0f} | 現金：{info['balance']:.2f} | 資產：{info['portfolio_value']:.2f}"
    )

    # 存檔紀錄
    daily_log = os.path.join(OUTPUT_DIR, "daily_signal_log.csv")
    row = pd.DataFrame(
        [
            {
                "Date": info["date"],
                "Action": action_str,
                "Price": info["price"],
                "Shares": info["shares"],
                "Balance": info["balance"],
                "Portfolio_Value": info["portfolio_value"],
            }
        ]
    )
    if os.path.exists(daily_log):
        row.to_csv(daily_log, mode="a", header=False, index=False, encoding="utf-8-sig")
    else:
        row.to_csv(daily_log, index=False, encoding="utf-8-sig")

    print(f"今日訊號已紀錄：{daily_log}")


if __name__ == "__main__":
    # 你可以改 ticker：
    # - 若你真的是要「0050」：yfinance 通常是 "0050.TW"
    # - 你原碼是 "SPMO"：我保留但你可自行改
    ticker = "SPMO"   # 例如改成 "0050.TW"

    best_model_path, test_csv_path = train_and_test_dqn(
        ticker=ticker,
        end_date=datetime(2024, 5, 9),
        train_years=8,
        test_years=1,
        window_size=20,
        initial_balance=100000.0,
        episodes=200,
    )

    predict_today_action(
        model_path=best_model_path,
        ticker=ticker,
        window_size=20,
        initial_balance=100000.0,
        lookback_days=180,
    )


[*********************100%***********************]  1 of 1 completed


Device: cpu
Train range: 2016-05-11 ~ 2023-05-10 (exclusive)
Test  range: 2023-05-10 ~ 2024-05-09
Save model to: c:\Users\L-JC (Harry)\Downloads\ppo\outputs\lstm_dueling_double_dqn_best.pth
Ep    1/200 | Reward  1126.70 | PV  119310.97 | Loss   0.9219 | eps 0.050 | Cnt(H/B/S)=[156, 454, 1129]
Recent actions: ['Hold', 'Buy 542 @ 55.74', 'Hold', 'Buy 268 @ 56.17', 'Hold', 'Sell 946 @ 54.88', 'Hold', 'Buy 605 @ 55.29']


KeyboardInterrupt: 