In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
import backtrader as bt
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common import env_checker
import gymnasium as gym
from gymnasium import spaces

In [10]:
data = yf.download('AAPL', start='2020-01-01', end='2025-03-20', auto_adjust=False)
data.reset_index(inplace=True)
data['Date'] = pd.to_datetime(data['Date']) 
data = data[['Date', 'Open', 'High', 'Low', 'Close', 'Volume']]
data.set_index('Date', inplace=True)

[*********************100%***********************]  1 of 1 completed


In [11]:
data['Open'] = data['Open'].astype('float32')
data['High'] = data['High'].astype('float32')
data['Low'] = data['Low'].astype('float32')
data['Close'] = data['Close'].astype('float32')
data['Volume'] = data['Volume'].astype('float32')

In [7]:
def train_agent(env, model_class, model_name, timesteps=10000):
    model = model_class('MlpPolicy', env, verbose=0)
    model.learn(total_timesteps=timesteps)
    model.save(f"{model_name}_model")
    return model

In [30]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class TradingEnv(gym.Env):
    """自訂的交易環境，用於強化學習模型訓練"""

    def __init__(self, data, cash=10000, commission=0.001):
        super(TradingEnv, self).__init__()
        self.data = data.reset_index()
        self.cash = cash
        self.initial_cash = cash
        self.commission = commission
        self.current_step = 0

        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(5,), dtype=np.float32
        )
        self.action_space = spaces.Discrete(3)  # 0: 持有, 1: 買入, 2: 賣出

        self.position = 0
        self.net_worth = self.cash
        self.prev_net_worth = self.cash

        self.trades = []

    def _get_obs(self):
        obs = np.array([
            self.data.loc[self.current_step, 'Open'],
            self.data.loc[self.current_step, 'High'],
            self.data.loc[self.current_step, 'Low'],
            self.data.loc[self.current_step, 'Close'],
            self.data.loc[self.current_step, 'Volume'],
        ], dtype=np.float32).flatten()  # ✅ 確保是 shape=(5,)
        return obs


    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        if seed is not None:
            np.random.seed(seed)

        self.current_step = 0
        self.position = 0
        self.cash = self.initial_cash
        self.net_worth = self.cash
        self.prev_net_worth = self.cash
        self.trades = []

        obs = self._get_obs()
        info = {}
        return obs, info

    def step(self, action):
        current_price = self.data.loc[self.current_step, 'Close']
        date = self.data.loc[self.current_step, 'Date']
        self.trades.append({'Date': date, 'Action': action})

        commission = 0

        if action == 1:  # 買入
            max_shares = int(self.cash / (current_price * (1 + self.commission)))
            if max_shares > 0:
                cost = max_shares * current_price * (1 + self.commission)
                self.cash -= cost
                self.position += max_shares
                commission = cost * self.commission
        elif action == 2:  # 賣出
            if self.position > 0:
                revenue = self.position * current_price * (1 - self.commission)
                self.cash += revenue
                commission = self.position * current_price * self.commission
                self.position = 0

        self.current_step += 1
        self.net_worth = self.cash + self.position * current_price
        reward = float(self.net_worth - self.prev_net_worth - commission)
        self.prev_net_worth = self.net_worth

        terminated = self.current_step >= len(self.data) - 1
        truncated = False

        obs = self._get_obs()
        info = {}

        return obs, reward, terminated, truncated, info

    def render(self):
        pass


In [31]:
env = TradingEnv(data)

In [32]:
env_checker.check_env(env)

  max_shares = int(self.cash / (current_price * (1 + self.commission)))
  reward = float(self.net_worth - self.prev_net_worth - commission)


In [33]:
env = DummyVecEnv([lambda: env])

In [34]:
model = PPO('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=10000)

Using cuda device


  max_shares = int(self.cash / (current_price * (1 + self.commission)))
  reward = float(self.net_worth - self.prev_net_worth - commission)


-----------------------------
| time/              |      |
|    fps             | 207  |
|    iterations      | 1    |
|    time_elapsed    | 9    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 202          |
|    iterations           | 2            |
|    time_elapsed         | 20           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0057431827 |
|    clip_fraction        | 0.0909       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.09        |
|    explained_variance   | 0            |
|    learning_rate        | 0.0003       |
|    loss                 | 1.12e+05     |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00364     |
|    value_loss           | 3.02e+05     |
------------------------------------------
----------------

<stable_baselines3.ppo.ppo.PPO at 0x7f22ff5813f0>

In [None]:
env.envs[0].reset()
for i in range(len(data) - 1):
    obs = env.envs[0]._get_obs()
    action, _states = model.predict(obs)
    obs, rewards, dones, truncated, info = env.envs[0].step(action)
    if dones:
        break