In [2]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
class StockTradingEnv(gym.Env):
    def __init__(self, data):
        super(StockTradingEnv, self).__init__()
        self.data = data
        self.current_step = 0
        self.balance = 10000
        self.shares_held = 0
        self.net_worth = self.balance
        self.max_net_worth = self.balance
        self.done = False

        self.action_space = gym.spaces.Discrete(3)

        self.observation_space = gym.spaces.Box(low=0, high=np.inf, shape=(6,), dtype=np.float32)

    def reset(self):
        self.balance = 10000
        self.shares_held = 0
        self.net_worth = self.balance
        self.max_net_worth = self.balance
        self.current_step = 0
        self.done = False
        return self._next_observation()
    
    def _next_observation(self):
        obs = np.array([
            self.data.iloc[self.current_step]['Open'],
            self.data.iloc[self.current_step]['High'],
            self.data.iloc[self.current_step]['Low'],
            self.data.iloc[self.current_step]['Close'],
            self.balance,
            self.shares_held
        ])
        return obs
    
    def step(self, action):
        current_price = self.data.iloc[self.current_step]['Close']
        self.current_step += 1
        
        if action == 1:  
            max_shares = int(self.balance / current_price)
            shares_bought = max_shares
            self.balance -= shares_bought * current_price
            self.shares_held += shares_bought
            
        elif action == 2:  
            self.balance += self.shares_held * current_price
            self.shares_held = 0
        
        self.net_worth = self.balance + self.shares_held * current_price
        self.max_net_worth = max(self.max_net_worth, self.net_worth)
        
        reward = self.net_worth - self.max_net_worth
        
        if self.current_step >= len(self.data) - 1:
            self.done = True
        
        obs = self._next_observation()
        return obs, reward, self.done, {}

In [4]:
import yfinance as yf

data = yf.download('AAPL', start='2020-01-01', end='2025-03-19', auto_adjust=False)
data.reset_index(inplace=True)
data = data[['Open', 'High', 'Low', 'Close', 'Volume']]

[*********************100%***********************]  1 of 1 completed


In [9]:
import random  # Import the random module

class DQN(nn.Module):
      def __init__(self, input_dim, action_dim):
          super(DQN, self).__init__()
          self.fc1 = nn.Linear(input_dim, 64)
          self.fc2 = nn.Linear(64, 64)
          self.out = nn.Linear(64, action_dim)

      def forward(self, x):
          x = torch.relu(self.fc1(x))
          x = torch.relu(self.fc2(x))
          return self.out(x)

class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size  # 狀態空間的維度
        self.action_size = action_size  # 行動空間的維度
        self.memory = []  # 經驗回放記憶庫
        self.gamma = 0.99  # 折扣因子
        self.epsilon = 1.0  # 初始探索率
        self.epsilon_min = 0.01  # 最小探索率
        self.epsilon_decay = 0.995  # 探索率衰減率
        self.model = DQN(state_size, action_size)  # 當前 Q 網絡
        self.target_model = DQN(state_size, action_size)  # 目標 Q 網絡
        self.update_target_model()  # 初始化目標網絡
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)  # 優化器
        self.criterion = nn.MSELoss()  # 損失函數

    def update_target_model(self):
        # 將當前模型的權重複製到目標模型
        self.target_model.load_state_dict(self.model.state_dict())

    def act(self, state):
        # 根據當前狀態選擇行動
        if np.random.rand() <= self.epsilon:
            # 隨機選擇行動（探索）
            return np.random.choice(self.action_size)
        # 使用模型預測行動（利用）
        state = torch.FloatTensor(state).unsqueeze(0)
        act_values = self.model(state)
        return torch.argmax(act_values, dim=1).item()

    def remember(self, state, action, reward, next_state, done):
        # 將經驗存入記憶庫
        self.memory.append((state, action, reward, next_state, done))

    def replay(self, batch_size):
        # 從記憶庫中取樣進行訓練
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
                target = reward + self.gamma * torch.max(self.target_model(next_state_tensor)).item()
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            target_f = self.model(state_tensor)
            target_f = target_f.clone().detach()
            target_f[0][action] = target
            self.optimizer.zero_grad()
            output = self.model(state_tensor)
            loss = self.criterion(output, target_f)
            loss.backward()
            self.optimizer.step()
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        # 加載模型權重
        self.model.load_state_dict(torch.load(name))

    def save(self, name):
        # 保存模型權重
        torch.save(self.model.state_dict(), name)


        

In [10]:
n_episodes = 50
batch_size = 32
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = Agent(state_size, action_size)

In [11]:
import random
total_reward_list = []
for e in range(n_episodes):
      state = env.reset()
      total_reward = 0
      for time in range(len(data) - 1):
          action = agent.act(state)
          next_state, reward, done, _ = env.step(action)
          agent.remember(state, action, reward, next_state, done)
          state = next_state
          total_reward += reward
          if done:
              agent.update_target_model()
              print(f"Episode: {e+1}/{n_episodes}, Total Reward: {total_reward:.2f}, Epsilon: {agent.epsilon:.2f}")
              total_reward_list.append(total_reward)
              break
          agent.replay(batch_size)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (6,) + inhomogeneous part.