In [16]:
!pip install ta



In [17]:
import numpy as np
import pandas as pd
import random
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import torch
from collections import deque
import matplotlib.pyplot as plt


In [18]:
import yfinance as yf

# 종목 티커 설정 (예: 애플)
ticker = 'AAPL'

# 데이터 불러오기 (기간 및 텀 설정)
data = yf.Ticker(ticker)
df_origin = data.history(interval='1d', period='3y', auto_adjust=False)

df_origin

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-04-05 00:00:00-04:00,177.500000,178.300003,174.419998,175.059998,172.268646,73401800,0.0,0.0
2022-04-06 00:00:00-04:00,172.360001,173.630005,170.130005,171.830002,169.090118,89058800,0.0,0.0
2022-04-07 00:00:00-04:00,171.160004,173.360001,169.850006,172.139999,169.395218,77594700,0.0,0.0
2022-04-08 00:00:00-04:00,171.779999,171.779999,169.199997,170.089996,167.377869,76575500,0.0,0.0
2022-04-11 00:00:00-04:00,168.710007,169.029999,165.500000,165.750000,163.107117,72246700,0.0,0.0
...,...,...,...,...,...,...,...,...
2025-03-31 00:00:00-04:00,217.009995,225.619995,216.229996,222.130005,222.130005,65299300,0.0,0.0
2025-04-01 00:00:00-04:00,219.809998,223.679993,218.899994,223.190002,223.190002,36412700,0.0,0.0
2025-04-02 00:00:00-04:00,221.320007,225.190002,221.020004,223.889999,223.889999,35905900,0.0,0.0
2025-04-03 00:00:00-04:00,205.539993,207.490005,201.250000,203.190002,203.190002,103419000,0.0,0.0


In [19]:
from ta.momentum import RSIIndicator, StochasticOscillator
from ta.trend import MACD, CCIIndicator, EMAIndicator
import pandas as pd

df = pd.DataFrame()

# 2) 기본적인 변화량(시가/고가/저가/종가/거래량)
df['Open_Change']   = df_origin['Open'].diff(1)
df['High_Change']   = df_origin['High'].diff(1)
df['Low_Change']    = df_origin['Low'].diff(1)
df['Close_Change']  = df_origin['Close'].diff(1)
df['Volume_Change'] = df_origin['Volume'].diff(1)

# 3) 20일 지수 이동평균 (EWM20)과 그 변화량
#    ta 라이브러리에도 EMAIndicator가 있지만, pandas ewm()을 직접 써도 무방합니다.
ema20 = EMAIndicator(close=df_origin['Close'], window=20)
df_origin['EWM20'] = ema20.ema_indicator()
df['EWM20_Change'] = df_origin['EWM20'].diff(1)

# 4) KDJ (Fast%K, Slow%D, Slow%J)
#    ta의 StochasticOscillator로 %K, %D를 구한 뒤, J는 3K - 2D 공식을 사용합니다.
stoch = StochasticOscillator(
    high=df_origin['High'],
    low=df_origin['Low'],
    close=df_origin['Close'],
    window=5,         # Fast%K 기간
    smooth_window=3   # Fast%D 기간
)
df['FastK'] = stoch.stoch()          # Fast%K
df['SlowD'] = stoch.stoch_signal()   # Slow%D (StochasticOscillator에서 %K의 이동평균)
df['SlowJ'] = 3 * df['FastK'] - 2 * df['SlowD']  # KDJ의 J 계산

# 5) MACD (MACD, MACDS, MACDO)
macd = MACD(
    close=df_origin['Close'],
    window_slow=26,
    window_fast=12,
    window_sign=9
)
df['MACD']  = macd.macd()
df['MACDS'] = macd.macd_signal()
# 질문에서 MACDO = MACD - MACDS 로 정의했으므로
df['MACDO'] = df['MACD'] - df['MACDS']

# 6) CCI (Commodity Channel Index)
cci = CCIIndicator(
    high=df_origin['High'],
    low=df_origin['Low'],
    close=df_origin['Close'],
    window=14,    # 보통 14 또는 20
    constant=0.015
)
df['CCI'] = cci.cci()

# 7) RSI (Relative Strength Index)
rsi = RSIIndicator(
    close=df_origin['Close'],
    window=14
)
df['RSI'] = rsi.rsi()
df['Close'] = df_origin['Close']

df

# 8) 결과 확인
df.head(10)

Unnamed: 0_level_0,Open_Change,High_Change,Low_Change,Close_Change,Volume_Change,EWM20_Change,FastK,SlowD,SlowJ,MACD,MACDS,MACDO,CCI,RSI,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2022-04-05 00:00:00-04:00,,,,,,,,,,,,,,,175.059998
2022-04-06 00:00:00-04:00,-5.139999,-4.669998,-4.289993,-3.229996,15657000.0,,,,,,,,,,171.830002
2022-04-07 00:00:00-04:00,-1.199997,-0.270004,-0.279999,0.309998,-11464100.0,,,,,,,,,,172.139999
2022-04-08 00:00:00-04:00,0.619995,-1.580002,-0.650009,-2.050003,-1019200.0,,,,,,,,,,170.089996
2022-04-11 00:00:00-04:00,-3.069992,-2.75,-3.699997,-4.339996,-4328800.0,,1.953125,,,,,,,,165.75
2022-04-12 00:00:00-04:00,-0.690002,0.839996,1.139999,1.910004,7018500.0,,26.568295,,,,,,,,167.660004
2022-04-13 00:00:00-04:00,-0.630005,1.169998,0.130005,2.73999,-8646300.0,,62.340884,30.287435,126.447784,,,,,,170.399994
2022-04-14 00:00:00-04:00,3.229996,0.230011,-1.730011,-5.110001,4710500.0,,3.709196,30.872792,-50.617996,,,,,,165.289993
2022-04-18 00:00:00-04:00,-6.699997,-4.669998,-1.469986,-0.219986,-6305500.0,,19.480527,28.510202,1.421177,,,,,,165.070007
2022-04-19 00:00:00-04:00,1.100006,1.220001,0.339996,2.329987,-1300100.0,,49.740105,24.309943,100.60043,,,,,,167.399994


In [20]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class StockTradingEnv(gym.Env):
    metadata = {'render.modes': ['human']}  # Gym 환경이 지원하는 렌더링 모드 설정 (human 모드는 텍스트 형태로 렌더링).

    # 환경 초기화
    # 클래스 초기화. 매개변수로는 주식 데이터가 담긴 DataFrame(df)을 받음.
    def __init__(self, df):
        super(StockTradingEnv, self).__init__()     # 부모 클래스 (gym.Env) 초기화.

        self.df = df                                # 주가 데이터(DataFrame)를 클래스 내부 변수에 저장.
        self.max_steps = len(df) - 1                # 환경 내 최대 스텝 수를 데이터 길이에 맞춰 설정 (데이터 인덱스를 벗어나지 않기 위해 -1)
        self.current_step = 0                       # 현재 진행 중인 스텝 번호 (에피소드 시작은 항상 0에서부터 시작)
        self.initial_balance = 1000000              # 초기 자산을 100만원으로 설정
        self.balance = self.initial_balance         # 현재 보유 중인 현금을 초기 자산과 동일하게 설정
        self.shares_held = 0                        # 처음에는 주식을 전혀 보유하고 있지 않음
        self.avg_buy_price = 0                      # 평균 매수가격 초기화 (주식을 구매할 때 업데이트됨)

        # 상태 공간 정의 (14개 상태 변수)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(14,), dtype=np.float32)

        # 행동 공간 정의 (0: hold, 1: action)
        self.action_space = spaces.Discrete(2)

    # 다음 상태 가져오기 메서드
    def _next_observation(self):
        obs = self.df.iloc[self.current_step].values.astype(np.float32)   # 현재 스텝에 해당하는 데이터의 지표들을 numpy 배열로 변환하고, float32로 설정
        return obs                                                        # 관측된 상태값 반환

    # 에피소드 초기화
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)                # Gym 환경의 리셋 메서드 호출 (시드값 설정 가능)
        self.balance = self.initial_balance     # 보유 현금을 초기화
        self.shares_held = 0                    # 보유 주식 수 초기화
        self.avg_buy_price = 0                  # 평균 매수가 초기화
        self.current_step = 0                   # 환경의 스텝 번호를 초기화하여 첫 번째 데이터에서 시작
        return self._next_observation(), {}     # 초기 상태를 반환하고, 추가 정보를 담는 dict는 비어있는 상태로 반환

    # 행동 수행 메서드
    def step(self, action):
      done = False                              # 종료 여부를 기본적으로 False로 설정
      self.current_step += 1                    # 스텝을 한 칸 진행시킴 (다음 날로 이동)

      current_price = self.df['Close'].iloc[self.current_step]      # 현재 스텝의 종가를 현재 가격으로 설정
      next_price = self.df['Close'].iloc[self.current_step + 1] if self.current_step + 1 < len(self.df) else current_price

      reward = 0        # 보상을 기본값으로 0으로 설정

      # 현재 현금이 주가보다 많다면 Buy Agent가 행동 결정
      if self.balance >= current_price:
          # Buy agent (0: Buy, 1: Hold)
          if action == 0:  # Buy
              shares_bought = self.balance // current_price   # 살 수 있는 최대 주식 수를 계산 (소수점은 제외)
              self.avg_buy_price = current_price              # 평균 매수가를 현재 가격으로 설정
              self.shares_held += shares_bought               # 보유한 주식 수를 증가
              self.balance -= shares_bought * current_price   # 주식을 사면서 잔고에서 구매한 금액을 차감
              reward = next_price - current_price             # 보상은 내일 가격에서 오늘 가격을 뺀 값으로 설정 (가격이 오르면 양수 보상)

          elif action == 1:  # BuyHold
              reward = (current_price - next_price) / current_price    # 유지 보상은 오늘 가격과 내일 가격의 차이를 비율로 계산

      # 주식을 보유 중이라면 Sell Agent
      elif self.shares_held > 0:
          # Sell agent (0: Sell, 1: Hold)
          if action == 0:  # Sell
              profit = (current_price - self.avg_buy_price) * self.shares_held    # 총 수익을 계산 (현재 가격 - 매수가격) × 보유 주식 수
              reward = profit / self.avg_buy_price                                # 수익을 평균 매수가로 나눠 수익률을 계산하여 보상으로 줌
              self.balance += self.shares_held * current_price                    # 매도한 금액을 현금에 추가
              self.shares_held = 0                                                # 보유 주식 수를 0으로 초기화
              self.avg_buy_price = 0                                              # 평균 매수가도 초기화

          elif action == 1:  # SellHold
              profit = (next_price - self.avg_buy_price) * self.shares_held       # 내일 가격을 바탕으로 예상 수익을 계산
              reward = profit / self.avg_buy_price                                # 예상 수익률을 보상으로 설정

      # 주식도 없고, 현금도 부족하면 행동불가 (reward 0)
      else:
          reward = 0

      # 스텝이 마지막 날에 도달하면 에피소드 종료
      if self.current_step >= self.max_steps - 1:
          done = True

      obs = self._next_observation()  # 새로운 상태 관측값 반환 준비

      return obs, reward, done, False, {} # 다음 상태, 보상, 종료 여부, 그리고 추가 정보(빈 dict)를 반환

    # 렌더링 메서드
    def render(self, mode='human', close=False):
        profit = self.balance + self.shares_held * self.df['Close'].iloc[self.current_step] - self.initial_balance    # 현재 자산과 초기 자산을 비교하여 현재 수익을 계산
        print(f'Step: {self.current_step}, Balance: {self.balance:.2f}, Shares: {self.shares_held}, Profit: {profit:.2f}')    # 스텝, 현금 잔고, 보유 주식 수, 누적 수익을 출력하여 현황을 표시



In [21]:
env = StockTradingEnv(df)

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 128)
        self.fc4 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.fc4(x)


In [23]:
import random
from collections import deque, namedtuple

Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))

class ReplayBuffer:
    def __init__(self, capacity=1000):
        self.memory = deque(maxlen=capacity)

    def add(self, *args):
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)


In [24]:
import torch.optim as optim
import numpy as np

class Agent:
    def __init__(self, state_size, action_size, lr=0.001, gamma=0.001, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma

        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay

        self.policy_net = DQN(state_size, action_size)
        self.target_net = DQN(state_size, action_size)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)

        self.buffer = ReplayBuffer()
        self.update_target_net()

    def update_target_net(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def act(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.action_size)
        state = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            return self.policy_net(state).argmax().item()

    def remember(self, *args):
        self.buffer.add(*args)

    def replay(self, batch_size):
        if len(self.buffer) < batch_size:
            return

        transitions = self.buffer.sample(batch_size)
        batch = Transition(*zip(*transitions))

        state_batch = torch.FloatTensor(batch.state)
        action_batch = torch.LongTensor(batch.action).unsqueeze(1)
        reward_batch = torch.FloatTensor(batch.reward).unsqueeze(1)
        next_state_batch = torch.FloatTensor(batch.next_state)
        done_batch = torch.FloatTensor(batch.done).unsqueeze(1)

        current_q = self.policy_net(state_batch).gather(1, action_batch)
        next_q = self.target_net(next_state_batch).max(1)[0].detach().unsqueeze(1)
        expected_q = reward_batch + self.gamma * next_q * (1 - done_batch)

        loss = F.mse_loss(current_q, expected_q)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ε decay
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [28]:
state_size = 15
action_size = 3  # [0: action, 1: hold]

buy_agent = Agent(state_size, action_size)
sell_agent = Agent(state_size, action_size)

In [29]:
def train(env, buy_agent, sell_agent, num_episodes=20, batch_size=32, target_update_freq=4):
    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False
        total_reward = 0
        step = 0

        while not done:
            current_price = env.df['Close'].iloc[env.current_step]

            # 어떤 agent를 쓸지 결정
            if env.balance >= current_price:
                agent = buy_agent
            elif env.shares_held > 0:
                agent = sell_agent
            else:
                # 아무 행동도 할 수 없는 경우
                next_state, reward, done, _, _ = env.step(1)
                state = next_state
                continue

            # 행동 선택 및 환경 적용
            action = agent.act(state)
            next_state, reward, done, _, _ = env.step(action)

            # 메모리에 기록
            agent.remember(state, action, reward, next_state, done)

            # 학습
            agent.replay(batch_size)

            state = next_state
            total_reward += reward
            step += 1

        # 일정 주기로 target network 업데이트
        if episode % target_update_freq == 0:
            buy_agent.update_target_net()
            sell_agent.update_target_net()

        print(f"Episode {episode+1}/{num_episodes} - Total reward: {total_reward:.2f} - Steps: {step}")


In [30]:
# 예시 실행
train(env, buy_agent, sell_agent, num_episodes=50)

Episode 1/50 - Total reward: 2359.15 - Steps: 751
Episode 2/50 - Total reward: 3045.07 - Steps: 751
Episode 3/50 - Total reward: -839.08 - Steps: 751
Episode 4/50 - Total reward: 1504.03 - Steps: 751
Episode 5/50 - Total reward: 803.91 - Steps: 751
Episode 6/50 - Total reward: 447.40 - Steps: 751
Episode 7/50 - Total reward: 206.21 - Steps: 751
Episode 8/50 - Total reward: 3592.35 - Steps: 751
Episode 9/50 - Total reward: 24.09 - Steps: 751
Episode 10/50 - Total reward: 930.77 - Steps: 751
Episode 11/50 - Total reward: 1910.47 - Steps: 751
Episode 12/50 - Total reward: -810.87 - Steps: 751
Episode 13/50 - Total reward: -298.63 - Steps: 751
Episode 14/50 - Total reward: 185.19 - Steps: 751
Episode 15/50 - Total reward: 977.29 - Steps: 751
Episode 16/50 - Total reward: 2314.81 - Steps: 751
Episode 17/50 - Total reward: 1757.13 - Steps: 751
Episode 18/50 - Total reward: 179.15 - Steps: 751
Episode 19/50 - Total reward: 1029.97 - Steps: 751
Episode 20/50 - Total reward: 1326.96 - Steps: 75

In [31]:
def evaluate_agent(env, buy_agent, sell_agent, render=True):
    state, _ = env.reset()
    done = False
    total_reward = 0

    while not done:
        current_price = env.df['Close'].iloc[env.current_step]

        # 자산 조건에 따라 에이전트 선택
        if env.balance >= current_price:
            agent = buy_agent
        elif env.shares_held > 0:
            agent = sell_agent
        else:
            # 아무것도 못할 때는 그냥 넘어감
            next_state, reward, done, _, _ = env.step(1)
            state = next_state
            continue

        # ε 없이 행동 선택 (탐색 x)
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            action = agent.policy_net(state_tensor).argmax().item()

        next_state, reward, done, _, _ = env.step(action)

        if render:
            env.render()

        state = next_state
        total_reward += reward

    print(f"\n[Evaluation Complete] Final Balance: {env.balance:.2f}, Total Profit: {env.balance + env.shares_held * env.df['Close'].iloc[env.current_step] - env.initial_balance:.2f}")
    return total_reward


In [32]:
# 학습이 끝난 후 evaluate
reward = evaluate_agent(env, buy_agent, sell_agent)


Step: 1, Balance: 121.22, Shares: 5819.0, Profit: 0.00
Step: 2, Balance: 1001803.88, Shares: 0, Profit: 1803.88
Step: 3, Balance: 143.89, Shares: 5889.0, Profit: 1803.88
Step: 4, Balance: 976245.64, Shares: 0, Profit: -23754.36
Step: 5, Balance: 129.10, Shares: 5822.0, Profit: -23754.36
Step: 6, Balance: 992197.86, Shares: 0, Profit: -7802.14
Step: 7, Balance: 127.32, Shares: 6002.0, Profit: -7802.14
Step: 8, Balance: 990877.50, Shares: 0, Profit: -9122.50
Step: 9, Balance: 36.94, Shares: 5919.0, Profit: -9122.50
Step: 10, Balance: 989871.29, Shares: 0, Profit: -10128.71
Step: 11, Balance: 5.14, Shares: 5948.0, Profit: -10128.71
Step: 12, Balance: 962332.02, Shares: 0, Profit: -37667.98
Step: 13, Balance: 36.95, Shares: 5908.0, Profit: -37667.98
Step: 14, Balance: 926411.37, Shares: 0, Profit: -73588.63
Step: 15, Balance: 143.20, Shares: 5916.0, Profit: -73588.63
Step: 16, Balance: 968237.44, Shares: 0, Profit: -31762.56
Step: 17, Balance: 108.83, Shares: 6141.0, Profit: -31762.56
Step