In [5]:
df = pd.read_csv('./000660.csv',encoding='cp949')


In [6]:
df

Unnamed: 0.1,Unnamed: 0,날짜,시간,시가,고가,저가,종가,거래량,code,MA5,MA20,MA60,MA120,Upper_Band,Lower_Band,day_name,매도량,매수량
0,2022-08-19 09:01:00,20220819,901,97000,97000,96500,96800,99206,660,,,,,,,Friday,11919.0,87281.0
1,2022-08-19 09:02:00,20220819,902,96800,97200,96700,97000,55648,660,,,,,,,Friday,15861.0,39787.0
2,2022-08-19 09:03:00,20220819,903,97100,97100,96200,96200,37596,660,,,,,,,Friday,17740.0,19856.0
3,2022-08-19 09:04:00,20220819,904,96200,96400,96100,96300,28194,660,,,,,,,Friday,15239.0,12955.0
4,2022-08-19 09:05:00,20220819,905,96300,96500,96100,96400,19232,660,96540.0,,,,,,Friday,8216.0,11016.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186835,2024-08-19 15:16:00,20240819,1516,193100,193200,193100,193200,11360,660,193140.0,192995.0,193810.00,194504.17,193370.44,192619.56,Monday,7343.0,4017.0
186836,2024-08-19 15:17:00,20240819,1517,193200,193200,192900,193000,32191,660,193120.0,193005.0,193778.33,194476.67,193369.05,192640.95,Monday,26959.0,5232.0
186837,2024-08-19 15:18:00,20240819,1518,192900,193000,192800,192800,9585,660,193040.0,193000.0,193746.67,194448.33,193372.77,192627.23,Monday,6745.0,2840.0
186838,2024-08-19 15:19:00,20240819,1519,192800,193100,192800,193000,16917,660,193020.0,193000.0,193713.33,194421.67,193372.77,192627.23,Monday,4646.0,12271.0


In [3]:
import numpy as np
import pandas as pd
import gym
from gym import spaces
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
from sklearn.preprocessing import StandardScaler
from torch_geometric.nn import GCNConv
import matplotlib.pyplot as plt
from torch.nn import BatchNorm1d

# 데이터 로드 및 기술적 지표 계산
df = pd.read_csv('./000660.csv', encoding='cp949')
df.rename(columns={'종가': 'Close'}, inplace=True)
df['MA10'] = df['Close'].rolling(window=10).mean()
df['MA50'] = df['Close'].rolling(window=50).mean()
df['RSI'] = 100 - (100 / (1 + df['Close'].diff().apply(lambda x: np.maximum(x, 0))
                            .rolling(window=14).mean() / df['Close'].diff()
                            .apply(lambda x: np.abs(np.minimum(x, 0)))
                            .rolling(window=14).mean()))
df = df.dropna().reset_index()

# 입력 데이터 정규화
scaler = StandardScaler()
feature_cols = ['Close', 'MA10', 'MA50', 'RSI']
scaler.fit(df[feature_cols])

# 주식 트레이딩 환경 정의
class StockTradingEnv(gym.Env):
    def __init__(self, df, scaler):
        super(StockTradingEnv, self).__init__()
        self.df = df
        self.scaler = scaler
        self.max_steps = len(df) - 1
        self.current_step = 0

        self.action_space = spaces.Discrete(3)  # 매도(0), 보유(1), 매수(2)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(5,), dtype=np.float32
        )

        self.initial_balance = 10000000
        self.balance = self.initial_balance
        self.position = 0  # 0: 현금, 1: 주식 보유
        self.asset_value = self.initial_balance

        self.balance_history = []
        self.asset_value_history = []
        self.position_history = []
        self.action_history = []
        self.price_history = []

    def reset(self):
        self.balance = self.initial_balance
        self.position = 0
        self.asset_value = self.initial_balance
        self.current_step = 0

        self.balance_history = [self.balance]
        self.asset_value_history = [self.asset_value]
        self.position_history = [self.position]
        self.action_history = []
        self.price_history = [self.df.loc[self.current_step, 'Close']]

        return self._next_observation()

    def _next_observation(self):
        raw_state = self.df.loc[self.current_step, ['Close', 'MA10', 'MA50', 'RSI']]
        raw_state_df = pd.DataFrame([raw_state])
        scaled_state = self.scaler.transform(raw_state_df)[0]
        frame = np.append(scaled_state, self.position)
        return frame.astype(np.float32)

    def step(self, action):
        done = False
        price = self.df.loc[self.current_step, 'Close']
        prev_asset_value = self.asset_value

        if action == 0:  # 매도
            if self.position == 1:
                self.balance += price
                self.position = 0
        elif action == 1:  # 보유
            pass
        elif action == 2:  # 매수
            if self.position == 0:
                self.balance -= price
                self.position = 1

        self.asset_value = self.balance + self.position * price

        holding_cost = 0
        if self.position == 1:
            holding_cost = 0.001 * price
            self.asset_value -= holding_cost

        reward = (self.asset_value - prev_asset_value) * 10

        self.current_step += 1
        if self.current_step >= self.max_steps:
            done = True

        self.balance_history.append(self.balance)
        self.asset_value_history.append(self.asset_value)
        self.position_history.append(self.position)
        self.action_history.append(action)
        self.price_history.append(price)

        obs = self._next_observation()
        return obs, reward, done, {}

    def create_edge_index(self, num_nodes):
        edge_index = []
        for i in range(num_nodes):
            if i > 0:
                edge_index.append([i, i - 1])  # 이전 노드와 연결
            if i < num_nodes - 1:
                edge_index.append([i, i + 1])  # 다음 노드와 연결
        return torch.tensor(edge_index, dtype=torch.long).t().contiguous()

# 환경 생성
env = StockTradingEnv(df, scaler)

# 엣지 인덱스 생성
num_nodes = len(df)
edge_index = env.create_edge_index(num_nodes)

# PPO를 위한 액터-크리틱 신경망 정의
class ActorCritic(nn.Module):
    def __init__(self, input_dim, action_dim):
        super(ActorCritic, self).__init__()
        self.conv1 = GCNConv(input_dim, 32)
        self.bn1 = BatchNorm1d(32)
        self.conv2 = GCNConv(32, 64)
        self.bn2 = BatchNorm1d(64)
        self.conv3 = GCNConv(64, 128)
        self.fc = nn.Sequential(
            nn.Linear(128, 128), 
            nn.ReLU()
        )
        self.policy_head = nn.Linear(128, action_dim)
        self.value_head = nn.Linear(128, 1)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight)
            nn.init.zeros_(module.bias)

    def forward(self, x, edge_index):
        # GCN 레이어에 입력
        x = self.conv1(x, edge_index)
        x = self.bn1(x)  # 첫 번째 배치 정규화
        x = torch.relu(x)
        
        x = self.conv2(x, edge_index)
        x = self.bn2(x)  # 두 번째 배치 정규화
        x = torch.relu(x)

        x = self.conv3(x, edge_index)  # 세 번째 GCN 레이어
        
        x = self.fc(x)  # FC 레이어로 전달
        policy_logits = self.policy_head(x)  # 정책 헤드
        value = self.value_head(x)  # 가치 헤드

        return policy_logits, value

    def act(self, state, edge_index):
        state_tensor = torch.FloatTensor(state).unsqueeze(0)  # (1, num_features) 형태로 변환
        policy_logits, _ = self.forward(state_tensor, edge_index)
        dist = Categorical(logits=policy_logits)
        action = dist.sample()
        action_logprob = dist.log_prob(action)
        return action.item(), action_logprob

    def evaluate(self, state, action):
        state_tensor = torch.FloatTensor(state)
        policy_logits, value = self.forward(state_tensor, edge_index)
        dist = Categorical(logits=policy_logits)
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        return action_logprobs, value.squeeze(-1), dist_entropy

# 하이퍼파라미터 설정
learning_rate = 1e-3
gamma = 0.99
epsilon = 0.1
epochs = 10
entropy_coef = 0.05

# 정책 및 옵티마이저 초기화
input_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
policy = ActorCritic(input_dim, action_dim)
optimizer = optim.AdamW(policy.parameters(), lr=learning_rate)

# 메모리 클래스 정의
class Memory:
    def __init__(self):
        self.states = []
        self.actions = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []

    def clear(self):
        self.states = []
        self.actions = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []

memory = Memory()

# PPO 업데이트 함수 정의
def ppo_update():
    states = torch.tensor(memory.states, dtype=torch.float32)
    actions = torch.tensor(memory.actions, dtype=torch.int64)
    old_logprobs = torch.tensor(memory.logprobs, dtype=torch.float32)
    rewards = memory.rewards
    is_terminals = memory.is_terminals

    returns = []
    discounted_reward = 0
    for reward, is_terminal in zip(reversed(rewards), reversed(is_terminals)):
        if is_terminal:
            discounted_reward = 0
        discounted_reward = reward + (gamma * discounted_reward)
        returns.insert(0, discounted_reward)
    returns = torch.tensor(returns, dtype=torch.float32)

    with torch.no_grad():
        _, state_values = policy.forward(states, edge_index)
        advantages = returns - state_values.squeeze(-1)

    for _ in range(epochs):
        logprobs, state_values, dist_entropy = policy.evaluate(states, actions)
        ratios = torch.exp(logprobs - old_logprobs)
        surr1 = ratios * advantages
        surr2 = torch.clamp(ratios, 1 - epsilon, 1 + epsilon) * advantages
        loss = -torch.min(surr1, surr2) + 0.5 * advantages.pow(2) - entropy_coef * dist_entropy

        optimizer.zero_grad()
        loss.mean().backward()
        torch.nn.utils.clip_grad_norm_(policy.parameters(), max_norm=0.5)
        optimizer.step()

        print(f"Loss: {loss.mean().item():.4f}")

# 학습 루프
max_episodes = 10  # 에피소드 수를 줄여 빠른 테스트
update_interval = 1

# 행동 분포 추적을 위한 리스트
action_counts = []
best_asset_value = 0
best_model_path = 'best_actor_critic_model.pth'

for episode in range(max_episodes):
    print(f'episode{episode+1} 시작')
    state = env.reset()
    done = False
    episode_actions = []
    while not done:
        action, action_logprob = policy.act(state,edge_index)
        next_state, reward, done, _ = env.step(action)
        # 메모리에 데이터 저장
        memory.states.append(state)
        memory.actions.append(action)
        memory.logprobs.append(action_logprob.item())
        memory.rewards.append(reward)
        memory.is_terminals.append(done)
        state = next_state
        episode_actions.append(action)

    # 정책 업데이트 및 메모리 초기화
    ppo_update()
    memory.clear()

    # 행동 분포 추적
    action_counts.append(np.bincount(episode_actions, minlength=3))

    # 에피소드별 성과 시각화
    plt.figure(figsize=(12, 10))

    # 포트폴리오 가치 변화 시각화
    plt.subplot(5, 1, 1)
    plt.plot(env.asset_value_history)
    plt.title(f'Episode {episode+1} - Asset Value Over Time')
    plt.ylabel('Asset Value')

    # 포지션 변화 시각화
    plt.subplot(5, 1, 2)
    plt.plot(env.position_history)
    plt.title('Position Over Time')
    plt.ylabel('Position')

    # 주가 변화 시각화
    plt.subplot(5, 1, 3)
    plt.plot(env.price_history)
    plt.title('Price Over Time')
    plt.ylabel('Price')

    # 행동 시각화
    plt.subplot(5, 1, 4)
    plt.plot(env.action_history)
    plt.title('Actions Over Time')
    plt.ylabel('Action')
    plt.xlabel('Time Step')
    plt.yticks([0, 1, 2], ['Sell', 'Hold', 'Buy'])

    # 행동 분포 시각화
    plt.subplot(5, 1, 5)
    counts = np.array(action_counts).sum(axis=0)
    plt.bar(['Sell', 'Hold', 'Buy'], counts)
    plt.title('Action Distribution')
    plt.ylabel('Counts')

    plt.tight_layout()
    plt.show()

    final_asset_value = env.asset_value_history[-1]
    print(f"Episode {episode+1} completed. Final Asset Value: {env.asset_value_history[-1]:.2f}")
    
    # 베스트 모델 저장 로직
    if final_asset_value > best_asset_value:
        best_asset_value = final_asset_value
        torch.save(policy.state_dict(), best_model_path)  # 모델 저장
        print(f"Best model saved with asset value: {best_asset_value:.2f}")

# 전체 행동 분포 시각화
total_counts = np.array(action_counts).sum(axis=0)
plt.figure(figsize=(6, 4))
plt.bar(['Sell', 'Hold', 'Buy'], total_counts)
plt.title('Total Action Distribution')
plt.ylabel('Counts')
plt.show()

# PPO 알고리즘 적용한 트레이딩 예제

episode1 시작


RuntimeError: index 1 is out of bounds for dimension 0 with size 1