In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from gymnasium import spaces
import tensorflow as tf
from tensorflow.keras import layers
import torch
import torch.nn as nn
import torch.optim as optim
import random
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from stable_baselines3 import DDPG
from stable_baselines3.common.vec_env import DummyVecEnv
import ta
import shimmy
from collections import deque, namedtuple
import copy
import gymnasium as gym
import matplotlib.pyplot as plt
import torch.nn.functional as F
import tqdm



In [26]:
# Ensure reproducibility
random.seed(1234)
np.random.seed(1234)
torch.manual_seed(1234)

<torch._C.Generator at 0x1c78fbf03f0>

In [None]:
# Define parameters such as:
START_DATE = '2010-01-01'
END_DATE = '2020-12-31'
STOCKS = ['AAPL']  # AAPL for testing
TRAIN_TEST_SPLIT = 0.8
WINDOW_SIZE = 60  # Number of past days to consider
INITIAL_BALANCE = 100000  # Starting cash
data = {}

In [16]:
def download_data(tickers, start, end):
    data = {}
    for ticker in tickers:
        data[ticker] = yf.download(ticker, start=start, end=end)
    return data

def clean_data(data):
    cleaned_data = {}
    for ticker, df in data.items():
        df = df.dropna()
        cleaned_data[ticker] = df
    return cleaned_data


In [17]:
data = download_data(STOCKS, START_DATE, END_DATE)
data = clean_data(data)

[*********************100%***********************]  1 of 1 completed


In [None]:
print(data)

In [37]:
# Define the Replay Buffer
class ReplayMemory:
    """Experience Replay Buffer"""
    def __init__(self, max_size, state_size):
        self.max_size = max_size
        self.state_size = state_size
        self.buffer = deque(maxlen=max_size)
    
    def add(self, state, action, reward, next_state, done):
        """Add a transition to the buffer"""
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample_batch(self, batch_size):
        """Sample a batch of transitions"""
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return Batch(states=np.array(states),
                     actions=np.array(actions),
                     rewards=np.array(rewards),
                     next_states=np.array(next_states),
                     dones=np.array(dones))
    
    def __len__(self):
        return len(self.buffer)

# Define a namedtuple for batches
Batch = namedtuple('Batch', ('states', 'actions', 'rewards', 'next_states', 'dones'))

In [39]:
# Define the Deep Q-Network
class DeepQNet(nn.Module):
    def __init__(self, input_dim, num_hidden_layer, dim_hidden_layer, output_dim):
        super(DeepQNet, self).__init__()
        layers = []
        layers.append(nn.Linear(input_dim, dim_hidden_layer))
        layers.append(nn.ReLU())
        for _ in range(num_hidden_layer - 1):
            layers.append(nn.Linear(dim_hidden_layer, dim_hidden_layer))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(dim_hidden_layer, output_dim))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

# Initialize weights using Xavier Uniform
def customized_weights_init(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        nn.init.constant_(m.bias, 0)

# Define the Epsilon Schedule
class ExponentialSchedule:
    """Exponential schedule for epsilon-greedy policy"""
    def __init__(self, value_from, value_to, num_steps):
        """
        value(t) = a * exp(b * t)
        """
        self.value_from = value_from
        self.value_to = value_to
        self.num_steps = num_steps
        # Solve for a and b such that:
        # value(0) = a = value_from
        # value(num_steps) = a * exp(b * num_steps) = value_to
        self.a = self.value_from
        self.b = np.log(self.value_to / self.value_from) / self.num_steps
    
    def value(self, step) -> float:
        if step >= self.num_steps:
            return self.value_to
        elif step < 0:
            return self.value_from
        else:
            return self.a * np.exp(self.b * step)


In [40]:
# Define the DQN Agent
class DQNAgent:
    def __init__(self, state_size, action_size, hidden_layers=2, hidden_dim=64, learning_rate=1e-3, gamma=0.95, device='cpu'):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.device = torch.device(device)
        
        # Initialize the Q-Network and Target Network
        self.q_network = DeepQNet(input_dim=state_size,
                                  num_hidden_layer=hidden_layers,
                                  dim_hidden_layer=hidden_dim,
                                  output_dim=action_size).to(self.device)
        self.q_network.apply(customized_weights_init)
        
        self.target_network = DeepQNet(input_dim=state_size,
                                       num_hidden_layer=hidden_layers,
                                       dim_hidden_layer=hidden_dim,
                                       output_dim=action_size).to(self.device)
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.target_network.eval()
        
        # Optimizer
        self.optimizer = torch.optim.Adam(self.q_network.parameters(), lr=learning_rate)
        
        # Replay Buffer
        self.replay_buffer = ReplayMemory(max_size=100000, state_size=state_size)
        
        # Exploration parameters
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995  # Exponential decay factor
    
    def get_action(self, state, eps):
        """Select an action using epsilon-greedy policy"""
        if random.random() < eps:
            return random.randint(0, self.action_size -1)
        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_values = self.q_network(state_tensor)
        return torch.argmax(q_values, dim=1).item()
    
    def remember(self, state, action, reward, next_state, done):
        """Store experience in replay buffer"""
        self.replay_buffer.add(state, action, reward, next_state, done)
    
    def replay(self, batch_size):
        """Sample a batch and perform a training step"""
        if len(self.replay_buffer) < batch_size:
            return None  # Not enough samples
        batch = self.replay_buffer.sample_batch(batch_size)
        states = torch.FloatTensor(batch.states).to(self.device)
        actions = torch.LongTensor(batch.actions).unsqueeze(1).to(self.device)
        rewards = torch.FloatTensor(batch.rewards).unsqueeze(1).to(self.device)
        next_states = torch.FloatTensor(batch.next_states).to(self.device)
        dones = torch.FloatTensor(batch.dones).unsqueeze(1).to(self.device)
        
        # Current Q values
        current_q = self.q_network(states).gather(1, actions)
        
        # Target Q values
        with torch.no_grad():
            max_next_q = self.target_network(next_states).max(1)[0].unsqueeze(1)
            target_q = rewards + (self.gamma * max_next_q * (1 - dones))
        
        # Compute loss
        loss = F.mse_loss(current_q, target_q)
        
        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        # Epsilon decay
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        
        return loss.item()
    
    def update_target_network(self):
        """Update the target network to match the Q-Network"""
        self.target_network.load_state_dict(self.q_network.state_dict())


In [34]:
# Define the Trading Environment (Assuming it's already defined and compatible)
# from your current project code, ensure that the TradingEnv class excludes non-numerical data.

# Example:
# env = TradingEnv(data[ticker])  # Replace 'ticker' with actual stock symbol

# Define the Training Loop
def train_dqn_trading_agent(env, agent, exploration_schedule, num_steps=10000, batch_size=64, target_update_freq=1000):
    """
    Train the DQN agent in the given environment.

    Args:
        env: TradingEnv instance
        agent: DQNAgent instance
        exploration_schedule: ExponentialSchedule instance
        num_steps: Total number of training steps
        batch_size: Size of training batches
        target_update_freq: Frequency (in steps) to update the target network

    Returns:
        rewards_history: List of total rewards per episode
        loss_history: List of loss values per training step
    """
    rewards_history = []
    loss_history = []
    episode_rewards = 0
    episode = 0
    state, _ = env.reset()
    state = state.flatten()
    done = False
    
    pbar = tqdm.trange(num_steps, desc="Training Progress")
    for t in pbar:
        # Get current epsilon
        epsilon = exploration_schedule.value(t)
        agent.epsilon = epsilon  # Update agent's epsilon
        
        # Select action
        action = agent.get_action(state, eps=epsilon)
        
        # Execute action
        next_state, reward, done, _, _ = env.step(action)
        next_state = next_state.flatten()
        
        # Store experience
        agent.remember(state, action, reward, next_state, done)
        
        # Update state
        state = next_state
        episode_rewards += reward
        
        # Perform a training step
        loss = agent.replay(batch_size)
        if loss is not None:
            loss_history.append(loss)
            pbar.set_postfix({'Loss': f"{loss:.4f}", 'Epsilon': f"{epsilon:.4f}"})
        
        # Update target network
        if t % target_update_freq == 0 and t != 0:
            agent.update_target_network()
            pbar.write(f"Updated Target Network at step {t}")
        
        # Check if episode is done
        if done:
            rewards_history.append(episode_rewards)
            pbar.write(f"Episode {episode+1} finished with total reward {episode_rewards}")
            # Reset environment
            state, _ = env.reset()
            state = state.flatten()
            episode_rewards = 0
            episode += 1
    
    return rewards_history, loss_history

In [35]:
# Define a function to plot training metrics
def plot_training_metrics(rewards, losses, window=100):
    """
    Plot the training rewards and losses.

    Args:
        rewards: List of total rewards per episode
        losses: List of loss values per training step
        window: Window size for moving average
    """
    # Plot Rewards
    plt.figure(figsize=(12,5))
    plt.subplot(1,2,1)
    plt.plot(rewards, label='Total Reward per Episode')
    if len(rewards) >= window:
        moving_avg = np.convolve(rewards, np.ones(window)/window, mode='valid')
        plt.plot(range(window-1, len(rewards)), moving_avg, label=f'{window}-Episode Moving Average')
    plt.xlabel('Episodes')
    plt.ylabel('Total Reward')
    plt.title('DQN Training Rewards')
    plt.legend()
    
    # Plot Losses
    plt.subplot(1,2,2)
    plt.plot(losses, label='Loss per Training Step')
    if len(losses) >= window:
        moving_avg_loss = np.convolve(losses, np.ones(window)/window, mode='valid')
        plt.plot(range(window-1, len(losses)), moving_avg_loss, label=f'{window}-Step Moving Average')
    plt.xlabel('Training Steps')
    plt.ylabel('Loss')
    plt.title('DQN Training Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

In [36]:
# Example Integration and Training
if __name__ == "__main__":
    # Initialize your trading environment
    # Replace 'AAPL' with your actual stock ticker and ensure 'data' is correctly loaded
    ticker = 'AAPL'  # Example ticker
    env = TradingEnv(data[ticker])  # Replace 'data' with your actual data structure
    
    # Define state and action sizes
    state_size = env.observation_space.shape[0] * env.observation_space.shape[1]
    action_size = env.action_space.n  # e.g., 3 actions: Hold, Buy, Sell
    
    # Initialize the DQN Agent
    agent = DQNAgent(state_size=state_size,
                     action_size=action_size,
                     hidden_layers=2,
                     hidden_dim=64,
                     learning_rate=1e-3,
                     gamma=0.95,
                     device='cpu')  # Change to 'cuda' if using GPU
    
    # Define the exploration schedule
    exploration_schedule = ExponentialSchedule(value_from=1.0,
                                               value_to=0.01,
                                               num_steps=100000)  # Adjust num_steps based on training
    
    # Define training parameters
    num_steps = 500000  # Total training steps
    batch_size = 64
    target_update_freq = 1000  # Update target network every 1000 steps
    
    # Train the agent
    rewards_history, loss_history = train_dqn_trading_agent(env=env,
                                                            agent=agent,
                                                            exploration_schedule=exploration_schedule,
                                                            num_steps=num_steps,
                                                            batch_size=batch_size,
                                                            target_update_freq=target_update_freq)
    
    # Plot training metrics
    plot_training_metrics(rewards_history, loss_history, window=100)
    
    # Save the trained Q-Network
    torch.save(agent.q_network.state_dict(), f'dqn_trading_agent_{ticker}.pt')
    print(f"Trained Q-Network saved as dqn_trading_agent_{ticker}.pt")

ValueError: too many values to unpack (expected 2)

In [19]:
def compute_RSI(series, period=14):
    delta = series.diff()

    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)

    avg_gain = gain.rolling(window=period, min_periods=period).mean()[:period+1]
    avg_loss = loss.rolling(window=period, min_periods=period).mean()[:period+1]

    rsi = pd.Series(index=series.index, dtype='float64')

    # Initialize RSI values
    for i in range(period, len(series)):
        if i == period:
            current_avg_gain = avg_gain.iloc[-1]
            current_avg_loss = avg_loss.iloc[-1]
        else:
            current_avg_gain = (current_avg_gain * (period - 1) + gain.iloc[i]) / period
            current_avg_loss = (current_avg_loss * (period - 1) + loss.iloc[i]) / period

        if current_avg_loss == 0:
            rs = np.inf
            rsi.iloc[i] = 100
        else:
            rs = current_avg_gain / current_avg_loss
            rsi.iloc[i] = 100 - (100 / (1 + rs))

    return rsi

def compute_MACD(series, span_short=12, span_long=26, span_signal=9):
    ema_short = series.ewm(span=span_short, adjust=False).mean()
    ema_long = series.ewm(span=span_long, adjust=False).mean()
    macd = ema_short - ema_long
    signal = macd.ewm(span=span_signal, adjust=False).mean()
    macd_diff = macd - signal
    return macd, signal, macd_diff

In [20]:
def add_technical_indicators(df):
    close = df['Close']
    
    # Ensure 'close' is a Series
    if isinstance(close, pd.DataFrame):
        close = close.squeeze()
    
    # Ensure 'close' is 1D
    if len(close.shape) > 1:
        close = close.flatten()
    
    # Compute indicators
    df['MA50'] = ta.trend.sma_indicator(close, window=50)
    df['MA200'] = ta.trend.sma_indicator(close, window=200)
    df['RSI'] = ta.momentum.RSIIndicator(close, window=14).rsi()
    df['MACD'] = ta.trend.MACD(close).macd()
    
    df = df.dropna()
    return df

In [21]:
for ticker in STOCKS:
    if ticker in data:
        df = data[ticker].copy()
        data[ticker] = add_technical_indicators(df)
    else:
        print(f"Data for {ticker} not available.")

# Verify Indicators
for ticker in STOCKS:
    if ticker in data:
        print(f"{ticker} DataFrame with indicators:")
        print(data[ticker].head())

AAPL DataFrame with indicators:
Price      Adj Close      Close       High        Low       Open      Volume  \
Ticker          AAPL       AAPL       AAPL       AAPL       AAPL        AAPL   
Date                                                                           
2010-10-18  9.580286  11.357143  11.392857  11.224643  11.373929  1093010800   
2010-10-19  9.323907  11.053214  11.206071  10.715000  10.835714  1232784000   
2010-10-20  9.355236  11.090357  11.223214  10.959643  11.035714   721624400   
2010-10-21  9.324810  11.054286  11.240714  10.957143  11.155714   551460000   
2010-10-22  9.263053  10.981071  11.072857  10.939286  11.038214   372778000   

Price           MA50     MA200        RSI      MACD  
Ticker                                               
Date                                                 
2010-10-18  9.659357  8.767198  83.388449  0.382372  
2010-10-19  9.693457  8.784248  70.061930  0.382695  
2010-10-20  9.729971  8.801418  70.678640  0.381549  
201

In [22]:
train_data = {}
test_data = {}
for ticker in STOCKS:
    df = data[ticker]
    split_idx = int(len(df) * TRAIN_TEST_SPLIT)
    train_data[ticker] = df[:split_idx]
    test_data[ticker] = df[split_idx:]


In [23]:
class TradingEnv(gym.Env):
    def __init__(self, df, initial_balance=100000, window_size=60):
        super(TradingEnv, self).__init__()
        self.df = df.reset_index()
        self.initial_balance = initial_balance
        self.window_size = window_size
        self.current_step = self.window_size
        self.balance = self.initial_balance
        self.net_worth = self.initial_balance
        self.shares_held = 0
        self.max_steps = len(self.df) - 1

        # Define action and observation space
        # Actions: 0 = Hold, 1 = Buy, 2 = Sell
        self.action_space = spaces.Discrete(3)
        
        # Observation: window_size days of data + portfolio info
        # Assuming df has N columns, observation shape = (window_size, N + 3)
        self.observation_space = spaces.Box(
            low=-np.inf, 
            high=np.inf, 
            shape=(window_size, len(df.columns) + 3), 
            dtype=np.float32
        )

    def reset(self):
        self.balance = self.initial_balance
        self.net_worth = self.initial_balance
        self.shares_held = 0
        self.current_step = self.window_size
        return self._next_observation()

    def _next_observation(self):
        frame = self.df.iloc[self.current_step - self.window_size:self.current_step]

        obs = []
        for col in self.df.columns:
            col_values = frame[col].values
            
            # Convert timestamps to Unix timestamps
            if isinstance(col_values[0], pd.Timestamp):
                col_values = col_values.astype('int64') // 10**9  # Convert to seconds
                
            # Convert integers to float
            elif col_values.dtype == np.int64:
                col_values = col_values.astype(np.float32)
                
            # Ensure other values are float
            else:
                col_values = pd.to_numeric(col_values, errors='coerce').astype(np.float32)
                
            # Replace any remaining NaN values
            col_values = np.nan_to_num(col_values, nan=0.0)
            obs.append(col_values)
        
        concatenated = np.concatenate(obs)
        return concatenated.astype(np.float32)

    def step(self, action):
        # Get current price as scalar
        current_price = float(self.df.iloc[self.current_step]['Close'])
        
        # Calculate reward based on previous action
        prev_value = self.balance + self.shares_held * current_price
        
        # Execute trading action
        if action == 1:  # Buy
            shares_to_buy = self.balance // current_price
            # Convert to scalar value before comparison
            if float(shares_to_buy) > 0:
                cost = shares_to_buy * current_price
                self.balance -= cost
                self.shares_held += shares_to_buy
                
        elif action == 2:  # Sell
            if self.shares_held > 0:
                self.balance += self.shares_held * current_price
                self.shares_held = 0
                
        # Move to next timestep
        self.current_step += 1
        done = self.current_step >= len(self.df) - 1
        
        # Calculate reward
        current_value = self.balance + self.shares_held * current_price
        reward = current_value - prev_value
        
        # Get next observation
        next_state = self._next_observation()
        
        return next_state, reward, done, {}

    def render(self, mode='human', close=False):
        profit = self.net_worth - self.initial_balance
        print(f'Step: {self.current_step}')
        print(f'Balance: {self.balance}')
        print(f'Shares held: {self.shares_held}')
        print(f'Net worth: {self.net_worth}')
        print(f'Profit: {profit}')

In [11]:
envs = {}
for ticker in STOCKS:
    envs[ticker] = TradingEnv(train_data[ticker])


In [12]:


class DQNAgent:
    def __init__(self, state_size, action_size, learning_rate=0.001, gamma=0.95, 
                 epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01, 
                 batch_size=64, memory_size=100000):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=memory_size)
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.batch_size = batch_size
        self.model = self._build_model(learning_rate)

    def _build_model(self, learning_rate):
        model = tf.keras.Sequential()
        model.add(layers.Dense(64, input_dim=self.state_size, activation='relu'))
        model.add(layers.Dense(64, activation='relu'))
        model.add(layers.Dense(self.action_size, activation='linear'))
        # Updated the parameter name from 'lr' to 'learning_rate'
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def replay(self):
        if len(self.memory) < self.batch_size:
            return
        minibatch = random.sample(self.memory, self.batch_size)
        states = np.array([m[0] for m in minibatch]).reshape(self.batch_size, self.state_size)
        actions = np.array([m[1] for m in minibatch])
        rewards = np.array([m[2] for m in minibatch])
        next_states = np.array([m[3] for m in minibatch]).reshape(self.batch_size, self.state_size)
        dones = np.array([m[4] for m in minibatch])

        # Predict Q-values for current states and next states
        target = rewards + self.gamma * np.amax(self.model.predict(next_states), axis=1) * (~dones)
        target_f = self.model.predict(states)

        # Update the Q-values for the actions taken
        for i in range(self.batch_size):
            target_f[i][actions[i]] = target[i]

        # Train the model on the updated Q-values
        self.model.fit(states, target_f, epochs=1, verbose=0)

        # Decay epsilon to reduce exploration over time
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [14]:
def train_dqn(agent, env, episodes=1000):
    # Verify dimensions
    state = env.reset()
    
    for e in range(episodes):
        state = env.reset()
        # Reshape to (1, 660) for batch processing
        state = np.reshape(state, (1, -1))  # Using -1 lets numpy calculate correct dimension
        
        done = False
        total_reward = 0
        
        while not done:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state, (1, -1))
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward
            agent.replay()
            
        print(f"Episode {e+1}/{episodes}, Total Reward: {total_reward}, Epsilon: {agent.epsilon:.2f}")

In [None]:
dqn_agents = {}
for ticker in STOCKS:
    env = envs[ticker]
    test_state = env.reset()
    state_size = len(test_state)
    
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)
    
    train_dqn(agent, env, episodes=1000)
    dqn_agents[ticker] = agent



  current_price = float(self.df.iloc[self.current_step]['Close'])




In [50]:
class PPOAgent:
    def __init__(self, state_dim, action_dim, lr=3e-4, gamma=0.99, eps_clip=0.2, K_epochs=80):
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        self.action_dim = action_dim

        self.policy = ActorCritic(state_dim, action_dim).to(device)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        self.policy_old = ActorCritic(state_dim, action_dim).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        self.MseLoss = nn.MSELoss()

    def select_action(self, state):
        state = torch.FloatTensor(state).to(device)
        action, _ = self.policy_old.act(state)
        return action.detach().cpu().numpy()

    def update(self, memory):
        # Implement PPO update logic
        pass

class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorCritic, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim),
            nn.Softmax(dim=-1)
        )
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def act(self, state):
        action_probs = self.actor(state)
        dist = torch.distributions.Categorical(action_probs)
        action = dist.sample()
        return action, dist.log_prob(action)

    def evaluate(self, state, action):
        action_probs = self.actor(state)
        dist = torch.distributions.Categorical(action_probs)
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_values = self.critic(state)
        return action_logprobs, state_values, dist_entropy

In [78]:
# Example initialization
env = DummyVecEnv([lambda: TradingEnv(train_data['AAPL'])])
ddpg_model = DDPG('MlpPolicy', env, verbose=1)



NotImplementedError: Cannot convert space of type Box(-inf, inf, (60, 13), float32). Please upgrade your code to gymnasium.