In [None]:
!pip install gym
!pip install stable-baselines3[extra]

In [216]:
# Dependencies
import os 
import gym
import gymnasium
import numpy as np
import pandas as pd
from stable_baselines3 import PPO
from vmdpy import VMD
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env

Defining a stock class to keep track of revenue and expenditures of stock over time. 
This class was generated by ChatGPT to have a better mental organization on how to keep track of profits over time in the reinforcement learning training. 

In [194]:
class Portfolio:
    def __init__(self, initial_cash):
        self.cash = initial_cash
        self.stock_quantity = 0
        #add the current portfolio value

        #consider adding a crashed flag(went below 0) or a can't buy so skip the buy action, same with sell

    def buy_stock(self, price, quantity):
        cost = price * quantity
        if cost <= self.cash:
            self.cash -= cost
            self.stock_quantity += quantity
            print(f"Bought {quantity} shares at ${price} each.")
        else:
            #skip the buy action
            print("Insufficient funds to buy the stock.")

    def sell_stock(self, price, quantity):
        if self.stock_quantity >= quantity:
            revenue = price * quantity
            self.cash += revenue
            self.stock_quantity -= quantity
            print(f"Sold {quantity} shares at ${price} each.")
        else:
            #skip the sell action
            print("Insufficient shares to sell.")

    def get_portfolio_value(self, current_price):
        return self.cash + current_price * self.stock_quantity


# Example usage:
initial_cash = 10000
portfolio = Portfolio(initial_cash)

# Buying stocks
portfolio.buy_stock( 150.0, 10)

# Selling some stocks
portfolio.sell_stock( 160.0, 5)

# Getting the current portfolio value
current_price = 170
portfolio_value = portfolio.get_portfolio_value(current_price)
print(f"Current portfolio value: ${portfolio_value}")


Bought 10 shares at $150.0 each.
Sold 5 shares at $160.0 each.
Current portfolio value: $10150.0


Testing the dimensions and gymnasium spaces

In [195]:
Discrete(3)
Dict({0:Box(0,initial_cash, shape=(1,)), 1:Box(0,initial_cash, shape=(1,))}).sample()[0][0]

9081.06

Defining the RL environment. Must inherit from the gymnasium environment class. Checkout online documentation 

In [250]:
# Define Environment
class StockTradingEnvironment(gymnasium.Env):
    def __init__(self, actual_price_data, predicted_price_data, sentiment_data, initial_cash):
        super(StockTradingEnvironment, self).__init__()

        # Initialize environment variables
        self.initial_cash = initial_cash
        self.actual_price_data = actual_price_data
        self.predicted_price_data = predicted_price_data                
        self.sentiment_data = sentiment_data
        self.price_index, self.sentiment_index = 0, 0
        self.action_space = Discrete(3)  # buy, sell, hold
        self.action_range = Dict({"Buy":Box(0,initial_cash, shape=(1,)), "Sell":Box(0,initial_cash, shape=(1,))}) #actual stock ranges
        self.observation_space = Dict({'actual_price':Box(0, 500, shape=(1,)),'predicted_prices':Box(0, 500, shape=(1,)), "sentiment":Box(-1, 1, shape=(1,))})

        self.state = {
        'actual_price' : np.array([self.actual_price_data[0]], dtype=np.float32),
        'predicted_prices': np.array([self.predicted_price_data[0]], dtype=np.float32),  # Ensure it's a float32 numpy array
        'sentiment': np.array([self.sentiment_data[0]], dtype=np.float32)     # Ensure it's a float32 numpy array
        }
        # predicted stock states
        self.cash = initial_cash
        self.stock_quantity = 0
        self.current_value = 0
        self.seed = 0
        

    def buy_stock(self, price, desired_amount):
        quantity = round(desired_amount / price, 1)
        cost = price * quantity
        if cost <= self.cash:
            self.cash -= cost
            self.stock_quantity += quantity
            #print(f"Bought {quantity} shares at ${price} each.")
        else:
            #print("Insufficient funds to buy the stock.")
            return False
        #get portfolio value based off real price
        self.current_value = self.get_portfolio_value(self.state['actual_price'][0])
        return True
    def sell_stock(self, price, desired_amount):
        quantity = round(desired_amount / price, 1)
        if self.stock_quantity >= quantity:
            revenue = price * quantity
            self.cash += revenue
            self.stock_quantity -= quantity
            #print(f"Sold {quantity} shares at ${price} each.")
        else:
            #print("Insufficient shares to sell.")
            return False
        self.current_value = self.get_portfolio_value(self.state['actual_price'][0])
        return True

    def get_portfolio_value(self, current_price):
        return self.cash + current_price * self.stock_quantity
        
    def reset(self, seed=None):
        super().reset(seed=4)
        # Reset environment to initial state
        self.cash = self.initial_cash
        self.stock_quantity = 0
        self.current_value = 0
        self.price_index = 0
        self.sentiment_index = 0
        self.state = {
        'actual_price' : np.array([self.actual_price_data[0]], dtype=np.float32),
        'predicted_prices': np.array([self.predicted_price_data[0]], dtype=np.float32),  # Ensure it's a float32 numpy array
        'sentiment': np.array([self.sentiment_data[0]], dtype=np.float32)     # Ensure it's a float32 numpy array
        }

        info = {}
        return self.state, info
    
    def step(self, action):
        # Execute one step within the environment
        # Update state based on action taken
        # Calculate reward
        # Return next observation, reward, done flag, and additional info
        old_value = self.current_value
        if action == 0:
            #we buy based on predicted stock
            buy_successful = self.buy_stock(self.state['predicted_prices'][0], self.action_range.sample()["Buy"][0])
            
        elif action == 1:
            sell_successful = self.sell_stock(self.state['predicted_prices'][0], self.action_range.sample()["Sell"][0])
        #else the action is a "Hold" so we do nothing (we can adjust to reduce the reward if we hold)
        #now we calculate the reward based on portfolio value

        #if we sell more than we have or buy more than we can afford, punish the action
        if (action == 0 and not buy_successful) or (action == 1 and not sell_successful):
            reward = -10
        else:
            reward = self.current_value - old_value
        
        done = self.price_index >= len(self.predicted_price_data) - 1
        self.price_index += 1
        self.sentiment_index += 1
        truncated = False
        if not done:
            self.state = {
                'actual_price':np.array([self.actual_price_data[self.price_index]], dtype=np.float32),
                'predicted_prices': np.array([self.predicted_price_data[self.price_index]], dtype=np.float32), 
                'sentiment': np.array([self.sentiment_data[self.sentiment_index]], dtype=np.float32)
            }
        info = {}
        return self.state, reward, done, truncated, info
    

In [55]:
# get data and do VMD
frames = []
for i in range(1, 13):
    i_str = str(i)
    if i % 10 == i:
        i_str = '0'+i_str
    frames.append(pd.read_csv(f'train_data/60min_MSFT_2016_{i_str}.csv', index_col='timestamp').iloc[::-1])
for i in range(1, 13):
    i_str = str(i)
    if i % 10 == i:
        i_str = '0'+i_str
    frames.append(pd.read_csv(f'train_data/60min_MSFT_2017_{i_str}.csv', index_col='timestamp').iloc[::-1])

for i in range(1, 2):
    i_str = str(i)
    if i % 10 == i:
        i_str = '0'+i_str
    frames.append(pd.read_csv(f'train_data/60min_MSFT_2018_{i_str}.csv', index_col='timestamp').iloc[::-1])
stockprices = pd.concat(frames)

alpha = 5000      # moderate bandwidth constraint  
tau = 0           # noise-tolerance (no strict fidelity enforcement)  
K = 5              # 5 modes  
DC = 0             # no DC part imposed  
init = 1           # initialize omegas uniformly  
tol = 1e-7

signals, u_hat, omega = VMD(stockprices['close'].to_numpy(), alpha, tau, K, DC, init, tol)
stockprices.drop(stockprices.tail(1).index,inplace=True)
for i, signal in enumerate(signals):
    stockprices[f'signal{i}'] = signal

In [None]:
# load model
model = tf.keras.models.load_model('./trained_models/LSTM_Price_Predictor_Trial.keras')
scaler = StandardScaler()
window_size = 50
future_steps = 5

def preprocess_data(data=stockprices, scaler=scaler, window_size=window_size):
    raw = data[['close','signal0', 'signal1', 'signal2', 'signal3', 'signal4']][-window_size:].values
    raw = scaler.fit_transform(raw)
    X_test = [raw[i-window_size:i, 1:] for i in range(window_size, raw.shape[0])]
    X_test = np.array(X_test)
    print(X_test.shape)
    
    return X_test

X_test = preprocess_data()
predicted_price_ = model.predict(X_test)
# Post-processing to fit scaler expectations
full_dummy_features = np.zeros((predicted_price_.shape[0], 6))  # Create a dummy array with the same number of columns as the scaler expects
full_dummy_features[:, 0] = predicted_price_.ravel()  # Assuming 'close' is the first column

# Inverse transform
predicted_price = scaler.inverse_transform(full_dummy_features)[:, 0]  # Inverse transform and select only the 'close' column

stockprices.loc[:future_steps-1, "Predictions_lstm"] = None
stockprices.loc[future_steps:, "Predictions_lstm"] = predicted_price[:-future_steps]

Action space is discrete and of size 3 (buy, sell, hold), so we can use A2C, PPo, and DQN algorithms. 

In [None]:
# Define RL Agent
class DQNAgent:
    def __init__(self, state_size, action_size):
        # Initialize agent variables
        pass
    
    def act(self, state):
        # Choose action based on current state
        pass
    
    def train(self, state, action, reward, next_state, done):
        # Update agent's Q-values based on experience
        pass

In [None]:
# Define hyperparameters
state_size = len(price_data.columns) + len(sentiment_data.columns)
action_size = 3
batch_size = 32
num_episodes = 1000

In [None]:
# Initialize environment and agent
# env = StockTradingEnvironment(price_data, sentiment_data)
# agent = DQNAgent(state_size, action_size)

# # Training loop
# for episode in range(num_episodes):
#     state = env.reset()
#     done = False
#     total_reward = 0
#     while not done:
#         action = agent.act(state)
#         next_state, reward, done, _ = env.step(action)
#         agent.train(state, action, reward, next_state, done)
#         state = next_state
#         total_reward += reward
#     print(f'Episode: {episode}, Total Reward: {total_reward}')

# Evaluate the trained agent
# Evaluate performance on validation dataset


Testing the environment

In [234]:
#CALL CSV FILE TO LOAD PRICE PREDICTION AND SENTIMENT   
data = pd.read_csv("all_data.csv")
predicted_price_data = data.iloc[5:, 7].values
sentiment_data = data.iloc[:-5, 6].values
actual_price_data = data.iloc[:-5, 4].values


In [258]:
env = StockTradingEnvironment(actual_price_data[:3000], predicted_price_data[:3000], sentiment_data[:3000], 10000)
check_env(env, warn=True)

In [259]:
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        
        action = env.action_space.sample()
        n_state, reward, done, truncated, info = env.step(action)
        score+=reward
        #print("state : ", n_state)
    print('EPISODE:{} Score:{}\n'.format(episode, score))
env.close()

EPISODE:1 Score:310.4903264244531

EPISODE:2 Score:1677.2659228619068

EPISODE:3 Score:689.4063104679626

EPISODE:4 Score:1824.3748923028907

EPISODE:5 Score:2239.437879041736

EPISODE:6 Score:1745.225472805516

EPISODE:7 Score:1485.9139710785694

EPISODE:8 Score:681.7843872061949

EPISODE:9 Score:753.389840590833

EPISODE:10 Score:996.5993059100638



In [262]:
log_path = os.path.join('Training', 'Logs')
model = PPO("MultiInputPolicy", env, verbose=1, tensorboard_log=log_path)
model.learn(total_timesteps=50000)
model.save('PPO2')
#evaluate_policy(model, env, n_eval_episodes=10, render=False)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\PPO_12
-----------------------------
| time/              |      |
|    fps             | 1241 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3e+03       |
|    ep_rew_mean          | 2.28e+03    |
| time/                   |             |
|    fps                  | 783         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.006915222 |
|    clip_fraction        | 0.109       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -1e-05      |
|    learning_rate        | 0.0003

Found existing installation: tensorflow 2.16.1
Uninstalling tensorflow-2.16.1:
  Would remove:
    c:\users\jorge tomaylla\anaconda3\lib\site-packages\tensorflow-2.16.1.dist-info\*
Proceed (Y/n)? 
