In [10]:
import pandas as pd
df = pd.read_csv('/data/AAPL.csv')

In [11]:
df.head(5)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1980-12-12,0.128348,0.128906,0.128348,0.128348,0.100323,469033600
1,1980-12-15,0.12221,0.12221,0.121652,0.121652,0.095089,175884800
2,1980-12-16,0.113281,0.113281,0.112723,0.112723,0.08811,105728000
3,1980-12-17,0.115513,0.116071,0.115513,0.115513,0.090291,86441600
4,1980-12-18,0.118862,0.11942,0.118862,0.118862,0.092908,73449600


In [12]:
df.isna().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

seems we have no null values

In [13]:
! pip install swig > /dev/null 2>&1
! pip install gym[box2d] > /dev/null 2>&1

In [14]:
import random
import json
import gym
from gym import spaces
import pandas as pd
import numpy as np
import datetime as dt
# from stable_baselines3.common.policies import MlpPolicy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import PPO

In [15]:
# Intialising variables
MAX_ACCOUNT_BALANCE = 2147483647
MAX_NUM_SHARES = 2147483647
MAX_SHARE_PRICE = 5000
MAX_STEPS = 20000
INITIAL_ACCOUNT_BALANCE = 10000

In [16]:
class StockTradingEnv(gym.Env):
    """A stock trading environment for OpenAI gym"""

    def __init__(self, df):
        super(StockTradingEnv, self).__init__()

        self.df = df
        self.reward_range = (0, MAX_ACCOUNT_BALANCE)

        # Actions of the format Buy x%, Sell x%, Hold, etc.
        self.action_space = spaces.Box(
            low=np.array([0, 0]), high=np.array([3, 1]), dtype=np.float16)

        # Prices contains the OHCL values for the last five prices
        self.observation_space = spaces.Box(
            low=0, high=1, shape=(6, 6), dtype=np.float16)

    def _next_observation(self):
        # Get the stock data points for the last 5 days and scale to between 0-1
        frame = np.array([
            self.df.loc[self.current_step: self.current_step +
                        5, 'Open'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step +
                        5, 'High'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step +
                        5, 'Low'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step +
                        5, 'Close'].values / MAX_SHARE_PRICE,
            self.df.loc[self.current_step: self.current_step +
                        5, 'Volume'].values / MAX_NUM_SHARES
        ])

        # Append additional data and scale each value to between 0-1
        # print(self.current_step)
        # print(frame.shape)
        obs1 = np.append(frame, [[
            self.balance / MAX_ACCOUNT_BALANCE,
            self.max_net_worth / MAX_ACCOUNT_BALANCE,
            self.shares_held / MAX_NUM_SHARES,
            self.cost_basis / MAX_SHARE_PRICE,
            self.total_shares_sold / MAX_NUM_SHARES,
            self.total_sales_value / (MAX_NUM_SHARES * MAX_SHARE_PRICE),
        ]], axis=0)
        obs =obs1
        return obs

    def _take_action(self, action):
        # Set the current price to a random price within the time step
        current_price = random.uniform(
            self.df.loc[self.current_step, "Open"], self.df.loc[self.current_step, "Close"])

        action_type = action[0]
        amount = action[1]

        if action_type < 1:
            # Buy amount % of balance in shares
            total_possible = int(self.balance / current_price)
            shares_bought = int(total_possible * amount)
            prev_cost = self.cost_basis * self.shares_held
            additional_cost = shares_bought * current_price

            self.balance -= additional_cost
            self.cost_basis = (
                prev_cost + additional_cost) / (self.shares_held + shares_bought)
            self.shares_held += shares_bought

        elif action_type < 2:
            # Sell amount % of shares held
            shares_sold = int(self.shares_held * amount)
            self.balance += shares_sold * current_price
            self.shares_held -= shares_sold
            self.total_shares_sold += shares_sold
            self.total_sales_value += shares_sold * current_price

        self.net_worth = self.balance + self.shares_held * current_price

        if self.net_worth > self.max_net_worth:
            self.max_net_worth = self.net_worth

        if self.shares_held == 0:
            self.cost_basis = 0

    def step(self, action):
        # Execute one time step within the environment
        self._take_action(action)

        self.current_step += 1

        if self.current_step > len(self.df.loc[:, 'Open'].values) - 6:
            self.current_step = 0

        delay_modifier = (self.current_step / MAX_STEPS)

        reward = self.balance * delay_modifier
        done = self.net_worth <= 0

        obs = self._next_observation()

        return obs, reward, done, {}

    def reset(self):
        # Reset the state of the environment to an initial state
        self.balance = INITIAL_ACCOUNT_BALANCE
        self.net_worth = INITIAL_ACCOUNT_BALANCE
        self.max_net_worth = INITIAL_ACCOUNT_BALANCE
        self.shares_held = 0
        self.cost_basis = 0
        self.total_shares_sold = 0
        self.total_sales_value = 0

        # Set the current step to a random point within the data frame
        self.current_step = random.randint(
            0, len(self.df.loc[:, 'Open'].values) - 6)

        return self._next_observation()

    def render(self, mode='human', close=False):
        # Render the environment to the screen
        profit = self.net_worth - INITIAL_ACCOUNT_BALANCE

        print(f'Step: {self.current_step}')
        print(f'Balance: {self.balance}')
        print(
            f'Shares held: {self.shares_held} (Total sold: {self.total_shares_sold})')
        print(
            f'Avg cost for held shares: {self.cost_basis} (Total sales value: {self.total_sales_value})')
        print(
            f'Net worth: {self.net_worth} (Max net worth: {self.max_net_worth})')
        print(f'Profit: {profit}')



In [17]:
df.dropna(inplace=True)
df = df.sort_values('Date') # sort by dates

In [18]:
df['VWAP'] = (df['Close']*df['Volume'])

In [19]:
df = df.reset_index()

In [20]:
env = DummyVecEnv([lambda: StockTradingEnv(df)])

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [21]:
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=20000)

Using cpu device


  self.cost_basis = (


-----------------------------
| time/              |      |
|    fps             | 661  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 523           |
|    iterations           | 2             |
|    time_elapsed         | 7             |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 0.00022223071 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -2.84         |
|    explained_variance   | 1.61e-06      |
|    learning_rate        | 0.0003        |
|    loss                 | 6.93e+07      |
|    n_updates            | 10            |
|    policy_gradient_loss | -0.00148      |
|    std                  | 1             |
|    value_loss           | 1.3e+08       

<stable_baselines3.ppo.ppo.PPO at 0x7eae0a07a260>

In [22]:
obs = env.reset()
for i in range(2000):
#     print('hi')
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()
    env.get_attr('current_step')[0]  # Using get_attr to access attributes from environments within VecEnv
    print(f"Step: {env.get_attr('current_step')[0]}")
    print(f"Balance: {env.get_attr('balance')[0]}")
    print(f"Net Worth: {env.get_attr('net_worth')[0]}")

  self.cost_basis = (


Step: 10143
Balance: 10000.0
Net Worth: 10000.0
Step: 10144
Balance: 5938.971419429211
Net Worth: 10000.0
Step: 10145
Balance: 5938.971419429211
Net Worth: 9987.549963697202
Step: 10146
Balance: 5938.971419429211
Net Worth: 10069.79446518462
Step: 10147
Balance: 4962.93800883146
Net Worth: 10087.113414469652
Step: 10148
Balance: 4962.93800883146
Net Worth: 10026.750849737798
Step: 10149
Balance: 4962.93800883146
Net Worth: 10064.636756133556
Step: 10150
Balance: 4962.93800883146
Net Worth: 10240.317635945077
Step: 10151
Balance: 4962.93800883146
Net Worth: 10199.9021691947
Step: 10152
Balance: 4962.93800883146
Net Worth: 10036.63081845551
Step: 10153
Balance: 1485.221414171338
Net Worth: 9999.631007994394
Step: 10154
Balance: 1485.221414171338
Net Worth: 10060.388684041904
Step: 10155
Balance: 1485.221414171338
Net Worth: 10194.988817568854
Step: 10156
Balance: 1485.221414171338
Net Worth: 10202.859825864538
Step: 10157
Balance: 6512.343347438433
Net Worth: 9983.451348979997
Step: 1015