# Portfolio Optimization using Deep Reinforcement Learning
---

## 8.0 Deep Reinforcement Learning Portfolios

### 8.1 Import Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pylab as plt
matplotlib.use('Agg')
import datetime

from pypfopt.efficient_frontier import efficient_frontier
from pypfopt.efficient_frontier.efficient_frontier import EfficientFrontier
from pypfopt import efficient_frontier
from pypfopt import risk_models
from pypfopt import expected_returns

In [2]:
import finrl

In [3]:
# Imports from the FinRL Library

from finrl import config
from backtest import BackTestStats, BaselineStats, BackTestPlot, backtest_strat, baseline_strat
from backtest import backtest_strat, baseline_strat



### 8.2 Load Data

In [4]:
%store -r train_df
%store -r test_df

In [5]:
tech_indicator_list = ['f01','f02','f03','f04']

In [6]:
train_df.head()

Unnamed: 0,date,tic,close,high,low,open,volume,cov_list,f01,f02,f03,f04
0,2009-01-13,ASIANPAINT.NS,91.699997,88.5,91.235001,88.5,65800,"[[0.0005821350723573744, 0.0001385649017777150...",0.646335,0.708096,0.085455,2.997396
0,2009-01-13,CIPLA.NS,189.649994,184.0,185.350006,185.0,901712,"[[0.0005821350723573744, 0.0001385649017777150...",0.646335,0.708096,0.085455,2.997396
0,2009-01-13,DRREDDY.NS,478.0,448.0,452.75,465.75,544994,"[[0.0005821350723573744, 0.0001385649017777150...",0.646335,0.708096,0.085455,2.997396
0,2009-01-13,GAIL.NS,39.375019,37.875019,38.756268,38.60627,9334277,"[[0.0005821350723573744, 0.0001385649017777150...",0.646335,0.708096,0.085455,2.997396
0,2009-01-13,GRASIM.NS,209.852203,202.908554,204.891357,205.570282,1994905,"[[0.0005821350723573744, 0.0001385649017777150...",0.646335,0.708096,0.085455,2.997396


In [7]:
train_df.shape

(59660, 12)

### 8.4 Implement DRL Algorithms

In [8]:
from finrl.agents.elegantrl.models import DRLAgent
from finrl.meta.env_portfolio_allocation.env_portfolio import StockPortfolioEnv

In [9]:
import env_portfolio
from env_portfolio import StockPortfolioEnv

import train_models
from train_models import DRLAgents

In [10]:
stock_dimension = len(train_df.tic.unique())
state_space = stock_dimension
print(f"Stock Dimension: {stock_dimension}, State Space: {state_space}")


Stock Dimension: 20, State Space: 20


In [11]:
weights_initial = [1/stock_dimension]*stock_dimension

In [12]:
env_kwargs = {
    "hmax": 500, 
    "initial_amount": 1000000, 
    "transaction_cost_pct": 0.001, 
    "state_space": state_space, 
    "stock_dim": stock_dimension, 
    "tech_indicator_list": tech_indicator_list, 
    "action_space": stock_dimension, 
    "reward_scaling": 0,
    'initial_weights': [1/stock_dimension]*stock_dimension
}

In [13]:
e_train_gym = StockPortfolioEnv(df = train_df, **env_kwargs)

In [14]:
env_train, _ = e_train_gym.get_sb_env()
print(type(env_train))

<class 'stable_baselines3.common.vec_env.dummy_vec_env.DummyVecEnv'>




#### 8.4.1 Model 1: A2C: Advantage Actor-Critic

In [15]:
import env_portfolio
from env_portfolio import StockPortfolioEnv

import train_models
from train_models import DRLAgents

In [16]:
train_df.head()

Unnamed: 0,date,tic,close,high,low,open,volume,cov_list,f01,f02,f03,f04
0,2009-01-13,ASIANPAINT.NS,91.699997,88.5,91.235001,88.5,65800,"[[0.0005821350723573744, 0.0001385649017777150...",0.646335,0.708096,0.085455,2.997396
0,2009-01-13,CIPLA.NS,189.649994,184.0,185.350006,185.0,901712,"[[0.0005821350723573744, 0.0001385649017777150...",0.646335,0.708096,0.085455,2.997396
0,2009-01-13,DRREDDY.NS,478.0,448.0,452.75,465.75,544994,"[[0.0005821350723573744, 0.0001385649017777150...",0.646335,0.708096,0.085455,2.997396
0,2009-01-13,GAIL.NS,39.375019,37.875019,38.756268,38.60627,9334277,"[[0.0005821350723573744, 0.0001385649017777150...",0.646335,0.708096,0.085455,2.997396
0,2009-01-13,GRASIM.NS,209.852203,202.908554,204.891357,205.570282,1994905,"[[0.0005821350723573744, 0.0001385649017777150...",0.646335,0.708096,0.085455,2.997396


In [22]:
train_df.to_excel('train.xlsx', sheet_name="train_df", index=False, header=True)


In [17]:
from train_models import DRLAgents

In [18]:
# initialize
agent = DRLAgents(env=env_train)

A2C_PARAMS = {"n_steps": 5, "ent_coef": 0.005, "learning_rate": 0.0002}
model_a2c = agent.get_model(model_name="a2c",model_kwargs = A2C_PARAMS)


{'n_steps': 5, 'ent_coef': 0.005, 'learning_rate': 0.0002}
Using cpu device


In [19]:
trained_a2c = agent.train_model(model=model_a2c, 
                                tb_log_name='a2c',
                                total_timesteps=50000)

Logging to tensorboard_log/a2c/a2c_104


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 20 and the array at index 1 has size 19

#### 8.4.2 Model 2: PPO : Proximal Policy Optimization

In [None]:
agent = DRLAgents(env = env_train)
PPO_PARAMS = {
    "n_steps": 2048,
    "ent_coef": 0.005,
    "learning_rate": 0.0001,
    "batch_size": 128,
}
model_ppo = agent.get_model("ppo",model_kwargs = PPO_PARAMS)

{'n_steps': 2048, 'ent_coef': 0.005, 'learning_rate': 0.0001, 'batch_size': 128}
Using cpu device


In [None]:
trained_ppo = agent.train_model(model=model_ppo, 
                             tb_log_name='ppo',
                             total_timesteps=50000)

Logging to tensorboard_log/ppo/ppo_6
-----------------------------
| time/              |      |
|    fps             | 3629 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
begin_total_asset:1000000
end_total_asset:1000000.0
--------------------------------------
| time/                   |          |
|    fps                  | 2761     |
|    iterations           | 2        |
|    time_elapsed         | 1        |
|    total_timesteps      | 4096     |
| train/                  |          |
|    approx_kl            | 0.0      |
|    clip_fraction        | 0        |
|    clip_range           | 0.2      |
|    entropy_loss         | -28.4    |
|    explained_variance   | 0        |
|    learning_rate        | 0.0001   |
|    loss                 | 1.41e+14 |
|    n_updates            | 10       |
|    policy_gradient_loss | 9.55e-09 |
|    std                  | 1        |
|    value_loss           | 2.79e+14 |

#### 8.4.3 Model 3: DDPG : Deep Deterministic Policy Gradient

In [None]:
agent = DRLAgents(env = env_train)
DDPG_PARAMS = {"batch_size": 128, "buffer_size": 50000, "learning_rate": 0.001}


model_ddpg = agent.get_model("ddpg",model_kwargs = DDPG_PARAMS)

{'batch_size': 128, 'buffer_size': 50000, 'learning_rate': 0.001}
Using cpu device


In [None]:
trained_ddpg = agent.train_model(model=model_ddpg, 
                             tb_log_name='ddpg',
                             total_timesteps=50000)

Logging to tensorboard_log/ddpg/ddpg_4
begin_total_asset:1000000
end_total_asset:1000000.0
begin_total_asset:1000000
end_total_asset:1000000.0
begin_total_asset:1000000
end_total_asset:1000000.0
begin_total_asset:1000000
end_total_asset:1000000.0
----------------------------------
| time/              |           |
|    episodes        | 4         |
|    fps             | 182       |
|    time_elapsed    | 65        |
|    total_timesteps | 11940     |
| train/             |           |
|    actor_loss      | -4.32e+07 |
|    critic_loss     | 2.4e+09   |
|    learning_rate   | 0.001     |
|    n_updates       | 11839     |
----------------------------------
begin_total_asset:1000000
end_total_asset:1000000.0
begin_total_asset:1000000
end_total_asset:1000000.0


KeyboardInterrupt: 

#### 8.4.4 Model 4: SAC : Soft Actor-Critic

In [None]:
agent = DRLAgents(env = env_train)
SAC_PARAMS = {
    "batch_size": 128,
    "buffer_size": 100000,
    "learning_rate": 0.0003,
    "learning_starts": 100,
    "ent_coef": "auto_0.1",
}

model_sac = agent.get_model("sac",model_kwargs = SAC_PARAMS)

{'batch_size': 128, 'buffer_size': 100000, 'learning_rate': 0.0003, 'learning_starts': 100, 'ent_coef': 'auto_0.1'}
Using cpu device


In [None]:
trained_sac = agent.train_model(model=model_sac, 
                             tb_log_name='sac',
                             total_timesteps=50000)

Logging to tensorboard_log/sac/sac_3


#### 8.4.5 Model 5: TD3 : Twin Delayed Deep Deterministic Policy Gradien

In [None]:
agent = DRLAgents(env = env_train)
TD3_PARAMS = {"batch_size": 100, 
              "buffer_size": 1000000, 
              "learning_rate": 0.001}

model_td3 = agent.get_model("td3",model_kwargs = TD3_PARAMS)

{'batch_size': 100, 'buffer_size': 1000000, 'learning_rate': 0.001}
Using cpu device


In [None]:
trained_td3 = agent.train_model(model=model_td3, 
                             tb_log_name='td3',
                             total_timesteps=30000)

Logging to tensorboard_log/td3/td3_3


### 8.5 Fittng Model on Training Data

In [None]:
# A2C Train Model
e_trade_gym = StockPortfolioEnv(df = train_df, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

a2c_train_daily_return, a2c_train_weights = DRLAgents.DRL_prediction(model=trained_a2c,
                        test_data = train_df,
                        test_env = env_trade,
                        test_obs = obs_trade)



In [None]:
# PPO Train Model
e_trade_gym = StockPortfolioEnv(df = train_df, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

ppo_train_daily_return, ppo_train_weights = DRLAgents.DRL_prediction(model=trained_ppo,
                        test_data = train_df,
                        test_env = env_trade,
                        test_obs = obs_trade)



In [None]:
# DDPG Train Model
e_trade_gym = StockPortfolioEnv(df = train_df, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

ddpg_train_daily_return, ddpg_train_weights = DRLAgents.DRL_prediction(model=trained_ddpg,
                        test_data = train_df,
                        test_env = env_trade,
                        test_obs = obs_trade)



In [None]:
# SAC Train Model
e_trade_gym = StockPortfolioEnv(df = train_df, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

sac_train_daily_return, sac_train_weights = DRLAgents.DRL_prediction(model=trained_sac,
                        test_data = train_df,
                        test_env = env_trade,
                        test_obs = obs_trade)



In [None]:
# TD3 Train Model
e_trade_gym = StockPortfolioEnv(df = train_df, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

td3_train_daily_return, td3_train_weights = DRLAgents.DRL_prediction(model=trained_td3,
                        test_data = train_df,
                        test_env = env_trade,
                        test_obs = obs_trade)



In [None]:
# Store the Training Models
%store a2c_train_daily_return
%store ppo_train_daily_return
%store ddpg_train_daily_return
%store sac_train_daily_return
%store td3_train_daily_return

Stored 'a2c_train_daily_return' (DataFrame)
Stored 'ppo_train_daily_return' (DataFrame)
Stored 'ddpg_train_daily_return' (DataFrame)
Stored 'sac_train_daily_return' (DataFrame)
Stored 'td3_train_daily_return' (DataFrame)


### 8.6 Trading
Assume that we have $1,000,000 initial capital at 2024-01-01. We use the DDPG model to trade Dow jones 30 stocks.

In [None]:
test_df.tail(5)

Unnamed: 0,date,tic,close,high,low,open,volume,cov_list,f01,f02,f03,f04
744,2024-02-27,SUNPHARMA.NS,1585.449951,1549.050049,1582.75,1556.75,2140963,"[[9.860392541309073e-05, -1.08803881131486e-05...",1.158366,1.36625,0.0,4.018352
744,2024-02-27,TATACHEM.NS,975.0,958.75,961.799988,972.0,427306,"[[9.860392541309073e-05, -1.08803881131486e-05...",1.158366,1.36625,0.0,4.018352
744,2024-02-27,TCS.NS,4124.0,3999.0,4104.399902,3999.0,2960475,"[[9.860392541309073e-05, -1.08803881131486e-05...",1.158366,1.36625,0.0,4.018352
744,2024-02-27,ULTRACEMCO.NS,10151.0,9881.450195,9951.099609,9930.049805,214186,"[[9.860392541309073e-05, -1.08803881131486e-05...",1.158366,1.36625,0.0,4.018352
744,2024-02-27,WIPRO.NS,537.400024,527.549988,531.450012,534.0,5199630,"[[9.860392541309073e-05, -1.08803881131486e-05...",1.158366,1.36625,0.0,4.018352


In [None]:
# A2C Test Model
e_trade_gym = StockPortfolioEnv(df = test_df, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

a2c_test_daily_return, a2c_test_weights = DRLAgents.DRL_prediction(model=trained_a2c,
                        test_data = test_df,
                        test_env = env_trade,
                        test_obs = obs_trade)



In [None]:
a2c_test_daily_return.head()

Unnamed: 0,date,daily_return
0,2021-02-23,0.0
1,2021-02-23,0.0
2,2021-02-23,0.0
3,2021-02-23,0.0
4,2021-02-23,0.0


In [None]:
a2c_test_weights.to_csv('a2c_test_weights.csv')

In [None]:
a2c_test_weights.head()

Unnamed: 0_level_0,ASIANPAINT.NS,CIPLA.NS,DRREDDY.NS,GAIL.NS,GRASIM.NS,HDFCBANK.NS,HEROMOTOCO.NS,HINDUNILVR.NS,INFY.NS,ITC.NS,LT.NS,M&M.NS,MARUTI.NS,NTPC.NS,POWERGRID.NS,SUNPHARMA.NS,TATACHEM.NS,TCS.NS,ULTRACEMCO.NS,WIPRO.NS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2021-02-23,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
2021-02-23,0.027919,0.027919,0.052119,0.027919,0.075892,0.067281,0.048798,0.075892,0.027919,0.075892,0.028771,0.041693,0.027919,0.075892,0.033142,0.027919,0.075892,0.074999,0.075892,0.030332
2021-02-23,0.024985,0.024985,0.067917,0.067917,0.067917,0.030266,0.065212,0.062506,0.025282,0.067917,0.067917,0.067917,0.024985,0.067917,0.026439,0.024985,0.054113,0.067917,0.067917,0.024985
2021-02-23,0.027938,0.027938,0.028315,0.075943,0.075943,0.027938,0.027938,0.027938,0.027938,0.034865,0.075943,0.075943,0.075943,0.029832,0.075943,0.027938,0.075943,0.075943,0.075943,0.027938
2021-02-23,0.028909,0.060095,0.078584,0.067483,0.077145,0.028909,0.028909,0.078584,0.039706,0.028909,0.072342,0.028909,0.037682,0.058924,0.035741,0.028909,0.034183,0.078584,0.078584,0.028909


In [None]:
# PPO Test Model
e_trade_gym = StockPortfolioEnv(df = test_df, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

ppo_test_daily_return, ppo_test_weights = DRLAgents.DRL_prediction(model=trained_ppo,
                        test_data = test_df,
                        test_env = env_trade,
                        test_obs = obs_trade)



In [None]:
ppo_test_weights.to_csv('ppo_test_weights')

In [None]:
# DDPG Test Model
e_trade_gym = StockPortfolioEnv(df = test_df, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

ddpg_test_daily_return, ddpg_test_weights = DRLAgents.DRL_prediction(model=trained_ddpg,
                        test_data = test_df,
                        test_env = env_trade,
                        test_obs = obs_trade)



In [None]:
ddpg_test_weights.to_csv('ddpg_test_weights')

In [None]:
# SAC Test Model
e_trade_gym = StockPortfolioEnv(df = test_df, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

sac_test_daily_return, sac_test_weights = DRLAgents.DRL_prediction(model=trained_sac,
                        test_data = test_df,
                        test_env = env_trade,
                        test_obs = obs_trade)



In [None]:
sac_test_weights.to_csv('sac_test_weights')

In [None]:
# TD3 Test Model
e_trade_gym = StockPortfolioEnv(df = test_df, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

td3_test_daily_return, td3_test_weights = DRLAgents.DRL_prediction(model=trained_sac,
                        test_data = test_df,
                        test_env = env_trade,
                        test_obs = obs_trade)



In [None]:
td3_test_weights.to_csv('td3_test_weights')

### 8.7 Save the Portfolios

In [None]:
a2c_test_portfolio = a2c_test_daily_return.copy()
a2c_test_returns = a2c_test_daily_return.copy()

ppo_test_portfolio = ppo_test_daily_return.copy()
ppo_test_returns = ppo_test_daily_return.copy()

ddpg_test_portfolio = ddpg_test_daily_return.copy()
ddpg_test_returns = ddpg_test_daily_return.copy()

sac_test_portfolio = sac_test_daily_return.copy()
sac_test_returns = sac_test_daily_return.copy()

td3_test_portfolio = td3_test_daily_return.copy()
td3_test_returns = td3_test_daily_return.copy()

In [None]:
%store a2c_test_portfolio
%store a2c_test_returns 

%store ppo_test_portfolio
%store ppo_test_returns 

%store ddpg_test_portfolio
%store ddpg_test_returns 

%store sac_test_portfolio
%store sac_test_returns

%store td3_test_portfolio
%store td3_test_returns

Stored 'a2c_test_portfolio' (DataFrame)
Stored 'a2c_test_returns' (DataFrame)
Stored 'ppo_test_portfolio' (DataFrame)
Stored 'ppo_test_returns' (DataFrame)
Stored 'ddpg_test_portfolio' (DataFrame)
Stored 'ddpg_test_returns' (DataFrame)
Stored 'sac_test_portfolio' (DataFrame)
Stored 'sac_test_returns' (DataFrame)
Stored 'td3_test_portfolio' (DataFrame)
Stored 'td3_test_returns' (DataFrame)
