# Portfolio Optimization using Deep Reinforcement Learning
---

## 8.0 Deep Reinforcement Learning Portfolios

### 8.1 Import Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pylab as plt
matplotlib.use('Agg')
import datetime

from pypfopt.efficient_frontier import efficient_frontier
from pypfopt.efficient_frontier.efficient_frontier import EfficientFrontier
from pypfopt import efficient_frontier
from pypfopt import risk_models
from pypfopt import expected_returns

In [2]:
import finrl

In [3]:
# Imports from the FinRL Library

from finrl import config
from backtest import BackTestStats, BaselineStats, BackTestPlot, backtest_strat, baseline_strat
from backtest import backtest_strat, baseline_strat



### 8.2 Load Data

In [4]:
%store -r train_df
%store -r test_df

In [5]:
tech_indicator_list = ['f01','f02','f03','f04']

In [6]:
train_df.head()

Unnamed: 0,date,tic,close,high,low,open,volume,cov_list,f01,f02,f03,f04
0,2019-01-09,ASIANPAINT.NS,1414.0,1397.150024,1402.0,1402.5,973687,"[[0.00015852269746834745, 1.720325697910041e-0...",1.47337,0.150646,0.0,4.06108
0,2019-01-09,BAJAJ-AUTO.NS,2710.0,2672.5,2696.899902,2702.5,285560,"[[0.00015852269746834745, 1.720325697910041e-0...",1.47337,0.150646,0.0,4.06108
0,2019-01-09,GRASIM.NS,840.956177,823.226685,831.991821,836.673218,4021049,"[[0.00015852269746834745, 1.720325697910041e-0...",1.47337,0.150646,0.0,4.06108
0,2019-01-09,HCLTECH.NS,474.274994,466.149994,469.200012,473.5,2471720,"[[0.00015852269746834745, 1.720325697910041e-0...",1.47337,0.150646,0.0,4.06108
0,2019-01-09,HDFCBANK.NS,1060.675049,1051.300049,1058.400024,1059.0,4284314,"[[0.00015852269746834745, 1.720325697910041e-0...",1.47337,0.150646,0.0,4.06108


In [7]:
train_df.shape

(20300, 12)

### 8.4 Implement DRL Algorithms

In [8]:
from finrl.agents.elegantrl.models import DRLAgent
from finrl.meta.env_portfolio_allocation.env_portfolio import StockPortfolioEnv

In [9]:
import env_portfolio
from env_portfolio import StockPortfolioEnv

import train_models
from train_models import DRLAgents

In [10]:
stock_dimension = len(train_df.tic.unique())
state_space = stock_dimension
print(f"Stock Dimension: {stock_dimension}, State Space: {state_space}")


Stock Dimension: 20, State Space: 20


In [11]:
weights_initial = [1/stock_dimension]*stock_dimension

In [12]:
env_kwargs = {
    "hmax": 500, 
    "initial_amount": 1000000, 
    "transaction_cost_pct": 0.001, 
    "state_space": state_space, 
    "stock_dim": stock_dimension, 
    "tech_indicator_list": tech_indicator_list, 
    "action_space": stock_dimension, 
    "reward_scaling": 0,
    'initial_weights': [1/stock_dimension]*stock_dimension
}

In [13]:
e_train_gym = StockPortfolioEnv(df = train_df, **env_kwargs)

In [14]:
env_train, _ = e_train_gym.get_sb_env()
print(type(env_train))

<class 'stable_baselines3.common.vec_env.dummy_vec_env.DummyVecEnv'>




#### 8.4.1 Model 1: A2C: Advantage Actor-Critic

In [15]:
import env_portfolio
from env_portfolio import StockPortfolioEnv

import train_models
from train_models import DRLAgents

In [16]:
train_df.head()

Unnamed: 0,date,tic,close,high,low,open,volume,cov_list,f01,f02,f03,f04
0,2019-01-09,ASIANPAINT.NS,1414.0,1397.150024,1402.0,1402.5,973687,"[[0.00015852269746834745, 1.720325697910041e-0...",1.47337,0.150646,0.0,4.06108
0,2019-01-09,BAJAJ-AUTO.NS,2710.0,2672.5,2696.899902,2702.5,285560,"[[0.00015852269746834745, 1.720325697910041e-0...",1.47337,0.150646,0.0,4.06108
0,2019-01-09,GRASIM.NS,840.956177,823.226685,831.991821,836.673218,4021049,"[[0.00015852269746834745, 1.720325697910041e-0...",1.47337,0.150646,0.0,4.06108
0,2019-01-09,HCLTECH.NS,474.274994,466.149994,469.200012,473.5,2471720,"[[0.00015852269746834745, 1.720325697910041e-0...",1.47337,0.150646,0.0,4.06108
0,2019-01-09,HDFCBANK.NS,1060.675049,1051.300049,1058.400024,1059.0,4284314,"[[0.00015852269746834745, 1.720325697910041e-0...",1.47337,0.150646,0.0,4.06108


In [17]:
train_df.to_excel('train.xlsx', sheet_name="train_df", index=False, header=True)


In [18]:
from train_models import DRLAgents

In [19]:
# initialize
agent = DRLAgents(env=env_train)

A2C_PARAMS = {"n_steps": 5, "ent_coef": 0.005, "learning_rate": 0.0002}
model_a2c = agent.get_model(model_name="a2c",model_kwargs = A2C_PARAMS)


{'n_steps': 5, 'ent_coef': 0.005, 'learning_rate': 0.0002}
Using cpu device


In [20]:
trained_a2c = agent.train_model(model=model_a2c, 
                                tb_log_name='a2c',
                                total_timesteps=50000)

Logging to tensorboard_log/a2c/a2c_127
-------------------------------------
| time/                 |           |
|    fps                | 1550      |
|    iterations         | 100       |
|    time_elapsed       | 0         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -28.3     |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.0002    |
|    n_updates          | 99        |
|    policy_loss        | 1.29e+08  |
|    std                | 0.997     |
|    value_loss         | 2.31e+13  |
-------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 1615     |
|    iterations         | 200      |
|    time_elapsed       | 0        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -28.3    |
|    explained_variance | 0        |
|    learning_rate      | 0.0002   |
|    n_updates       

#### 8.4.2 Model 2: PPO : Proximal Policy Optimization

In [21]:
agent = DRLAgents(env = env_train)
PPO_PARAMS = {
    "n_steps": 2048,
    "ent_coef": 0.005,
    "learning_rate": 0.0001,
    "batch_size": 128,
}
model_ppo = agent.get_model("ppo",model_kwargs = PPO_PARAMS)

{'n_steps': 2048, 'ent_coef': 0.005, 'learning_rate': 0.0001, 'batch_size': 128}
Using cpu device


In [22]:
trained_ppo = agent.train_model(model=model_ppo, 
                             tb_log_name='ppo',
                             total_timesteps=50000)

Logging to tensorboard_log/ppo/ppo_8
begin_total_asset:1000000
end_total_asset:1735992.762826523
Sharpe:  0.967328971383132
begin_total_asset:1000000
end_total_asset:1782458.9134549478
Sharpe:  1.0132703318306215
-----------------------------
| time/              |      |
|    fps             | 2437 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
begin_total_asset:1000000
end_total_asset:1848063.5215638904
Sharpe:  1.0695004672535804
begin_total_asset:1000000
end_total_asset:2024231.015839269
Sharpe:  1.2187809726347039
---------------------------------------
| time/                   |           |
|    fps                  | 2136      |
|    iterations           | 2         |
|    time_elapsed         | 1         |
|    total_timesteps      | 4096      |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |


#### 8.4.3 Model 3: DDPG : Deep Deterministic Policy Gradient

In [23]:
agent = DRLAgents(env = env_train)
DDPG_PARAMS = {"batch_size": 128, "buffer_size": 50000, "learning_rate": 0.001}


model_ddpg = agent.get_model("ddpg",model_kwargs = DDPG_PARAMS)

{'batch_size': 128, 'buffer_size': 50000, 'learning_rate': 0.001}
Using cpu device


In [24]:
trained_ddpg = agent.train_model(model=model_ddpg, 
                             tb_log_name='ddpg',
                             total_timesteps=50000)

Logging to tensorboard_log/ddpg/ddpg_6
begin_total_asset:1000000
end_total_asset:1906019.2483081822
Sharpe:  1.1458286025228233
begin_total_asset:1000000
end_total_asset:1874216.9866449942
Sharpe:  1.1163149272650246
begin_total_asset:1000000
end_total_asset:1874216.9866449942
Sharpe:  1.1163149272650246
begin_total_asset:1000000
end_total_asset:1874216.9866449942
Sharpe:  1.1163149272650246
----------------------------------
| time/              |           |
|    episodes        | 4         |
|    fps             | 150       |
|    time_elapsed    | 26        |
|    total_timesteps | 4060      |
| train/             |           |
|    actor_loss      | -2.27e+07 |
|    critic_loss     | 7.96e+11  |
|    learning_rate   | 0.001     |
|    n_updates       | 3959      |
----------------------------------
begin_total_asset:1000000
end_total_asset:1874216.9866449942
Sharpe:  1.1163149272650246
begin_total_asset:1000000
end_total_asset:1874216.9866449942
Sharpe:  1.1163149272650246
begin_t

#### 8.4.4 Model 4: SAC : Soft Actor-Critic

In [25]:
agent = DRLAgents(env = env_train)
SAC_PARAMS = {
    "batch_size": 128,
    "buffer_size": 100000,
    "learning_rate": 0.0003,
    "learning_starts": 100,
    "ent_coef": "auto_0.1",
}

model_sac = agent.get_model("sac",model_kwargs = SAC_PARAMS)

{'batch_size': 128, 'buffer_size': 100000, 'learning_rate': 0.0003, 'learning_starts': 100, 'ent_coef': 'auto_0.1'}
Using cpu device


In [26]:
trained_sac = agent.train_model(model=model_sac, 
                             tb_log_name='sac',
                             total_timesteps=50000)

Logging to tensorboard_log/sac/sac_5
begin_total_asset:1000000
end_total_asset:1868735.1478745686
Sharpe:  1.0901952475838916
begin_total_asset:1000000
end_total_asset:1826751.7565411686
Sharpe:  1.0550710004492392
begin_total_asset:1000000
end_total_asset:1826751.5464523996
Sharpe:  1.055070361193
begin_total_asset:1000000
end_total_asset:1826752.0876221845
Sharpe:  1.0550709743703564
----------------------------------
| time/              |           |
|    episodes        | 4         |
|    fps             | 131       |
|    time_elapsed    | 30        |
|    total_timesteps | 4060      |
| train/             |           |
|    actor_loss      | -8.53e+06 |
|    critic_loss     | 7.7e+11   |
|    ent_coef        | 0.379     |
|    ent_coef_loss   | 199       |
|    learning_rate   | 0.0003    |
|    n_updates       | 3959      |
----------------------------------
begin_total_asset:1000000
end_total_asset:1826750.494671965
Sharpe:  1.055069577735394
begin_total_asset:1000000
end_tota

#### 8.4.5 Model 5: TD3 : Twin Delayed Deep Deterministic Policy Gradien

In [27]:
agent = DRLAgents(env = env_train)
TD3_PARAMS = {"batch_size": 100, 
              "buffer_size": 1000000, 
              "learning_rate": 0.001}

model_td3 = agent.get_model("td3",model_kwargs = TD3_PARAMS)

{'batch_size': 100, 'buffer_size': 1000000, 'learning_rate': 0.001}
Using cpu device


In [28]:
trained_td3 = agent.train_model(model=model_td3, 
                             tb_log_name='td3',
                             total_timesteps=30000)

Logging to tensorboard_log/td3/td3_5
begin_total_asset:1000000
end_total_asset:1834581.0840704073
Sharpe:  1.0406735944713668
begin_total_asset:1000000
end_total_asset:1868097.3487184967
Sharpe:  1.0684380898678556
begin_total_asset:1000000
end_total_asset:1868097.3487184967
Sharpe:  1.0684380898678556
begin_total_asset:1000000
end_total_asset:1868097.3487184967
Sharpe:  1.0684380898678556
----------------------------------
| time/              |           |
|    episodes        | 4         |
|    fps             | 122       |
|    time_elapsed    | 33        |
|    total_timesteps | 4060      |
| train/             |           |
|    actor_loss      | -1.05e+07 |
|    critic_loss     | 4.75e+11  |
|    learning_rate   | 0.001     |
|    n_updates       | 3959      |
----------------------------------
begin_total_asset:1000000
end_total_asset:1868097.3487184967
Sharpe:  1.0684380898678556
begin_total_asset:1000000
end_total_asset:1868097.3487184967
Sharpe:  1.0684380898678556
begin_tot

### 8.5 Fittng Model on Training Data

In [29]:
# A2C Train Model
e_trade_gym = StockPortfolioEnv(df = train_df, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

a2c_train_daily_return, a2c_train_weights = DRLAgents.DRL_prediction(model=trained_a2c,
                        test_data = train_df,
                        test_env = env_trade,
                        test_obs = obs_trade)



begin_total_asset:1000000
end_total_asset:1790541.9756475668
Sharpe:  1.0206933131400584


In [30]:
# PPO Train Model
e_trade_gym = StockPortfolioEnv(df = train_df, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

ppo_train_daily_return, ppo_train_weights = DRLAgents.DRL_prediction(model=trained_ppo,
                        test_data = train_df,
                        test_env = env_trade,
                        test_obs = obs_trade)



begin_total_asset:1000000
end_total_asset:2055045.5600348732
Sharpe:  1.2531702186291702


In [31]:
# DDPG Train Model
e_trade_gym = StockPortfolioEnv(df = train_df, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

ddpg_train_daily_return, ddpg_train_weights = DRLAgents.DRL_prediction(model=trained_ddpg,
                        test_data = train_df,
                        test_env = env_trade,
                        test_obs = obs_trade)



begin_total_asset:1000000
end_total_asset:1874216.9866449942
Sharpe:  1.1163149272650246


In [32]:
# SAC Train Model
e_trade_gym = StockPortfolioEnv(df = train_df, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

sac_train_daily_return, sac_train_weights = DRLAgents.DRL_prediction(model=trained_sac,
                        test_data = train_df,
                        test_env = env_trade,
                        test_obs = obs_trade)



begin_total_asset:1000000
end_total_asset:1846167.3147196728
Sharpe:  1.0751296092512685


In [33]:
# TD3 Train Model
e_trade_gym = StockPortfolioEnv(df = train_df, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

td3_train_daily_return, td3_train_weights = DRLAgents.DRL_prediction(model=trained_td3,
                        test_data = train_df,
                        test_env = env_trade,
                        test_obs = obs_trade)



begin_total_asset:1000000
end_total_asset:1868097.3487184967
Sharpe:  1.0684380898678556


In [34]:
# Store the Training Models
%store a2c_train_daily_return
%store ppo_train_daily_return
%store ddpg_train_daily_return
%store sac_train_daily_return
%store td3_train_daily_return

Stored 'a2c_train_daily_return' (DataFrame)
Stored 'ppo_train_daily_return' (DataFrame)
Stored 'ddpg_train_daily_return' (DataFrame)
Stored 'sac_train_daily_return' (DataFrame)
Stored 'td3_train_daily_return' (DataFrame)


### 8.6 Trading
Assume that we have Rs1,000,000 initial capital at 2024-01-01. We use the DDPG model to trade NIFTY50 stocks

In [35]:
test_df.tail(5)

Unnamed: 0,date,tic,close,high,low,open,volume,cov_list,f01,f02,f03,f04
251,2024-02-26,RELIANCE.NS,2989.050049,2965.0,2974.649902,2987.100098,3756553,"[[9.755062055703997e-05, 4.4546950611781485e-0...",0.510949,0.005566,0.0,3.781142
251,2024-02-26,SBILIFE.NS,1553.400024,1525.099976,1549.150024,1531.150024,1283074,"[[9.755062055703997e-05, 4.4546950611781485e-0...",0.510949,0.005566,0.0,3.781142
251,2024-02-26,TCS.NS,4050.0,3982.75,4001.050049,4036.0,1379284,"[[9.755062055703997e-05, 4.4546950611781485e-0...",0.510949,0.005566,0.0,3.781142
251,2024-02-26,TITAN.NS,3690.0,3610.850098,3622.5,3690.0,546842,"[[9.755062055703997e-05, 4.4546950611781485e-0...",0.510949,0.005566,0.0,3.781142
251,2024-02-26,WIPRO.NS,536.5,526.049988,532.900024,536.25,4410318,"[[9.755062055703997e-05, 4.4546950611781485e-0...",0.510949,0.005566,0.0,3.781142


In [36]:
# A2C Test Model
e_trade_gym = StockPortfolioEnv(df = test_df, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

a2c_test_daily_return, a2c_test_weights = DRLAgents.DRL_prediction(model=trained_a2c,
                        test_data = test_df,
                        test_env = env_trade,
                        test_obs = obs_trade)

begin_total_asset:1000000
end_total_asset:1296811.6370321312
Sharpe:  3.1430460632422927




In [37]:
a2c_test_daily_return.head()

Unnamed: 0,date,daily_return
0,2023-02-16,0.0
1,2023-02-17,-0.005276
2,2023-02-20,-0.001997
3,2023-02-21,-0.003438
4,2023-02-22,-0.010396


In [38]:
a2c_test_weights.to_csv('a2c_test_weights.csv')

In [39]:
a2c_test_weights.head()

Unnamed: 0_level_0,ASIANPAINT.NS,BAJAJ-AUTO.NS,GRASIM.NS,HCLTECH.NS,HDFCBANK.NS,HDFCLIFE.NS,HINDUNILVR.NS,ICICIBANK.NS,ITC.NS,KOTAKBANK.NS,LT.NS,MARUTI.NS,NESTLEIND.NS,NTPC.NS,POWERGRID.NS,RELIANCE.NS,SBILIFE.NS,TCS.NS,TITAN.NS,WIPRO.NS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2023-02-16,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
2023-02-17,0.078287,0.078287,0.0288,0.0288,0.078287,0.078287,0.062078,0.078287,0.063451,0.029875,0.0288,0.0288,0.0288,0.0288,0.0288,0.0288,0.030733,0.078287,0.042089,0.071648
2023-02-20,0.055541,0.053976,0.029339,0.040607,0.079752,0.029339,0.069721,0.03744,0.029339,0.079036,0.079752,0.029339,0.029339,0.029339,0.030206,0.079752,0.029339,0.079752,0.079752,0.029339
2023-02-21,0.095044,0.073877,0.034965,0.034965,0.095044,0.035297,0.056106,0.095044,0.034965,0.034965,0.034965,0.034965,0.034965,0.034965,0.034965,0.034965,0.034965,0.095044,0.034965,0.034965
2023-02-22,0.087752,0.087752,0.032282,0.032282,0.087752,0.041471,0.032282,0.087752,0.032282,0.032282,0.032282,0.032282,0.032282,0.032282,0.044105,0.043077,0.032282,0.077482,0.087752,0.032282


In [40]:
# PPO Test Model
e_trade_gym = StockPortfolioEnv(df = test_df, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

ppo_test_daily_return, ppo_test_weights = DRLAgents.DRL_prediction(model=trained_ppo,
                        test_data = test_df,
                        test_env = env_trade,
                        test_obs = obs_trade)



begin_total_asset:1000000
end_total_asset:1291875.697714956
Sharpe:  3.179998740437339


In [41]:
ppo_test_weights.to_csv('ppo_test_weights')

In [42]:
# DDPG Test Model
e_trade_gym = StockPortfolioEnv(df = test_df, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

ddpg_test_daily_return, ddpg_test_weights = DRLAgents.DRL_prediction(model=trained_ddpg,
                        test_data = test_df,
                        test_env = env_trade,
                        test_obs = obs_trade)



begin_total_asset:1000000
end_total_asset:1285002.5452271781
Sharpe:  3.1333540864650433


In [43]:
ddpg_test_weights.to_csv('ddpg_test_weights')

In [44]:
# SAC Test Model
e_trade_gym = StockPortfolioEnv(df = test_df, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

sac_test_daily_return, sac_test_weights = DRLAgents.DRL_prediction(model=trained_sac,
                        test_data = test_df,
                        test_env = env_trade,
                        test_obs = obs_trade)



begin_total_asset:1000000
end_total_asset:1334937.5633620594
Sharpe:  3.4159674591370908


In [45]:
sac_test_weights.to_csv('sac_test_weights')

In [46]:
# TD3 Test Model
e_trade_gym = StockPortfolioEnv(df = test_df, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

td3_test_daily_return, td3_test_weights = DRLAgents.DRL_prediction(model=trained_sac,
                        test_data = test_df,
                        test_env = env_trade,
                        test_obs = obs_trade)



begin_total_asset:1000000
end_total_asset:1336283.8886596966
Sharpe:  3.421345262391123


In [47]:
td3_test_weights.to_csv('td3_test_weights')

### 8.7 Save the Portfolios

In [48]:
a2c_test_portfolio = a2c_test_daily_return.copy()
a2c_test_returns = a2c_test_daily_return.copy()

ppo_test_portfolio = ppo_test_daily_return.copy()
ppo_test_returns = ppo_test_daily_return.copy()

ddpg_test_portfolio = ddpg_test_daily_return.copy()
ddpg_test_returns = ddpg_test_daily_return.copy()

sac_test_portfolio = sac_test_daily_return.copy()
sac_test_returns = sac_test_daily_return.copy()

td3_test_portfolio = td3_test_daily_return.copy()
td3_test_returns = td3_test_daily_return.copy()

In [49]:
%store a2c_test_portfolio
%store a2c_test_returns 

%store ppo_test_portfolio
%store ppo_test_returns 

%store ddpg_test_portfolio
%store ddpg_test_returns 

%store sac_test_portfolio
%store sac_test_returns

%store td3_test_portfolio
%store td3_test_returns

Stored 'a2c_test_portfolio' (DataFrame)
Stored 'a2c_test_returns' (DataFrame)
Stored 'ppo_test_portfolio' (DataFrame)
Stored 'ppo_test_returns' (DataFrame)
Stored 'ddpg_test_portfolio' (DataFrame)
Stored 'ddpg_test_returns' (DataFrame)
Stored 'sac_test_portfolio' (DataFrame)
Stored 'sac_test_returns' (DataFrame)
Stored 'td3_test_portfolio' (DataFrame)
Stored 'td3_test_returns' (DataFrame)
