In [3]:
## install required packages
!pip install swig
!pip install wrds
!pip install pyportfolioopt
## install finrl library
!pip install git+https://github.com/AI4Finance-Foundation/FinRL.git

Collecting git+https://github.com/AI4Finance-Foundation/FinRL.git
  Cloning https://github.com/AI4Finance-Foundation/FinRL.git to c:\users\admin\appdata\local\temp\pip-req-build-_ow2_5vt
  Resolved https://github.com/AI4Finance-Foundation/FinRL.git to commit dcf6bde8d64e1a747673d24dde30dce195bc3cf1
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting elegantrl@ git+https://github.com/AI4Finance-Foundation/ElegantRL.git (from finrl==0.3.8)
  Cloning https://github.com/AI4Finance-Foundation/ElegantRL.git to c:\users\admin\appdata\local\temp\pip-install-9z1w5efd\elegantrl_3ad6fb0bfbb94d7a81291d32c54c57a9
  Resolved https://github.com/AI4Finance-Foundation/ElegantRL.git to commit 37aac1f592e1add9f9fd37ae8db

  Running command git clone --filter=blob:none --quiet https://github.com/AI4Finance-Foundation/FinRL.git 'C:\Users\Admin\AppData\Local\Temp\pip-req-build-_ow2_5vt'
  Running command git clone --filter=blob:none --quiet https://github.com/AI4Finance-Foundation/ElegantRL.git 'C:\Users\Admin\AppData\Local\Temp\pip-install-9z1w5efd\elegantrl_3ad6fb0bfbb94d7a81291d32c54c57a9'


In [4]:
# Import our Deep SARSA Agent
from deep_sarsa_agent import DeepSARSAAgent

In [5]:
import finrl
import pandas as pd
from finrl.agents.stablebaselines3.models import DRLAgent
from finrl.config import INDICATORS, TRAINED_MODEL_DIR, RESULTS_DIR
from finrl.main import check_and_make_directories
from finrl.meta.env_stock_trading.env_stocktrading import StockTradingEnv

In [6]:
train = pd.read_csv(r'SARSA_FinancialRL\data\train_data.csv')
# Don't set the first column as index - FinRL needs integer index
# Just drop the unnamed index column if it exists
if 'Unnamed: 0' in train.columns:
    train = train.drop(columns=['Unnamed: 0'])

# Reset index to ensure it's a proper integer RangeIndex
train = train.reset_index(drop=True)

print(f"Loaded {len(train)} rows of training data")
print(f"Columns: {train.columns.tolist()}")
print(f"Index type: {type(train.index)}")
train.head()

Loaded 2556 rows of training data
Columns: ['date', 'open', 'high', 'low', 'close', 'volume', 'tic', 'day', 'macd', 'rsi', 'cci', 'dx', 'atr']
Index type: <class 'pandas.core.indexes.range.RangeIndex'>


Unnamed: 0,date,open,high,low,close,volume,tic,day,macd,rsi,cci,dx,atr
0,2015-01-13,29.16,29.74,29.01,29.74,84550,vnm,1,0.013013,100.0,66.666667,100.0,0.797407
1,2015-01-14,30.03,30.03,29.16,29.45,87550,vnm,2,0.007588,65.0,100.0,38.095238,0.823419
2,2015-01-15,29.45,29.45,29.01,29.16,86090,vnm,3,-0.005536,47.206704,-133.333333,2.204929,0.716661
3,2015-01-16,29.16,29.16,29.01,29.16,33740,vnm,4,-0.01255,47.206704,-103.109656,2.204929,0.585941
4,2015-01-19,29.16,29.16,28.87,29.01,60950,vnm,0,-0.022945,40.548192,-99.904489,28.230517,0.527051


In [7]:
stock_dimension = len(train.tic.unique())
# Use the actual tech_indicator_list we're using, not FinRL's INDICATORS
tech_indicator_list = ["macd", "rsi", "cci", "dx", "atr"]
state_space = 1 + 2*stock_dimension + len(tech_indicator_list)*stock_dimension
print(f"Stock Dimension: {stock_dimension}, State Space: {state_space}")

Stock Dimension: 1, State Space: 8


In [None]:
buy_cost_list = sell_cost_list = [0.001] * stock_dimension
num_stock_shares = [0] * stock_dimension

env_kwargs = {
    "hmax": 100,
    "initial_amount": 1000000,
    "num_stock_shares": num_stock_shares,
    "buy_cost_pct": buy_cost_list,
    "sell_cost_pct": sell_cost_list,
    "state_space": state_space,
    "stock_dim": stock_dimension,
    "tech_indicator_list": tech_indicator_list,
    "action_space": stock_dimension,
    "reward_scaling": 1e-3  # TĂNG từ 1e-4 lên 1e-3 để reward có ý nghĩa hơn
}

print("Environment configuration:")
print(f"  Initial amount: {env_kwargs['initial_amount']:,}")
print(f"  Max shares per trade: {env_kwargs['hmax']}")
print(f"  Reward scaling: {env_kwargs['reward_scaling']}")

e_train_gym = StockTradingEnv(df = train, **env_kwargs)
print("\n✓ Environment created successfully!")

## ⚠️ Critical Fix: Loss Instability

### Vấn đề phát hiện:
Loss tăng liên tục từ 9.5M → 16.6M cho thấy **training không ổn định**

### Root Causes:
1. **Reward scaling quá nhỏ (1e-4)** → Rewards gần như = 0 → TD targets không có signal
2. **LR_UPDATE quá cao (0.8)** → TD update quá mạnh → destabilize learning
3. **Network learning rate quá cao (5e-4)** → Overshooting optimal values
4. **Không có gradient clipping** → Exploding gradients

### Solutions Applied:

**1. Tăng Reward Scaling: 1e-4 → 1e-3 (×10)**
   - Rewards có magnitude lớn hơn → meaningful signal cho learning
   
**2. Giảm LR_UPDATE: 0.8 → 0.3 (÷2.7)**
   - TD update nhẹ nhàng hơn: `target = 0.7 * old_Q + 0.3 * TD_target`
   - Giảm variance trong training
   
**3. Giảm Network Learning Rate: 5e-4 → 1e-4 (÷5)**
   - Slow and steady learning
   - Tránh overshooting
   
**4. Tăng Network Capacity: [128,64] → [256,128]**
   - Với 8-dimensional state, network lớn hơn capture patterns tốt hơn
   
**5. Giảm Gamma: 0.95 → 0.90**
   - Giảm variance từ long-term rewards
   - Focus vào immediate profits
   
**6. Giảm Batch Size & Epochs: 64×8 → 32×5**
   - Training steps nhỏ hơn → ổn định hơn
   - Tránh overfitting trên single trajectory

**7. Thêm Gradient Clipping (trong code)**
   - Clip gradient norm ≤ 1.0
   - Ngăn exploding gradients

### Expected Results:
✅ Loss giảm hoặc ít nhất ổn định (không tăng)
✅ Sharpe ratio dương và cải thiện dần
✅ Rewards tăng qua các episodes

In [None]:
# Network hyperparameters
HIDDEN_SIZES = [256, 128]       # TĂNG network capacity
LEARNING_RATE = 1e-4            # GIẢM learning rate để ổn định
GAMMA = 0.90                    # GIẢM gamma để giảm variance
ACTION_DIM = 10  

# Create agent
agent = DeepSARSAAgent(
    env=e_train_gym,
    action_dim=ACTION_DIM,
    hidden_sizes=HIDDEN_SIZES,
    learning_rate=LEARNING_RATE,
    gamma=GAMMA
)

print("\n✓ Agent created successfully!")
print(f"   Learning Rate: {LEARNING_RATE}")
print(f"   Gamma: {GAMMA}")
print(f"   Network: {HIDDEN_SIZES}")

DeepSARSAAgent initialized:
  State dimension: 8
  Discrete action dimension: 10
  Continuous action dimension: 1
  Device: cpu
  Network architecture: 8 -> [128, 64] -> 10

✓ Agent created successfully!


In [19]:
# Training hyperparameters
NUM_EPISODES = 80               # GIẢM xuống để test nhanh
EPSILON_START = 1.0             # BẮT ĐẦU với 100% exploration
EPSILON_END = 0.1               # TĂNG minimum exploration
EPSILON_DECAY = 0.96            # GIẢM chậm
LR_UPDATE = 0.3                 # GIẢM MẠNH TD learning rate để ổn định hơn
BATCH_SIZE = 32                 # GIẢM batch size để học ổn định hơn
EPOCHS_PER_TRAJECTORY = 5       # GIẢM epochs để tránh overfitting

print("="*60)
print("Starting Stabilized Training")
print("="*60)
print(f"Configuration:")
print(f"  Episodes: {NUM_EPISODES}")
print(f"  Epsilon: {EPSILON_START} -> {EPSILON_END} (decay={EPSILON_DECAY})")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Epochs per trajectory: {EPOCHS_PER_TRAJECTORY}")
print(f"  TD update lr: {LR_UPDATE} CRITICAL: Giảm để ổn định loss")
print("="*60)
print()

# Train
training_history = agent.train(
    num_episodes=NUM_EPISODES,
    epsilon_start=EPSILON_START,
    epsilon_end=EPSILON_END,
    epsilon_decay=EPSILON_DECAY,
    lr_update=LR_UPDATE,
    batch_size=BATCH_SIZE,
    epochs_per_trajectory=EPOCHS_PER_TRAJECTORY,
    verbose=True
)

print("\n" + "="*60)
print("✓ Training Complete!")
print("="*60)

Starting Stabilized Training
Configuration:
  Episodes: 80
  Epsilon: 1.0 -> 0.1 (decay=0.96)
  Batch size: 32
  Epochs per trajectory: 5
  TD update lr: 0.3 CRITICAL: Giảm để ổn định loss



Training Deep SARSA:   0%|          | 0/80 [00:00<?, ?it/s]

Training Deep SARSA:   8%|▊         | 6/80 [00:35<07:27,  6.04s/it, reward=0.15, loss=6440266.8006, eps=0.783, steps=2556] 

day: 2555, episode: 340
begin_total_asset: 1000000.00
end_total_asset: 1001974.98
total_reward: 1974.98
total_cost: 1007.52
total_trades: 1532
Sharpe: 0.298


Training Deep SARSA:  20%|██        | 16/80 [01:40<06:54,  6.48s/it, reward=-0.03, loss=6667325.4725, eps=0.520, steps=2556]

day: 2555, episode: 350
begin_total_asset: 1000000.00
end_total_asset: 1000322.98
total_reward: 322.98
total_cost: 1068.62
total_trades: 1635
Sharpe: 0.030


Training Deep SARSA:  32%|███▎      | 26/80 [02:49<06:24,  7.12s/it, reward=-0.07, loss=6902791.1069, eps=0.346, steps=2556]

day: 2555, episode: 360
begin_total_asset: 1000000.00
end_total_asset: 999698.86
total_reward: -301.14
total_cost: 1060.54
total_trades: 1630
Sharpe: -0.058


Training Deep SARSA:  45%|████▌     | 36/80 [03:56<05:11,  7.08s/it, reward=0.69, loss=7121674.2663, eps=0.230, steps=2556] 

day: 2555, episode: 370
begin_total_asset: 1000000.00
end_total_asset: 998692.83
total_reward: -1307.17
total_cost: 1078.47
total_trades: 1664
Sharpe: -0.134


Training Deep SARSA:  57%|█████▊    | 46/80 [05:05<04:08,  7.32s/it, reward=-0.04, loss=7166990.5163, eps=0.153, steps=2556]

day: 2555, episode: 380
begin_total_asset: 1000000.00
end_total_asset: 999845.46
total_reward: -154.54
total_cost: 1030.34
total_trades: 1587
Sharpe: -0.021


Training Deep SARSA:  70%|███████   | 56/80 [06:14<02:55,  7.31s/it, reward=-0.02, loss=7318920.6500, eps=0.102, steps=2556]

day: 2555, episode: 390
begin_total_asset: 1000000.00
end_total_asset: 1002506.27
total_reward: 2506.27
total_cost: 1042.53
total_trades: 1604
Sharpe: 0.327


Training Deep SARSA:  82%|████████▎ | 66/80 [07:24<01:40,  7.19s/it, reward=-0.05, loss=7534918.6087, eps=0.068, steps=2556]

day: 2555, episode: 400
begin_total_asset: 1000000.00
end_total_asset: 999920.05
total_reward: -79.95
total_cost: 1033.25
total_trades: 1596
Sharpe: -0.012


Training Deep SARSA:  95%|█████████▌| 76/80 [08:39<00:29,  7.27s/it, reward=-0.06, loss=7729153.8706, eps=0.045, steps=2556]

day: 2555, episode: 410
begin_total_asset: 1000000.00
end_total_asset: 998933.08
total_reward: -1066.92
total_cost: 999.32
total_trades: 1537
Sharpe: -0.233


Training Deep SARSA: 100%|██████████| 80/80 [09:07<00:00,  6.84s/it, reward=0.12, loss=7757001.7050, eps=0.038, steps=2556] 



Training history plot saved to training_history.pdf

✓ Training Complete!


  plt.show()


In [None]:
# Save model
model_path = r'D:\SARSA_finRL\SARSA_FinancialRL\Agent'
agent.save_model(model_path)

print(f"\n✓ Model saved to: {model_path}")


RuntimeError: Parent directory Agent does not exist.

In [None]:
# Analyze training results
import matplotlib.pyplot as plt

print("\n📊 Training Summary:")
print("="*60)
if training_history:
    # Find best episode
    best_episode = max(training_history, key=lambda x: x['total_reward'])
    worst_episode = min(training_history, key=lambda x: x['total_reward'])
    
    print(f"Best Episode #{best_episode['episode']}:")
    print(f"  Total Reward: {best_episode['total_reward']:.2f}")
    print(f"  Sharpe Ratio: {best_episode['sharpe']:.3f}")
    print(f"  Total Trades: {best_episode['total_trades']}")
    print(f"  Final Asset: {best_episode['end_total_asset']:.2f}")
    print()
    print(f"Worst Episode #{worst_episode['episode']}:")
    print(f"  Total Reward: {worst_episode['total_reward']:.2f}")
    print(f"  Sharpe Ratio: {worst_episode['sharpe']:.3f}")
    print()
    
    # Calculate average metrics
    avg_reward = sum(ep['total_reward'] for ep in training_history) / len(training_history)
    avg_sharpe = sum(ep['sharpe'] for ep in training_history) / len(training_history)
    avg_trades = sum(ep['total_trades'] for ep in training_history) / len(training_history)
    
    print(f"Average Metrics:")
    print(f"  Avg Reward: {avg_reward:.2f}")
    print(f"  Avg Sharpe: {avg_sharpe:.3f}")
    print(f"  Avg Trades: {avg_trades:.0f}")
    
print("="*60)

In [None]:
# Plot loss trend to diagnose training
import matplotlib.pyplot as plt
import numpy as np

if training_history:
    episodes = [ep['episode'] for ep in training_history]
    rewards = [ep['total_reward'] for ep in training_history]
    sharpes = [ep['sharpe'] for ep in training_history]
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 4))
    
    # Plot 1: Reward trend
    axes[0].plot(episodes, rewards, marker='o', markersize=3, linewidth=1.5, alpha=0.7)
    axes[0].axhline(y=0, color='r', linestyle='--', alpha=0.5, label='Break-even')
    axes[0].set_xlabel('Episode')
    axes[0].set_ylabel('Total Reward')
    axes[0].set_title('📈 Reward Progression')
    axes[0].grid(True, alpha=0.3)
    axes[0].legend()
    
    # Plot 2: Sharpe ratio trend
    axes[1].plot(episodes, sharpes, marker='s', markersize=3, linewidth=1.5, alpha=0.7, color='green')
    axes[1].axhline(y=0, color='r', linestyle='--', alpha=0.5, label='Zero Sharpe')
    axes[1].set_xlabel('Episode')
    axes[1].set_ylabel('Sharpe Ratio')
    axes[1].set_title('📊 Sharpe Ratio Progression')
    axes[1].grid(True, alpha=0.3)
    axes[1].legend()
    
    plt.tight_layout()
    plt.savefig('training_metrics.png', dpi=150, bbox_inches='tight')
    print("✓ Training metrics plot saved to training_metrics.png")
    plt.show()
    
    # Performance analysis
    print("\n" + "="*60)
    print("🎯 Performance Diagnosis:")
    print("="*60)
    
    positive_rewards = [r for r in rewards if r > 0]
    negative_rewards = [r for r in rewards if r < 0]
    
    print(f"Profitable episodes: {len(positive_rewards)}/{len(rewards)} ({len(positive_rewards)/len(rewards)*100:.1f}%)")
    print(f"Loss-making episodes: {len(negative_rewards)}/{len(rewards)} ({len(negative_rewards)/len(rewards)*100:.1f}%)")
    
    if positive_rewards:
        print(f"Avg profit when winning: {np.mean(positive_rewards):.2f}")
    if negative_rewards:
        print(f"Avg loss when losing: {np.mean(negative_rewards):.2f}")
    
    # Trend analysis
    first_half_reward = np.mean(rewards[:len(rewards)//2])
    second_half_reward = np.mean(rewards[len(rewards)//2:])
    improvement = second_half_reward - first_half_reward
    
    print(f"\nLearning Progress:")
    print(f"  First half avg reward: {first_half_reward:.2f}")
    print(f"  Second half avg reward: {second_half_reward:.2f}")
    print(f"  Improvement: {improvement:+.2f} ({'✅ LEARNING' if improvement > 0 else '⚠️ NOT IMPROVING'})")
    print("="*60)