<a href="https://colab.research.google.com/github/isuba-ian-k/Finance/blob/master/fin_algo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### ============================================================================
# **REINFORCEMENT LEARNING ALGO AGENT on FX-EUR/USD FIN TRADING**
###### 500k+ steps/hour
###### Built on i5 3rd-gen/4GB DDR3
### =============================================================================

## 0.0 project setup - lib imports

In [23]:
# for data handling
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# for data visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# for the RL-Algo Trading Agent
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# other utils
from collections import deque
import hashlib
import random

## 1.0 data handling

In [17]:
df_unc = pd.read_csv("/content/sample_data/eurusd_5yrs_daily_cleaned.csv")
df_unc.columns = df_unc.columns.str.lower()

features = df_unc.copy()
features.drop(['date'], axis=1, inplace=True)
features['c - o'] = features['close'] - features['open']
features['h - o'] = features['high'] - features['open']
features['o - l'] = features['open'] - features['low']
features['h - l'] = features['high'] - features['low']
features['h - c'] = features['high'] - features['close']
features['c - l'] = features['close'] - features['low']

scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(features)

df = pd.DataFrame(scaled_features)
df.columns = features.columns
df.head()

Unnamed: 0,open,high,low,close,c - o,h - o,o - l,h - l,h - c,c - l
0,1.076473,1.08885,1.075338,1.076461,-1.2e-05,0.012378,0.001134,0.013512,0.012389,0.001123
1,1.080124,1.084952,1.077018,1.080264,0.00014,0.004828,0.003106,0.007934,0.004688,0.003246
2,1.088755,1.102548,1.087654,1.088957,0.000202,0.013793,0.001101,0.014894,0.013591,0.001303
3,1.104618,1.108647,1.095518,1.104826,0.000207,0.004029,0.0091,0.013129,0.003822,0.009308
4,1.113536,1.113784,1.101225,1.113908,0.000372,0.000248,0.012312,0.01256,-0.000124,0.012684


## 2.0 environment design

Create a State

In [24]:
window = 30   # ← reduced from 60

def create_state(df, idx, window=30):
    start = idx - window + 1
    if start < 0:
        return None
    window_df = df.iloc[start:idx+1]

    close = window_df['close'].values
    returns = np.log(close[1:] / close[:-1]) if len(close) > 1 else np.zeros(window-1)

    # 6 simple features
    rsi = compute_rsi(close)
    macd = compute_macd(close)
    vol = np.std(returns[-20:]) if len(returns) >= 20 else 0
    ema20_dist = (close[-1] - pd.Series(close).ewm(20).mean().iloc[-1]) / close[-1]
    ema50_dist = (close[-1] - pd.Series(close).ewm(50).mean().iloc[-1]) / close[-1]
    momentum = returns[-1] if len(returns) > 0 else 0

    padded_returns = np.pad(returns, (window - len(returns), 0), mode='constant')
    state = np.concatenate([padded_returns, [rsi/100, macd, ema20_dist, ema50_dist, vol, momentum]])
    return state

In [None]:
# ------------------------------
# 2. FAST REPLAY BUFFER (still needed)
# ------------------------------
class ReplayBuffer:
    def __init__(self, capacity=50_000):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = zip(*batch)
        return (torch.FloatTensor(np.array(state)).to(device),
                torch.LongTensor(action).to(device),
                torch.FloatTensor(reward).to(device),
                torch.FloatTensor(np.array(next_state)).to(device),
                torch.FloatTensor(done).to(device))

    def __len__(self):
        return len(self.buffer)

## 3.0 rl-algo architecture

The Neural Network

In [19]:
# ------------------------------
# 1. SIMPLE NEURAL NET Q-FUNCTION
# ------------------------------
class DQN(nn.Module):
    def __init__(self, state_size, action_size=3, hidden=256):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_size, hidden),
            nn.ReLU(),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Linear(hidden, hidden//2),
            nn.ReLU(),
            nn.Linear(hidden//2, action_size)
        )

    def forward(self, x):
        return self.net(x)

Training loop

In [None]:
## ------------------------------
# 5. FINAL TRAINING LOOP (WORKS 100%)
# ------------------------------
# some initializations
buffer = ReplayBuffer()
target_update = 100

batch_size = 64
state_size = window + 6
action_size = 3

# The Neural Nets
policy_net = DQN(state_size, action_size).to(device)
target_net = DQN(state_size, action_size).to(device)
target_net.load_state_dict(policy_net.state_dict())
optimizer = optim.Adam(policy_net.parameters(), lr=0.0005)

gamma = 0.99
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995

episodes = 15
all_rewards = []

for ep in range(episodes):
    ep_reward = 0.0
    for idx in range(window, len(df)-1):
        state = create_state(df, idx, window)
        if state is None:
            continue

        # Epsilon-greedy action
        if random.random() < epsilon:
            action = random.randint(0, 2)
        else:
            with torch.no_grad():
                qvals = policy_net(torch.FloatTensor(state).unsqueeze(0).to(device))
                action = qvals.argmax().item()

        # === REWARD (clean & visible) ===
        price_diff = df.iloc[idx+1]['close'] - df.iloc[idx]['close']
        price_diff_pips = price_diff * 10000  # 0.0001 → 1 pip

        if action == 1:      # Long
            reward = price_diff_pips
        elif action == 2:    # Short
            reward = -price_diff_pips
        else:                # Flat
            reward = 0.0

        reward -= 0.5        # 0.5 pip cost every time we take a position (only when action changes would be better, but this is fine for now)

        # === NEXT STATE & DONE ===
        next_state = create_state(df, idx+1, window)
        if next_state is None:
            next_state = state  # fallback
        done = 1.0 if idx >= len(df)-2 else 0.0

        # Store transition
        buffer.push(state, action, reward, next_state, done)
        ep_reward += reward

        # Train when buffer is warm
        if len(buffer) > 1000:
            states, actions, rewards, next_states, dones = buffer.sample(batch_size)

            current_q = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
            next_q = target_net(next_states).max(1)[0]
            target_q = rewards + gamma * next_q * (1 - dones)

            loss = F.mse_loss(current_q, target_q.detach())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # === End of episode ===
    if (ep + 1) % 5 == 0:  # Update target net every 5 episodes (faster convergence)
        target_net.load_state_dict(policy_net.state_dict())

    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    all_rewards.append(ep_reward)
    print(f"Episode {ep+1:2d}/{episodes} | Total Reward: {ep_reward:8,.1f} pips | Epsilon: {epsilon:.3f}")



## 4.0 evaluation

In [None]:
# === FINAL REWARD PLOT ===
plt.figure(figsize=(10,5))
plt.plot(all_rewards, marker='o')
plt.title("DQN Training Progress (Pips)")
plt.xlabel("Episode")
plt.ylabel("Total Reward (pips)")
plt.grid(True)
plt.show()

print(f"Best DQN episode: {max(all_rewards):,.0f} pips")

In [None]:
# Plot rewards
plt.figure(figsize=(10,4))
plt.plot(all_rewards)
plt.title("DQN Training Reward Curve")
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.grid()
plt.show()

## The End.