# Environment Setup

In [1]:
import numpy as np
import gym
from gym import spaces
import pandas as pd

class StockTradingEnv(gym.Env):
    """
    A stock trading environment for OpenAI gym
    """
    def __init__(self, stock_data, initial_balance=10000, lookback_window_size=10):
        super(StockTradingEnv, self).__init__()

        self.stock_data = stock_data
        self.initial_balance = initial_balance
        self.lookback_window_size = lookback_window_size

        self.action_space = spaces.Discrete(3)  # Sell, Hold, Buy
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(lookback_window_size + X,))  # +X for additional indicators

        self.reset()

    def reset(self):
        self.balance = self.initial_balance
        self.current_step = 0
        self.holdings = 0  # Number of shares held
        return self._next_observation()

    def _next_observation(self):
        # Use data for the last N days
        frame = self.stock_data.iloc[self.current_step:self.current_step + self.lookback_window_size]

        # Append additional features like balance, holdings and technical indicators
        additional_features = self._get_technical_indicators(frame)
        obs = np.append(frame.values.flatten(), [self.balance, self.holdings, *additional_features])
        return obs

    def _get_technical_indicators(self, frame):
        # Calculate and return technical indicators (like moving average, RSI, etc.)
        # Dummy implementation
        return [0.0] * 20  # Replace X with the number of indicators

    def step(self, action):
        current_price = self.stock_data.iloc[self.current_step]['Close']
        self.current_step += 1

        if action == 0:  # Sell
            self.balance += self.holdings * current_price
            self.holdings = 0
        elif action == 2 and self.balance >= current_price:  # Buy
            self.holdings += self.balance // current_price
            self.balance -= self.holdings * current_price

        done = self.current_step >= len(self.stock_data)
        reward = self._calculate_reward()
        obs = self._next_observation()
        return obs, reward, done, {}

    def _calculate_reward(self):
        # Simple reward: Change in portfolio value
        current_portfolio_value = self.balance + self.holdings * self.stock_data.iloc[self.current_step]['Close']
        return current_portfolio_value - self.initial_balance

    def render(self, mode='human', close=False):
        # Optionally implement rendering for visualization
        pass


# Step 2: Policy Network

In [2]:
import torch
import torch.nn as nn

class PolicyNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(PolicyNetwork, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.layers(x)
    
policy_net = PolicyNetwork(input_size=30, hidden_size=128, output_size=3)

# Step 3: MAML Implementation

In [7]:
import torch.optim as optim
from copy import deepcopy

def inner_loop(policy, optimizer, env, num_steps):
    for _ in range(num_steps):
        states, actions, rewards = sample_trajectory(env, policy)
        returns = compute_returns(rewards)
        policy_loss = compute_policy_loss(states, actions, returns, policy)

        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
def maml_update(initial_policy, tasks, alpha, beta, num_inner_steps):
    meta_policy = deepcopy(initial_policy)
    meta_optimizer = optim.Adam(meta_policy.parameters(), lr=beta)

    task_grads = []  # List to store gradients for each task

    for task_env in tasks:
        task_policy = deepcopy(meta_policy)
        task_optimizer = optim.Adam(task_policy.parameters(), lr=alpha)
        
        # Perform task-specific updates
        inner_loop(task_policy, task_optimizer, task_env, num_inner_steps)

        # Compute gradients for task
        for param, task_param in zip(meta_policy.parameters(), task_policy.parameters()):
            if task_param.grad is not None:
                # Accumulate gradients from this task
                if param.grad is None:
                    param.grad = torch.zeros_like(param.data)
                param.grad.data.add_(task_param.data - param.data)

        task_grads.append([param.grad for param in meta_policy.parameters()])

    # Aggregate gradients across tasks and apply meta-update
    for param_index, param in enumerate(meta_policy.parameters()):
        meta_grad = torch.stack([grads[param_index] for grads in task_grads]).mean(dim=0)
        param.data.sub_(meta_grad.data * beta)  # Update the initial policy

    return meta_policy


    # Meta-update logic
    # ...

def sample_trajectory(env, policy, max_trajectory_length=1000):
    states, actions, rewards = [], [], []
    state = env.reset()

    for _ in range(max_trajectory_length):
        state_tensor = torch.from_numpy(state).float()
        action_prob = policy(state_tensor)
        action = torch.multinomial(action_prob, 1).item()

        next_state, reward, done, _ = env.step(action)

        states.append(state)
        actions.append(action)
        rewards.append(reward)

        state = next_state

        if done:
            break

    return states, actions, rewards


def compute_returns(rewards, gamma=0.99):
    R = 0
    returns = []
    for r in reversed(rewards):
        R = r + gamma * R
        returns.insert(0, R)
    return returns


def compute_policy_loss(states, actions, returns, policy):
    loss = 0
    for state, action, R in zip(states, actions, returns):
        state_tensor = torch.from_numpy(state).float()
        action_probs = policy(state_tensor)
        action_prob = action_probs[action]
        loss -= torch.log(action_prob) * R  # Negative because we want to maximize
    return loss


# Other needed function

In [6]:
import random
def step(self, action):
    current_price = self.stock_data[self.current_step]['Close']
    self.current_step += 1

    if action == 0:  # Sell
        # Implement selling logic: update balance based on current_price
        pass
    elif action == 2:  # Buy
        # Implement buying logic: update balance based on current_price
        pass
    # Hold does nothing

    reward = self._calculate_reward()
    done = self.current_step >= len(self.stock_data)
    return self._next_observation(), reward, done, {}

def sample_tasks(stock_data, num_tasks):
    sampled_stocks = random.sample(list(stock_data.keys()), num_tasks)
    return [StockTradingEnv(stock_data[stock]) for stock in sampled_stocks]


# Step 4 : Training Loop

In [None]:
# Hyperparameters
num_iterations = 100
num_inner_steps = 5
alpha = 0.01  # Inner loop learning rate
beta = 0.001  # Meta learning rate

# Policy network setup
input_size = 20  # Define according to your state representation
hidden_size = 64
output_size = 3  # Buy, hold, sell

initial_policy = PolicyNetwork(input_size, hidden_size, output_size)

# Main training loop
for iteration in range(num_iterations):
    tasks = sample_tasks(stock_list, num_tasks_per_iteration)  # Sample tasks

    initial_policy = maml_update(initial_policy, tasks, alpha, beta, num_inner_steps)
    # Evaluate the updated policy


In [None]:

# Optional: Evaluate the updated policy on validation tasks
    
# ... rest of the functions used above
def initialize_policy_parameters():
    # Assuming a policy network class has been defined
    policy_net = PolicyNetwork(input_size, hidden_size, output_size)
    return policy_net.parameters()
def sample_trajectories(task_env, policy_params, num_trajectories=K):
    trajectories = []
    policy_net.load_state_dict(policy_params)  # Assuming policy_net is a global variable for the policy network
    
    for _ in range(num_trajectories):
        state = task_env.reset()
        trajectory = []
        while not done:
            action = policy_net.act(state)  # Assuming a method that decides an action based on the current state
            next_state, reward, done, _ = task_env.step(action)
            trajectory.append((state, action, reward))
            state = next_state
        trajectories.append(trajectory)
    return trajectories
def compute_loss(trajectories, policy_params):
    policy_losses = []
    policy_net.load_state_dict(policy_params)  # Load the current policy parameters

    for trajectory in trajectories:
        for state, action, reward in trajectory:
            # Compute the policy loss from the trajectory data
            # This could use a function like policy_net.evaluate_actions
            log_prob = policy_net.evaluate_actions(state, action)
            policy_losses.append(-log_prob * reward)  # Negative because we perform gradient ascent
    return torch.stack(policy_losses).sum()
def compute_meta_gradient(tasks, D_prime, theta, theta_i_prime, task_gradients):
    meta_gradient = torch.zeros_like(theta)
    
    for i, task in enumerate(tasks):
        # Load the adapted parameters for the current task
        policy_net.load_state_dict(theta_i_prime[i])
        
        # Sample new trajectories using the adapted parameters
        new_trajectories = sample_trajectories(task, theta_i_prime[i], num_trajectories=K)
        
        # Compute the loss with the new trajectories
        new_loss = compute_loss(new_trajectories, theta_i_prime[i])
        
        # Compute the gradient w.r.t. the adapted parameters
        adapted_gradients = torch.autograd.grad(new_loss, theta_i_prime[i])
        
        # Combine the adapted gradient with the initial gradient for the meta-gradient
        meta_gradient += adapted_gradients[0] - task_gradients[i]
    
    return meta_gradient / len(tasks)

# Initialization
theta = initialize_policy_parameters()
meta_optimizer = torch.optim.Adam([theta], lr=beta)

# Meta-training loop
while not done:
    task_gradients = []  # To store gradients for each task
    
    # Task sampling
    tasks = sample_tasks(num_tasks)
    
    # Inner loop
    for task in tasks:
        # Copy initial policy parameters for task-specific learning
        theta_i = deepcopy(theta)
        
        # Sample K trajectories using the current policy parameters theta
        D = sample_trajectories(task, theta_i, num_trajectories=K)
        
        # Evaluate gradients of the loss w.r.t. theta_i
        task_loss = compute_loss(D, theta_i)
        task_gradients.append(torch.autograd.grad(task_loss, theta_i))
        
        # Compute adapted parameters theta'_i using gradient descent
        theta_i_prime = theta_i - alpha * task_gradients[-1]
        
        # Sample new trajectories D'_i using the adapted parameters theta'_i
        D_prime = sample_trajectories(task, theta_i_prime, num_trajectories=K)
    
    # Outer loop
    meta_gradient = compute_meta_gradient(tasks, D_prime, theta, theta_i_prime, task_gradients)
    meta_optimizer.zero_grad()
    theta.grad = meta_gradient
    meta_optimizer.step()
