<a href="https://colab.research.google.com/github/jcmachicao/pucp_energy__erasmus_modulos/blob/main/pucp_erasmus__RL_for_EnergyStorage.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Reinformcement Learning Modeling for Energy Storage System

## 1. Modeling

In [1]:
import numpy as np
import gym
from gym import spaces

In [2]:
class BatteryStorageEnv(gym.Env):
    def __init__(self, capacity=100, max_charge_rate=30, efficiency=0.95):
        super(BatteryStorageEnv, self).__init__()

        self.capacity = capacity
        self.max_charge_rate = max_charge_rate
        self.efficiency = efficiency

        self.action_space = spaces.Discrete(3)  # -1: discharge, 0: idle, 1: charge
        self.observation_space = spaces.Box(
            low=np.array([0, 0, 0, 0]),
            high=np.array([capacity, 1000, 24, 1000]),
            dtype=np.float32
        )

        self.reset()

    def step(self, action):
        action_map = {0: -1, 1: 0, 2: 1}
        action = action_map[action]

        current_charge = self.state[0]
        current_price = self.state[1]

        # Calculate charge amount
        charge_amount = action * self.max_charge_rate
        if action > 0:  # Charging
            charge_amount *= self.efficiency

        # Don't allow discharge if battery is empty or charge if battery is full
        if (current_charge <= 0 and action < 0) or (current_charge >= self.capacity and action > 0):
            charge_amount = 0

        new_charge = np.clip(current_charge + charge_amount, 0, self.capacity)

        # Simple reward structure:
        # Scale trading reward down
        trading_reward = -charge_amount * current_price * 0.1

        # Progressive boundary penalty
        if new_charge <= 0:
            boundary_penalty = -500
        elif new_charge >= self.capacity:
            boundary_penalty = -500
        elif new_charge < 0.2 * self.capacity:
            boundary_penalty = -100 * (0.2 - new_charge/self.capacity)
        elif new_charge > 0.8 * self.capacity:
            boundary_penalty = -100 * (new_charge/self.capacity - 0.8)
        else:
            boundary_penalty = 0

        # Larger range reward
        if 0.4 <= new_charge/self.capacity <= 0.8:
            range_reward = 200
        else:
            range_reward = 0

        reward = trading_reward + boundary_penalty + range_reward

        # 2. Fixed penalty for being at 0% or 100%
        if new_charge <= 0 or new_charge >= self.capacity:
            boundary_penalty = -100  # High penalty for being at boundaries, it was -1000 at the beginning
        else:
            boundary_penalty = 0

        # 3. Reward for being in good range (40-80%)
        if 40 <= new_charge <= 80:
            range_reward = 100
        else:
            range_reward = 0

        reward = trading_reward + boundary_penalty + range_reward

        # Update time and get new price
        self.hour = (self.hour + 1) % 24
        new_price = self._get_price()
        new_demand = self._get_demand()

        self.state = np.array([
            new_charge,
            new_price,
            self.hour,
            new_demand
        ])

        done = self.hour == 0
        return self.state, reward, done, {}

    def reset(self):
        self.hour = 0
        self.state = np.array([
            self.capacity * 0.6,  # Start at 60% charge
            self._get_price(),
            self.hour,
            self._get_demand()
        ])
        return self.state

    def _get_price(self):
        """Simulated electricity price based on time of day"""
        base_price = 50
        time_factor = np.sin(self.hour * np.pi / 12) * 30
        return max(base_price + time_factor + np.random.normal(0, 5), 0)

    def _get_demand(self):
        """Simulated demand based on time of day"""
        base_demand = 500
        time_factor = np.sin(self.hour * np.pi / 12) * 200
        return max(base_demand + time_factor + np.random.normal(0, 20), 0)

  and should_run_async(code)


#### Otras versiones de BatteryStorageEnv

In [28]:
class BatteryStorageEnv_v1(gym.Env):
    def __init__(self, capacity=100, max_charge_rate=30, efficiency=0.95):
        super(BatteryStorageEnv, self).__init__()

        self.capacity = capacity
        self.max_charge_rate = max_charge_rate
        self.efficiency = efficiency

        self.action_space = spaces.Discrete(3)  # -1: discharge, 0: idle, 1: charge
        self.observation_space = spaces.Box(
            low=np.array([0, 0, 0, 0]),
            high=np.array([capacity, 1000, 24, 1000]),
            dtype=np.float32
        )

        # Target range for battery level (40-80% is ideal operating range)
        self.target_min = 0.4 * capacity
        self.target_max = 0.8 * capacity

        self.reset()

    def step(self, action):
        action_map = {0: -1, 1: 0, 2: 1}
        action = action_map[action]

        current_charge = self.state[0]
        current_price = self.state[1]

        # Calculate charge amount
        charge_amount = action * self.max_charge_rate
        if action > 0:  # Charging
            charge_amount *= self.efficiency

        # Don't allow discharge if battery is empty or charge if battery is full
        if (current_charge <= 0 and action < 0) or (current_charge >= self.capacity and action > 0):
            charge_amount = 0

        new_charge = np.clip(current_charge + charge_amount, 0, self.capacity)

        # Calculate rewards

        # 1. Basic energy arbitrage reward (scaled down to be comparable with other rewards)
        energy_reward = -charge_amount * current_price * 0.1

        # 2. Penalty for being outside target range (quadratic penalty)
        if new_charge < self.target_min:
            level_penalty = -((self.target_min - new_charge) / self.capacity) ** 2 * 1000
        elif new_charge > self.target_max:
            level_penalty = -((new_charge - self.target_max) / self.capacity) ** 2 * 1000
        else:
            level_penalty = 0

        # 3. Reward for maintaining good charge level
        if self.target_min <= new_charge <= self.target_max:
            maintenance_reward = 50
        else:
            maintenance_reward = 0

        # 4. Severe penalty for complete discharge or overcharge
        if new_charge <= 0 or new_charge >= self.capacity:
            extreme_penalty = -2000
        else:
            extreme_penalty = 0

        # Combine all rewards
        reward = energy_reward + level_penalty + maintenance_reward + extreme_penalty

        # Update time and get new price
        self.hour = (self.hour + 1) % 24
        new_price = self._get_price()
        new_demand = self._get_demand()

        self.state = np.array([
            new_charge,
            new_price,
            self.hour,
            new_demand
        ])

        done = self.hour == 0

        return self.state, reward, done, {}

    def reset(self):
        self.hour = 0
        self.state = np.array([
            self.capacity * 0.6,  # Start at 60% charge
            self._get_price(),
            self.hour,
            self._get_demand()
        ])
        return self.state

    def _get_price(self):
        """Simulated electricity price based on time of day"""
        base_price = 50
        time_factor = np.sin(self.hour * np.pi / 12) * 30
        return max(base_price + time_factor + np.random.normal(0, 5), 0)

    def _get_demand(self):
        """Simulated demand based on time of day"""
        base_demand = 500
        time_factor = np.sin(self.hour * np.pi / 12) * 200
        return max(base_demand + time_factor + np.random.normal(0, 20), 0)

In [17]:
class BatteryStorageEnv_v0(gym.Env):
    """
    A simplified battery storage environment for RL
    State space: [current_charge, current_price, time_of_day, current_demand]
    Action space: [-1, 0, 1] (discharge, do nothing, charge)
    """
    def __init__(self, capacity=100, max_charge_rate=30, efficiency=0.95):
        super(BatteryStorageEnv, self).__init__()

        # Battery parameters
        self.capacity = capacity
        self.max_charge_rate = max_charge_rate
        self.efficiency = efficiency

        # Define action and observation spaces
        self.action_space = spaces.Discrete(3)  # -1: discharge, 0: idle, 1: charge
        self.observation_space = spaces.Box(
            low=np.array([0, 0, 0, 0]),  # charge, price, time, demand
            high=np.array([capacity, 1000, 24, 1000]),
            dtype=np.float32
        )

        # Initialize state
        self.reset()

    def step(self, action):
        # Decode action
        action_map = {0: -1, 1: 0, 2: 1}
        action = action_map[action]

        # Get current state values
        current_charge = self.state[0]
        current_price = self.state[1]

        # Update battery state
        charge_amount = action * self.max_charge_rate
        if action > 0:  # Charging
            charge_amount *= self.efficiency

        new_charge = np.clip(
            current_charge + charge_amount,
            0,
            self.capacity
        )

        # Calculate primary reward (from energy arbitrage)
        energy_reward = -charge_amount * current_price  # Negative when charging, positive when discharging

        # Add penalty for very low battery levels (below 20%)
        low_battery_penalty = 0
        if new_charge < (0.2 * self.capacity):
            low_battery_penalty = -50 * (0.2 - (new_charge / self.capacity))

        # Add small penalty for very high battery levels (above 90%) to encourage utilization
        high_battery_penalty = 0
        if new_charge > (0.9 * self.capacity):
            high_battery_penalty = -10 * ((new_charge / self.capacity) - 0.9)

        # Combine rewards
        reward = energy_reward + low_battery_penalty + high_battery_penalty

        # Update time and get new price
        self.hour = (self.hour + 1) % 24
        new_price = self._get_price()
        new_demand = self._get_demand()

        # Update state
        self.state = np.array([
            new_charge,
            new_price,
            self.hour,
            new_demand
        ])

        # Episode ends after 24 hours
        done = self.hour == 0

        return self.state, reward, done, {}

    def reset(self):
        self.hour = 0
        self.state = np.array([
            self.capacity * 0.5,  # Start at 50% charge
            self._get_price(),
            self.hour,
            self._get_demand()
        ])
        return self.state

    def _get_price(self):
        """Simulated electricity price based on time of day"""
        base_price = 50
        time_factor = np.sin(self.hour * np.pi / 12) * 30
        return max(base_price + time_factor + np.random.normal(0, 5), 0)

    def _get_demand(self):
        """Simulated demand based on time of day"""
        base_demand = 500
        time_factor = np.sin(self.hour * np.pi / 12) * 200
        return max(base_demand + time_factor + np.random.normal(0, 20), 0)

In [11]:
class BatteryStorageEnv_v3(gym.Env):
    def __init__(self, capacity=100, max_charge_rate=30, efficiency=0.95):
        super(BatteryStorageEnv, self).__init__()

        self.capacity = capacity
        self.max_charge_rate = max_charge_rate
        self.efficiency = efficiency

        self.action_space = spaces.Discrete(3)  # -1: discharge, 0: idle, 1: charge
        self.observation_space = spaces.Box(
            low=np.array([0, 0, 0, 0]),
            high=np.array([capacity, 1000, 24, 1000]),
            dtype=np.float32
        )

        self.reset()

    def step(self, action):
        action_map = {0: -1, 1: 0, 2: 1}
        action = action_map[action]

        current_charge = self.state[0]
        current_price = self.state[1]

        # Calculate charge amount
        charge_amount = action * self.max_charge_rate
        if action > 0:  # Charging
            charge_amount *= self.efficiency

        # Don't allow discharge if battery is empty or charge if battery is full
        if (current_charge <= 0 and action < 0) or (current_charge >= self.capacity and action > 0):
            charge_amount = 0

        new_charge = np.clip(current_charge + charge_amount, 0, self.capacity)

        # Simple reward structure:
        # 1. Profit/loss from energy trading
        trading_reward = -charge_amount * current_price

        # 2. Fixed penalty for being at 0% or 100%
        if new_charge <= 0 or new_charge >= self.capacity:
            boundary_penalty = -100  # High penalty for being at boundaries, it was -1000 at the beginning
        else:
            boundary_penalty = 0

        # 3. Reward for being in good range (40-80%)
        if 40 <= new_charge <= 80:
            range_reward = 100
        else:
            range_reward = 0

        reward = trading_reward + boundary_penalty + range_reward

        # Update time and get new price
        self.hour = (self.hour + 1) % 24
        new_price = self._get_price()
        new_demand = self._get_demand()

        self.state = np.array([
            new_charge,
            new_price,
            self.hour,
            new_demand
        ])

        done = self.hour == 0
        return self.state, reward, done, {}

    def reset(self):
        self.hour = 0
        self.state = np.array([
            self.capacity * 0.6,  # Start at 60% charge
            self._get_price(),
            self.hour,
            self._get_demand()
        ])
        return self.state

    def _get_price(self):
        """Simulated electricity price based on time of day"""
        base_price = 50
        time_factor = np.sin(self.hour * np.pi / 12) * 30
        return max(base_price + time_factor + np.random.normal(0, 5), 0)

    def _get_demand(self):
        """Simulated demand based on time of day"""
        base_demand = 500
        time_factor = np.sin(self.hour * np.pi / 12) * 200
        return max(base_demand + time_factor + np.random.normal(0, 20), 0)

## 2. Training the model

In [3]:
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt

In [12]:
# Modified Q-Learning agent with better exploration
class QLearningAgent:
    def __init__(self, action_space, learning_rate=0.01, discount_factor=0.9, initial_epsilon=0.9):
        self.q_table = defaultdict(lambda: np.zeros(action_space.n))

        self.lr = learning_rate
        self.gamma = discount_factor
        self.initial_epsilon = initial_epsilon
        self.epsilon = initial_epsilon
        self.action_space = action_space
        self.episode_count = 0

    def decay_epsilon(self):
        self.epsilon = max(0.01, self.initial_epsilon * (0.995 ** self.episode_count))
        self.episode_count += 1

    def get_action(self, state):
        if np.random.random() < self.epsilon:
            return self.action_space.sample()

        state_tuple = tuple(np.round(state, 1))  # Round state values for better discretization
        return np.argmax(self.q_table[state_tuple])

    def learn(self, state, action, reward, next_state):
        state_tuple = tuple(np.round(state, 1))
        next_state_tuple = tuple(np.round(next_state, 1))

        best_next_action = np.argmax(self.q_table[next_state_tuple])
        td_target = reward + self.gamma * self.q_table[next_state_tuple][best_next_action]
        td_error = td_target - self.q_table[state_tuple][action]
        self.q_table[state_tuple][action] += self.lr * td_error

#### Otras versiones QLearning

In [7]:
class QLearningAgent_v1:
    def __init__(self, action_space, learning_rate=0.1, discount_factor=0.95, epsilon=0.1):
        self.q_table = defaultdict(lambda: np.zeros(action_space.n))
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = epsilon
        self.action_space = action_space

    def get_action(self, state):
        # Epsilon-greedy action selection
        if np.random.random() < self.epsilon:
            return self.action_space.sample()

        # Convert state to tuple for dictionary key
        state_tuple = tuple(state.astype(np.float32))
        return np.argmax(self.q_table[state_tuple])

    def learn(self, state, action, reward, next_state):
        # Convert states to tuples for dictionary keys
        state_tuple = tuple(state.astype(np.float32))
        next_state_tuple = tuple(next_state.astype(np.float32))

        # Q-learning update
        best_next_action = np.argmax(self.q_table[next_state_tuple])
        td_target = reward + self.gamma * self.q_table[next_state_tuple][best_next_action]
        td_error = td_target - self.q_table[state_tuple][action]
        self.q_table[state_tuple][action] += self.lr * td_error

## Training

In [13]:
def train_agent(env, agent, episodes=10000):
    rewards_history = []

    for episode in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False

        while not done:
            action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.learn(state, action, reward, next_state)
            state = next_state
            total_reward += reward

        rewards_history.append(total_reward)

        # Print progress every 500 episodes
        if (episode + 1) % 500 == 0:
            avg_reward = np.mean(rewards_history[-100:])
            print(f"Episode {episode + 1}, Average Reward: {avg_reward:.2f}")

    return rewards_history

In [14]:
def plot_training_results(rewards_history):
    plt.figure(figsize=(10, 5))
    plt.plot(rewards_history)
    plt.title('Training Progress')
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.grid(True)
    plt.show()

In [15]:
# Create environment and agent
env = BatteryStorageEnv()
agent = QLearningAgent(env.action_space)

In [16]:
# Test the trained agent
def test_agent(env, agent, episodes=5):
    for episode in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False
        actions_taken = []

        print(f"\nEpisode {episode + 1}")
        while not done:
            action = agent.get_action(state)
            actions_taken.append(action)
            state, reward, done, _ = env.step(action)
            total_reward += reward

            print(f"Hour {int(state[2])}: Battery Level: {state[0]:.1f}%, "
                  f"Price: ${state[1]:.2f}, Action: {['Discharge', 'Idle', 'Charge'][action]}")

        print(f"Total Reward: ${total_reward:.2f}")

In [17]:
# Test BEFORE training to show random behavior
print("BEFORE TRAINING (Random Actions):")
test_agent(env, agent, episodes=2)  # Show just 2 episodes of bad performance

print("\n" + "="*50 + "\n")

BEFORE TRAINING (Random Actions):

Episode 1
Hour 1: Battery Level: 60.0%, Price: $58.03, Action: Idle
Hour 2: Battery Level: 60.0%, Price: $64.81, Action: Idle
Hour 3: Battery Level: 30.0%, Price: $71.40, Action: Discharge
Hour 4: Battery Level: 0.0%, Price: $82.35, Action: Discharge
Hour 5: Battery Level: 28.5%, Price: $79.39, Action: Charge
Hour 6: Battery Level: 28.5%, Price: $77.51, Action: Idle
Hour 7: Battery Level: 57.0%, Price: $80.84, Action: Charge
Hour 8: Battery Level: 57.0%, Price: $75.74, Action: Idle
Hour 9: Battery Level: 85.5%, Price: $80.80, Action: Charge
Hour 10: Battery Level: 55.5%, Price: $62.90, Action: Discharge
Hour 11: Battery Level: 55.5%, Price: $64.86, Action: Idle
Hour 12: Battery Level: 55.5%, Price: $45.80, Action: Idle
Hour 13: Battery Level: 84.0%, Price: $39.52, Action: Charge
Hour 14: Battery Level: 100.0%, Price: $32.45, Action: Charge
Hour 15: Battery Level: 70.0%, Price: $21.75, Action: Discharge
Hour 16: Battery Level: 70.0%, Price: $21.87, Act

In [18]:
# Now train the agent
print("Training the agent...")
rewards_history = train_agent(env, agent, episodes=10000)
print("\n" + "="*50 + "\n")

Training the agent...
Episode 500, Average Reward: -159.72
Episode 1000, Average Reward: -86.14
Episode 1500, Average Reward: -198.05
Episode 2000, Average Reward: -75.37
Episode 2500, Average Reward: -80.80
Episode 3000, Average Reward: -128.45
Episode 3500, Average Reward: -124.34
Episode 4000, Average Reward: -125.64
Episode 4500, Average Reward: -192.91
Episode 5000, Average Reward: -232.58
Episode 5500, Average Reward: -120.62
Episode 6000, Average Reward: -28.35
Episode 6500, Average Reward: -168.50
Episode 7000, Average Reward: -26.00
Episode 7500, Average Reward: -84.86
Episode 8000, Average Reward: -157.68
Episode 8500, Average Reward: -57.23
Episode 9000, Average Reward: -150.52
Episode 9500, Average Reward: -250.68
Episode 10000, Average Reward: -228.42




In [19]:
# Test AFTER training to show improvement
print("AFTER TRAINING (Learned Strategy):")
test_agent(env, agent, episodes=2)  # Show same number of episodes for comparison

AFTER TRAINING (Learned Strategy):

Episode 1
Hour 1: Battery Level: 30.0%, Price: $61.14, Action: Discharge
Hour 2: Battery Level: 58.5%, Price: $63.96, Action: Charge
Hour 3: Battery Level: 87.0%, Price: $79.85, Action: Charge
Hour 4: Battery Level: 87.0%, Price: $87.49, Action: Idle
Hour 5: Battery Level: 100.0%, Price: $72.92, Action: Charge
Hour 6: Battery Level: 70.0%, Price: $80.29, Action: Discharge
Hour 7: Battery Level: 40.0%, Price: $83.48, Action: Discharge
Hour 8: Battery Level: 10.0%, Price: $78.64, Action: Discharge
Hour 9: Battery Level: 0.0%, Price: $76.59, Action: Discharge
Hour 10: Battery Level: 0.0%, Price: $63.92, Action: Discharge
Hour 11: Battery Level: 0.0%, Price: $54.29, Action: Discharge
Hour 12: Battery Level: 0.0%, Price: $53.19, Action: Discharge
Hour 13: Battery Level: 0.0%, Price: $42.91, Action: Discharge
Hour 14: Battery Level: 0.0%, Price: $35.56, Action: Idle
Hour 15: Battery Level: 28.5%, Price: $28.37, Action: Charge
Hour 16: Battery Level: 0.0%, 

## Comments

# prompt: why do you think the Env is choosing to discharge to 0% and keeps the reward too low?

The environment's tendency to discharge to 0% and maintain low rewards stems from a combination of factors in your reward structure and the agent's learning process.  Let's break down the likely culprits:

1. **Reward Structure Imbalance:**

   * **High Penalty for Boundaries:** The `boundary_penalty` of -1000 for hitting 0% or 100% is extremely severe.  This overwhelms other rewards. The agent learns very quickly to avoid these states *at all costs*, even if it means sacrificing potential profit.  It's prioritizing avoiding the massive penalty over maximizing profit.

   * **Trading Reward is too low:**  The `trading_reward`  might not be significant enough to outweigh the huge boundary penalty, especially in early stages of training. The agent's exploration might lead it to situations where small trading profits are overshadowed by the risk of hitting the boundaries.


2. **Exploration-Exploitation Dilemma:**

    * **Insufficient Exploration:** If the agent's exploration rate (`epsilon`) is too low, it might get stuck in a local optimum where discharging is a "safe" strategy, even if it's not the most profitable one.  Early on, it needs to explore the state space more thoroughly to find better strategies.

    * **Epsilon Decay:** Ensure your agent's `epsilon` decays gradually during training.  A constant epsilon might prevent the agent from exploiting its learned knowledge.

3. **Learning Rate and Discount Factor:**

    * **Learning Rate:** The learning rate (`lr`) might be too high or too low. Too high and the agent might oscillate and not converge. Too low and the agent might learn very slowly.
    * **Discount Factor:** The discount factor (`gamma`) is related to how much the agent values future rewards. A very high gamma might lead the agent to focus heavily on long-term rewards, potentially making it less sensitive to immediate penalties.


**Debugging and Improvement Strategies:**

1. **Reward Engineering:**
   * **Scale Down the Boundary Penalty:** Reduce `boundary_penalty` to something less drastic, perhaps -10 or -50.  The goal is to discourage boundary hits, not to make them catastrophic.
   * **Scale Up the Trading Reward:** Increase the magnitude of `trading_reward` so that the agent sees significant gains when making profitable energy trading decisions.  You might need to experiment with scaling factors or modifying how the trading reward is calculated.
   * **Smooth Boundary Penalty (Optional):** Instead of a sharp penalty, consider a quadratic penalty that increases as the charge level gets close to the boundaries.  This would create a smoother incentive for staying within the acceptable range.

2. **Exploration:**
   * **Higher Initial Epsilon:** Start with a higher `epsilon` (e.g., 0.5 or 0.7) to encourage more exploration in the initial training stages.
   * **Decaying Epsilon:** Implement an appropriate decay schedule for `epsilon`, so it gradually decreases over the training episodes, allowing the agent to exploit learned policies later on.

3. **Parameter Tuning:**
   * **Learning Rate:** Experiment with smaller values of `lr` (e.g., 0.01) to stabilize learning.
   * **Discount Factor:** A moderate discount factor (e.g., 0.95) is often a good starting point.


4. **State Discretization:**
    * You use `np.round(state, 1)` in your agent.  The granularity of this rounding impacts the discretization of your state space.  Consider reducing the rounding value or using another discretization technique if this is causing issues.

5. **Printing intermediate states:** Add printing to show the different components of your reward function during training, which would help in visualizing if your rewards are behaving correctly.

6. **Debug Initial State and Pricing:** Verify that the initial price and state of the battery are favorable, and that prices are indeed fluctuating.

7. **Alternative RL Algorithms:**  You could test other RL algorithms like DQN or SARSA.  They might find better strategies.



By carefully adjusting these parameters, you should be able to improve the agent's learning and prevent it from constantly discharging to 0%. Remember to test different combinations of values until you find what works well for your environment.


Added a target operating range (40-80% of capacity)
Created multiple reward components:

Scaled down the energy arbitrage reward
Added quadratic penalty for being outside target range
Added positive reward for staying within target range
Added severe penalty for complete discharge/overcharge


Made rewards more balanced relative to each other
Start battery at 60% charge instead of 50%
Prevented impossible actions (can't discharge empty battery or charge full battery)

In [None]:
def step(self, action):
    # ... existing code ...

    # Scale trading reward down
    trading_reward = -charge_amount * current_price * 0.1

    # Progressive boundary penalty
    if new_charge <= 0:
        boundary_penalty = -500
    elif new_charge >= self.capacity:
        boundary_penalty = -500
    elif new_charge < 0.2 * self.capacity:
        boundary_penalty = -100 * (0.2 - new_charge/self.capacity)
    elif new_charge > 0.8 * self.capacity:
        boundary_penalty = -100 * (new_charge/self.capacity - 0.8)
    else:
        boundary_penalty = 0

    # Larger range reward
    if 0.4 <= new_charge/self.capacity <= 0.8:
        range_reward = 200
    else:
        range_reward = 0

    reward = trading_reward + boundary_penalty + range_reward