In [1]:
import numpy as np
from scipy.integrate import solve_ivp
import synthetic_data
import tensorflow as tf

class TrapEnvironment:
    def __init__(self, num_traps=20):
        # Initialize environment parameters
        self.num_traps = num_traps
        self.n = 25
        self.m = 25
        self.predator_density = None
        self.prey_density = None
        self.trap_replacement_rate = 10
        self.pts_per_sec = 100
        self.reset()

    def reset(self):
        # Reset the environment to its initial state (just predator and prey)
        # Generate initial predator and prey densities
        # default params for initial generation num_traj=200,len_traj=50, pts_per_sec=100, save_loc='../Data/val.npy', prey_range=(1, 5), predator_range=(1, 3)
        data_init = synthetic_data.generate()
        self.prey_density, self.predator_density = data_init[:, :, 0, :], data_init[:, :, 1, :]
        return self.predator_density, self.prey_density

    def step(self, action):
        # Take action (place traps) and observe the next state and reward
        # Update predator and prey densities based on the action
        # Calculate reward based on the change in predator density
        # Return next state, reward, and done flag

        # Simulate predator dynamics with traps placed at specified locations
        # Here, action is a list of trap locations [(i1, j1), (i2, j2), ..., (in, jn)]
        y0 = np.zeros((self.n, self.m, 2))
        for i, j in action:
            y0[i,j,1] = 10  # place those traps at each cell
        y0 = y0.flatten()

        # get impact on predator and prey spread after placement window steps
        master_sol = np.ndarray((self.n*self.m*2,self.pts_per_sec))
        
        for _ in range(self.trap_replacement_rate):
            # trap solver, only grab single timestep
            sol = solve_ivp(synthetic_data.spatial_dynamics_traps, y0=y0, t_span=[0,1], t_eval=np.linspace(0, 1, self.pts_per_sec), args=(self.n, self.m))
            # prey solver y0 creation
            y_prey = np.zeros((self.n, self.m, 2))
            last_dim = int(self.pts_per_sec)
            sol_use = sol.y.reshape((self.n, self.m, 2, last_dim))
            pred_data_new, trap_data_new = sol_use[:, :, 0, :], sol_use[:, :, 1, :]
            # set prey from timestep of interest as predator in y_prey
            y_prey[:,:,1] = pred_data_new[:, :, -1]
            # grab predator information from prey data
            y_prey[:,:,0] = prey_data
            y_prey = y_prey.flatten()
            # prey solver, only grab single timestep
            sol_prey = solve_ivp(synthetic_data.spatial_dynamics, y0=y_prey, t_span=[0,1], t_eval=np.linspace(0, 1, self.pts_per_sec), args=(self.n, self.m))
            # create y0 for next run of trap solver, overwrite y0 and prey_data
            y0 = np.zeros((self.n, self.m, 2))
            sol_prey_use = sol_prey.y.reshape((self.n, self.m, 2, last_dim))
            prey_data, predator_data = sol_prey_use[:, :, 0, :], sol_prey_use[:, :, 1, :]
            y0[:,:,0] = predator_data[:, :, -1]
            prey_data = prey_data[:, :, -1]
            # initialize trap locations based on number of desired traps and density, re initialize per replacement time
            y0[:,:,1] = trap_data_new[:,:,-1]
            y0 = y0.flatten()
            master_sol = np.concatenate((master_sol, sol_prey.y), 1)

        master_sol = master_sol[:,100:].reshape(self.n, self.m,2,self.pts_per_sec*self.trap_replacement_rate)
        
        #Extract the predator and prey densities from the solution
        self.prey_density , self.predator_density = master_sol[:, :, 0, :], master_sol[:, :, 1, :]

        # Calculate reward based on the change in predator density
        reward = -np.sum(self.predator_density)
        
        # Return next state, reward, and done flag (assuming no termination condition for now)
        return self.predator_density, self.prey_density, reward, False

#define DQN
class QNetwork(tf.keras.Model):
    def __init__(self, num_actions):
        super(QNetwork, self).__init__()
        self.dense1 = tf.keras.layers.Dense(64, activation='relu')
        self.dense2 = tf.keras.layers.Dense(64, activation='relu')
        self.output_layer = tf.keras.layers.Dense(num_actions)

    def call(self, state):
        x = self.dense1(state)
        x = self.dense2(x)
        return self.output_layer(x)

# Simple Q-learning algorithm with experience replay
class QLearningAgent:
    def __init__(self, num_actions):
        self.num_actions = num_actions
        self.q_network = QNetwork(num_actions)
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
        self.memory = []

    def select_action(self, state):
        # Epsilon-greedy policy
        if np.random.rand() < 0.1:
            return np.random.randint(self.num_actions)  # Explore
        else:
            q_values = self.q_network(np.array([state], dtype=np.float32))
            return np.argmax(q_values[0].numpy())  # Exploit

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def experience_replay(self, batch_size=32):
        if len(self.memory) < batch_size:
            return

        minibatch = np.random.choice(len(self.memory), batch_size, replace=False)
        states, targets = [], []
        for idx in minibatch:
            state, action, reward, next_state, done = self.memory[idx]
            states.append(state)
            q_values = self.q_network(np.array([state], dtype=np.float32)).numpy()[0]
            if done:
                q_values[action] = reward
            else:
                next_q_values = self.q_network(np.array([next_state], dtype=np.float32)).numpy()[0]
                q_values[action] = reward + 0.9 * np.max(next_q_values)
            targets.append(q_values)
        
        states = np.array(states, dtype=np.float32)
        targets = np.array(targets, dtype=np.float32)
        
        with tf.GradientTape() as tape:
            q_values = self.q_network(states)
            loss = tf.reduce_mean(tf.square(targets - q_values))
        
        grads = tape.gradient(loss, self.q_network.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.q_network.trainable_variables))

# Training loop
def train_agent(env, agent, num_episodes=1000):
    for episode in range(num_episodes):
        state = env.reset()
        total_reward = 0
        done = False
        while not done:
            action = agent.select_action(state)
            next_state, _, reward, done = env.step(action)
            agent.remember(state, action, reward, next_state, done)
            agent.experience_replay()
            state = next_state
            total_reward += reward
        print(f"Episode {episode + 1}/{num_episodes}, Total Reward: {total_reward}")

# Create environment and agent
env = TrapEnvironment()
agent = QLearningAgent(num_actions=env.n * env.m)

# Train the agent
train_agent(env, agent)