In [None]:
## Import required libraries

import os 
import torch
import torch.nn as nn
import torch.nn.functional as F 
import numpy as np

import warnings
from torch.distributions.categorical import Categorical
import matplotlib.pyplot as plt
from numpy import *
import numpy as np


from solver_sdmd_torch_gpu2 import KoopmanNNTorch, KoopmanSolverTorch
from sde_coefficients_estimator import SDECoefficientEstimator
from numpy import random as rrr
from time import time
from koopman_bandit import KoopmanBandit
import joblib
from collections import namedtuple, deque
import math
import random as rnd
from scipy import stats
from tqdm import tqdm
from numpy import linalg as la
import time
from fhn_system_2d import simulate_trajectory
warnings.simplefilter("ignore")

In [None]:
print (torch.__version__, torch.cuda.is_available())
print(torch.version.cuda)
print (torch.cuda.get_device_name())
#device= 'cpu'
device= 'cuda'

In [None]:
gamma = 1
beta = 1
delta = 0.25
epsilon = 0.05


a1= 1/3
b1= 0.5
b2= 0

# DX= 0.2
# DY= 0.2
DX= 0.05**2
DY = 0.05**2

In [None]:
def get_single_trajectory(x0, y0, T):
    """
    Produce SDE trajectory starting at (x0, y0) over T steps.
    This function delegates the simulation to duffing_system_2d.simulate_trajectory.
    
    Args:
        x0: Initial x-coordinate (position)
        y0: Initial y-coordinate (velocity)
        T: Total simulation time steps
        
    Returns:
        Tuple of (data_matrix_single, lag_time)
    """
    return simulate_trajectory(x0, y0, T, h=1e-4, n_steps=100, beta= beta, delta= delta, 
                        epsilon= epsilon, a1=a1, b1=b1, b2=b2, DX= DX, DY= DY)

In [None]:
def format_time(seconds):
            hours = int(seconds // 3600)
            minutes = int((seconds % 3600) // 60)
            seconds = int(seconds % 60)
            return f"{hours:02d}h:{minutes:02d}m:{seconds:02d}s"

In [None]:
def get_ee_reward (trajectory,next_initial_point):
    ee_epsilon= 0.1
    kernel = stats.gaussian_kde(trajectory.squeeze().T)
    mean_kernel= np.mean (kernel(trajectory.squeeze().T))
    normalized_p= kernel (next_initial_point)/mean_kernel
    ee_reward= 1.0/(normalized_p + ee_epsilon)
    return ee_reward

def do_koopman_step (state, action, k_grid= 4, state_len= 10, chunk_len= 100):
    chunk_list= []
    T= chunk_len
    action_x= action//k_grid
    action_y= action%k_grid
    for ii in arange(state_len-1):
        
        x0_single= state[0, ii]
        y0_single=  state[1, ii]
   
        data_matrix_single, lag_time= get_single_trajectory (x0_single, y0_single, T)
        chunk_list.append (data_matrix_single)
    x_borders= linspace (-3,3, k_grid+1)
    y_borders= linspace (-4,4, k_grid+1)
    x_lo= x_borders[action_x]
    x_hi= x_borders[action_x+1]
    y_lo= y_borders[action_y]
    y_hi= y_borders[action_y+1]
    x0_single= rrr.uniform (x_lo, x_hi)
    y0_single= rrr.uniform (y_lo, y_hi)
    data_matrix_single, lag_time= get_single_trajectory (x0_single, y0_single, T)
    trajectory= hstack (chunk_list)
    chunk_list.append (data_matrix_single)
    data_matrix_single= hstack (chunk_list)
    ee_reward= get_ee_reward (trajectory, [x0_single, y0_single])
    #print ('EE reward:' , ee_reward)
    #Extract data_X and data_Y from the data matrix
    data_X = data_matrix_single[:, :-1, :]
    data_Y = data_matrix_single[:, 1:, :]
    print(f"Shape of data_X: {data_X.shape}")
    print(f"Shape of data_Y: {data_Y.shape}")
    
    # Reshape data_X and data_Y into a single column
    X = data_X.reshape(-1, data_X.shape[2])  # 2D features
    Y = data_Y.reshape(-1, data_X.shape[2])  # 2D targets
    print(f"Shape of X: {X.shape}")
    print(f"Shape of Y: {Y.shape}")
    
    # Separate data into two parts: train and validation
    len_all = X.shape[0]
    data_x_train = X[:int(0.7*len_all)]
    data_x_valid = X[int(0.7*len_all)+1:]
    
    data_y_train = Y[:int(0.7*len_all)]
    data_y_valid = Y[int(0.7*len_all)+1:]
    
    data_train = [data_x_train, data_y_train]
    data_valid = [data_x_valid, data_y_valid]
    
    print(data_x_train.shape)
    
    checkpoint_file= f'example_{system}_dqn_ckpt3.torch'
    basis_function = KoopmanNNTorch(input_size= 2, layer_sizes=[20], n_psi_train=17).to(device)  # basis number would be 20
    
    
    solver = KoopmanSolverTorch(dic=basis_function, # Replace 'koopman_nn' by 'dic' if you use the original solver_edmdvar
                           target_dim=np.shape(data_x_train)[-1],
                                                       reg=0.1,  checkpoint_file= checkpoint_file, fnn_checkpoint_file= f'example_{system}_fnn_dqn1.torch', 
                                a_b_file= f'sde_coefficients_example_{system}_dqn1.jbl', 
                            generator_batch_size= 2, fnn_batch_size= 32, delta_t= lag_time)
    solver.build_with_generator(
    data_train=data_train,
    data_valid=data_valid,
    epochs=4,
    batch_size=256,
    lr=1e-5,
    log_interval=10,
    lr_decay_factor=.8
    )
    consistency= 0
    for ii in arange (len (solver.eigenvalues)):
        phi_x= solver.eigenfunctions (X)[ii]
        phi_y=  solver.eigenfunctions (Y)[ii]
        lmbd=  solver.eigenvalues[ii]
        consistency= consistency + la.norm (phi_y- lmbd*phi_x)**2
    print (consistency)
    reward= 16- consistency+  0.15*ee_reward
    next_state= hstack ([state[:, 1:], expand_dims (array ([x0_single, y0_single]), axis= 1)])
    return next_state, reward
        
    


In [None]:
############################# Data Store ####################################################
class PPOMemory():
    """
    Memory for PPO
    """
    def  __init__(self, batch_size):
        self.states = []
        self.actions= []
        self.action_probs = []
        self.rewards = []
        self.vals = []
        self.dones = []
        
        self.batch_size = batch_size

    def generate_batches(self):
        ## suppose n_states=20 and batch_size = 4
        n_states = len(self.states)
        ##n_states should be always greater than batch_size
        ## batch_start is the starting index of every batch
        ## eg:   array([ 0,  4,  8, 12, 16]))
        batch_start = np.arange(0, n_states, self.batch_size) 
        ## random shuffling if indexes
        # eg: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
        indices = np.arange(n_states, dtype=np.int64)
        ## eg: array([12, 17,  6,  7, 10, 11, 15, 13, 18,  9,  8,  4,  3,  0,  2,  5, 14,19,  1, 16])
        np.random.shuffle(indices)
        batches = [indices[i:i+self.batch_size] for i in batch_start]
        ## eg: [array([12, 17,  6,  7]),array([10, 11, 15, 13]),array([18,  9,  8,  4]),array([3, 0, 2, 5]),array([14, 19,  1, 16])]
        return np.array(self.states),np.array(self.actions),\
               np.array(self.action_probs),np.array(self.vals),np.array(self.rewards),\
               np.array(self.dones),batches
    
       
    

    def store_memory(self,state,action,action_prob,val,reward,done):
        self.states.append(state.ravel())
        self.actions.append(action)
        self.action_probs.append(action_prob)
        self.rewards.append(reward)
        self.vals.append(val)
        self.dones.append(done)

    def clear_memory(self):
        self.states = []
        self.actions= []
        self.action_probs = []
        self.rewards = []
        self.vals = []
        self.dones = []


In [None]:

## initialize actor network and critic network


class ActorNwk(nn.Module):
    def __init__(self,input_dim,out_dim,
                 adam_lr,
                 chekpoint_file,
                 hidden1_dim=256,
                 hidden2_dim=256
                 ):
        super(ActorNwk, self).__init__()

        self.actor_nwk = nn.Sequential(
            nn.Linear(*input_dim,hidden1_dim),
            nn.ReLU(),
            nn.Linear(hidden1_dim,hidden2_dim),
            nn.ReLU(),
            nn.Linear(hidden2_dim,out_dim),  
            nn.Softmax(dim=-1)
        )

        self.checkpoint_file = chekpoint_file
        self.optimizer = torch.optim.Adam(params=self.actor_nwk.parameters(),lr=adam_lr)

        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)

    
    def forward(self,state):
        out = self.actor_nwk(state)
        dist = Categorical(out)
        return dist

    def save_checkpoint(self):
        torch.save(self.state_dict(), self.checkpoint_file)

    def load_checkpoint(self):
        self.load_state_dict(torch.load(self.checkpoint_file))

############################### Crirtic Network ######################################

class CriticNwk(nn.Module):
    def __init__(self,input_dim,
                 adam_lr,
                 chekpoint_file,
                 hidden1_dim=256,
                 hidden2_dim=256
                 ):
        super(CriticNwk, self).__init__()

        self.critic_nwk = nn.Sequential(
            nn.Linear(*input_dim,hidden1_dim),
            nn.ReLU(),
            nn.Linear(hidden1_dim,hidden2_dim),
            nn.ReLU(),
            nn.Linear(hidden2_dim,1),  
   
        )

        self.checkpoint_file = chekpoint_file
        self.optimizer = torch.optim.Adam(params=self.critic_nwk.parameters(),lr=adam_lr)

        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)

    
    def forward(self,state):
        out = self.critic_nwk(state)
        return out

    def save_checkpoint(self):
        torch.save(self.state_dict(), self.checkpoint_file)

    def load_checkpoint(self):
        self.load_state_dict(torch.load(self.checkpoint_file))

In [None]:
## Initilaize an Agent will will be able to train the model

############################# Agent ########################################3

## agent

class Agent():
    def __init__(self, gamma, policy_clip,lamda, adam_lr,
                 n_epochs, batch_size, state_dim, action_dim):
        
        self.gamma = gamma 
        self.policy_clip = policy_clip
        self.lamda  = lamda
        self.n_epochs = n_epochs

        self.actor = ActorNwk(input_dim=state_dim,out_dim=action_dim,adam_lr=adam_lr,chekpoint_file='tmp/actor')
        self.critic = CriticNwk(input_dim=state_dim,adam_lr=adam_lr,chekpoint_file='tmp/ctitic')
        self.memory = PPOMemory(batch_size)

    def store_data(self,state,action,action_prob,val,reward,done):
        self.memory.store_memory(state,action,action_prob,val,reward,done)
       

    def save_models(self):
        print('... Saving Models ......')
        self.actor.save_checkpoint()
        self.critic.save_checkpoint()
    
    def load_models(self):
        print('... Loading models ...')
        self.actor.load_checkpoint()
        self.critic.load_checkpoint()

    def choose_action(self, state):
        state = torch.tensor([state], dtype=torch.float64).to(self.actor.device)

        dist = self.actor(state)
        ## sample the output action from a categorical distribution of predicted actions
        action = dist.sample()
        probs = torch.squeeze(dist.log_prob(action)).item()
        action = torch.squeeze(action).item()

        ## value from critic model
        value = self.critic(state)
        value = torch.squeeze(value).item()

        return action, probs, value
    
    def calculate_advanatage(self,reward_arr,value_arr,dones_arr):
        time_steps = len(reward_arr)
        advantage = np.zeros(len(reward_arr), dtype=np.float32)

        for t in range(0,time_steps-1):
            discount = 1
            running_advantage = 0
            for k in range(t,time_steps-1):
                if int(dones_arr[k]) == 1:
                    running_advantage += reward_arr[k] - value_arr[k]
                else:
                
                    running_advantage += reward_arr[k] + (self.gamma*value_arr[k+1]) - value_arr[k]

                running_advantage = discount * running_advantage
                # running_advantage += discount*(reward_arr[k] + self.gamma*value_arr[k+1]*(1-int(dones_arr[k])) - value_arr[k])
                discount *= self.gamma * self.lamda
            
            advantage[t] = running_advantage
        advantage = torch.tensor(advantage).to(self.actor.device)
        return advantage
    
    def learn(self):
        for _ in range(self.n_epochs):

            ## initially all will be empty arrays
            state_arr, action_arr, old_prob_arr, value_arr,\
            reward_arr, dones_arr, batches = \
                    self.memory.generate_batches()
            
            advantage_arr = self.calculate_advanatage(reward_arr,value_arr,dones_arr)
            values = torch.tensor(value_arr).to(self.actor.device)

            for batch in batches:
                states = torch.tensor(state_arr[batch], dtype=torch.double).to(self.actor.device)
                old_probs = torch.tensor(old_prob_arr[batch]).to(self.actor.device)
                actions = torch.tensor(action_arr[batch]).to(self.actor.device)

                dist = self.actor(states)
                critic_value = self.critic(states)

                critic_value = torch.squeeze(critic_value)

                new_probs = dist.log_prob(actions)
                prob_ratio = new_probs.exp() / old_probs.exp()
                #prob_ratio = (new_probs - old_probs).exp()
                weighted_probs = advantage_arr[batch] * prob_ratio
                weighted_clipped_probs = torch.clamp(prob_ratio, 1-self.policy_clip,
                        1+self.policy_clip)*advantage_arr[batch]
                actor_loss = -torch.min(weighted_probs, weighted_clipped_probs).mean()

                returns = advantage_arr[batch] + values[batch]
                critic_loss = (returns-critic_value)**2
                critic_loss = critic_loss.mean()

                total_loss = actor_loss + 0.5*critic_loss
                self.actor.optimizer.zero_grad()
                self.critic.optimizer.zero_grad()
                total_loss.backward()
                self.actor.optimizer.step()
                self.critic.optimizer.step()

        self.memory.clear_memory()   

In [None]:
k_grid= 32
state_len= 12
chunk_len= 100
n_actions = k_grid**2
# Get the number of state observations

n_observations = state_len*2
system= 'fhn'

In [None]:
if not os.path.exists('tmp'):
    os.makedirs('tmp')


N = 20
batch_size = 5
n_epochs = 4
alpha = 0.0003
agent = Agent(state_dim=(n_observations, ),
              action_dim=n_actions, 
              batch_size=batch_size,
              n_epochs=n_epochs,
              policy_clip=0.2,
              gamma=0.99,lamda=0.95, 
              adam_lr=alpha)

In [None]:

best_score = 0 # env.unwrapped.reward_range[0]

learn_iters = 0
avg_score = 0
num_steps= 10001
num_episodes= 1
step_count = 0
reward_hist= []
start_time= time.time()
for i_episode in range(num_episodes):
    initial_state = 3 * (np.random.uniform(size=(2, state_len)) - 1)
    current_state = initial_state
    # current_state,info = env.reset()
    # terminated,truncated = False,False
    done = False
    score = 0
    for t in arange(num_steps):
        current_time = time.time()
        elapsed_time = current_time - start_time
        
        # Calculate estimated time remaining
        if t > 0:
            avg_time_per_step = elapsed_time / t
            remaining_steps = num_steps - t
            estimated_time_remaining = avg_time_per_step * remaining_steps
        else:
            estimated_time_remaining = 0
        action, prob, val = agent.choose_action(current_state.ravel())
        #next_state, reward, terminated, truncated, info = env.step(action)
        observation, reward = do_koopman_step(current_state, action, k_grid=k_grid)
        next_state= observation#.ravel()
        #done = 1 if (terminated or truncated) else 0
        step_count += 1
        score += reward
        agent.store_data(current_state, action, prob, val, reward, done)
        reward_hist.append([current_state, action, next_state, reward])
        if step_count % N == 0:
            agent.learn()
            learn_iters += 1
        current_state = next_state
    # Write detailed status to file
        with open(f'ppo_state_{system}_test.txt', 'w') as f:
            f.write(f"System: {system}\n")
            f.write(f"Current RL step: {int(t)}/{num_steps} ({(t/num_steps)*100:.1f}%)\n")
            f.write(f"Elapsed time: {format_time(elapsed_time)}\n")
            f.write(f"Estimated time remaining: {format_time(estimated_time_remaining)}\n")
         
    # Periodically save results
        if (t % 100 == 0):
            joblib.dump(
                reward_hist,
                f'ppo_history{k_grid}x{k_grid}_{num_steps}steps_example_{system}_test.jbl'
            )
            print(f"Step {t}/{num_steps} completed. System: {system}. Elapsed: {format_time(elapsed_time)}")
    
    
    agent.save_models()
    print('episode', i_episode, 'score %.1f' % score, 'avg score %.1f' % avg_score,
            'time_steps', step_count, 'learning_steps', learn_iters)
    
    

In [None]:
# joblib.dump (reward_hist, 'dqn_history64x64_15000steps_example_final.jbl')
filename = f'ppo_history{k_grid}x{k_grid}_{num_steps}steps_example_{system}_final.jbl'
joblib.dump(
    reward_hist,
    filename
)