In [1]:
import os 
#指定GPU 
os.environ['CUDA_VISIBLE_DEVICES'] = "1"

In [2]:
import pandas as pd
import numpy  as np
import gymnasium as gym
from gymnasium import spaces
from sb3_contrib.common.maskable.utils import get_action_masks

class UnintEnv(gym.Env):
    def __init__(self, mode='train'):
        super(UnintEnv, self).__init__()
        self.mode = mode # train or test
        self.time_step = None
        self.execute_price = None
        self.execute_preference = None
        self.execute_remain_power = None
        self.preference_data_index = 0

        # load fix variable
        self.plt_load   = None
        self.load_power = None
        self.load_demand = None
        self.load_period = None

        # load variable
        self.load_switch = None
        self.load_remain_demand = None
        self.load_remain_period = None
        self.load_use_power = None
        
        # read price data
        self.price_data = pd.read_csv("csv_data/grid_price.csv")

        # read preference data

        self.preference_data = {
            1 : pd.read_csv("csv_data/unIntPreference1.csv"),
            2 : pd.read_csv("csv_data/unIntPreference2.csv")
        }

        # Observation space --------------------------------------------------------------------------------
        # [ time_step,, pgrid_price, remain_time, remain_period, period_len, preference ]
        lowerLimit = np.array([  0,  0,  0,  0, 0, -4 ], dtype=np.float32)
        upperLimit = np.array([ 95,  7, 40,  7, 7,  4 ], dtype=np.float32)
        self.observation_space = spaces.Box(lowerLimit, upperLimit, dtype=np.float32)

        # Ation space --------------------------------------------------------------------------------------
        self.action_space =  spaces.Discrete(2)
 
    ###########################################################################
    def step(self, action):
        # Get state -----------------------------------------------------------
        # [ time_step,, pgrid_price, remain_time, remain_period, period_len, preference ]
        self.time_step,  Pgrid_price, state_remain, _, _, preference = self.state
        
        not_enough_penalty = 0
        cost_reward        = 0
        preference_reward  = 0

        if ( ( action == 1 ) & ( self.load_remain_period == 0 ) & ( self.load_remain_demand > 0 ) ):
            self.load_remain_period = self.load_period
        
        if self.load_remain_period > 0:
            self.load_is_on = 1
            self.load_remain_demand -= 1
            self.load_remain_period -= 1
            cost_reward       = ( Pgrid_price/7 ) / ( self.load_demand )
            preference_reward = ( preference /4 ) / ( self.load_demand )
        else:
            self.load_is_on   = 0
            cost_reward       = 0
            preference_reward = 0

        self.load_use_power  = self.load_is_on*self.load_power

        self.load_preference = preference

        self.time_step += 1 

        if ( self.time_step >= 96 ) :
            done = True
            if  self.load_remain_demand > 0:
                not_enough_penalty = -1
            else:
                not_enough_penalty =  1
        else:
            done = False

        if ( self.time_step >= 96 ):
            price_next        = Pgrid_price
            preference_next   = preference
        else:
            price_next        = self.execute_price[self.time_step]
            preference_next   = self.execute_preference[self.time_step]

        self.reward = ( ( -5*cost_reward + 5*preference_reward ) + 20*not_enough_penalty)/30

        if  ( state_remain == 0 ) & (self.mode == 'train' ) & ( self.time_step < 96 ) :
            if np.random.randint(10) > 5:
                max_num_of_period = max( 1, ( ( 96 - self.time_step)//self.load_period ) - 1) 
                self.load_remain_demand = self.load_period*( np.random.randint( 0, max( 1, max_num_of_period )) )
    
        # Next state ----------------------------------------------------------
        # [ time_step,, pgrid_price, remain_time, remain_period, period_len, preference ]
        self.state = np.array([ self.time_step, price_next, self.load_remain_demand,  self.load_remain_period, self.load_period, preference_next ], dtype=np.float32)

        # truncated = False
        return  ( self.state, self.reward, done, False, {} )
    
    ###########################################################################
    def reset(self, seed=None, options=None, id=1, month=6, demand = 30, period = 5, load_power = 1.5):# id = 1,2 / month = 1 ~ 12
        super().reset(seed=seed)
        self.time_step  = 0
        self.load_power = load_power

        # get preference data
        if self.mode == 'train': # testing mode read the id and month
            self.id    = np.random.randint(1, 3)
            # self.month = np.random.randint(1,13)
            self.month = np.random.randint(6,10)
            self.load_period = np.random.randint(5,8)
            self.load_demand = (np.random.randint(30,36)//self.load_period)*self.load_period
            price_noise      = np.random.normal(0,0.1,96)
            preference_noise = np.random.randint(-1,2,96)
            self.execute_preference = self.preference_data[self.id][str(self.month)] + preference_noise
        else:
            self.id    = id
            self.month = month
            self.load_period = period
            self.load_demand = demand
            price_noise      = np.zeros(96)
            preference_noise = np.zeros(96)
            self.execute_preference = self.preference_data[self.id][str(self.month)]

        self.load_remain_demand = self.load_demand
        self.load_remain_period = 0
        
        
        # whether is "summer" | 6-9
        if ( 6 <= self.month <= 9 ): # is summer
            self.execute_price = self.price_data['summer_price'] + price_noise
        else: # not summer
            self.execute_price = self.price_data['not_summer_price'] + price_noise

        #------------------------------------------------------------------------------
        price_next        = self.execute_price[self.time_step]
        preference_next   = self.execute_preference[self.time_step]

        # if preference_next < 0:
        #     preference_next = -4

        # [ time_step,, pgrid_price, remain_time, remain_period, period_len, preference ]
        self.state = np.array([ self.time_step, price_next, self.load_remain_demand, self.load_remain_period, self.load_period, preference_next ], dtype=np.float32)

        return ( self.state, {} )

    def render(self):
        pass

if __name__ == '__main__':
    ################################
    mode = 0
    ################################
    if mode == 0:
        print('training mode')
        env = UnintEnv(mode = 'train')
        state , _ = env.reset()
    else:
        print('testing mode')
        env = UnintEnv(mode = 'test')
        state , _ = env.reset(id = 1,month = 10)
        
    #[ time_step, remain_power, pgrid_price, remain_time, load_switch, preference ]
    print('[{:}, {:.3f}, {:.2f}, {:2.0f}, {:}, {:2.0f}]'.format(state[0],state[1],state[2],state[3],state[4],state[5]))
    done = False
    total_reward = 0
    while not done:
        action = env.action_space.sample()
        state, reward, done, _ , _ = env.step(action)
        print('action:',action,' reward:',reward)
        print('[{:}, {:.3f}, {:.2f}, {:2.0f}, {:}, {:2.0f}]'.format(state[0],state[1],state[2],state[3],state[4],state[5]))
        total_reward += reward

    print(total_reward)

training mode
[0.0, 1.656, 28.00,  0, 7.0,  0]
action: 1  reward: -0.0014077694643111456
[1.0, 1.780, 27.00,  6, 7.0,  3]
action: 1  reward: 0.0029507526735059256
[2.0, 1.885, 26.00,  5, 7.0,  2]
action: 1  reward: 0.0013737037879269136
[3.0, 1.721, 25.00,  4, 7.0,  4]
action: 1  reward: 0.004488944399113557
[4.0, 1.539, 24.00,  3, 7.0,  3]
action: 1  reward: 0.0031556264072859367
[5.0, 1.766, 23.00,  2, 7.0,  4]
action: 1  reward: 0.004450598440202725
[6.0, 1.672, 22.00,  1, 7.0,  2]
action: 1  reward: 0.0015541082944999743
[7.0, 1.864, 21.00,  0, 7.0,  2]
action: 0  reward: 0.0
[8.0, 1.775, 21.00,  0, 7.0,  1]
action: 1  reward: -2.1094367617652728e-05
[9.0, 1.664, 20.00,  6, 7.0,  1]
action: 1  reward: 7.300727626904322e-05
[10.0, 1.895, 19.00,  5, 7.0,  1]
action: 1  reward: -0.00012300837607610813
[11.0, 1.674, 18.00,  4, 7.0, -1]
action: 1  reward: -0.002911504636816427
[12.0, 1.793, 17.00,  3, 7.0, -1]
action: 1  reward: -0.0030126913266928014
[13.0, 1.654, 16.00,  2, 7.0, -1]
a

In [3]:
import stable_baselines3
from stable_baselines3.common.env_checker import check_env
env = UnintEnv(mode = 'train')
check_env(env)

In [4]:
import torch as th
from stable_baselines3 import PPO

env = UnintEnv(mode = 'train')
policy_kwargs = dict(activation_fn=th.nn.Tanh,net_arch=dict(pi=[64,64],vf=[64,64]))
model = PPO(
    "MlpPolicy", env, learning_rate=0.0003, n_steps=2048, batch_size=128, n_epochs=10, gamma=0.99, gae_lambda=0.95, clip_range=0.2, 
    clip_range_vf=None, normalize_advantage=True, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, target_kl=None, 
    tensorboard_log="./agent/tensorboard_unint",  policy_kwargs=policy_kwargs, verbose=0, seed=0, device='cuda', _init_setup_model=True
    )
model.learn(500_000)
model.save('agent/unint_model_f') #nice 5 5 20 128
del model

In [12]:
from stable_baselines3 import PPO

import plotly.graph_objects as go

# model =  PPO.load("agent/unint_model_1_nice")
model =  PPO.load("agent/unint_model_f")  

id = 2
#
step = np.array(list(range(0,96)))
env  = UnintEnv(mode = 'test')

for month in range(6,10):
    if   id == 1: # intload_demand1 : 30 power : 1.6
        state , _  = env.reset( id=id, month=month, demand = 30, period = 5, load_power=1.6 )
    elif id == 2: # intload_demand2 : 40 power : 1.5
        state , _  = env.reset( id=id, month=month, demand = 30, period = 5, load_power=1.5  )
    else:         # intload_demand3 : 20 power : 1.3
        state , _  = env.reset( id=id, month=month, demand = 30, period = 5, load_power=1.3  )

    plt_action = []
    plt_price      = env.execute_price
    plt_preference = env.execute_preference
  
    total_rewards = 0
    
    terminated = False
    # for _ in np.arange(20):
    #     action, _ = model.predict(state, deterministic=True)
    #     action = 0
    #     state, reward, terminated, _ , _ = env.step(action)
    #     total_rewards += reward
    #     plt_action.append(env.load_is_on)
    #     if terminated:
    #         break

    for _ in np.arange(96):
        action, _ = model.predict(state,  deterministic=True)
        state, reward, terminated, _ , _ = env.step(action)
        total_rewards += reward
        plt_action.append(env.load_is_on)
        if terminated:
            break

    print('Left remain time = ',env.load_remain_demand)    
    print("Total_reward = ", total_rewards)

    #temperature --------------------------------------------------------------------------------------
    fig = go.Figure()
    fig.add_trace(go.Bar    (x=step,y=plt_action    ,name="Action",marker=dict(color="rgb(204,204,204)")))
    fig.add_trace(go.Scatter(x=step,y=plt_price     ,name="Pgrid price"    ,yaxis="y2",line=dict(color="#00cc96")))
    fig.add_trace(go.Scatter(x=step,y=plt_preference,name="User Preference",yaxis="y3",line=dict(color="#ffa15a")))
    fig.add_trace(go.Scatter(x=step,y=np.zeros(96)  ,yaxis="y4",line=dict(color="rgb(204,204,204)")))

    fig.update_layout(
        xaxis = dict(domain=[0.1, 0.9],tick0 = 0,dtick = 5,showgrid=False),
        yaxis = dict(title="action"      ,titlefont=dict(color="rgb(179,179,179)"), tickfont=dict(color="rgb(179,179,179)"),
                    range=[  0, 5], position = 0.05, showgrid=False , anchor="free", side="left"),

        yaxis2= dict(title="Pgrid price (TWD)",titlefont=dict(color="#00cc96"), tickfont=dict(color="#00cc96"),
                    range=[ -1, 7], position = 0.10, showgrid=False, overlaying="y", anchor="free", side="left" , autoshift=True),
                    
        yaxis3= dict(title="Preference " ,titlefont=dict(color="#ef553b"), tickfont=dict(color="#ef553b"),
                    range=[  -2, 5], position = 0.9, showgrid=False, overlaying="y", anchor="free", side="right", autoshift=True),

        yaxis4= dict(title="Power (kWh) " ,titlefont=dict(color="#636efa"), tickfont=dict(color="#636efa"),
                    range=[  -10, 10], position = 0.95, showgrid=False, overlaying="y", anchor="free", side="right", autoshift=True),
        plot_bgcolor='rgba(0,0,0,0)',legend=dict(orientation="h",yanchor="bottom",y=-0.2,xanchor="center",x=0.5)
    )

    fig.update_xaxes(showline=True,linewidth=1,linecolor='black',mirror=True)
    fig.update_yaxes(showline=True,linewidth=1,linecolor='black',mirror=True)
    fig.show()

Left remain time =  0
Total_reward =  0.634349206704942


Left remain time =  0
Total_reward =  0.6750396830695016


Left remain time =  0
Total_reward =  0.6753095231358966


Left remain time =  0
Total_reward =  0.6339761904307774
