In [1]:
import gym
env=gym.make("CartPole-v1")
obs= env.reset()
obs

array([-0.01122352,  0.01184797, -0.03377219,  0.00175344])

In [2]:
env.render()

True

In [1]:
from gym import Env # allows me to build a custom environment
from gym.spaces import Discrete, Box # allows me to define the actions to take in the environment and current state
import numpy as np
import random

The 4 things that are needed in every reinforcment learning model are: Action, Reward, Environment, Agent

In [2]:
# first initialization function that gets run automatically when I create a new instance of my function
# step function runs whenever I take a step in the environment
# render - not really needed since I"m not going to be rendering anything
# reset allows me to reset my environment

In [59]:
class ShowerEnv(Env):
    def __init__(self):
        # initializing actions I can take, the observation space (temperature/length)
        # actions I can take: down 2, down 1, stay, up 1, up 2
        # discrete comes from gym.spaces. This allows me to have 5 values 
        #(0 - go down two, 1 - go down one, 2 - stay same, 3 - go up one, 4 - go up two)
        self.action_space = Discrete(5)
        
        # Temperature array
        # defines where the shower is currently at, which can be used to tweak/produce the reward
        # the Box space can also hold n dimensional tensors, dataframes, images, and audio
        self.observation_space = Box(low=np.array([np.float64(0)]), high=np.array([np.float64(150)]))
        
        # setting the start temperature (in fahrenheit)
        # I like my showers pretty warm, so it will start within 3 degrees of 90
        self.state = 90 + random.randint(-3, 3)
        
        # shower length in seconds
        self.shower_length = 60
    def step(self, action):
        # defines what happens whenver a step is taken
        # Apply action
        # action is going to be 0, 1, or 2. Which was defined in the action space. Here I am applying my action to the state
        # 0 - 2 = -2 temperature
        # 1 - 2 = -1 temperature
        # 2 - 2 = 0 temperature
        # 3 - 2 = +1 temperature
        # 4 - 2 = +2 temperature
        self.state += action - 2
        
        # Reduce shower length by 1 second
        self.shower_length -= 1
        
        # Calculate reward
        # if the shower temperature is within the optimal range, then the reward is 1, otherwise it is -1
        # the model is going to try to converge so that the temperature is always within this range
        if self.state >=87 and self.state <=93:
            reward = 1
        else:
            reward = -1
            
        # check if shower is done
        if self.shower_length <= 0:
            done = True
        else:
            done = False
            
        # apply temperature noise
        # this will serve to fluctuate the temperature up and down, which is also what my real shower does
        self.state += random.randint(-1, 1)
        # every 5 seconds, there will be a potentially larger fluctuation, to simulate the real world
        # when the toilet flushes and suddenly the shower gets much colder or hotter
        if self.shower_length % 5 == 0:
            self.state += 3 * random.randint(-1, 1)
        
        # set placeholder for info, required by OpenAI
        info = {}
        
        # return step information
        return self.state, reward, done, info
    def render(self):
        # could be used for visualizations, not using here since there are no visualizations
        pass
    def reset(self):
        # where I can reset my environment
        # resetting the temperature
        self.state = 90 + random.randint(-3, 3)
        # resetting the shower length
        self.shower_length = 60
        return self.state
        

In [60]:
# testing it out by creating a new instance of the shower environment
env = ShowerEnv()

In [61]:
# example of the results within the action space
env.action_space.sample()

2

In [62]:
# example of the observation space
env.observation_space.sample()

array([26.445942], dtype=float32)

In [64]:
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+= reward
    print('Episode:{} Score:{}'.format(episode, score))

Episode:1 Score:-40
Episode:2 Score:2
Episode:3 Score:-36
Episode:4 Score:-24
Episode:5 Score:-40
Episode:6 Score:-24
Episode:7 Score:-44
Episode:8 Score:16
Episode:9 Score:-50
Episode:10 Score:-46


## Create a Deep Learning Model with Keras

In [65]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

In [66]:
# the shape of the states (1 value)
states = env.observation_space.shape
# number of actions that I have 
actions = env.action_space.n

In [67]:
def build_model(states, actions):
    model = Sequential()
    model.add(Dense(24, activation='relu', input_shape=states))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [68]:
model = build_model(states, actions)

In [69]:
# the model takes in the temperature as input and it will produce 3 different actions, (0, 1, 2)
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 24)                48        
_________________________________________________________________
dense_10 (Dense)             (None, 24)                600       
_________________________________________________________________
dense_11 (Dense)             (None, 5)                 125       
Total params: 773
Trainable params: 773
Non-trainable params: 0
_________________________________________________________________


## Build Agent with Keras-RL

In [70]:
# there are a bunch of different agents within the keras rl environment: DQNAgent, NAFAgent, DDPGAgent, SARSAAgent, CEMAgent
# these are all different agents that I can use to train my rl model
# https://keras-rl.readthedocs.io/en/latest/
from rl.agents import DQNAgent
# there are different styles: value based rl and policy based rl. Here I am going to use policy based
from rl.policy import BoltzmannQPolicy
# for the dqn agent I am going to need to maintain some memory. This class allows me to do that
from rl.memory import SequentialMemory

In [71]:
# pass in the model and the actions I can take in the environment
def build_agent(model, actions):
    # set up the policy
    policy = BoltzmannQPolicy()
    # set up the memory
    memory = SequentialMemory(limit=50000, window_length=1)
    # set up DQNAgent and pass in mode, memory, policy 
    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                  nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn

In [None]:
# instantiating
dqn = build_agent(model, actions)
# compiling
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
# fitting
dqn.fit(env, nb_steps=50000, visualize=False, verbose = 1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
166 episodes - episode_reward: -32.265 [-60.000, 26.000] - loss: 4.438 - mae: 6.271 - mean_q: 6.528

Interval 2 (10000 steps performed)
167 episodes - episode_reward: -23.042 [-60.000, 32.000] - loss: 0.906 - mae: 6.220 - mean_q: -7.163

Interval 3 (20000 steps performed)
167 episodes - episode_reward: -23.964 [-60.000, 28.000] - loss: 1.020 - mae: 7.211 - mean_q: -8.501

Interval 4 (30000 steps performed)
  907/10000 [=>............................] - ETA: 1:24 - reward: -0.3958

In [35]:
# testing the dqn on the custom environment
scores = dqn.test(env, nb_episodes=100, visualize=False)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
Episode 1: reward: -42.000, steps: 60
Episode 2: reward: -20.000, steps: 60
Episode 3: reward: -26.000, steps: 60
Episode 4: reward: -22.000, steps: 60
Episode 5: reward: -54.000, steps: 60
Episode 6: reward: -16.000, steps: 60
Episode 7: reward: -58.000, steps: 60
Episode 8: reward: -28.000, steps: 60
Episode 9: reward: -16.000, steps: 60
Episode 10: reward: -34.000, steps: 60
Episode 11: reward: -14.000, steps: 60
Episode 12: reward: 4.000, steps: 60
Episode 13: reward: -14.000, steps: 60
Episode 14: reward: -24.000, steps: 60
Episode 15: reward: -26.000, steps: 60
Episode 16: reward: -44.000, steps: 60
Episode 17: reward: 14.000, steps: 60
Episode 18: reward: 10.000, steps: 60
Episode 19: reward: -60.000, steps: 60
Episode 20: reward: -48.000, steps: 60
Episode 21: reward: -60.000, steps: 60
Episode 22: reward: -42.000, steps: 60
Episode 23: reward: -48.000, steps: 60
Episode 24: reward: -60.000, steps: 60
Episode 25: reward: -22.000, steps: 60
Episode 2