# Rat Environment

In [1]:
class rat: 
    '''
    left=0, right=1, up=2, down=3
    states = 0,1,2,3,4,5
    4=-10 reward death
    5=10 reward
    1=1 reward
    2=2 reward
    '''
    
    def __init__(self,):
        self.transition = [[0,0,1,3,4,5],[1,2,2,4,4,5],[0,1,2,0,4,5],[3,4,5,3,4,5]]
        self.initState = 0
        self.currState = 0
        self.rewardTable = [0,1,0,2,-10,10]
        self.stateSpace = 6
        self.actionSpace = 4
        return;
    
    def step(self,action):
        newState = self.transition[action][self.currState]
        if newState == self.currState:
            reward = 0
        else:
            reward = self.rewardTable[newState]
        self.currState = newState
        isDone = False
        if self.currState == 4 or self.currState == 5:
            isDone = True
        return (newState,reward,isDone)
    
    def reset(self,):
        self.initState = 0
        self.currState = 0
        return self.currState

# Q learning in Rat env

In [2]:
import numpy as np
import time, pickle, os

env = rat()

epsilon = 0.9
total_episodes = 10
max_steps = 100

lr_rate = 0.81
gamma = 0.96

Q = np.zeros((env.stateSpace, env.actionSpace))
    
    
def choose_action(state):
    action=0
    if np.random.uniform(0, 1) < epsilon:
        action = np.random.randint(0,high=4)
    else:
        action = np.argmax(Q[state, :])
    return action

def learn(state, state2, reward, action):
    predict = Q[state, action]
    target = reward + gamma * np.max(Q[state2, :])
    print('Old Q(','(',state//3,',',state%3,')',',',action,') =',round(Q[state,action],2))
    Q[state, action] = Q[state, action] + lr_rate * (target - predict)
    print('New Q(','(',state//3,',',state%3,')',',',action,') =',round(predict,5) ,'+',round(lr_rate,5),'*','(',round(target,5),'-',round(predict,5),') =' ,round(Q[state,action],2))

# Start
for episode in range(total_episodes):
    actionList = ['LEFT', 'RIGHT', 'UP', 'DOWN']
    state = env.reset()
    visualize = np.zeros((2,3))
    visualize[state//3,state%3] = 1
    print('############# Episode Begins ##################')
    t = 0
    
    while t < max_steps:
        
        print('Current State\n',visualize)

        action = choose_action(state)  
        print('Action =',action,'=>',actionList[action])

        state2, reward, done = env.step(action)  
        print('state2',state2,'reward',reward,'done',done)

        learn(state, state2, reward, action)
        
        state = state2
        
        visualize = np.zeros((2,3))
        visualize[state//3,state%3] = 1
        print('New State\n', visualize,'\n')
        #print('Q(',state,',',action,') =',round(Q[state,action],2),'\n')

        t += 1
       
        if done:
            break

        time.sleep(0.1)

print(Q)

#with open("frozenLake_qTable.pkl", 'wb') as f:
#    pickle.dump(Q, f)

############# Episode Begins ##################
Current State
 [[ 1.  0.  0.]
 [ 0.  0.  0.]]
Action = 0 => LEFT
state2 0 reward 0 done False
Old Q( ( 0 , 0 ) , 0 ) = 0.0
New Q( ( 0 , 0 ) , 0 ) = 0.0 + 0.81 * ( 0.0 - 0.0 ) = 0.0
New State
 [[ 1.  0.  0.]
 [ 0.  0.  0.]] 

Current State
 [[ 1.  0.  0.]
 [ 0.  0.  0.]]
Action = 1 => RIGHT
state2 1 reward 1 done False
Old Q( ( 0 , 0 ) , 1 ) = 0.0
New Q( ( 0 , 0 ) , 1 ) = 0.0 + 0.81 * ( 1.0 - 0.0 ) = 0.81
New State
 [[ 0.  1.  0.]
 [ 0.  0.  0.]] 

Current State
 [[ 0.  1.  0.]
 [ 0.  0.  0.]]
Action = 3 => DOWN
state2 4 reward -10 done True
Old Q( ( 0 , 1 ) , 3 ) = 0.0
New Q( ( 0 , 1 ) , 3 ) = 0.0 + 0.81 * ( -10.0 - 0.0 ) = -8.1
New State
 [[ 0.  0.  0.]
 [ 0.  1.  0.]] 

############# Episode Begins ##################
Current State
 [[ 1.  0.  0.]
 [ 0.  0.  0.]]
Action = 0 => LEFT
state2 0 reward 0 done False
Old Q( ( 0 , 0 ) , 0 ) = 0.0
New Q( ( 0 , 0 ) , 0 ) = 0.0 + 0.81 * ( 0.7776 - 0.0 ) = 0.63
New State
 [[ 1.  0.  0.]
 [ 0.  0.  0

Current State
 [[ 0.  1.  0.]
 [ 0.  0.  0.]]
Action = 2 => UP
state2 1 reward 0 done False
Old Q( ( 0 , 1 ) , 2 ) = 0.58
New Q( ( 0 , 1 ) , 2 ) = 0.58283 + 0.81 * ( 0.71955 - 0.58283 ) = 0.69
New State
 [[ 0.  1.  0.]
 [ 0.  0.  0.]] 

Current State
 [[ 0.  1.  0.]
 [ 0.  0.  0.]]
Action = 0 => LEFT
state2 0 reward 0 done False
Old Q( ( 0 , 1 ) , 0 ) = 0.75
New Q( ( 0 , 1 ) , 0 ) = 0.74953 + 0.81 * ( 5.22542 - 0.74953 ) = 4.37
New State
 [[ 1.  0.  0.]
 [ 0.  0.  0.]] 

Current State
 [[ 1.  0.  0.]
 [ 0.  0.  0.]]
Action = 0 => LEFT
state2 0 reward 0 done False
Old Q( ( 0 , 0 ) , 0 ) = 4.05
New Q( ( 0 , 0 ) , 0 ) = 4.04962 + 0.81 * ( 5.22542 - 4.04962 ) = 5.0
New State
 [[ 1.  0.  0.]
 [ 0.  0.  0.]] 

Current State
 [[ 1.  0.  0.]
 [ 0.  0.  0.]]
Action = 3 => DOWN
state2 3 reward 2 done False
Old Q( ( 0 , 0 ) , 3 ) = 5.44
New Q( ( 0 , 0 ) , 3 ) = 5.44314 + 0.81 * ( 6.76637 - 5.44314 ) = 6.51
New State
 [[ 0.  0.  0.]
 [ 1.  0.  0.]] 

Current State
 [[ 0.  0.  0.]
 [ 1.  0.  0.]]
A

Current State
 [[ 0.  0.  0.]
 [ 1.  0.  0.]]
Action = 2 => UP
state2 0 reward 0 done False
Old Q( ( 1 , 0 ) , 2 ) = 7.93
New Q( ( 1 , 0 ) , 2 ) = 7.93312 + 0.81 * ( 9.19222 - 7.93312 ) = 8.95
New State
 [[ 1.  0.  0.]
 [ 0.  0.  0.]] 

Current State
 [[ 1.  0.  0.]
 [ 0.  0.  0.]]
Action = 3 => DOWN
state2 3 reward 2 done False
Old Q( ( 0 , 0 ) , 3 ) = 9.58
New Q( ( 0 , 0 ) , 3 ) = 9.57523 + 0.81 * ( 10.59487 - 9.57523 ) = 10.4
New State
 [[ 0.  0.  0.]
 [ 1.  0.  0.]] 

Current State
 [[ 0.  0.  0.]
 [ 1.  0.  0.]]
Action = 0 => LEFT
state2 3 reward 0 done False
Old Q( ( 1 , 0 ) , 0 ) = 4.43
New Q( ( 1 , 0 ) , 0 ) = 4.43026 + 0.81 * ( 8.59487 - 4.43026 ) = 7.8
New State
 [[ 0.  0.  0.]
 [ 1.  0.  0.]] 

Current State
 [[ 0.  0.  0.]
 [ 1.  0.  0.]]
Action = 3 => DOWN
state2 3 reward 0 done False
Old Q( ( 1 , 0 ) , 3 ) = 5.58
New Q( ( 1 , 0 ) , 3 ) = 5.57842 + 0.81 * ( 8.59487 - 5.57842 ) = 8.02
New State
 [[ 0.  0.  0.]
 [ 1.  0.  0.]] 

Current State
 [[ 0.  0.  0.]
 [ 1.  0.  0.]]


# SARSA in rat env

In [None]:
import numpy as np
import time, pickle, os

env = rat()

epsilon = 0.9
# min_epsilon = 0.1
# max_epsilon = 1.0
# decay_rate = 0.01

total_episodes = 10
max_steps = 100

lr_rate = 0.81
gamma = 0.96

Q = np.zeros((env.stateSpace, env.actionSpace))
    
def choose_action(state):
    action=0
    if np.random.uniform(0, 1) < epsilon:
        action = np.random.randint(0,high=4)
    else:
        action = np.argmax(Q[state, :])
    return action

def learn(state, state2, reward, action, action2):
    predict = Q[state, action]
    target = reward + gamma * Q[state2, action2]
    print('Old Q(','(',state//3,',',state%3,')',',',action,') =',round(Q[state,action],2))
    Q[state, action] = Q[state, action] + lr_rate * (target - predict)
    print('New Q(','(',state//3,',',state%3,')',',',action,') =',round(predict,5) ,'+',round(lr_rate,5),'*','(',round(target,5),'-',round(predict,5),') =' ,round(Q[state,action],2))


# Start
rewards=0

for episode in range(total_episodes):
    actionList = ['LEFT', 'RIGHT', 'UP', 'DOWN']
    t = 0
    
    state = env.reset()
    
    visualize = np.zeros((2,3))
    visualize[state//3,state%3] = 1
    print('############# Episode Begins ##################')
    
    action = choose_action(state)
    
    while t < max_steps:

        print('Action = ',action, '=>', actionList[action])
        print('Current State\n',visualize)
        state2, reward, done = env.step(action)

        action2 = choose_action(state2)

        learn(state, state2, reward, action, action2)

        state = state2
        action = action2
        
        visualize = np.zeros((2,3))
        visualize[state//3,state%3] = 1
        print('Next Action = ',action2, '=>', actionList[action2])
        print('New State\n', visualize,'\n')
        

        t += 1
        rewards+=1

        if done:
            break
    # epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode) 
    # os.system('clear')
        time.sleep(0.1)

    
print ("Score over time: ", rewards/total_episodes)
print(Q)


# Flippers Env

In [None]:
import numpy as np

class Flipper:
    def __init__(self,):
        self.currState = np.ndarray.tolist(np.random.randint(0,2,size=(1,9)))[0]
        self.currStateInt = int("".join(str(x) for x in self.currState), 2)
        self.steps = 0
        self.endStatesInt = [7,56,73,146,292,448]
        #[[1,0,0,1,0,0,1,0,0],[0,1,0,0,1,0,0,1,0],[0,0,1,0,0,1,0,0,1],[1,1,1,0,0,0,0,0,0],[0,0,0,1,1,1,0,0,0],[0,0,0,0,0,0,1,1,1]]
        self.stateSpace = 512
        self.actionSpace = 9
        
    def step(self,action):
        self.currState[action] = int(not self.currState[action])
        self.currStateInt = int("".join(str(x) for x in self.currState), 2)
        isDone = False
        reward = 0
        self.steps = self.steps + 1
    
        if self.currStateInt in self.endStatesInt:
            isDone = True
            reward = pow(0.9,self.steps)
        return(self.currStateInt,reward,isDone,self.steps)
        
    def reset(self,):
        self.currState = np.ndarray.tolist(np.random.randint(0,2,size=(1,9)))[0]
        self.currStateInt = int("".join(str(x) for x in self.currState), 2)
        self.steps = 0
        return self.currStateInt

# Q in Flippers env

In [None]:
import numpy as np
import time, pickle, os
import matplotlib.pyplot as plt

env = Flipper()

epsilon = 0.9
total_episodes = 1000
max_steps = 1000

lr_rate = 0.81
gamma = 0.96

Q = np.zeros((env.stateSpace, env.actionSpace))
    
    
def choose_action(state):
    action=0
    if np.random.uniform(0, 1) < epsilon:
        action = np.random.randint(0,high=env.actionSpace)
    else:
        action = np.argmax(Q[state, :])
    return action

def learn(state, state2, reward, action):
    predict = Q[state, action]
    target = reward + gamma * np.max(Q[state2, :])
    #print('Old Q(','(',state//3,',',state%3,')',',',action,') =',round(Q[state,action],2))
    Q[state, action] = Q[state, action] + lr_rate * (target - predict)
    #print('New Q(','(',state//3,',',state%3,')',',',action,') =',round(predict,5) ,'+',round(lr_rate,5),'*','(',round(target,5),'-',round(predict,5),') =' ,round(Q[state,action],2))

# Start

stepsList = []
for episode in range(total_episodes):
    #actionList = ['LEFT', 'RIGHT', 'UP', 'DOWN']
    state = env.reset()
    #visualize = np.zeros((2,3))
    #visualize[state//3,state%3] = 1
    print('############# Episode Begins ##################')
    t = 0
    if episode==1:
        print(episode)
        print(Q[145,:])
    epsilon -= 0.001 
    
    while t < max_steps:
        
        print(Q[1,:])
        
        #print('Current State\n',state)

        action = choose_action(state)  
        #print('Action =',action)

        state2, reward, done, stepNum = env.step(action)  
        #print('state2',state2,'reward',reward,'done',done)

        learn(state, state2, reward, action)
        
        state = state2
        
        #visualize = np.zeros((2,3))
        #visualize[state//3,state%3] = 1
        #print('New State\n', visualize,'\n')
        #print('Q(',state,',',action,') =',round(Q[state,action],2),'\n')

        t += 1
       
        if done:
            stepsList.append(stepNum)
            break

        #time.sleep(0.1)
        
plt.plot(stepsList)
#print(Q)