In [183]:
import math
import random
import csv
import json
import numpy as np

In [64]:
def load_environment():
    environment = np.load('Environments/grid_game.npy')
    wall_coordinates = np.load('Environments/wall_coordinates.npy')
    nega_coordinates = np.load('Environments/nega_coordinates.npy')
    goal_coordinates = np.load('Environments/goal_coordinates.npy')
    return (environment, wall_coordinates, nega_coordinates, goal_coordinates)

In [18]:
def initialRewardValues(rewardFile, R):

    reader = open(rewardFile, 'rU')
    i = 0
    for row in reader:
        if i > 0:
            cells = row.split(",")
            R[i-1][0] = (int)(cells[1])
            R[i-1][1] = (int)(cells[2])
            R[i-1][2] = (int)(cells[3])
            R[i-1][3] = (int)(cells[4])
        i = i + 1
    return R

In [39]:
def calculateNextState(startState):

    possNextState = [0 for i in range(0,4)]	

    for i in range(0,4):

        if (i == 0): 
            if(startState - 10 >= 0):
                possNextState[i] = startState - 10
            else:
                possNextState[i] = -1

        if (i == 1):
            if(startState + 10 < 100):
                possNextState[i] = startState + 10
            else:
                possNextState[i] = -1	

        if (i == 2):
            if(startState - 1 >= 0):
                possNextState[i] = startState - 1
            else:
                possNextState[i] = -1

        if (i == 3):
            if(startState + 1 < 100):
                possNextState[i] = startState + 1
            else:
                possNextState[i] = -1

    return possNextState

In [184]:
def initialize_q():
    Q = {}
    for i in range(10):
        for j in range(10):
            key1 = ((i+1, j+1), 'up')
            key2 = ((i+1, j+1), 'down')
            key3 = ((i+1, j+1), 'left')
            key4 = ((i+1, j+1), 'right')
            Q[key1] = 0
            Q[key2] = 0
            Q[key3] = 0
            Q[key4] = 0
    return Q

In [94]:
def get_possible_actions(environment, current_state):
    
    next_states = []
    up = (current_state[0] - 1, current_state[1])
    down = (current_state[0] + 1, current_state[1])
    left = (current_state[0], current_state[1] - 1)
    right = (current_state[0], current_state[1] + 1)
    
    if(not math.isnan(environment[up])):
        next_states.append('up')
    if(not math.isnan(environment[down])):
        next_states.append('down')
    if(not math.isnan(environment[left])):
        next_states.append('left')
    if(not math.isnan(environment[right])):
        next_states.append('right')
    
    return next_states

In [125]:
def greedyQLearning(startState, R):

    goalState = 55
    iterations = 0

    # discount factor
    beta = 0.9 
    # learning rate
    alpha = 0.01
    # for greedy algorithm
    epsilon = 0.2

    #storing the q values
    Q = [[0.0 for i in range(4)] for j in range(100)]
    check1 = 0
    check2 = 0
    
    while(startState != goalState):
        
        possibleActions = []
        possNextState = [0 for i in range(0,4)]
        nextState = 0
        r = 0.0
        qMax = 0
        qMaxExploit = 0

        possNextState = calculateNextState(startState)


        #Determining the next state
        for action in range(0, 4):
            if (R[startState][action] != 999.0):
                possibleActions.append(action)

        #getting random number between 0 and 1 and checking with epsilon
        if (len(possibleActions) > 0):
            r = random.uniform(0, 1)

            # exploit
            if r <= epsilon:
                #print ("exploit: ", possNextState)
                for action in range(0, 4):
                    if (R[startState][action] != 999 and qMaxExploit <= Q[startState][action]):
                        qMaxExploit = Q[startState][action]
                        actionIndex = action
                nextState = possNextState[actionIndex]

            # explore
            else:
                #print ("explore: ", possNextState)
                actionIndex = possibleActions[(random.randrange(len(possibleActions)))]
                nextState = possNextState[actionIndex]

            #print ("nextState", nextState)

            for action in range(0, 4):
                if (R[nextState][action] != 999 and qMax <= Q[nextState][action]):
                    qMax = Q[nextState][action]
                    

            # Q calculation
            Q[startState][actionIndex] = Q[startState][actionIndex] + alpha * (R[startState][actionIndex] + (beta * qMax) - Q[startState][actionIndex])
            startState = nextState

            iterations += 1

    print ("Iterations: ", iterations)
    print ("Check 1: ", check1)
    print ("Check 2: ", check2)
    #print ("Q",Q)

In [None]:
def getBoltzmannProb(state):

    possNextState = calculateNextState(state)
    actionsProb = [0 for i in range(0,len(possNextState))]
    denominator = 0.0
    numerator = 0.0

    temperature = temperature - 0.05 #need to calculate the t value using temperature

    for action in range(0,len(possNextState)):
        if (possNextState[action] != -1):
            denominator = denominator + math.exp(Q[state][action] / temperature) # temperature

    for action in range(0,len(possNextState)):
        if (possNextState[action] != -1):
            prob = 0.0
            #numerator = math.exp((Q[possNextState[action]][action])/temperature)
            numerator = math.exp((Q[state][action])/temperature)
            if(denominator != 0):
                prob = numerator / denominator
            
            actionsProb[action] = prob

    return actionsProb

In [168]:
def epsilon_greedy(environment, current_state, goal_state, Q):
    
    epsilon = 0.2
    action_lookup = {'up': 0, 'down': 1, 'left': 2, 'right': 3}
    steps = 0
    beta = 0.9 
    alpha = 0.01
    
    while(current_state != goal_state):
        
        max_q_exploit = 0
        max_q = 0
        next_states = {}

        possible_actions = get_possible_actions(environment, current_state)
        
        for action in possible_actions:
            if (action == 'up'):
                next_states['up'] = (current_state[0] - 1, current_state[1])
            if (action == 'down'):
                next_states['down'] = (current_state[0] + 1, current_state[1])
            if (action == 'left'):
                next_states['left'] = (current_state[0], current_state[1] - 1)
            if (action == 'right'):
                next_states['right'] = (current_state[0], current_state[1] + 1)

        r = random.uniform(0, 1)
        
        # Exploit
        if(r < epsilon):
            print ("Exploit: ", next_states)
            for action in possible_actions:
                if(max_q_exploit <= Q[(current_state, action)]):
                    max_q_exploit = Q[(current_state, action)]
                    next_action = action
            next_state = next_states[next_action]
        # Explore
        else:
            print ("Explore: ", next_states)
            next_action = possible_actions[(random.randrange(len(possible_actions)))]
            next_state = next_states[next_action]
        
        next_possible_actions = get_possible_actions(environment, next_state)
        
        for action in next_possible_actions:
            if (max_q <= Q[(current_state, action)]):
                max_q = Q[(current_state, action)]

        # Calculate Q
        Q[(current_state, next_action)] += alpha * (environment[current_state] + (beta * max_q) - Q[(current_state, next_action)])
        current_state = next_state
        steps += 1
    
    print("Steps: ", steps)
    

In [185]:
def boltzmann_learning(environment, current_state, goal_state, Q):
    
    epsilon = 0.2
    action_lookup = {'up': 0, 'down': 1, 'left': 2, 'right': 3}
    steps = 0
    beta = 0.9 
    alpha = 0.01
    
    while(current_state != goal_state):
        
        max_q_exploit = 0
        max_q = 0
        next_states = {}

        possible_actions = get_possible_actions(environment, current_state)
        
        for action in possible_actions:
            if (action == 'up'):
                next_states['up'] = (current_state[0] - 1, current_state[1])
            if (action == 'down'):
                next_states['down'] = (current_state[0] + 1, current_state[1])
            if (action == 'left'):
                next_states['left'] = (current_state[0], current_state[1] - 1)
            if (action == 'right'):
                next_states['right'] = (current_state[0], current_state[1] + 1)

        r = random.uniform(0, 1)
        
        # Exploit
        if(r < epsilon):
            print ("Exploit: ", next_states)
            for action in possible_actions:
                if(max_q_exploit <= Q[(current_state, action)]):
                    max_q_exploit = Q[(current_state, action)]
                    next_action = action
            next_state = next_states[next_action]
        # Explore
        else:
            print ("Explore: ", next_states)
            next_action = possible_actions[(random.randrange(len(possible_actions)))]
            next_state = next_states[next_action]
        
        next_possible_actions = get_possible_actions(environment, next_state)
        
        for action in next_possible_actions:
            if (max_q <= Q[(current_state, action)]):
                max_q = Q[(current_state, action)]

        # Calculate Q
        Q[(current_state, next_action)] += alpha * (environment[current_state] + (beta * max_q) - Q[(current_state, next_action)])
        current_state = next_state
        steps += 1
    
    print("Steps: ", steps)

In [150]:
def main(R):
    epsilon_greedy(0, R)

In [109]:
def later():
    # reward file
    reward = "../reward.csv"

    # discount factor
    beta = 0.9 
    # learning rate
    alpha = 0.01
    # for greedy algorithm
    epsilon = 0.2

    #storing the q values
    Q = [[0.0 for i in range(4)] for j in range(100)]
    R = [[0.0 for i in range(4)] for j in range(100)]

    R = initialRewardValues(reward, R)

    # goal state
    goalState = 55
    # starting sate
    startState = 0

    main(R)

In [65]:
(environment, wall_coordinates, nega_coordinates, goal_coordinates) = load_environment()

In [138]:
later()

Iterations:  604
Check 1:  309
Check 2:  378


  This is separate from the ipykernel package so we can avoid doing imports until


In [182]:
Q = initialize_q()
epsilonGreedy(environment, (1, 1), (6, 6), Q)

Explore:  {'down': (2, 1), 'right': (1, 2)}
Explore:  {'down': (2, 2), 'left': (1, 1), 'right': (1, 3)}
Explore:  {'down': (2, 1), 'right': (1, 2)}
Explore:  {'up': (1, 1), 'down': (3, 1), 'right': (2, 2)}
Explore:  {'up': (2, 1), 'down': (4, 1)}
Exploit:  {'up': (3, 1), 'down': (5, 1), 'right': (4, 2)}
Explore:  {'down': (5, 2), 'left': (4, 1), 'right': (4, 3)}
Explore:  {'up': (4, 2), 'down': (6, 2), 'left': (5, 1), 'right': (5, 3)}
Explore:  {'up': (4, 1), 'down': (6, 1), 'right': (5, 2)}
Explore:  {'up': (3, 1), 'down': (5, 1), 'right': (4, 2)}
Exploit:  {'up': (4, 1), 'down': (6, 1), 'right': (5, 2)}
Explore:  {'up': (4, 2), 'down': (6, 2), 'left': (5, 1), 'right': (5, 3)}
Explore:  {'down': (5, 2), 'left': (4, 1), 'right': (4, 3)}
Explore:  {'up': (3, 1), 'down': (5, 1), 'right': (4, 2)}
Explore:  {'up': (2, 1), 'down': (4, 1)}
Explore:  {'up': (1, 1), 'down': (3, 1), 'right': (2, 2)}
Explore:  {'up': (2, 1), 'down': (4, 1)}
Exploit:  {'up': (3, 1), 'down': (5, 1), 'right': (4, 2