In [28]:
from IPython.display import clear_output
import numpy as np
import sys
import time
import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from collections import defaultdict
from copy import copy
from tabulate import tabulate
init_notebook_mode(connected=True)

In [29]:
def render_env(location):
    world = ["O"] * 7
    world[location] = "🐭"
    print("{}--{}--{}--{}--{}--{}--{}".format(*world))

In [37]:
def calc_need(energy):
    need = -2 * energy + 200
    return np.min([need, 100])

In [38]:
def get_preferences(action_values, preference=0, debug=False):
    action_values = copy(action_values)
    if debug: print("Action Values", action_values)
    low = np.min(action_values)
    if debug: print("Lowest value", low)
    if low < 0:
        if debug: print('NEGATIVE')
        action_values += low
    # Bound lower values to 1.0
    action_values += 1
    avg = np.mean(action_values)
    averages = np.array([avg] * len(action_values))
    deltas = action_values - avg
    if debug: 
        print("Shifted Action Values", action_values)
        print("Average", avg)
        print("Averages array", averages)
        print("Deltas", deltas)
    deltas *= preference
    tmp = averages + deltas
    total = np.sum(tmp)
    prefs = tmp / total
    if debug: 
        print("Weighted deltas", deltas)
        print("Temp values", tmp)
        print("Total", total)
        print("Prefs", prefs)
    
    return prefs

In [39]:
def choose_action(q_table, energy, location):
    # The strength of our need is a function of how much energy we have
    need = calc_need(energy)
    print(need)
    action_vals = q_table[location]
    actions = range(len(action_vals))
    
    # Our chance of adhereing to existing preferences is a function of need
    preference = act_chance = need / 100
    action = 0
    # preference = 1 # Debug - Make agent adhere to its preferences fully
    prefs = get_preferences(action_vals, preference)
    print(act_chance)
    if np.random.random() < act_chance:
        action = np.random.choice(actions, p=prefs)
    
    return action

In [40]:
def step_env(location, action):
    cheese = False
    if action == 1:
        location = np.max([location - 1, 0])
    if action == 2:
        location = np.min([location + 1, 6])
    if location == 6 and action == 3:
        cheese = True
        
    return location, cheese

In [41]:
def update_tables(state_table, q_table, location, action, new_location, value, alpha, debug=False):
    
    # How do we think about this location generically?
    state_val = state_table[new_location]
    state_val = state_val + alpha * (value - state_val) # This should probably be a squared error
    state_table[new_location] = state_val
    
    # How do we think about the previous location generically?
    prev_state_val = state_table[location]
    prev_state_val = prev_state_val + alpha * (state_val - prev_state_val)
    state_table[location] = prev_state_val
    
    # How do we think about the action which we took at the previous location?
    q_val = q_table[location][action]
    if debug: print("Previous location-action val", q_val)
    q_table[location][action] = q_val + alpha * (state_val - q_val)
    
    return state_table, q_table

In [42]:
def interpret(state_table, state):
    """
    This should be a generic, learned function but we'll hard code some knowledge here that
    would normally be acquired on an evolutionary timescale.
    """
    
    energy, location, cheese = state
    
    # Cheese is great until we're full
    cheese_val = 0
    if cheese:
        cheese_val = 100 - energy
    
    # Things are good until we're hungry
    # https://www.desmos.com/calculator/zc8dstbyzy
    energy_val = -1 * np.exp(-1 * ((energy - 40) / 8.5)) + 1
    
    return state_table[location] + energy_val + cheese_val

In [43]:
# [stay, left, right, eat]
actions = [0, 1, 2, 3]
bmr = 0.1

move_cost = 0.1
cheese_nrg = 5

energy = 100
location = 0

render_env(location)
learning_rate = 0.1

energy_over_time = []
location_over_time = []

Q = defaultdict(lambda: np.zeros(4))
S = defaultdict(lambda: 0)

state_value_history = []
q_value_history = []


render = False
max_iters = 100000
cheese = False
for i in range(max_iters):
    # Record our progress
    energy_over_time.append(energy)
    location_over_time.append(location)

    # Choose an action
    action = choose_action(Q, energy, location)
    
    # State
    new_location, cheese = step_env(location, action)
    
    # Interpretation
    value = interpret(S, [energy, new_location, cheese])
    print(tabulate([[location, action, new_location, energy, cheese, value]], headers=["Location","Action","New Location","Energy", "Cheese", "Value"]))
    
    # Update our memories
    S, Q = update_tables(S, Q, location, action, new_location, value, learning_rate)
    
    # Record the history of memory states
    state_value_history.append(copy(S))
    q_value_history.append(copy(Q))
    
    # time.sleep(0.01)
    # clear_output(wait=True)
    if render:
        render_env(location)


    # Moving costs energy
    if new_location != location:
        energy -= move_cost
    # We've just eaten some cheese! Modify internal state to reflect that.
    if cheese:
        energy += cheese_nrg
    # Living is costly, pay the base metabolic rate
    energy -= bmr
    
    if energy <=0 or energy >= 200:
        print("DEAD")
        print("Iterations: ", i)
        break

    location = new_location

    
print("Done.")


🐭--O--O--O--O--O--O
0
0.0
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         0               0       100         0  0.99914
0.19999999999998863
0.0019999999999998864
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         0               0      99.9         0  1.09904
0.39999999999997726
0.003999999999999773
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         0               0      99.8         0  1.19895
0.5999999999999659
0.005999999999999659
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         0               0      99.7         0  1.29885
0.7999999999999545
0.007999999999999546
  Location    Actio

         4         0               4      81.1         0  2.54933
37.99999999999784
0.3799999999999784
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         4         0               4        81         0  2.64844
38.19999999999783
0.3819999999999783
  Location    Action    New Location    Energy    Cheese     Value
----------  --------  --------------  --------  --------  --------
         4         2               5      80.9         0  0.991867
38.599999999997806
0.3859999999999781
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         5         0               5      80.7         0  1.09086
38.799999999997794
0.3879999999999779
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         5         1               4      80.6         

0.3739999999999617
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         4         0               4      81.3         0  4.30692
37.59999999999616
0.3759999999999616
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         4         0               4      81.2         0  4.40605
37.799999999996146
0.3779999999999615
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         4         0               4      81.1         0  4.50517
37.999999999996135
0.37999999999996137
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         4         2               5        81         0  4.08831
38.39999999999611
0.3839999999999611
  Location    Action    New Location  

0.29999999999994886
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         5         0               5        85         0  6.06176
30.199999999994873
0.30199999999994875
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         5         0               5      84.9         0   6.1612
30.39999999999486
0.3039999999999486
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         5         0               5      84.8         0  6.26063
30.59999999999485
0.3059999999999485
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         5         0               5      84.7         0  6.36005
30.79999999999484
0.30799999999994837
  Location    Action    New Location 

         2         0               2      71.5         0  6.69582
57.19999999999334
0.5719999999999333
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         2         0               2      71.4         0  6.79307
57.39999999999333
0.5739999999999332
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         2         0               2      71.3         0  6.89029
57.599999999993315
0.5759999999999331
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         2         0               2      71.2         0  6.98748
57.799999999993304
0.577999999999933
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         2         0               2      71.1         0  7

75.39999999999254
0.7539999999999254
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         3               0      62.3         0  12.2813
75.59999999999255
0.7559999999999255
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         0               0      62.2         0  12.3731
75.79999999999255
0.7579999999999255
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         1               0      62.1         0  12.4649
75.99999999999255
0.7599999999999255
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         0               0        62         0  12.5566
76.19999999999256
0.7619999999999255
  Location    Action   

1.0
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         1               0      48.9         0  16.7786
100.0
1.0
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         0               0      48.8         0  16.8393
100.0
1.0
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         1               0      48.7         0  16.8996
100.0
1.0
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         3               0      48.6         0  16.9594
100.0
1.0
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         0            

         1         3               1        39         0  12.9926
100.0
1.0
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         1         1               0      38.9         0  15.3932
100.0
1.0
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         3               0      38.7         0  15.3523
100.0
1.0
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         0               0      38.6         0   15.322
100.0
1.0
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         2               1      38.5         0  13.1532
100.0
1.0
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  ----

  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         6         0               6      55.6         0   55.863
88.99999999999359
0.889999999999936
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         6         0               6      55.5         0  55.9451
89.1999999999936
0.891999999999936
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         6         0               6      55.4         0  56.0271
89.3999999999936
0.893999999999936
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         6         0               6      55.3         0  56.1088
89.5999999999936
0.895999999999936
  Location    Action    New Location    Energy    Cheese    Value
-

         6         0               6      98.7         0  72.1795
2.799999999991769
0.02799999999991769
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         6         0               6      98.6         0  72.2794
2.9999999999917577
0.02999999999991758
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         6         0               6      98.5         0  72.3793
3.1999999999917463
0.03199999999991746
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         6         0               6      98.4         0  72.4792
3.399999999991735
0.033999999999917346
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         6         0               6      98.3       

         5         0               5      90.5         0  41.2232
19.199999999990837
0.19199999999990836
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         5         0               5      90.4         0  41.3229
19.399999999990825
0.19399999999990825
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         5         0               5      90.3         0  41.4226
19.599999999990814
0.19599999999990814
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         5         0               5      90.2         0  41.5223
19.799999999990803
0.19799999999990803
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         5         0               5      90.1      

0.08199999999989757
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         6         1               5      95.9         0  44.8789
8.599999999989734
0.08599999999989734
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         5         0               5      95.7         0  44.9787
8.799999999989723
0.08799999999989723
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         5         0               5      95.6         0  45.0786
8.999999999989711
0.0899999999998971
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         5         0               5      95.5         0  45.1784
9.1999999999897
0.091999999999897
  Location    Action    New Location    E

0.4379999999998773
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         1         3               1      78.1         0  14.2562
43.99999999998772
0.4399999999998772
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         1         0               1        78         0   14.355
44.19999999998771
0.4419999999998771
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         1         0               1      77.9         0  14.4537
44.3999999999877
0.443999999999877
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         1         0               1      77.8         0  14.5524
44.59999999998769
0.4459999999998769
  Location    Action    New Location    Ene

0.7019999999998623
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         1         0               1      64.9         0  16.7666
70.39999999998622
0.7039999999998622
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         1         3               1      64.8         0  16.8606
70.59999999998621
0.7059999999998621
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         1         3               1      64.7         0  16.9546
70.7999999999862
0.707999999999862
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         1         1               0      64.6         0  17.3321
71.19999999998618
0.7119999999998617
  Location    Action    New Location    Ene

  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         0               0      44.2         0  17.6371
100.0
1.0
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         0               0      44.1         0  17.6689
100.0
1.0
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         3               0        44         0  17.6999
100.0
1.0
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         0               0      43.9         0    17.73
100.0
1.0
  Location    Action    New Location    Energy    Cheese    Value
----------  --------  --------------  --------  --------  -------
         0         0               0

100.0
1.0
  Location    Action    New Location    Energy    Cheese     Value
----------  --------  --------------  --------  --------  --------
         0         3               0      22.5         0  -1.01734
100.0
1.0
  Location    Action    New Location    Energy    Cheese     Value
----------  --------  --------------  --------  --------  --------
         0         1               0      22.4         0  -1.79376
100.0
1.0
  Location    Action    New Location    Energy    Cheese     Value
----------  --------  --------------  --------  --------  --------
         0         0               0      22.3         0  -2.58054
100.0
1.0
  Location    Action    New Location    Energy    Cheese     Value
----------  --------  --------------  --------  --------  --------
         0         2               1      22.2         0  -1.09809
100.0
1.0
  Location    Action    New Location    Energy    Cheese     Value
----------  --------  --------------  --------  --------  --------
         1  

ValueError: probabilities are not non-negative

In [44]:
data = [
    go.Scatter(
        y=energy_over_time
    )
]

iplot(data)

In [45]:
data = [
    go.Scatter(
        y=location_over_time
    )
]

iplot(data)

In [46]:
data = []
for i in range(7):
    vals = [s[i] for s in state_value_history]
    trace = go.Scatter(
        y=vals
    )
    data.append(trace)

iplot(data)


In [47]:
data = []
for i in range(7):
    for j in range(4):
        vals = [q[i][j] for q in q_value_history]
        trace = go.Scatter(
            y=vals,
            name="State {} action {}".format(i, j)
        )
        data.append(trace)

iplot(data)

Food - Too little. You Die. Too much. You die.
Need - Motivation to find food.

### State, Interpretation, Value, Action

A state is composed of the observable portions of the environment and the agent's internal state.

Interpretation of that state results in an evaluation which may be positive, neutral, negative, or some combination of all three.

The agent then either does nothing or selects an action based on the evaluation of its current state, its policy, and its current needs.

Imagine we have a mouse. We want to this mouse to learn to solve a maze. Now this maze is going going to be very simple. It's just going to be a long hallway.

The mouse will start at one side of the hallway and there will be cheese at the other end.

To make this simple to program we're going to make the hallway a series of seven sections. The mouse will start in the first section and the cheese will be in the last section, the end of the hallway.

Now imagine that when we put the mouse in the maze it's actually full, and warm and happy. Do you expect the mouse to move a lot, a little, or not much at all?

Depending on just how comfortable the mouse is it probably won't move much. But it takes energy just to stay alive and so after a while the mouse is going to start to get hungry.

Here we've identified two important concepts. The energy level of the mouse and how hungry the mouse is.

As the energy reserves of the mouse decrease the hunger of the mouse will increase.

When the energy reserves of the mouse are full it doesn't need to move much and so probably won't. (Real mice have other drives, like curiosity, but this is a very dull mouse).

Now real mice can move in any direction, control hundreds of muscles, and do all sorts of amazing things. Our virtual mouse is going to be much more limited. It can do 1 of 4 things. It can do nothing, move left, move right, or gnaw on something around it.

As mentioned earlier when energy is high it will probably do nothing, but as it gets hungry it will select from one of the other things to do more frequently. Real mice probably have innate preferences for which one they would do more, but here we'll start out choosing randomly between the other options. 

As the mouse's hunger increases and it starts to move more and more it may eventually find itself (by chance) in the last section of the hallway where it decides to gnaw on something. This results in a revelation! Cheese! 

The act of gnawing on things in the last section results in the delicious taste of cheese. 

Because the mouse is still hungry it takes another bite and another until it's very full. Now imagine what happens to the level of enjoyment of eating the cheese as the mouse eats more and more. We would expect it to decrease. Eventually if the mouse is full to bursting, if it took another bite of cheese that bite might even taste unpleasant.

Once the mouse has eaten its fill it will stop eating.

Over time the mouse will digest and (just by living) use energy and get hungry again. When it's not full but not yet very hungry it might explore the maze a bit more, moving more or less randomly. As soon as it gets actually hungry however the mouse should immediately find its way back to the cheese (don't worry the cheese can't run out in our little world) and eat as much as it wants.

Finally, to say that the mouse has learned how to solve this maze, if we put a hungry version of the mouse anywhere in the maze it should immediately run to the cheese and start eating without going the wrong direction.

If it can it has learned how to fulfil its need.


