In [1]:
import numpy as np
from collections import defaultdict
from tabulate import tabulate

In [2]:
# A world of infinite cheese
class World(object):
    
    def __init__(self):
        self.cheese = False
    
    def step(self, eat):
        if eat:
            self.cheese = True
        else:
            self.cheese = False
        
        return self.cheese

In [70]:
class Mouse(object):
    def __init__(self, value_learning_rate=0.1, action_learning_rate=0.01):
        self._value_learning_rate = value_learning_rate
        self._action_learning_rate = action_learning_rate
        
        self.reset()
    
    def reset(self):
        self.energy = self.initial_energy = 5
        self.cheese = self.initial_cheese = None
        self.action = self.initial_action = None
        self.state = None
        self.previous_state = None
        
        self._learned_value_table = {}
        self._action_table = {}
        for i in range(11):
            for v in [True, False]:
                k = (i, v)
                self._learned_value_table[k] = 0
                self._action_table[k] = 0.5
    
    @staticmethod
    def _update_energy(energy, cheese):
        if cheese:
            energy += 1
        else:
            energy -= 1
        
        return np.clip(energy, 0, 10)
    
    @staticmethod
    def _innate_evaluation(current_state, full_point=7):
        value = 0
        energy, cheese = current_state
        if cheese:
            value = -energy + full_point
        elif energy < 4:
            value = energy - 4
        return value
    
    def _update_learned_value_table(self, previous_state, value_difference, learning_rate, debug=True):
        previous_value = self._learned_value_table[previous_state]
        new_value = previous_value + learning_rate * value_difference
        self._learned_value_table[previous_state] = new_value
        if debug:
            print("---- Update Learned Value Table")
            print("Previous State:", previous_state)
            print("Action:", self.action)
            print("Previous Value:", previous_value)
            print("Value Difference:", value_difference)
            print("New Value:", new_value)

    def _update_action_table(self, previous_state, value_difference, learning_rate, debug=True):
        if self.action:
            previous_strength = self._action_table[previous_state]
            new_strength = previous_strength + learning_rate * value_difference
            new_strength = np.clip(new_strength, 0.01, 0.99)
            self._action_table[previous_state] = new_strength
            if debug:
                print("---- Update Action Table")
                print("Previous State:", previous_state)
                print("Previous Strength:", previous_strength)
                print("Value Difference:", value_difference)
                print("New Strength:", new_strength)
        else:
            if debug:
                print("Did not act last step. Nothing to change.")
        
    def _learn(self, previous_state, value, debug=False):
        value_difference = value - self._learned_value_table[previous_state]
        self._update_learned_value_table(
            previous_state, 
            value_difference, 
            self._value_learning_rate,
            debug=debug
        )
        self._update_action_table(
            previous_state, 
            value_difference, 
            self._action_learning_rate,
            debug=debug
        )
    
    def _get_action(self, cheese):
        eat = False
        act_chance = self._action_table[(self.energy, cheese)]
        if np.random.random() < act_chance:
            eat = True
        return eat
    
    def _dict_to_table(self, d):
        int_d = defaultdict(list)
        for k, v in d.items():
            p1, p2 = k
            int_d[p1].append(v)
        
        return [[k] + v for k, v in int_d.items()]
    
    def _display_table(self, d_table, headers):
        t = self._dict_to_table(d_table)
        print()
        print(tabulate(t, headers=headers))
        print()
    
    def display_knowledge(self):
        print("---- Learned Value Table")
        self._display_table(self._learned_value_table, headers=["Energy", "Cheese", "No Cheese"])
        print("---- Action Table")
        self._display_table(self._action_table, headers=["Energy", "Cheese", "No Cheese"])
    
    def step(self, obseravation, debug=False):
        cheese = observation # Boolean if the mouse tastes cheese or not
        if debug: print("Initial energy value", self.energy)
        next_energy = self._update_energy(self.energy, cheese)
        if debug: print("Updated energy value", next_energy)
        self.previous_state = self.state
        self.state = (next_energy, cheese)
        
        if debug: 
            print("Previous State: (Energy, Cheese)", self.previous_state)
            print("Previous Action:", self.action)
            print("Current State: (Energy, Cheese)", self.state)
        if self.previous_state is not None:
            if debug: print("Evaluating and learning ...")
            v_innate = self._innate_evaluation(self.state)
            if debug: print("Innate Value:", v_innate)
            v_learned = self._learned_value_table[self.state]
            if debug: print("Learned Value:", v_learned)
            value = v_innate + v_learned
            if debug: print("Total Value:", value)
            self._learn(self.previous_state, value, debug)
            if debug: self.display_knowledge()
        
        next_action = self._get_action(cheese)
        if debug: 
            print("Action Chosen:", next_action)
            print("=" * 50)
        self.action = next_action
        self.cheese = cheese
        self.energy = next_energy
        return self.action
        
        

In [71]:
world = World()
mouse = Mouse(value_learning_rate=0.1, action_learning_rate=0.05)

In [74]:
mouse.reset()
observation = False # No cheese to begin with
for i in range(100):
    action = mouse.step(observation, debug=False)
    observation = world.step(action)

In [75]:
mouse.display_knowledge()

---- Learned Value Table

  Energy     Cheese    No Cheese
--------  ---------  -----------
       0   0          2.16722
       1   0.570196   0.604757
       2   0.911228  -0.106198
       3   0.366582   0.0117421
       4   0.281248   0.0786253
       5   0.272385   0.0221485
       6   0          6.2857e-05
       7  -0.1        6.2857e-06
       8  -0.2       -0.3438
       9  -0.0562     0
      10   0          0

---- Action Table

  Energy    Cheese    No Cheese
--------  --------  -----------
       0  0.5          0.748361
       1  0.612418     0.605742
       2  0.593849     0.564713
       3  0.561733     0.54638
       4  0.532109     0.520927
       5  0.513524     0.5
       6  0.5          0.5
       7  0.495        0.5
       8  0.49         0.4809
       9  0.5          0.5
      10  0.5          0.5

