In [1]:
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from tensorforce.environments import Environment
from tensorforce.agents import Agent
from tensorforce.execution import Runner

Main game

In [2]:
def get_val(mean,var):
    return mean+var*2.0*(np.random.random()-0.5)

class CustomEnvironment(Environment):

    def __init__(self, N=3, clist = [10., 10., 10.], vlist = [3.0,4.0,5.0], max_turns = 30, bad_list = [False,False,True]):
        super().__init__()
        self.N = N
        self.clist = clist
        self.vlist = vlist
        self.max_turns = max_turns
        self.turn = 0
        self.picked_count = np.zeros(self.N)
        self.measured_list = []
        self.bad_list = bad_list

        self.reset()

    def current_mean(self):

        return [np.mean([el for el in sublist]) for sublist in self.measured_list]

        #return mean_list
    
    def current_stddev(self):
        return [np.var([el for el in sublist])**.5 for sublist in self.measured_list]

    
    def current_norm_stddev(self):
        stddev_list = self.current_stddev()
        return stddev_list / max(stddev_list)
    
    def states(self):
        return dict(type='float', shape=(self.N*2,))

    def actions(self):
        return dict(type='int', num_values=self.N)

    def reset(self):
        #rshuffle the clist/vlist order, but don't change them.
        z = list(zip(self.clist, self.vlist, self.bad_list))
        np.random.shuffle(z)
        self.clist, self.vlist, self.bad_list = zip(*z)
        self.measured_list = []
        self.turn = 0
        
        #take first measurments
        for i in range(self.N):
            this_list = []
            for j in range(2):
                this_list.append(get_val(mean=self.clist[i],var=self.vlist[i]))
            self.measured_list.append(this_list)
            
        #state is the current variance of each point
        self.state = np.zeros(2*self.N)
        for i in range(self.N):
            self.state[i] = np.var(self.measured_list[i])#**.5
            self.state[int(self.N)+i] = self.picked_count[i]/float(self.max_turns)
        
        return self.state     
    
    def execute(self, actions):
        #assert 0 <= actions.item() <= 3
        
        #take another measurement of value 'action'
        this_val = get_val(mean=self.clist[actions],var=self.vlist[actions])
        self.picked_count[int(actions)] += 1.0
        self.measured_list[actions].append(this_val)
        
        next_state = self.state
        next_state[actions] = np.var(self.measured_list[actions])#**.5
        next_state[self.N+actions] = self.picked_count[(int(actions))] / self.max_turns

        terminal = False
        
        reward = 0
        self.turn += 1
        if self.bad_list[actions]: #if this is a bad sample
            reward += 1 #give 1 point

        if self.turn >= self.max_turns:
            terminal = True
            reward = 0.0
            #check if we've gotten min score on bad points
            for i in range(self.N):
                if self.bad_list[i] and self.picked_count[i] >= 20:
                    reward += 100
                    #print ('woohoo '+str(i))

        return next_state, terminal, reward

Testing Policies

In [3]:
def play_sequential_game(N,clist,vlist,bad_list,max_turns):
    env = CustomEnvironment(N=N,
            clist=clist,
            vlist=vlist,
            bad_list = bad_list,
            max_turns=max_turns)

    sum_points = 0
    game_terminated = False

    iguess = 0
    while not game_terminated:
        best_guess = int(iguess)
        iguess += 1
        next_state, game_terminated, next_reward = env.execute(iguess%env.N)
        sum_points += next_reward
    return sum_points

def play_omniscent_game(N,clist,vlist,bad_list,max_turns):
    env = CustomEnvironment(N=N,
            clist=clist,
            vlist=vlist,
            bad_list = bad_list,
            max_turns=max_turns)
    
    sum_points = 0
    game_terminated = False

    while not game_terminated:
        omniscent_guess = np.argmax(abs(10. - np.array(env.current_mean())))
        next_state, game_terminated, next_reward = env.execute(omniscent_guess)
        sum_points += next_reward
    return sum_points

def play_exploring_game(N,clist,vlist,max_turns,bad_list,explore_frac=.1):
    env = CustomEnvironment(N=N,
            clist=clist,
            vlist=vlist,
            bad_list = bad_list,
            max_turns=max_turns)

    sum_points = 0
    game_terminated = False

    while not game_terminated:
        best_guess = np.argmax(env.current_norm_stddev())
        if np.random.random() < explore_frac:
            best_guess = np.argmin(env.picked_count)
        
        next_state, game_terminated, next_reward = env.execute(best_guess)
        sum_points += next_reward

    return sum_points

    

In [4]:
N = 30
clist = 30*[10.0]
vlist = np.ones(30)*0.2
vlist[0:5] *= 20.0 #20 times higher variance in bad ones
bad_list = 30*[False]
bad_list[0:5] = 5*[True]
max_turns = 400

print (f'sequential score {play_sequential_game(N, clist, vlist, bad_list, max_turns)}')
print (f'omniscent score {play_omniscent_game(N, clist, vlist, bad_list, max_turns)}')
#print (f'exploring score {play_exploring_game(N, clist, vlist, bad_list, max_turns)}')


sequential score 68.0
omniscent score 678.0
