In [3]:
from gym.spaces import Discrete, Tuple
import random
import matplotlib.pyplot as plt
import matplotlib as mpl
from scipy.ndimage.measurements import label
import gym
from gym import error, spaces, utils
from gym.utils import seeding
import numpy as np
import copy
class PackEnv2(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, board_shape = (5, 5), input_shapes=[],max_moves=100, replacement=True):
        self.counter = 0
        self.max_moves = max_moves
        self.done = False
        self.reward = 0
        self.board_shape = board_shape
        self.observation_space = np.zeros((board_shape[0], board_shape[1]*2))
        self.action_space = Discrete(board_shape[0]*board_shape[1]+1)
        self.state = [np.zeros(board_shape),np.zeros(board_shape)]
        self.return_state = np.concatenate((self.state[0], self.state[1]), axis=1)
        self.replace = replacement

        self.num_possible_moves = board_shape[0]*board_shape[1]

        if len(input_shapes) == 0:
            mat = np.zeros(board_shape)
            mat[0][0] = 1
            self.shapes = [mat]
        else:
            self.shapes = []
            for shape in input_shapes:
                base_mat = np.zeros(board_shape)
                for i in range(len(shape)):
                    for j in range(len(shape[0])):
                        base_mat[i][j] = shape[i][j]
                self.shapes.append(base_mat)
        self.remaining_shapes = copy.deepcopy(self.shapes)
        val = random.choice(range(len(self.shapes)))
        self.state[1] = self.shapes[val]
        if not self.replace:
            self.remaining_shapes.pop(val)

    def reset(self):
        val = random.choice(range(len(self.shapes)))
        random_shape = self.shapes[val]
        self.counter = 0
        self.done = False
        self.reward = 0
        self.state = [np.zeros(self.board_shape), random_shape]
        self.return_state = np.concatenate((self.state[0], self.state[1]), axis=1)
        self.remaining_shapes = copy.deepcopy(self.shapes)
        if not self.replace:
            self.remaining_shapes.pop(val)
        return self.return_state


    def valid_move(self, target):
        state = self.state
        board = state[0]
        piece = state[1]
        h = self.board_shape[0]
        w = self.board_shape[1]

        #do nothing
        if target == h * w:
            return True

        if target > h*w or target < 0:
            return False

        h_offset = int(target / h)
        w_offset = target % w

        for H in range(len(piece)):
            for W in range(len(piece[0])):
                if piece[H][W] == 1:
                    if (h_offset + H >= h) or (w_offset + W  >= w):
                        return False
                    if board[H+h_offset][W+w_offset] == 1:
                        return False
        return True


    def calculate_reward(self, target):
        state = self.state
        board = state[0]
        h = self.board_shape[0]
        w = self.board_shape[1]
        if target == self.num_possible_moves:
            return -.05

        #connection structure
        #structure = np.ones((3, 3), dtype=np.int)
        structure = np.array([[0,1,0],[1,1,1],[0,1,0]])
        labeled, ncomponents = label(board, structure)
        component_num = labeled[int(target/h)][target % w]
        indices = np.indices(board.shape).T[:,:,[1, 0]]
        component = indices[labeled == component_num]

        size = len(component)
        max_h = max([pair[0] for pair in component])
        min_h = min([pair[0] for pair in component])
        max_w = max([pair[1] for pair in component])
        min_w = min([pair[1] for pair in component])
        block_size = abs(max_h-min_h + 1)*abs(max_w-min_w + 1)
        return size**2/block_size

    def merge(self, target):
        state = self.state
        board = state[0]
        piece = state[1]
        h = self.board_shape[0]
        w = self.board_shape[1]

        #do nothing
        if target == h * w:
            return state[0]

        h_offset = int(target / h)
        w_offset = target % w

        for H in range(len(piece)):
            for W in range(len(piece[0])):
                if piece[H][W] == 1:
                    #print("HIIIIIII")
                    board[H+h_offset][W+w_offset] = 1
        return board

    def final_reward(self):
        h = self.board_shape[0]
        w = self.board_shape[1]
        state = self.state
        board = state[0]
        if np.sum(board) == h*w:
            return 1000000
        else:
            return -1000000
        

    def step(self, target):
        h = self.board_shape[0]
        w = self.board_shape[1]
        if self.done == True:
            self.reward = self.final_reward()
            print("It's over")
            return [self.return_state, self.reward, self.done, {}]
        elif target > self.num_possible_moves:
            print("Impossible. Invalid position")
            return [self.return_state, self.reward, self.done, {}]
        else:
            self.counter+=1
            #print("counter", self.counter)
            if (self.counter == self.max_moves):
                self.done = True
                self.reward = self.final_reward()                
                return [self.return_state, self.reward, self.done, {}]
            #self.state[0][int(target/h)][target%k] = 1
            if not self.valid_move(target):
                self.reward = -50000000
                return [self.return_state, self.reward, self.done, {}]

            self.reward = self.calculate_reward(target)
            updated_board = self.merge(target)
            self.state[0] = updated_board
            
            #do nothing so same state
            if (target == h*w):
                return [self.return_state, self.reward, self.done, {}]
            #no pieces left so we're done
            if len(self.remaining_shapes) == 0:
                print("hi")
                self.state[1] = np.zeros(self.board_shape)
                self.return_state = np.concatenate((self.state[0], self.state[1]), axis=1)
                self.done = True
                self.reward = self.final_reward()
                return [self.return_state, self.reward, self.done, {}]
            else:
                val = random.choice(range(len(self.remaining_shapes)))
                self.state[1] = self.remaining_shapes[val]
                if not self.replace:
                    self.remaining_shapes.pop(val)
                self.return_state = np.concatenate((self.state[0], self.state[1]), axis=1)
                return [self.return_state, self.reward, self.done, {}]


    def render(self, mode='human'):
        fig, ax = plt.subplots()
        # define the colors
        cmap = mpl.colors.ListedColormap(['w', 'k'])

        # create a normalize object the describes the limits of
        # each color
        bounds = [0., 0.5, 1.]
        norm = mpl.colors.BoundaryNorm(bounds, cmap.N)

        # plot it
        ax.imshow(self.state[0], interpolation='none', cmap=cmap, norm=norm)
    def render_piece(self, mode='human'):
        fig, ax = plt.subplots()
        # define the colors
        cmap = mpl.colors.ListedColormap(['w', 'k'])

        # create a normalize object the describes the limits of
        # each color
        bounds = [0., 0.5, 1.]
        norm = mpl.colors.BoundaryNorm(bounds, cmap.N)

        # plot it
        ax.imshow(self.state[1], interpolation='none', cmap=cmap, norm=norm)

In [4]:
%%capture
import gym
#import gym_pack
import numpy as np
import random
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
import keras

from collections import deque

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Reshape
from keras.optimizers import Adam
from keras import layers, models

from rl.agents.cem import CEMAgent
from rl.memory import EpisodeParameterMemory

In [None]:
class PrupeQN:
    def __init__(self, env):
        self.env     = env
        self.memory  = deque(maxlen=10000)
        
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.05
        self.epsilon_decay = 0.999
        self.learning_rate = 0.01
        self.tau = .08
        self.min_tau = .02
        self.tau_decay = .999

        self.model        = self.create_model()
        self.target_model = self.create_model()
    
    #def subgrid_q()
    
    
    def grid_preprocess(self, board):
        return
    #input should be a binary array describing which action we are taking
    #along with the current state space which is made smaller by solving 
    #using subproblems and then put through several convolutional layers
    #then this is put through several fully connected layers and finally
    #output through a single node
    def create_model(self):
        input1 = layers.Input(shape=(3,6,1))
        input2 = layers.Input(shape=(10,))
        conved1 = layers.Conv2D(32, (5, 5),padding="same",input_shape=(3,6,1), activation="relu")(input1)
        conved2 = layers.Conv2D(32, (5, 5),padding="same",input_shape=(3,6,1), activation="relu")(conved1)
        compressed = layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same')(conved2)
        x = Flatten()(compressed)
        processed = Dense(10, activation="relu")(x)

        merged = keras.layers.Concatenate(axis=1)([processed, input2])
        a = Dense(24, activation="relu")(merged)
        b = Dense(48, activation="relu")(a)
        #flat = Flatten()(b)
        output = Dense(1, activation="relu")(b)
        model = keras.models.Model(inputs=[input1, input2], output=output)
        model.compile(loss="mean_squared_error",
            optimizer=Adam(lr=.01))
        print(model.summary())
        #print(model.input_shape)
        return model

    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        #print(state)
        #state = state.reshape(5, 10,1)
        u
        for i in range(len(predictions)):
            if not self.env.valid_move(i):
                predictions[i] -= 100000000
        print(predictions)
        return np.argmax(predictions)
    
    def trained_act(self, state):
        predictions = self.model.predict(state.reshape(1,2, 4,1))[0]
        for i in range(len(predictions)):
            if not self.env.valid_move(i):
                predictions[i] -= 100000000
        print(predictions)
        predictions[-1] -=1000000000
        return np.argmax(predictions)

    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])

    def replay(self):
        batch_size = 32
        if len(self.memory) < batch_size: 
            return

        samples = random.sample(self.memory, batch_size)
        for sample in samples:
            state, action, reward, new_state, done = sample
            #print("sample state is ", state, action)
            #state = state.reshape(5, 10,1)
            target = self.target_model.predict(state.reshape(1,2, 4,1))
            if done:
                target[0][action] = reward
            else:
                #print('REPLAY', self.target_model.predict(new_state)[0])
                Q_future = max(self.target_model.predict(new_state.reshape(1,2, 4,1))[0])
                target[0][action] = reward + Q_future * self.gamma
            self.model.fit(state, target, epochs=1, verbose=0)

    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        self.tau = max(self.min_tau, self.tau*self.tau_decay)
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)

    def save_model(self, fn):
        self.model.save(fn)

def main():
    env = PackEnv(board_shape=(2,2), input_shapes=[[[1]]]*2+[[[1,1],[0,0]]], replacement=False, max_moves=4)
    gamma   = 0.95
    epsilon = .99

    trials  = 500
    trial_len = 50

    # updateTargetNetwork = 1000
    dqn_agent = DQN(env=env)
    steps = []
    for trial in range(trials):
        #cur_state = env.reset().reshape(5,10)
        cur_state = env.reset()
        #print("cur",cur_state)
        cur_state = cur_state.reshape(1,2,4,1)
        #print("updated, cur_state")
        for step in range(trial_len):
            #print("cur_state",cur_state)
            action = dqn_agent.act(cur_state)
            new_state, reward, done, _ = env.step(action)
            #print("info", new_state, reward, done)

            reward = reward/abs(reward) if not done else -.01
            new_state = new_state.reshape(1,2,4,1)
            dqn_agent.remember(cur_state, action, reward, new_state, done)
            
            dqn_agent.replay()       # internally iterates default (prediction) model
            dqn_agent.target_train() # iterates target model

            cur_state = new_state
            if done:
                break
        """
        if step >= 199:
            print("Failed to complete in trial {}".format(trial))
            if step % 10 == 0:
                dqn_agent.save_model("trial-{}.model".format(trial))
        else:
            print("Completed in {} trials".format(trial))
            dqn_agent.save_model("success.model")
            break
        """
        print("trial #{}".format(trial))
    return env, dqn_agent

In [18]:
input1 = layers.Input(shape=(3,6,1))
input2 = layers.Input(shape=(10,))
conved1 = layers.Conv2D(32, (5, 5),padding="same",input_shape=(3,6,1), activation="relu")(input1)
conved2 = layers.Conv2D(32, (5, 5),padding="same",input_shape=(3,6,1), activation="relu")(conved1)
compressed = layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same')(conved2)
x = Flatten()(compressed)
processed = Dense(10, activation="relu")(x)

merged = keras.layers.Concatenate(axis=1)([processed, input2])
a = Dense(24, activation="relu")(merged)
b = Dense(48, activation="relu")(a)
#flat = Flatten()(b)
output = Dense(1, activation="relu")(b)
model = keras.models.Model(inputs=[input1, input2], output=output)
model.compile(loss="mean_squared_error",
    optimizer=Adam(lr=.01))
print(model.summary())

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_20 (InputLayer)           (None, 3, 6, 1)      0                                            
__________________________________________________________________________________________________
conv2d_19 (Conv2D)              (None, 3, 6, 32)     832         input_20[0][0]                   
__________________________________________________________________________________________________
conv2d_20 (Conv2D)              (None, 3, 6, 32)     25632       conv2d_19[0][0]                  
__________________________________________________________________________________________________
max_pooling2d_2 (MaxPooling2D)  (None, 2, 3, 32)     0           conv2d_20[0][0]                  
____________________________________________________________________________________________

  
