In [None]:
!pip install pymdptoolbox

In [1]:
import numpy as np
import pandas as pd
import pickle
import itertools
from scipy import sparse
import mdptoolbox


### Warehouse Settings

In [2]:
warehouse_dim_x = 2
warehouse_dim_y = 2
warehouse_actions = ['store', 'restore']
warehouse_blocktypes = ['empty', 'red', 'blue', 'white']
warehouse_size = warehouse_dim_x * warehouse_dim_y

print("Warehouse Size:", warehouse_size)
print("Warehouse Actions:", ", ".join(warehouse_actions))
print("Block Types: ", ", ".join(warehouse_blocktypes))

Warehouse Size: 4
Warehouse Actions: store, restore
Block Types:  empty, red, blue, white


### Loading Dataset

In [26]:
training_data = pd.read_csv('./res/warehousetraining.txt', sep="\t", names=['action', 'blocktype'])
print("Dataset Size:", training_data.size)
test_data = pd.read_csv('./res/warehouseorder.txt', sep="\t", names=['action', 'blocktype'])
print("Testset Size:", test_data.size)

print(training_data.head())

Dataset Size: 24216
Testset Size: 120
    action blocktype
0    store       red
1    store      blue
2    store     white
3  restore      blue
4  restore     white


### Single Probabilities

In [28]:
df = training_data.copy()
df = training_data.groupby(["action", "blocktype"]).size().reset_index(name='probability')
training_data['probability'] = df['probability'].div(df.size)

probability_map = dict()

store_white = training_data.loc[(training_data[0] == 'store') & (training_data[1] == 'white')]
store_blue = training_data.loc[(training_data[0] == 'store') & (training_data[1] == 'blue')]
store_red = training_data.loc[(training_data[0] == 'store') & (training_data[1] == 'red')]

restore_white = training_data.loc[(training_data[0] == 'restore') & (training_data[1] == 'white')]
restore_blue = training_data.loc[(training_data[0] == 'restore') & (training_data[1] == 'blue')]
restore_red = training_data.loc[(training_data[0] == 'restore') & (training_data[1] == 'red')]

probability_map[('store', 'white')] = store_white.size / training_data.size
probability_map[('store', 'blue')] = store_blue.size / training_data.size
probability_map[('store', 'red')] = store_red.size / training_data.size
probability_map[('restore', 'white')] = restore_white.size / training_data.size
probability_map[('restore', 'blue')] = restore_blue.size / training_data.size
probability_map[('restore', 'red')] = restore_red.size / training_data.size

print("Store White: ", probability_map[('store', 'white')])
print("Store Red: ", probability_map[('store', 'blue')])
print("Store Blue: ", probability_map[('store', 'red')])

print("\nRestore White: ", probability_map[('restore', 'white')])
print("Restore Red: ", probability_map[('restore', 'blue')])
print("Restore Blue: ", probability_map[('restore', 'red')])

KeyError: 0

### Warehouse States

In [25]:
warehouse_blocktypes_without_empty = list(warehouse_blocktypes)
if 'empty' in warehouse_blocktypes_without_empty:
    warehouse_blocktypes_without_empty.remove('empty')

warehouse_moves = list(itertools.product(warehouse_actions, warehouse_blocktypes_without_empty)) # We ignore the empty block type

warehouse_states = list(itertools.product(warehouse_blocktypes, repeat=warehouse_size))
warehouse_action_states = list(itertools.product(warehouse_states, warehouse_actions))

warehouse_dataframe_columns = []
for field in range(warehouse_size):
    warehouse_dataframe_columns.append("field "+str(field))
warehouse_dataframe_columns.append("action")
warehouse_dataframe_columns.append("object")
warehouse_dataframe_columns.append("probability")

warehouse_dataframe = pd.DataFrame(list(itertools.product(warehouse_blocktypes, warehouse_blocktypes, warehouse_blocktypes, warehouse_blocktypes, warehouse_actions, warehouse_blocktypes_without_empty)), columns=warehouse_dataframe_columns)
warehouse_dataframe = pd.merge(warehouse_dataframe, probability_map, on=["action", "blocktype"])

print(warehouse_dataframe.head())
print("Possible Moves:\n", warehouse_moves)
print("States:", len(warehouse_action_states))

ValueError: 7 columns passed, passed data had 6 columns

### Transition Probability Matrix

In [11]:
transition_probability_matrix = []
block_size = len(warehouse_blocktypes) ** warehouse_size

number_of_warehouse_fields = warehouse_size
for action in range(number_of_warehouse_fields):
    current_index = 0
    transition_probability_matrix.append(np.zeros((len(warehouse_action_states), len(warehouse_action_states)),dtype=np.float16))

    for order in range(len(warehouse_moves)):
        for warehouse_state in itertools.product(warehouse_blocktypes, repeat=warehouse_size):
           for move in warehouse_moves:

                if(warehouse_state[action] != 'empty'):
                    transition_probability_matrix[action][current_index][(current_index % block_size) + (block_size * warehouse_moves.index(move))] = probability_map[move]
                else:
                    transition_probability_matrix[action][current_index][((current_index % block_size) + (len(warehouse_blocktypes)**(number_of_warehouse_fields - action - 1) * warehouse_blocktypes.index(move[1]))) + (block_size * warehouse_moves.index(move))] = probability_map[move]
           current_index += 1

    transition_probability_matrix[action] = sparse.csr_matrix(transition_probability_matrix[action])
    print("finished P"+str(action))

finished P0
finished P1
finished P2
finished P3


In [152]:
pickle.dump( P, open( "./res/P.pickle", "wb" ) )
print("Pickled TPM")


Pickled TPM


### Reward Matrix

In [None]:
reward_matrix = []

for action in range(number_of_warehouse_fields):
    reward_matrix.append(np.zeros((len(warehouse_action_states), len(warehouse_action_states)),dtype=np.float16))

    for (state, action) in warehouse_action_states:
        try:
            (actiontype, blocktype) = action
            #Reward for correct move
            if((actiontype == 'store' and state[action] == 0) or
            (actiontype == 'restore' and (state[action] == (ws.NextMove - 2)))):

                if  (action == 0): reward = 80**2  #8**3.5;
                elif(action == 1): reward = 60**2  #6**3.5;
                elif(action == 2): reward = 60**2  #6**3.5;
                elif(action == 3): reward = 40**2  #4**3.5;
                elif(action == 4): reward = 40**2  #4**3.5;
                elif(action == 5): reward = 40**2  #2**3.5;

                #Extra reward if restore is possible
                if actiontype == 'restore' and (ws[action] == (ws.NextMove - 2)):
                    reward *= 100  #+=100

            #Reward for Failed moves
            else:
                #store not possible
                if actiontype == 'store':
                    reward = -20000  #5
                #restore not possible
                else:
                    reward = -1000000  #-10

            reward_matrix[-1][index] = reward



reward_matrix = np.asarray(reward_matrix)
reward_matrix = reward_matrix.transpose()

# Save the matrices into a pickle file.
pickle.dump(reward_matrix, open( "./res/reward_matrix.pickle", "wb" ) )

In [14]:
############################################
num_fields = 4 # Warehouse places 2x2 = 4 ..
############################################

num_color = 4 # empty, red, white,
# blue
num_moves = 6 # store red, store white, store blue, restore red, ...
num_actions = num_fields
block_size = num_color ** num_fields

num_state = num_color ** num_fields * num_moves
warehouse_description=[0,1,2,3] #possible colors

#Init TPM
P = []

move_probs = [
    0.1278493557978196,
    0.12528906508093823,
    0.24686157912124215,
    0.1278493557978196,
    0.12528906508093823,
    0.24686157912124215
]

for action in range(num_actions):
    current_index = 0
    P.append(np.zeros((num_state, num_state),dtype=np.float16))

    for instr in range(num_moves):
        for w_state in itertools.product(warehouse_description, repeat=num_fields):
            #Iter through all 6 column blocks (=possible instructions) (store red, store blue, ...., restore red, ...)
            for move in range(num_moves):

                ##For field one (Action)
                #FOR STORE
                if(instr in range(3)):

                    #1. Empty? if (field1 == 0)
                    if(w_state[action] != 0):
                        P[action][current_index][(current_index % block_size) + (block_size * move)] = move_probs[move]
                    else:
                        #FOR STORE
                        #if(red) index+64 (i**numFields)
                        # else if(white) index+128
                        # else if(blue) index+192

                        #red = 0
                        if(instr == 0):
                            P[action][current_index][((current_index % block_size) + (num_color**(num_actions - action - 1) * 1)) + (block_size * move)] = move_probs[move]
                        #white = 1
                        elif(instr == 1):
                            P[action][current_index][((current_index % block_size) + (num_color**(num_actions - action - 1) * 2)) + (block_size * move)] = move_probs[move]
                        #blue = 2
                        elif(instr == 2):
                            P[action][current_index][((current_index % block_size) + (num_color**(num_actions - action - 1) * 3)) + (block_size * move)] = move_probs[move]


                #FOR RESTORE
                else:
                    #possible? if (field 1 != 0)
                    if(w_state[action] == 0):
                        P[action][current_index][(current_index % block_size) + (block_size * move)] = move_probs[move]
                    else:
                        #FOR STORE
                        #if(red) index-64 (i**numFields)
                        # else if(white) index-128
                        # else if(blue) index-192

                        #red = 3
                        if(instr == 3):
                            P[action][current_index][((current_index % block_size) - (num_color**(num_actions - action - 1) * 1)) + (block_size * move)] = move_probs[move]
                        #white = 4
                        elif(instr == 4):
                            P[action][current_index][((current_index % block_size) - (num_color**(num_actions - action - 1) * 2)) + (block_size * move)] = move_probs[move]
                        #blue = 5
                        elif(instr == 5):
                            P[action][current_index][((current_index % block_size) - (num_color**(num_actions - action - 1) * 3)) + (block_size * move)] = move_probs[move]

            current_index += 1

    P[action] = sparse.csr_matrix(P[action])
    print("finished P"+str(action))

print("---")
print("successfull")

warehouse = []
for instr in range(num_moves):
    for w_state in itertools.product(warehouse_description, repeat=num_fields):
        tmp = []
        tmp_str = []
        for i in range(num_fields):
            tmp.append(w_state[i])
            tmp_str.append('state'+str(i))
        tmp.append(instr)
        tmp_str.append('NextMove')
        warehouse.append(tmp)

warehouse = pd.DataFrame(warehouse, columns=tmp_str)
print(warehouse.head())

R = []

for action in range(num_actions):
    R.append(np.zeros((num_state, )))

    for index, ws in warehouse.iterrows():
        try:
            #Reward for correct move
            if((ws.NextMove in range(3) and ws[action] == 0) or
            (ws.NextMove in range(3, 6) and (ws[action] == (ws.NextMove - 2)))):

                if  (action == 0): reward = 80**2  #8**3.5;
                elif(action == 1): reward = 60**2  #6**3.5;
                elif(action == 2): reward = 60**2  #6**3.5;
                elif(action == 3): reward = 40**2  #4**3.5;
                elif(action == 4): reward = 40**2  #4**3.5;
                elif(action == 5): reward = 40**2  #2**3.5;

                #Extra reward if restore is possible
                if ws.NextMove in range(3, 6) and (ws[action] == (ws.NextMove - 2)):
                    reward *= 100  #+=100

            #Reward for Failed moves
            else:
                #store not possible
                if ws.NextMove in range(3):
                    reward = -20000  #5
                #restore not possible
                else:
                    reward = -1000000  #-10

            R[-1][index] = reward

        except:
            print("An exception occurred")
            print(ws.NextMove)
            print(ws[action])

R = np.asarray(R)
R = R.transpose()

finished P0
finished P1
finished P2
finished P3
---
successfull
   state0  state1  state2  state3  NextMove
0       0       0       0       0         0
1       0       0       0       1         0
2       0       0       0       2         0
3       0       0       0       3         0
4       0       0       1       0         0


Verify Matrices

In [15]:
mdptoolbox.util.check(transition_probability_matrix,R)

StochasticError: 'PyMDPToolbox - The transition probability matrix is not stochastic.'

In [None]:
def store_next_states_df(state_now, place):
    df_pos = states
    if state_now[place] == '0':
        for i in storage_places:
            if (i == place):
                df_pos = df_pos.loc[df_pos.iloc[ : , i-1]== row [size_storage+2]]
            else:
                df_pos = df_pos.loc[df_pos.iloc[ : , i-1] == row [i]]
    else:
        for i in storage_places:
            df_pos = df_pos.loc[df_pos.iloc[ : , i-1] == row [i]]
    return df_pos

def restore_next_states_df(state_now, place):
    df_pos = states
    if state_now[place] == state_now[size_storage+2]:
        for i in storage_places:
            if (i == place):
                df_pos = df_pos.loc[df_pos.iloc[ : , i-1]== '0']
            else:
                df_pos = df_pos.loc[df_pos.iloc[ : , i-1] == row [i]]
    else:
        for i in storage_places:
            df_pos = df_pos.loc[df_pos.iloc[ : , i-1] == row [i]]
    return df_pos

def init_trans_prob_mat(df_next_states, state_now, trans_prob_mat):
    for i in df_next_states.index:
        trans_prob_mat.itemset((state_now[0],i),round(df_next_states.loc[[i]].probability,4))
     return trans_prob_mat

In [None]:
trans_prob_mat = []
trans_prob_mat_all = []

for place in storage_places:
    trans_prob_mat = np.zeros((len(states),len(states)), dtype=np.float16)
    for row in states.itertuples():
        #print(row)
        df_next_states = []

        #an dataframe that stores all the rows where the trans prob mat is not null
        if (row[size_storage+1] == 'store'):
            df_next_states= store_next_states_df(row,place)
            #print(df_next_states)

        elif(row[size_storage+1] == 'restore'):
            df_next_states= restore_next_states_df(row,place)
            #print(df_next_states)

        else:
            print("Wrong Instruction in array")


        trans_prob_mat = init_trans_prob_mat(df_next_states, row, trans_prob_mat)

    #print(place)

    np.save("trans_mat"+str(place)+".nyp", trans_prob_mat)
    print(datetime.datetime.now())

    trans_prob_mat_all.append(trans_prob_mat)