# Reinforcement Learning

import necessary dependencies

In [0]:
! pip install pymdptoolbox



In [0]:
import mdptoolbox
import numpy as np
import datetime
import csv
import pandas as pd 

In [0]:
print(datetime.datetime.now())

2019-07-09 21:07:58.612105


Load the drive folder

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import os
os.chdir("/content/gdrive/My Drive/SAKI_2019") 

Define custom print option

In [0]:
import sys
def fullprint(args):
  opt = np.get_printoptions()
  np.set_printoptions(threshold=sys.maxsize)
  print(args)
  np.set_printoptions(opt)

#### Get probability of different actions
Get probability of commands from dataset

In [4]:

data = pd.read_csv("data/warehousetraining_2x3.txt", delimiter='\t', names=["instr", "next_color"])
length_data = len(data)
print(length_data)

df = data.copy()
df = df.groupby(["instr", "next_color"]).size().reset_index(name='probability') 
df['probability'] = df['probability'].div(length_data)
df['p'] = round(df['probability'],4)

if df.p.sum() != 1:
    print("rounding error sum of probabilities not 1")

print(df)


12108
     instr next_color  probability       p
0  restore       blue     0.125289  0.1253
1  restore        red     0.246862  0.2469
2  restore      white     0.127849  0.1278
3    store       blue     0.125289  0.1253
4    store        red     0.246862  0.2469
5    store      white     0.127849  0.1278


#### Initialize the states
Get all possible states of the problem for the Marlov Decision process

We put it all in a panda dataframe, to make it easier to work with

In [6]:
size_storage = 6
storage_places = range(1,(size_storage+1))
storage_state= ["0", "red", "blue", "white"] 
instr = ["store", "restore"]
next_color = ["red", "blue", "white"]


from itertools import product
if (size_storage == 4):
    states_tmp = pd.DataFrame(list(product(storage_state,storage_state,storage_state,storage_state, instr, next_color)), columns=['x1', 'x2', 'x3', 'x4', "instr", "next_color"])
elif (size_storage == 6):
    states_tmp = pd.DataFrame(list(product(storage_state,storage_state,storage_state,storage_state, storage_state,storage_state, instr, next_color)), columns=['x1', 'x2', 'x3', 'x4', 'x5', 'x6', "instr", "next_color"])
else:
    print ("Not supported storage size")
    
states = pd.merge(states_tmp,df, how = 'left' ,on = ["instr", "next_color"])

print (len(states))
print (states)

24576
          x1     x2     x3     x4  ...    instr next_color probability       p
0          0      0      0      0  ...    store        red    0.246862  0.2469
1          0      0      0      0  ...    store       blue    0.125289  0.1253
2          0      0      0      0  ...    store      white    0.127849  0.1278
3          0      0      0      0  ...  restore        red    0.246862  0.2469
4          0      0      0      0  ...  restore       blue    0.125289  0.1253
5          0      0      0      0  ...  restore      white    0.127849  0.1278
6          0      0      0      0  ...    store        red    0.246862  0.2469
7          0      0      0      0  ...    store       blue    0.125289  0.1253
8          0      0      0      0  ...    store      white    0.127849  0.1278
9          0      0      0      0  ...  restore        red    0.246862  0.2469
10         0      0      0      0  ...  restore       blue    0.125289  0.1253
11         0      0      0      0  ...  restor

#### Make Reward matrix
Iterate through the states and make a matrix of the cost connected all possible actions for each state

Matrix dimensions : number of states x number of actions

In [8]:
def init_reward_mat_store(row, place, price, not_possible):

    if row[place]=='0':
        reward_mat.append(price[place-1])
    else:
        reward_mat.append(not_possible)
    return reward_mat

def init_reward_mat_restore(row, place, price, not_possible):
    #6th entry is color 
    if row[place]==row[size_storage+2]:
        reward_mat.append(price[place-1])
    else:
        reward_mat.append(not_possible)
    return reward_mat



reward_mat_all=[]
price = [-1,-2,-2,-3,-3,-4]
not_possible_in = -50
not_possible_out = -50

for row in states.itertuples():
    reward_mat = []
    for place in storage_places:
        if (row[size_storage+1] == 'store'):
            reward_mat = init_reward_mat_store(row, place, price, not_possible_in)
        if (row[size_storage+1] == 'restore'):
            reward_mat = init_reward_mat_restore(row, place, price, not_possible_out)
    reward_mat_all.append(reward_mat)
    
reward_mat_all = np.array(reward_mat_all)
np.save('reward_mat_all.npy', reward_mat_all)

print(np.shape(reward_mat_all))
fullprint(reward_mat_all)


(24576, 6)
[[ -1  -2  -2  -3  -3  -4]
 [ -1  -2  -2  -3  -3  -4]
 [ -1  -2  -2  -3  -3  -4]
 [-50 -50 -50 -50 -50 -50]
 [-50 -50 -50 -50 -50 -50]
 [-50 -50 -50 -50 -50 -50]
 [ -1  -2  -2  -3  -3 -50]
 [ -1  -2  -2  -3  -3 -50]
 [ -1  -2  -2  -3  -3 -50]
 [-50 -50 -50 -50 -50  -4]
 [-50 -50 -50 -50 -50 -50]
 [-50 -50 -50 -50 -50 -50]
 [ -1  -2  -2  -3  -3 -50]
 [ -1  -2  -2  -3  -3 -50]
 [ -1  -2  -2  -3  -3 -50]
 [-50 -50 -50 -50 -50 -50]
 [-50 -50 -50 -50 -50  -4]
 [-50 -50 -50 -50 -50 -50]
 [ -1  -2  -2  -3  -3 -50]
 [ -1  -2  -2  -3  -3 -50]
 [ -1  -2  -2  -3  -3 -50]
 [-50 -50 -50 -50 -50 -50]
 [-50 -50 -50 -50 -50 -50]
 [-50 -50 -50 -50 -50  -4]
 [ -1  -2  -2  -3 -50  -4]
 [ -1  -2  -2  -3 -50  -4]
 [ -1  -2  -2  -3 -50  -4]
 [-50 -50 -50 -50  -3 -50]
 [-50 -50 -50 -50 -50 -50]
 [-50 -50 -50 -50 -50 -50]
 [ -1  -2  -2  -3 -50 -50]
 [ -1  -2  -2  -3 -50 -50]
 [ -1  -2  -2  -3 -50 -50]
 [-50 -50 -50 -50  -3  -4]
 [-50 -50 -50 -50 -50 -50]
 [-50 -50 -50 -50 -50 -50]
 [ -1  -2  -2  -3

#### Make Transition Probability matrix
Iterate through the states for all actions and make a matrix of the probability that one state follows the other

In [0]:
def store_next_states_df(state_now, place):
    df_pos = states
    if state_now[place] == '0':
        for i in storage_places:
            if (i == place):
                df_pos = df_pos.loc[df_pos.iloc[ : , i-1]== row [size_storage+2]]
            else:
                df_pos = df_pos.loc[df_pos.iloc[ : , i-1] == row [i]]
    else:
        for i in storage_places:
            df_pos = df_pos.loc[df_pos.iloc[ : , i-1] == row [i]]        
    return df_pos

def restore_next_states_df(state_now, place):
    df_pos = states
    if state_now[place] == state_now[size_storage+2]:
        for i in storage_places:
            if (i == place):
                df_pos = df_pos.loc[df_pos.iloc[ : , i-1]== '0']
            else:
                df_pos = df_pos.loc[df_pos.iloc[ : , i-1] == row [i]]
    else:
        for i in storage_places:
            df_pos = df_pos.loc[df_pos.iloc[ : , i-1] == row [i]]      
    return df_pos

def init_trans_prob_mat(df_next_states, state_now, trans_prob_mat):    
    for i in df_next_states.index:
        trans_prob_mat.itemset((state_now[0],i),round(df_next_states.loc[[i]].probability,4))
     return trans_prob_mat

In [0]:
print(datetime.datetime.now())
trans_prob_mat = []
trans_prob_mat_all = []

for place in storage_places:
    trans_prob_mat = np.zeros((len(states),len(states)), dtype=np.float16)
    for row in states.itertuples():
        #print(row)
        df_next_states = []

        #an dataframe that stores all the rows where the trans prob mat is not null
        if (row[size_storage+1] == 'store'):
            df_next_states= store_next_states_df(row,place)
            #print(df_next_states)

        elif(row[size_storage+1] == 'restore'):
            df_next_states= restore_next_states_df(row,place)
            #print(df_next_states)
         
        else:
            print("Wrong Instruction in array")
        

        trans_prob_mat = init_trans_prob_mat(df_next_states, row, trans_prob_mat)
        
    #print(place)

    np.save("trans_mat"+str(place)+".nyp", trans_prob_mat)
    print(datetime.datetime.now())
    
    trans_prob_mat_all.append(trans_prob_mat)


2019-07-09 21:07:59.585635
2048.0
2019-07-09 21:14:26.134235


In [0]:
print(datetime.datetime.now())


mdpresultPolicy = mdptoolbox.mdp.PolicyIteration(trans_prob_mat_all,reward_mat_all,0.99, max_iter=100)
mdpresultValue = mdptoolbox.mdp.ValueIteration(trans_prob_mat_all,reward_mat_all,0.99, max_iter=100)

print(datetime.datetime.now())

In [0]:
mdpresultValue.run()

In [0]:
print('ValueIteration:')
policy2 = mdpresultValue.policy
print(mdpresultValue.policy)
np.save('Value_policy_2x3_50_50__0_75.npy',mdpresultValue.policy)
print(mdpresultValue.V)
#np.save('Value_values_2x3.npy',mdpresultValue.V)
print(mdpresultValue.iter)

print(datetime.datetime.now())

ValueIteration:
(0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 4, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 4, 5, 0, 0, 0, 0, 4, 0, 5, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5, 4, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, 4, 0, 0, 0, 5, 0, 4, 0, 0, 0, 0, 5, 4, 0, 0, 0, 0, 0, 4, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 5, 0, 0, 0, 0, 3, 0, 5, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 5, 0, 0, 0, 0, 3, 0, 5, 0, 0, 0, 3, 4, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 3, 4, 5, 0, 0, 0, 3, 0, 4, 0, 0, 0, 3, 0, 4, 0, 0, 0, 3, 5, 4, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3, 0, 0, 0, 0, 5, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 5, 0, 0, 0, 4, 3, 0, 0, 0, 0, 4, 3, 0, 0, 0, 0, 4, 3, 0, 0, 0, 0, 4, 3, 5, 0, 0, 0, 0, 3, 0, 0, 0, 0, 5, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 5, 3, 4, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 3, 0, 0, 0, 5, 0, 3, 0, 0, 0, 0, 5, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 4, 0, 3, 0, 0, 0, 4, 0, 3, 0, 0, 0, 4,

In [0]:
mdpresultPolicy.run()


In [0]:
print('PolicyIteration:')
policy = mdpresultPolicy.policy
print(mdpresultPolicy.policy)
np.save('policy_2x3.npy',mdpresultPolicy.policy)
print(mdpresultPolicy.V)
np.save('policy_values_2x3.npy',mdpresultPolicy.V)
print(mdpresultPolicy.iter)

print(datetime.datetime.now())