# Reinforcement Learning

import necessary dependencies

In [13]:
! pip install pymdptoolbox



In [0]:
import mdptoolbox
import numpy as np
import datetime
import csv
import pandas as pd 

In [15]:
print(datetime.datetime.now())

2019-07-09 20:25:51.307970


Load the drive folder

In [16]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import os
os.chdir("/content/gdrive/My Drive/SAKI_2019") 

#### Get probability of different actions
Get probability of commands from dataset

In [18]:

data = pd.read_csv("data/warehousetraining_2x3.txt", delimiter='\t', names=["instr", "next_color"])
length_data = len(data)
print(length_data)

df = data.copy()
df = df.groupby(["instr", "next_color"]).size().reset_index(name='probability') 
df['probability'] = df['probability'].div(length_data)
df['p'] = round(df['probability'],4)

if df.p.sum() != 1:
    print("rounding error sum of probabilities not 1")

print(df)


12108
     instr next_color  probability       p
0  restore       blue     0.125289  0.1253
1  restore        red     0.246862  0.2469
2  restore      white     0.127849  0.1278
3    store       blue     0.125289  0.1253
4    store        red     0.246862  0.2469
5    store      white     0.127849  0.1278


#### Initialize the states
Get all possible states of the problem for the Marlov Decision process

We put it all in a panda dataframe, to make it easier to work with

In [19]:
size_storage = 6
storage_places = range(1,(size_storage+1))
storage_state= ["0", "red", "blue", "white"] 
instr = ["store", "restore"]
next_color = ["red", "blue", "white"]


from itertools import product
if (size_storage == 4):
    states_tmp = pd.DataFrame(list(product(storage_state,storage_state,storage_state,storage_state, instr, next_color)), columns=['x1', 'x2', 'x3', 'x4', "instr", "next_color"])
elif (size_storage == 6):
    states_tmp = pd.DataFrame(list(product(storage_state,storage_state,storage_state,storage_state, storage_state,storage_state, instr, next_color)), columns=['x1', 'x2', 'x3', 'x4', 'x5', 'x6', "instr", "next_color"])
else:
    print ("Not supported storage size")
    
states = pd.merge(states_tmp,df, how = 'left' ,on = ["instr", "next_color"])

print (len(states))
print (states)

24576
          x1     x2     x3     x4  ...    instr next_color probability       p
0          0      0      0      0  ...    store        red    0.246862  0.2469
1          0      0      0      0  ...    store       blue    0.125289  0.1253
2          0      0      0      0  ...    store      white    0.127849  0.1278
3          0      0      0      0  ...  restore        red    0.246862  0.2469
4          0      0      0      0  ...  restore       blue    0.125289  0.1253
5          0      0      0      0  ...  restore      white    0.127849  0.1278
6          0      0      0      0  ...    store        red    0.246862  0.2469
7          0      0      0      0  ...    store       blue    0.125289  0.1253
8          0      0      0      0  ...    store      white    0.127849  0.1278
9          0      0      0      0  ...  restore        red    0.246862  0.2469
10         0      0      0      0  ...  restore       blue    0.125289  0.1253
11         0      0      0      0  ...  restor

In [22]:
start = datetime.datetime.now()
curr_state = ['0','0','0','0','0','0']
price = [-1,-2,-2,-3,-3,-4]
total_cost = 0
policy= np.load('Value_policy_2x3_10_10.npy')
not_possible_in = 0
not_possible_out = 0
no_storage = 0
no_restorage = 0

command = pd.read_csv("data/warehousetraining_2x3.txt", delimiter='\t', names=["instr", "next_color"])
print(len(command))

for move in command.itertuples():
   # print(move)#Pandas(Index=0, instr='store', next_color='red')
    df_pol = states
    
    #Where storage state is the same as current state
    for place in storage_places:
        df_pol = df_pol.loc[df_pol.iloc[ : , place-1]== str(curr_state[place-1])]
        
    # Where instructions and next colors are the same as in move
    print("command:" + move.instr +' '+ move.next_color)
    df_pol = df_pol.loc[df_pol['instr']== move.instr]
    df_pol = df_pol.loc[df_pol['next_color']== move.next_color]

    
    if len(df_pol.index) != 1:
        print ("ambigious variable")
    pol = policy[df_pol.index[0]]
    print(pol)
    total_cost = total_cost + price[pol]
    
    # update current state
    if move.instr == 'store':
        # punish not storing an item
        if (curr_state [pol] == move.next_color):
            print ('storage refused')
            no_storage =no_storage +1
            total_cost = total_cost + not_possible_in - price[pol]
        curr_state [pol] = move.next_color
    elif move.instr == 'restore':
        if(curr_state [pol] == '0'):
            print ('restorage not possible')
            no_restorage =no_restorage +1
            total_cost = total_cost + not_possible_out - price[pol]# test test
        curr_state [pol] = '0'
    
    print(total_cost)
    print(curr_state)
    
print('TOTAL COST:    ' + str(total_cost))
print(no_restorage)
print(no_storage)
print(datetime.datetime.now()-start)

12108
command:store red
0
-1
['red', '0', '0', '0', '0', '0']
command:store blue
1
-3
['red', 'blue', '0', '0', '0', '0']
command:store white
2
-5
['red', 'blue', 'white', '0', '0', '0']
command:restore blue
1
-7
['red', '0', 'white', '0', '0', '0']
command:restore white
2
-9
['red', '0', '0', '0', '0', '0']
command:store white
1
-11
['red', 'white', '0', '0', '0', '0']
command:store red
2
-13
['red', 'white', 'red', '0', '0', '0']
command:store white
3
-16
['red', 'white', 'red', 'white', '0', '0']
command:store red
4
-19
['red', 'white', 'red', 'white', 'red', '0']
command:restore red
0
-20
['0', 'white', 'red', 'white', 'red', '0']
command:restore red
2
-22
['0', 'white', '0', 'white', 'red', '0']
command:store white
0
-23
['white', 'white', '0', 'white', 'red', '0']
command:store blue
2
-25
['white', 'white', 'blue', 'white', 'red', '0']
command:store white
0
storage refused
-25
['white', 'white', 'blue', 'white', 'red', '0']
command:restore red
4
-28
['white', 'white', 'blue', 'wh