# Algorithmic - text manipulation
---
> 텍스트 알고리즘 학습. 복사, 뒤집기, 등 간단한 일을 하게 학습함. agent 가 문자열을 제대로 다 작성되거나 하나라도 틀리거나 시간이 너무 오래 지나면 종료한다. 처음엔 주어지는 input string 은 짧지만 episode 가 커질수록 문자열도 길어진다.

* state : read head 의 위치
* actoin : 2(좌우) x 2(write or not) x 5(what to write)
* reward : 성공 시 + 1, 오류 시 -0.5, 시간초과 시 -1

% openAi Gym 주의 - episode 끝나고 state 업데이트 해주기

---

In [1]:
import gym
import numpy as np
import random

In [78]:
'''
환경 생성
'''
env = gym.make('Copy-v0')

In [79]:
print(env.action_space)
print(env.observation_space.n)

Tuple(Discrete(2), Discrete(2), Discrete(5))
6


In [84]:
q_table = np.zeros([20, env.observation_space.n])
gamma = .95
epsilon = 1
alpha = .1
episode_total = 2000
episode = 0

In [85]:
def decode_action(action):
    return [int(action/10), int((action/5) % 2), action % 5]

def encode_action(action):
    return action[0]*10 + action[1]*5 + action[2]

In [86]:
state = env.reset()
step = 0
reward_total = 0

while(episode < episode_total):
    step += 1
    if(random.random() < epsilon):
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table[ : , state])
        action = decode_action(action)
    
    state_next, reward, done, _ = env.step(action)
    reward -= .5
    reward_total += reward
    action = encode_action(action)
    
    q_table[action, state] += alpha * (reward + np.max(gamma * q_table[ : , state_next]) - q_table[action, state])
    state = state_next

    if(done):
        if(reward > 0):
            if(epsilon < .1):
                epsilon = .1
            else:
                epsilon = 1 / (1 + episode / 100)
        episode += 1
        state = env.reset() # 꼭 해주기. 중요
        step = 0
        #print(reward_total)
        reward_total = 0
env.close()

In [87]:
print(q_table)
s = env.reset()
reward_sum = 0
ep = 0

while(True):
    a = np.argmax(q_table[ : , s])
    _a = decode_action(a)
    s,r,d,_ = env.step(_a)
    if(ep == 100):
        env.render()
        
    reward_sum += r
    if(d):
        ep += 1
        s = env.reset() # 꼭 해주기. 중요
        print(reward_sum)
        reward_sum = 0

[[-0.72927387 -0.69354201 -0.6403836  -0.81874835 -0.62384507 -0.92517078]
 [-0.69046872 -0.70911541 -0.64159408 -0.62002581 -0.6079537  -0.92319247]
 [-0.56464352 -0.71880656 -0.7303072  -0.71692728 -0.60583307 -0.90811511]
 [-0.6602136  -0.7555307  -0.70695378 -0.77723476 -0.76529335 -0.93116353]
 [-0.69399232 -0.68190686 -0.80254291 -0.75031183 -0.78521498 -0.92801243]
 [-0.02618911 -1.05587228 -1.08854035 -0.83066268 -1.07595057 -0.95841814]
 [-1.26158742 -0.22099276 -1.21038315 -1.17263022 -1.14593312 -1.04497283]
 [-0.84837786 -1.20063457 -0.10051453 -0.95872366 -1.15234921 -0.98631333]
 [-0.87600878 -1.17391572 -1.18353763 -0.2655583  -0.92615661 -1.06949497]
 [-1.07122355 -0.76156784 -0.90505297 -1.11414593 -0.20629255 -1.3035178 ]
 [-0.40719027 -0.33817835 -0.25888325 -0.28114118 -0.28318144 -0.56611053]
 [-0.34163511 -0.31068079 -0.26335036 -0.32301459 -0.32432074 -0.43113093]
 [-0.31037675 -0.40363297 -0.30472564 -0.36219594 -0.27713066 -0.61026008]
 [-0.34916398 -0.41473557

KeyboardInterrupt: 

# 어느 짱깨가 만든 몬테카를로 알고리즘
---

In [540]:
import numpy as np
import gym
import random

env = gym.make('Copy-v0')

# observation_space: Discrete(6)
# action_space: (Discrete(2), Discrete(2), Discrete(5))

# states: 0, 1, 2, 3, 4, 5

# MARK: problem specific functions


def action_to_index(action):
    return action[0]*10+action[1]*5+action[2]


def index_to_action(index):
    action = [int(index/10), int((index/5) % 2), index % 5]
    return tuple(action)


# MARK: Monte Carlo ES method

STATE_COUNT = 6
ACTION_COUNT = 20


def init_mces(state_count, action_count):
    q = np.random.rand(state_count, action_count)
    rets = np.zeros((state_count, action_count), dtype=np.double)
    policy = [random.randint(0, action_count-1) for _ in range(state_count)]
    return q, rets, policy


def learning(env):
    q, rets, policy = init_mces(STATE_COUNT, ACTION_COUNT)
    gamma = 0.7
    epsilon = 1
    total_score = 0.0
    i_episode = 0

    for i_episode in range(5000):
        total_reward = 0
        observation = env.reset()
        g = np.zeros((STATE_COUNT, ACTION_COUNT), dtype=np.double)
        passed = np.zeros((STATE_COUNT, ACTION_COUNT), dtype=np.double)

        for t in range(100):
            raw_action = policy[observation]
            # 1 - epsilon greedy
            if random.random() < epsilon:
                raw_action = action_to_index((random.randint(0, 1),
                                              random.randint(0, 1),
                                              random.randint(0, 4)))
            if passed[observation, raw_action] == 0.0:
                passed[observation, raw_action] = 1.0

            action = index_to_action(raw_action)
            #env.render()
            observation, reward, done, info = env.step(action)
            reward -= 0.5  # IMPORTANT: punish useless (even dangerous) actions whose environment reward is 0.0
            total_reward += reward

            for i in range(STATE_COUNT):
                for j in range(ACTION_COUNT):
                    passed[i][j] *= gamma
                    g[i][j] += reward * passed[i][j]
            if done:
                break

        # reduce exploration chance
        if i_episode % 100 == 0:
            epsilon *= 0.9

        rets += g
        q = rets / (i_episode+1)
        policy = np.argmax(q, axis=1).tolist()

        total_score += total_reward

    return policy, i_episode


def test_policy(env, policy):
    total_reward = 0.0
    obs = env.reset()
    for t in range(1000):
        action = index_to_action(policy[obs])
        obs, reward, done, info = env.step(action)
        total_reward += reward
        env.render()
        if done:
            break
    print('total reward: %f'%total_reward)


#env.monitor.start('Copyv0-experiment-0')
policy, n_episode = learning(env)
#env.monitor.close()

print('final policy: '+str(policy))
print('episodes trained: '+str(n_episode))
test_policy(env, policy)

final policy: [15, 16, 17, 18, 19, 15]
episodes trained: 4999
total reward: 30.000000


In [221]:
'''
환경 생성
'''
env2 = gym.make('Reverse-v0')

In [234]:
print(env2.action_space)
print(env2.observation_space.n)

Tuple(Discrete(2), Discrete(2), Discrete(2))
3


In [251]:
q_table = np.zeros([8, env2.observation_space.n])
gamma = .95
epsilon = 1
alpha = .1
episode_total = 2000
episode = 0

In [252]:
def decode_action(action):
    return [int(action/4), int((action/2) % 2), action % 2]

def encode_action(action):
    return action[0]*4 + action[1]*2 + action[2]

In [253]:
state = env2.reset()
step = 0
reward_total = 0

while(episode < episode_total):
    step += 1
    if(random.random() < epsilon):
        action = env2.action_space.sample()
    else:
        action = np.argmax(q_table[ : , state])
        action = decode_action(action)
    
    state_next, reward, done, _ = env2.step(action)
    reward -= .5
    reward_total += reward
    action = encode_action(action)
    
    q_table[action, state] += alpha * (reward + np.max(gamma * q_table[ : , state_next]) - q_table[action, state])
    state = state_next

    if(done):
        if(reward > 0):
            if(epsilon < .1):
                epsilon = .1
            else:
                epsilon = 1 / (1 + episode / 100)
        episode += 1
        state = env2.reset() # 꼭 해주기. 중요
        step = 0
        #print(reward_total)
        reward_total = 0
env2.close()

In [254]:
print(q_table)
s = env2.reset()
reward_sum = 0
ep = 0

while(True):
    a = np.argmax(q_table[ : , s])
    _a = decode_action(a)
    s,r,d,_ = env2.step(_a)
    if(ep == 100):
        env2.render()
        
    reward_sum += r
    if(d):
        ep += 1
        s = env2.reset() # 꼭 해주기. 중요
        print(reward_sum)
        reward_sum = 0

[[5.99619023 4.430821   3.09849888]
 [6.0910561  5.07189231 3.07368591]
 [6.76251591 5.41512441 4.76685382]
 [5.53346153 8.65916137 2.61155112]
 [4.52975196 3.99274697 2.93220359]
 [5.93524077 4.0195155  3.02079725]
 [8.98095442 5.45858346 0.88971874]
 [6.53717538 6.31057169 8.73084898]]
1.0
1.0
1.0
1.0
-0.5
1.0
-0.5
1.0
2.0
3.0
1.0
1.0
1.0
2.0
-0.5
2.0
-0.5
1.0
-0.5
1.0
3.0
2.0
1.0
2.0
2.0
2.0
1.0
-0.5
-0.5
1.0
0.5
1.0
1.0
3.0
1.0
2.0
-0.5
1.0
-0.5
-0.5
2.0
3.0
1.0
-0.5
-0.5
2.0
3.0
3.0
-0.5
2.0
1.0
-0.5
0.5
-0.5
-0.5
-0.5
2.0
-0.5
-0.5
1.0
-0.5
1.0
2.0
3.0
1.0
3.0
-0.5
3.0
-0.5
3.0
1.0
-0.5
1.0
1.0
3.0
-0.5
-0.5
0.5
1.0
1.0
1.0
1.0
1.0
2.0
1.0
1.0
1.0
1.0
-0.5
1.0
1.0
1.0
-0.5
1.0
2.0
1.0
2.0
-0.5
-0.5
2.0
Total length of input instance: 1, step: 1
Observation Tape    :  [42m [0mB  
Output Tape         :   [42mB[0m
Targets             :   B  

Current reward      :   1.000
Cumulative reward   :   1.000
Action              :   Tuple(move over input: left,
                         

KeyboardInterrupt: 