# Algorithmic - text manipulation
---
> 텍스트 알고리즘 학습. 복사, 뒤집기, 등 간단한 일을 하게 학습함. agent 가 문자열을 제대로 다 작성되거나 하나라도 틀리거나 시간이 너무 오래 지나면 종료한다. 처음엔 주어지는 input string 은 짧지만 episode 가 커질수록 문자열도 길어진다.

* state : read head 의 위치
* actoin : 2(좌우) x 2(write or not) x 5(what to write)
* reward : 성공 시 + 1, 오류 시 -0.5, 시간초과 시 -1

% openAi Gym 주의 - episode 끝나고 state 업데이트 해주기

---

In [612]:
import gym
import numpy as np
import random

In [613]:
'''
환경 생성
'''
env = gym.make('Copy-v0')

In [614]:
print(env.action_space)
print(env.observation_space.n)

Tuple(Discrete(2), Discrete(2), Discrete(5))
6


In [623]:
q_table = np.zeros([20, env.observation_space.n])
gamma = .95
epsilon = 1
alpha = .1
episode_total = 2000
episode = 0

In [624]:
def decode_action(action):
    return [int(action/10), int((action/5) % 2), action % 5]

def encode_action(action):
    return action[0]*10 + action[1]*5 + action[2]

In [625]:
state = env.reset()
step = 0
reward_total = 0

while(episode < episode_total):
    step += 1
    if(random.random() < epsilon):
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table[ : , state])
        action = decode_action(action)
    
    state_next, reward, done, _ = env.step(action)
    reward -= .5
    reward_total += reward
    action = encode_action(action)
    
    q_table[action, state] += alpha * (reward + np.max(gamma * q_table[ : , state_next]) - q_table[action, state])
    state = state_next

    if(done):
        if(reward > 0):
            if(epsilon < .1):
                epsilon = .1
            else:
                epsilon = 1 / (1 + episode / 100)
        episode += 1
        state = env.reset() # 꼭 해주기. 중요
        step = 0
        #print(reward_total)
        reward_total = 0
env.close()

In [626]:
print(q_table)
s = env.reset()
reward_sum = 0
ep = 0

while(True):
    a = np.argmax(q_table[ : , s])
    _a = decode_action(a)
    s,r,d,_ = env.step(_a)
    if(ep == 100):
        env.render()
        
    reward_sum += r
    if(d):
        ep += 1
        s = env.reset() # 꼭 해주기. 중요
        print(reward_sum)
        reward_sum = 0

[[-0.54661179 -0.46489457 -0.61591941 -0.52880045 -0.56840773 -0.74533029]
 [-0.56749593 -0.46386748 -0.42324346 -0.46989764 -0.38140557 -0.73779281]
 [-0.50468414 -0.5767562  -0.57632526 -0.43641531 -0.61283298 -0.74532157]
 [-0.54737332 -0.45170561 -0.53642309 -0.5008144  -0.50461826 -0.74525236]
 [-0.48921005 -0.52722067 -0.5038153  -0.39035242 -0.594993   -0.75848013]
 [ 0.1627189  -1.12285529 -0.97007823 -0.97079137 -0.76549421 -0.98002622]
 [-0.71379974  0.27116988 -0.52975732 -1.10448376 -0.98018781 -0.75626181]
 [-0.9956953  -0.76707688 -0.01683418 -0.94039167 -0.88133887 -0.96362635]
 [-0.68283179 -0.95512881 -1.06456391  0.00211023 -0.9756859  -0.97731365]
 [-0.94111413 -0.87157309 -0.84303162 -0.85647861 -0.01475626 -1.06392201]
 [-0.09840329 -0.12613058 -0.15748441 -0.18673757 -0.14779175 -0.2954341 ]
 [-0.15850104 -0.09464548 -0.13941369 -0.16251746 -0.16058574 -0.40190317]
 [-0.19754563 -0.12389672 -0.13357666 -0.16494898 -0.08356571 -0.43164803]
 [-0.14597337 -0.11919677

KeyboardInterrupt: 

# 어느 짱깨가 만든 몬테카를로 알고리즘
---

In [540]:
import numpy as np
import gym
import random

env = gym.make('Copy-v0')

# observation_space: Discrete(6)
# action_space: (Discrete(2), Discrete(2), Discrete(5))

# states: 0, 1, 2, 3, 4, 5

# MARK: problem specific functions


def action_to_index(action):
    return action[0]*10+action[1]*5+action[2]


def index_to_action(index):
    action = [int(index/10), int((index/5) % 2), index % 5]
    return tuple(action)


# MARK: Monte Carlo ES method

STATE_COUNT = 6
ACTION_COUNT = 20


def init_mces(state_count, action_count):
    q = np.random.rand(state_count, action_count)
    rets = np.zeros((state_count, action_count), dtype=np.double)
    policy = [random.randint(0, action_count-1) for _ in range(state_count)]
    return q, rets, policy


def learning(env):
    q, rets, policy = init_mces(STATE_COUNT, ACTION_COUNT)
    gamma = 0.7
    epsilon = 1
    total_score = 0.0
    i_episode = 0

    for i_episode in range(5000):
        total_reward = 0
        observation = env.reset()
        g = np.zeros((STATE_COUNT, ACTION_COUNT), dtype=np.double)
        passed = np.zeros((STATE_COUNT, ACTION_COUNT), dtype=np.double)

        for t in range(100):
            raw_action = policy[observation]
            # 1 - epsilon greedy
            if random.random() < epsilon:
                raw_action = action_to_index((random.randint(0, 1),
                                              random.randint(0, 1),
                                              random.randint(0, 4)))
            if passed[observation, raw_action] == 0.0:
                passed[observation, raw_action] = 1.0

            action = index_to_action(raw_action)
            #env.render()
            observation, reward, done, info = env.step(action)
            reward -= 0.5  # IMPORTANT: punish useless (even dangerous) actions whose environment reward is 0.0
            total_reward += reward

            for i in range(STATE_COUNT):
                for j in range(ACTION_COUNT):
                    passed[i][j] *= gamma
                    g[i][j] += reward * passed[i][j]
            if done:
                break

        # reduce exploration chance
        if i_episode % 100 == 0:
            epsilon *= 0.9

        rets += g
        q = rets / (i_episode+1)
        policy = np.argmax(q, axis=1).tolist()

        total_score += total_reward

    return policy, i_episode


def test_policy(env, policy):
    total_reward = 0.0
    obs = env.reset()
    for t in range(1000):
        action = index_to_action(policy[obs])
        obs, reward, done, info = env.step(action)
        total_reward += reward
        env.render()
        if done:
            break
    print('total reward: %f'%total_reward)


#env.monitor.start('Copyv0-experiment-0')
policy, n_episode = learning(env)
#env.monitor.close()

print('final policy: '+str(policy))
print('episodes trained: '+str(n_episode))
test_policy(env, policy)

final policy: [15, 16, 17, 18, 19, 15]
episodes trained: 4999
total reward: 30.000000
