In [1]:
import Env2048
import numpy as np
from collections import deque
import json
import os
from pprint import pprint

In [5]:
class MC_agent:
    def __init__(self):
        self.env = Env2048.Game2048()
        self.action = ['L','R','U','D'] #좌,우,위,아래
        self.filename = 'save_value.json'
        self.value_list = {}
        #self.value_list = self.load_data()
        self.gamma = 0.9
        self.e = 0.1
        self.memory = deque()
        self.learning_rate = 0.05
    
    def get_action(self,state): #수정필요
        if np.random.rand() < self.e :
            idx = np.random.choice(len(self.action),1)[0]
        else:
            next_values = np.array([])

            #각 이동시 얻을 수 있는 값들 저장
            #모든 경우를 다 봐야 함. 수정 요망 - 재현이형한테 부탁해야할듯. 문제발견했어요. 여기서.
            #그냥 막해도 할 수는 있는데...고민입니다.
            #up
            next_state = self.env.move_up(1)
            if next_state not in self.value_list:
                self.value_list[next_state] = 0 #value_list에 없으면 0 그 value를 0으로
            next_values = np.append(next_values,self.value_list[next_state])

            #down
            next_state = self.env.move_down(1)
            if next_state not in self.value_list:
                self.value_list[next_state] = 0 #value_list에 없으면 0 그 value를 0으로
            next_values = np.append(next_values,self.value_list[next_state])

            #left
            next_state = self.env.move_left(1)
            if next_state not in self.value_list:
                self.value_list[next_state] = 0 #value_list에 없으면 0 그 value를 0으로
            next_values = np.append(next_values,self.value_list[next_state])

            #right
            next_state = self.env.move_right(1)
            if next_state not in self.value_list:
                self.value_list[next_state] = 0 #value_list에 없으면 0 그 value를 0으로
            next_values = np.append(next_values,self.value_list[next_state])

            max_val = max(next_values)

            action_list = np.where(next_values == max_val)[0]

            if len(action_list) > 1:
                idx = np.random.choice(action_list,1)[0]
            else:
                idx = action_list[0]

        action = self.action[idx]
            
        return action

    def update(self):
        G_t = 0

        # Iterate through the memory list
        while(self.memory):
            # Pop the last sample from the memory list
            sample = self.memory.pop()
            state = sample[1]
            reward = sample[2]
            
            # Calculate the discounted return
            G_t = reward + self.gamma*G_t
            
            # Convert the state to a tuple
            state = self.env.grid_to_tuple(state)
            
            # Add the state to the value_list if it is not included
            if state not in self.value_list:
                self.value_list[state] = 0
            
            # Get the value of the state from the value_list
            V_t = self.value_list[state]
            
            # Update the value of the state using TD(0) update rule
            self.value_list[state] = V_t + self.learning_rate*(G_t-V_t)

    # 데이터 저장 함수
    def save_data(self):
        with open(self.filename, 'w') as f:
            json.dump(self.value_list, f)

    # 데이터 불러오기 함수
#     def load_data(self):
#         with open(self.filename, 'r') as f:
#             data = json.load(f)

#         self.value_list = data.get("value_list", {})

In [None]:
agent = MC_agent()
num_episodes = 100000
max_reward = 0
max_grid = 0
result_expectation = deque()

for episode in range(num_episodes):
    total_reward = 0
    state = agent.env.reset()
    action = agent.get_action(state)
    done = False
    walk = 0
    
    while True:
        done,next_state,reward = agent.env.step(action)
        data = (done,agent.env.grid_to_tuple(state),reward)
        agent.memory.append(data)
        walk += 1
        state = next_state
        action = agent.get_action(state)
        
        if not done:
            total_reward = reward
            max_reward = max(total_reward,max_reward)
            if(total_reward == max_reward):
                max_grid = state
                
            if(len(result_expectation)==100):
                result_expectation.popleft()
            
            result_expectation.append(total_reward)
                
            if episode % 5000 == 0:
                print('finished at', state)
                print('episode :{}, The number of step:{}\n The total reward is: {}\nThe Max reward is : {}\nThe Max grid is : {}\nexpectation of 100 is : {}\n'.format(episode, walk, total_reward,max_reward,max_grid,sum(result_expectation)/100))
            
            agent.update()
            break

print('The Max : ', max_reward)
pprint(agent.value_list)

finished at [[8, 2, 16, 2], [8, 32, 2, 4], [4, 8, 4, 2], [2, 16, 2, 4]]
episode :0, The number of step:54
 The total reward is: 392
The Max reward is : 392
The Max grid is : [[8, 2, 16, 2], [8, 32, 2, 4], [4, 8, 4, 2], [2, 16, 2, 4]]
expectation of 100 is : 3.92

finished at [[2, 4, 64, 2], [8, 16, 8, 4], [2, 4, 32, 2], [16, 2, 16, 4]]
episode :5000, The number of step:86
 The total reward is: 802
The Max reward is : 2376
The Max grid is : [[2, 4, 2, 16], [8, 256, 2, 4], [16, 2, 4, 2], [2, 4, 32, 8]]
expectation of 100 is : 692.86

finished at [[4, 2, 4, 2], [16, 4, 8, 2], [4, 8, 32, 4], [2, 32, 8, 4]]
episode :10000, The number of step:58
 The total reward is: 476
The Max reward is : 2592
The Max grid is : [[2, 8, 16, 2], [16, 32, 2, 8], [256, 16, 8, 2], [2, 8, 2, 32]]
expectation of 100 is : 680.94

finished at [[8, 16, 4, 2], [2, 2, 8, 4], [4, 16, 16, 8], [16, 4, 32, 2]]
episode :15000, The number of step:64
 The total reward is: 500
The Max reward is : 3052
The Max grid is : [[2, 4