In [4]:
import numpy as np
import random
import math
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import gym

def build_model(state_size, action_size):
    model = Sequential()
    model.add(Dense(24, input_dim=state_size, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(action_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam(lr=0.001))
    return model

def train_model():
    minibatch = random.sample(memory, batch_size)
    for state, action, reward, next_state, done in minibatch:
        target = reward
        if not done:
            target = reward + 0.95 * np.amax(target_model.predict(next_state.reshape(1, -1))[0])
        target_f = model.predict(state.reshape(1, -1))
        target_f[0][action] = target
        model.fit(state.reshape(1, -1), target_f, epochs=1, verbose=0)

env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
model = build_model(state_size, action_size)
target_model = build_model(state_size, action_size)
target_model.set_weights(model.get_weights())

done = False
batch_size = 32
EPISODES = 1000
memory = deque(maxlen=2000)

scores, steps = [], []
epsilon = 1.0
iteration = 0
max_memory = 1000
train_count = 0

for i in range(EPISODES):
    prev_obs = env.reset()
    score = 0
    step = 0
    not_move_list = np.ones(action_size)
    prev_max = np.max(prev_obs)
    
    while True:
        iteration += 1
        
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            x = np.expand_dims(prev_obs, axis=0)
            logits = model.predict(x)[0]
            prob = np.exp(logits) / np.sum(np.exp(logits))
            prob = prob * not_move_list
            action = np.argmax(prob)
        
        obs, reward, done, _ = env.step(action)
        score += reward
        step += 1
        
        if reward == 0 and np.array_equal(obs, prev_obs):
            not_move_list[action] = 0
            continue
        else:
            not_move_list = np.ones(action_size)
        
        now_max = np.max(obs)
        if prev_max < now_max:
            prev_max = now_max
            reward = math.log(now_max, 2) * 0.1
        else:
            reward = 0
        
        reward += np.count_nonzero(prev_obs) - np.count_nonzero(obs) + 1
        
        memory.append((prev_obs, action, reward, obs, done))
        
        if len(memory) >= max_memory:
            train_model()
            memory = []
            train_count += 1
            if train_count % 4 == 0:
                target_model.set_weights(model.get_weights())
        
        prev_obs = obs
        
        if epsilon > 0.01 and iteration % 2500 == 0:
            epsilon /= 1.005
        
        if done:
            break
    
    scores.append(score)
    steps.append(step)
    print(f"Episode {i}, Score: {score}, Steps: {step}, Max Tile: {np.max(obs)}, Memory Length: {len(memory)}")


Episode 0, Score: 12.0, Steps: 12, Max Tile: 1.5473836581486589, Memory Length: 12
Episode 1, Score: 24.0, Steps: 24, Max Tile: 1.0977964570272631, Memory Length: 36
Episode 2, Score: 12.0, Steps: 12, Max Tile: 0.7986387376905412, Memory Length: 48
Episode 3, Score: 19.0, Steps: 19, Max Tile: 1.6449153423780911, Memory Length: 67
Episode 4, Score: 13.0, Steps: 13, Max Tile: 0.9520763312532148, Memory Length: 80
Episode 5, Score: 26.0, Steps: 26, Max Tile: 0.36702100805069376, Memory Length: 106
Episode 6, Score: 17.0, Steps: 17, Max Tile: 0.7765547432174108, Memory Length: 123
Episode 7, Score: 55.0, Steps: 55, Max Tile: -0.11165308424733258, Memory Length: 178
Episode 8, Score: 17.0, Steps: 17, Max Tile: 1.7224061044007484, Memory Length: 195
Episode 9, Score: 17.0, Steps: 17, Max Tile: 1.432015931630174, Memory Length: 212
Episode 10, Score: 16.0, Steps: 16, Max Tile: 1.5507511709121578, Memory Length: 228
Episode 11, Score: 21.0, Steps: 21, Max Tile: 1.2112869230201573, Memory Lengt

Episode 147, Score: 22.0, Steps: 22, Max Tile: 0.40140751505457806, Memory Length: 14
Episode 148, Score: 26.0, Steps: 26, Max Tile: 0.1071278644817578, Memory Length: 40
Episode 149, Score: 17.0, Steps: 17, Max Tile: 1.7277142807241244, Memory Length: 57
Episode 150, Score: 22.0, Steps: 22, Max Tile: 1.4575393642594325, Memory Length: 79
Episode 151, Score: 25.0, Steps: 25, Max Tile: 0.8389630424057436, Memory Length: 104
Episode 152, Score: 15.0, Steps: 15, Max Tile: 0.6142327763520314, Memory Length: 119
Episode 153, Score: 31.0, Steps: 31, Max Tile: 0.4594793120911817, Memory Length: 150
Episode 154, Score: 19.0, Steps: 19, Max Tile: 1.27500085853756, Memory Length: 169
Episode 155, Score: 19.0, Steps: 19, Max Tile: 0.9700509506138373, Memory Length: 188
Episode 156, Score: 19.0, Steps: 19, Max Tile: 0.9836617998185444, Memory Length: 207
Episode 157, Score: 18.0, Steps: 18, Max Tile: 1.0122733445018524, Memory Length: 225
Episode 158, Score: 9.0, Steps: 9, Max Tile: 1.000729048819

Episode 284, Score: 15.0, Steps: 15, Max Tile: 0.14003743832101795, Memory Length: 24
Episode 285, Score: 10.0, Steps: 10, Max Tile: 2.62545185062659, Memory Length: 34
Episode 286, Score: 27.0, Steps: 27, Max Tile: 1.130973986733283, Memory Length: 61
Episode 287, Score: 27.0, Steps: 27, Max Tile: 0.5646817315454745, Memory Length: 88
Episode 288, Score: 26.0, Steps: 26, Max Tile: 0.25267947599358975, Memory Length: 114
Episode 289, Score: 16.0, Steps: 16, Max Tile: 1.1474414618918345, Memory Length: 130
Episode 290, Score: 12.0, Steps: 12, Max Tile: 1.555773579220557, Memory Length: 142
Episode 291, Score: 14.0, Steps: 14, Max Tile: 1.4849069773676007, Memory Length: 156
Episode 292, Score: 19.0, Steps: 19, Max Tile: 1.3712393382426118, Memory Length: 175
Episode 293, Score: 17.0, Steps: 17, Max Tile: 1.320504413978974, Memory Length: 192
Episode 294, Score: 17.0, Steps: 17, Max Tile: 0.9763057094335654, Memory Length: 209
Episode 295, Score: 22.0, Steps: 22, Max Tile: 0.784429782148

Episode 391, Score: 60.0, Steps: 60, Max Tile: 2.3558726203345794, Memory Length: 549
Episode 392, Score: 17.0, Steps: 17, Max Tile: 1.4499659738638737, Memory Length: 566
Episode 393, Score: 13.0, Steps: 13, Max Tile: 1.6068099669766545, Memory Length: 579
Episode 394, Score: 18.0, Steps: 18, Max Tile: 1.1582884276078673, Memory Length: 597
Episode 395, Score: 46.0, Steps: 46, Max Tile: 0.3906714275733594, Memory Length: 643
Episode 396, Score: 15.0, Steps: 15, Max Tile: 1.6030074778263554, Memory Length: 658
Episode 397, Score: 19.0, Steps: 19, Max Tile: 1.2927895348019984, Memory Length: 677
Episode 398, Score: 21.0, Steps: 21, Max Tile: 0.6172881475235801, Memory Length: 698
Episode 399, Score: 14.0, Steps: 14, Max Tile: 2.0177022500979866, Memory Length: 712
Episode 400, Score: 27.0, Steps: 27, Max Tile: 1.3143943939392846, Memory Length: 739
Episode 401, Score: 22.0, Steps: 22, Max Tile: 1.173924044244695, Memory Length: 761
Episode 402, Score: 46.0, Steps: 46, Max Tile: 2.424470

Episode 495, Score: 65.0, Steps: 65, Max Tile: -0.22433750186684417, Memory Length: 40
Episode 496, Score: 26.0, Steps: 26, Max Tile: 0.4675462212150936, Memory Length: 66
Episode 497, Score: 34.0, Steps: 34, Max Tile: 1.6987618969089389, Memory Length: 100
Episode 498, Score: 15.0, Steps: 15, Max Tile: 2.2054258904026245, Memory Length: 115
Episode 499, Score: 12.0, Steps: 12, Max Tile: 0.7924634078457526, Memory Length: 127
Episode 500, Score: 14.0, Steps: 14, Max Tile: 0.4563853782210958, Memory Length: 141
Episode 501, Score: 17.0, Steps: 17, Max Tile: 0.6855623925525702, Memory Length: 158
Episode 502, Score: 13.0, Steps: 13, Max Tile: 1.3874314672608479, Memory Length: 171
Episode 503, Score: 12.0, Steps: 12, Max Tile: 1.1471550996079694, Memory Length: 183
Episode 504, Score: 27.0, Steps: 27, Max Tile: -0.022676343321732635, Memory Length: 210
Episode 505, Score: 46.0, Steps: 46, Max Tile: 0.8374687156829972, Memory Length: 256
Episode 506, Score: 53.0, Steps: 53, Max Tile: 1.37

Episode 593, Score: 22.0, Steps: 22, Max Tile: 0.4220334534428306, Memory Length: 336
Episode 594, Score: 21.0, Steps: 21, Max Tile: 1.7256228526260258, Memory Length: 357
Episode 595, Score: 12.0, Steps: 12, Max Tile: 1.1794599200699967, Memory Length: 369
Episode 596, Score: 11.0, Steps: 11, Max Tile: 2.2729503181118162, Memory Length: 380
Episode 597, Score: 22.0, Steps: 22, Max Tile: 2.5307060223802864, Memory Length: 402
Episode 598, Score: 20.0, Steps: 20, Max Tile: 1.5315090182679603, Memory Length: 422
Episode 599, Score: 15.0, Steps: 15, Max Tile: 1.1946606224921048, Memory Length: 437
Episode 600, Score: 11.0, Steps: 11, Max Tile: 2.197302984991957, Memory Length: 448
Episode 601, Score: 47.0, Steps: 47, Max Tile: 0.273720702799894, Memory Length: 495
Episode 602, Score: 12.0, Steps: 12, Max Tile: 1.3477552286255288, Memory Length: 507
Episode 603, Score: 14.0, Steps: 14, Max Tile: 1.0604372892351537, Memory Length: 521
Episode 604, Score: 19.0, Steps: 19, Max Tile: 0.9784079

Episode 719, Score: 19.0, Steps: 19, Max Tile: 0.1136259122473486, Memory Length: 50
Episode 720, Score: 14.0, Steps: 14, Max Tile: 1.139112521998901, Memory Length: 64
Episode 721, Score: 32.0, Steps: 32, Max Tile: 0.42133962617789555, Memory Length: 96
Episode 722, Score: 59.0, Steps: 59, Max Tile: 0.660698576496346, Memory Length: 155
Episode 723, Score: 12.0, Steps: 12, Max Tile: 2.0174848079757908, Memory Length: 167
Episode 724, Score: 18.0, Steps: 18, Max Tile: 1.1692662580758015, Memory Length: 185
Episode 725, Score: 11.0, Steps: 11, Max Tile: 1.0002423051875995, Memory Length: 196
Episode 726, Score: 15.0, Steps: 15, Max Tile: 0.8916267428426712, Memory Length: 211
Episode 727, Score: 12.0, Steps: 12, Max Tile: 1.502473238514994, Memory Length: 223
Episode 728, Score: 15.0, Steps: 15, Max Tile: 1.7995881367849855, Memory Length: 238
Episode 729, Score: 15.0, Steps: 15, Max Tile: 0.6325949702564948, Memory Length: 253
Episode 730, Score: 12.0, Steps: 12, Max Tile: 0.8364634855

Episode 862, Score: 78.0, Steps: 78, Max Tile: 1.8206571281857862, Memory Length: 241
Episode 863, Score: 18.0, Steps: 18, Max Tile: 1.5309540654173017, Memory Length: 259
Episode 864, Score: 12.0, Steps: 12, Max Tile: 2.6441768077185195, Memory Length: 271
Episode 865, Score: 29.0, Steps: 29, Max Tile: 0.5637713017865298, Memory Length: 300
Episode 866, Score: 13.0, Steps: 13, Max Tile: 2.2940470960937507, Memory Length: 313
Episode 867, Score: 24.0, Steps: 24, Max Tile: 0.4611692710121188, Memory Length: 337
Episode 868, Score: 11.0, Steps: 11, Max Tile: 1.3662750578281262, Memory Length: 348
Episode 869, Score: 20.0, Steps: 20, Max Tile: 1.6173062122836361, Memory Length: 368
Episode 870, Score: 15.0, Steps: 15, Max Tile: 0.9677732562770349, Memory Length: 383
Episode 871, Score: 14.0, Steps: 14, Max Tile: 0.8850101581888568, Memory Length: 397
Episode 872, Score: 21.0, Steps: 21, Max Tile: 1.7663261257058633, Memory Length: 418
Episode 873, Score: 17.0, Steps: 17, Max Tile: 0.62139

Episode 989, Score: 19.0, Steps: 19, Max Tile: 0.641922478659884, Memory Length: 65
Episode 990, Score: 21.0, Steps: 21, Max Tile: 0.7621921373917605, Memory Length: 86
Episode 991, Score: 28.0, Steps: 28, Max Tile: 1.9060403906797474, Memory Length: 114
Episode 992, Score: 10.0, Steps: 10, Max Tile: 2.504598728146001, Memory Length: 124
Episode 993, Score: 15.0, Steps: 15, Max Tile: 1.25309309908471, Memory Length: 139
Episode 994, Score: 19.0, Steps: 19, Max Tile: 0.2552389093801233, Memory Length: 158
Episode 995, Score: 16.0, Steps: 16, Max Tile: 0.832321597523221, Memory Length: 174
Episode 996, Score: 12.0, Steps: 12, Max Tile: 0.7432759795534274, Memory Length: 186
Episode 997, Score: 10.0, Steps: 10, Max Tile: 2.495208987855915, Memory Length: 196
Episode 998, Score: 15.0, Steps: 15, Max Tile: 1.2093064352788028, Memory Length: 211
Episode 999, Score: 20.0, Steps: 20, Max Tile: 2.2267417891418315, Memory Length: 231


In [14]:
!pip uninstall gym-2048 -y
!pip uninstall scipy scikit-learn -y
!pip uninstall numpy==1.14.0 -y
!pip install numpy==1.20.0





Found existing installation: scipy 1.10.1
Uninstalling scipy-1.10.1:
  Successfully uninstalled scipy-1.10.1
Found existing installation: scikit-learn 1.3.2
Uninstalling scikit-learn-1.3.2:
  Successfully uninstalled scikit-learn-1.3.2
Found existing installation: numpy 1.20.1
Uninstalling numpy-1.20.1:


You can safely remove it manually.


  Successfully uninstalled numpy-1.20.1
Collecting numpy==1.20.0
  Downloading numpy-1.20.0-cp38-cp38-win_amd64.whl.metadata (2.0 kB)
Downloading numpy-1.20.0-cp38-cp38-win_amd64.whl (13.7 MB)
   ---------------------------------------- 13.7/13.7 MB 2.2 MB/s eta 0:00:00
Installing collected packages: numpy
Successfully installed numpy-1.20.0


DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
keras 2.4.3 requires scipy>=0.14, which is not installed.
gensim 3.8.3 requires scipy>=0.18.1, which is not installed.
gym 0.10.11 requires scipy, which is not installed.
scikit-image 0.17.2 requires scipy>=1.0.1, which is not installed.
seaborn 0.11.0 requires scipy>=1.0, which is not installed.
statsmodels 0.12.0 requires scipy>=1.1, which is not installed.
tensorflow 2.4.1 requires numpy~=1.19.2, but you have numpy 1.20.0 which is incompatible.


In [13]:
try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
except Exception:
    pass

import tensorflow as tf
print(tf.__version__)
# 현재 model.predict() 의 속도가 느린 문제가 있어서 eager_execution을 끔
tf.compat.v1.disable_eager_execution()

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge

# 데이터 생성
np.random.seed(0)
x = np.array(range(20))
y = x - 2 * (x ** 2) + 0.5 * (x ** 3) + np.random.normal(-40, 40, 20)
x = np.reshape(x, (-1, 1))
y = np.reshape(y, (-1, 1))

plt.figure(figsize=(15,5))

# 첫 번째 서브플롯
plt.subplot(1, 3, 1)
plt.scatter(x, y, s=10, color='gray')

select = np.random.choice(20, 5, replace=False)
select.sort()
clf = Ridge(alpha=100, normalize=True)
clf.fit(x[select], y[select])

plt.scatter(x[select], y[select], s=20, color='r')
plt.plot(x, clf.predict(x) - clf.intercept_, color='blue')

# 두 번째 서브플롯
plt.subplot(1, 3, 2)
plt.scatter(x, y, s=10, color='k')

select = np.array(range(5))
clf = Ridge(alpha=100, normalize=True)
clf.fit(x[select], y[select])

plt.scatter(x[select], y[select], s=20, color='r')
plt.plot(x, clf.predict(x) - clf.intercept_, color='blue')

# 세 번째 서브플롯
plt.subplot(1, 3, 3)
plt.scatter(x, y, s=10, color='k')

select = np.array(range(15, 20))
clf = Ridge(alpha=100, normalize=True)
clf.fit(x[select], y[select])

plt.scatter(x[select], y[select], s=20, color='r')
plt.plot(x, clf.predict(x) - clf.intercept_, color='blue')

plt.show()


2.4.1



  atlas_info:
  No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
  customize MSVCCompiler
    libraries f77blas,cblas,atlas not found in c:\users\mr_k\anaconda3\lib
  No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
  customize MSVCCompiler
    libraries lapack_atlas not found in c:\users\mr_k\anaconda3\lib
  No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
  customize MSVCCompiler
    libraries f77blas,cblas,atlas not found in C:\
  No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
  customize MSVCCompiler
    libraries lapack_atlas not found in C:\
  No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
  customize MSVCCompiler
    libraries f77blas,cblas,atlas not found in c:\users\mr_k\anaconda3\libs
  No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from dist

  C:\Program Files\Microsoft Visual Studio\2022\Preview\VC\Tools\MSVC\14.40.33807\bin\HostX86\x64\cl.exe /c /nologo /Ox /W3 /GL /DNDEBUG /MD -Inumpy\core\src\private -Inumpy\core\src -Inumpy\core -Inumpy\core\src\npymath -Inumpy\core\src\multiarray -Inumpy\core\src\umath -Inumpy\core\src\npysort -Ic:\users\mr_k\anaconda3\include -Ic:\users\mr_k\anaconda3\include -I"C:\Program Files\Microsoft Visual Studio\2022\Preview\VC\Tools\MSVC\14.40.33807\include" -I"C:\Program Files\Microsoft Visual Studio\2022\Preview\VC\Tools\MSVC\14.40.33807\ATLMFC\include" -I"C:\Program Files\Microsoft Visual Studio\2022\Preview\VC\Auxiliary\VS\include" -I"C:\Program Files (x86)\Windows Kits\10\include\10.0.22621.0\ucrt" -I"C:\Program Files (x86)\Windows Kits\10\\include\10.0.22621.0\\um" -I"C:\Program Files (x86)\Windows Kits\10\\include\10.0.22621.0\\shared" -I"C:\Program Files (x86)\Windows Kits\10\\include\10.0.22621.0\\winrt" -I"C:\Program Files (x86)\Windows Kits\10\\include\10.0.22621.0\\cppwinrt" -I"C

  #define HAVE_SINF 1
  #define HAVE_COSF 1
  #define HAVE_TANF 1
  #define HAVE_SINHF 1
  #define HAVE_COSHF 1
  #define HAVE_TANHF 1
  #define HAVE_FLOORF 1
  #define HAVE_CEILF 1
  #define HAVE_RINTF 1
  #define HAVE_TRUNCF 1
  #define HAVE_SQRTF 1
  #define HAVE_LOG10F 1
  #define HAVE_LOGF 1
  #define HAVE_LOG1PF 1
  #define HAVE_EXPF 1
  #define HAVE_EXPM1F 1
  #define HAVE_ASINF 1
  #define HAVE_ACOSF 1
  #define HAVE_ATANF 1
  #define HAVE_ASINHF 1
  #define HAVE_ACOSHF 1
  #define HAVE_ATANHF 1
  #define HAVE_ATAN2F 1
  #define HAVE_POWF 1
  #define HAVE_FMODF 1
  #define HAVE_MODFF 1
  #define HAVE_EXP2F 1
  #define HAVE_LOG2F 1
  #define HAVE_COPYSIGNF 1
  #define HAVE_NEXTAFTERF 1
  #define HAVE_CBRTF 1
  #define HAVE_RINTL 1
  #define HAVE_TRUNCL 1
  #define HAVE_LOG1PL 1
  #define HAVE_EXPM1L 1
  #define HAVE_ASINHL 1
  #define HAVE_ACOSHL 1
  #define HAVE_ATANHL 1
  #define HAVE_EXP2L 1
  #define HAVE_LOG2L 1
  #define HAVE_COPYSIGNL 1
  #define HAVE_NEXTAFTERL 1
  #defi

ImportError: DLL load failed while importing _iterative: 지정된 모듈을 찾을 수 없습니다.

In [None]:
!pip install gym_2048 --no-dependencies

In [None]:
import gym_2048
import gym

env = gym.make('2048-v0')
obs = env.reset()

print(obs)
print(env.observation_space)
print(env.action_space)

obs, _, _, _ = env.step(0)
print(obs)

In [None]:
score = 0
step = 0
obs = env.reset()

while True:
    action = env.action_space.sample()
    obs, reward, done, info = env.step(action)
    
    score += reward
    step += 1
    if done:
        break
        
print('score:', score, 'step:', step)
print(obs)

In [None]:
layer_count = 12
table = {2**i:i for i in range(layer_count)}
print(table)

def preprocess(obs):    
    x = np.zeros((4, 4, layer_count))
    for i in range(4):
        for j in range(4):
            if obs[i,j] > 0:
                v = min(obs[i,j], 2**(layer_count-1))
                x[i,j,table[v]] = 1
            else:
                x[i,j,0] = 1
    return x

In [None]:
def build_model():
    dense1 = 128
    dense2 = 128

    x = tf.keras.Input(shape=(4, 4, layer_count))

    conv_a = tf.keras.layers.Conv2D(dense1, kernel_size=(2,1), activation='relu')(x)
    conv_b = tf.keras.layers.Conv2D(dense1, kernel_size=(1,2), activation='relu')(x)

    conv_aa = tf.keras.layers.Conv2D(dense2, kernel_size=(2,1), activation='relu')(conv_a)
    conv_ab = tf.keras.layers.Conv2D(dense2, kernel_size=(1,2), activation='relu')(conv_a)
    conv_ba = tf.keras.layers.Conv2D(dense2, kernel_size=(2,1), activation='relu')(conv_b)
    conv_bb = tf.keras.layers.Conv2D(dense2, kernel_size=(1,2), activation='relu')(conv_b)

    flat = [tf.keras.layers.Flatten()(a) for a in [conv_a, conv_b, conv_aa, conv_ab, conv_ba, conv_bb]]

    concat = tf.keras.layers.Concatenate()(flat)
    dense1 = tf.keras.layers.Dense(256, activation='relu')(concat)
    out = tf.keras.layers.Dense(4, activation='linear')(dense1)

    model = tf.keras.Model(inputs=x, outputs=out)
    model.compile(optimizer=tf.optimizers.RMSprop(learning_rate=0.0005), loss='mse')
    # model.summary()
    return model

model = build_model()
target_model = build_model()

In [None]:
gamma = 0.9
batch_size = 512
max_memory = batch_size*8
memory = []

def append_sample(state, action, reward, next_state, done):
    memory.append([state, action, reward, next_state, done])

def train_model():
    np.random.shuffle(memory)

    len = max_memory // batch_size
    for k in range(len):
        mini_batch = memory[k*batch_size:(k+1)*batch_size]

        states = np.zeros((batch_size, 4, 4, layer_count))
        next_states = np.zeros((batch_size, 4, 4, layer_count))
        actions, rewards, dones = [], [], []

        for i in range(batch_size):
            states[i] = mini_batch[i][0]
            actions.append(mini_batch[i][1])
            rewards.append(mini_batch[i][2])
            next_states[i] = mini_batch[i][3]
            dones.append(mini_batch[i][4])

        target = model.predict(states)
        next_target = target_model.predict(next_states)

        for i in range(batch_size):
            if dones[i]:
                target[i][actions[i]] = rewards[i]
            else:
                target[i][actions[i]] = rewards[i] + gamma * np.amax(next_target[i])

        model.fit(states, target, batch_size=batch_size, epochs=1, verbose=0)

In [None]:
import math
import random

def softmax(logits):
    exp_logits = np.exp(logits - np.max(logits))
    sum_exp_logits = np.sum(exp_logits)
    return exp_logits / sum_exp_logits

max_episodes = 10001
epsilon = 0.9
epsilon_min = 0.1

scores = []
steps = []
iteration = 0

train_count = 0

for i in range(max_episodes):
    if i % 100 == 0 and i != 0:
        print('score mean:', np.mean(scores[-100:]), 'step mean:', np.mean(steps[-100:]), 'iteration:', iteration, 'epsilon:', epsilon)

    prev_obs = env.reset()

    score = 0
    step = 0
    not_move_list = np.array([1,1,1,1])
    prev_max = np.max(prev_obs)

    while True:
        iteration += 1

        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            x = preprocess(prev_obs)
            logits = model.predict(np.expand_dims(x, axis=0))[0]
            prob = softmax(logits)
            prob = prob * not_move_list
            action = np.argmax(prob)

        obs, reward, done, info = env.step(action)

        score += reward
        step += 1

        # not moved situation
        if reward == 0 and np.array_equal(obs, prev_obs):
            not_move_list[action] = 0
            continue
        else:
            not_move_list = np.array([1,1,1,1])

        # custom reward
        now_max = np.max(obs)
        if prev_max < now_max:
            prev_max = now_max
            reward = math.log(now_max, 2) * 0.1
        else:
            reward = 0

        reward += np.count_nonzero(prev_obs) - np.count_nonzero(obs) + 1

        append_sample(preprocess(prev_obs), action, reward, preprocess(obs), done)

        if len(memory) >= max_memory:
            train_model()
            memory = []

            train_count += 1
            if train_count % 4 == 0:
                target_model.set_weights(model.get_weights())

        prev_obs = obs

        if epsilon > 0.01 and iteration % 2500 == 0:
            epsilon = epsilon / 1.005

        if done:
            break

    scores.append(score)
    steps.append(step)

    print(i, 'score:', score, 'step:', step, 'max tile:', np.max(obs), 'memory len:', len(memory))

In [None]:
import matplotlib.pyplot as plt

N = 100
rolling_mean = [np.mean(scores[x:x+N]) for x in range(len(scores)-N+1)]

plt.figure(figsize=(12,4))
plt.subplot(1, 2, 1)
plt.scatter(range(len(scores)), scores, marker='.')
plt.subplot(1, 2, 2)
plt.plot(rolling_mean)
plt.show()

In [None]:
max_memory = 512*64

action_swap_array = [[0, 0, 2, 2, 1, 3, 1, 3],
                     [1, 3, 1, 3, 0, 0, 2, 2],
                     [2, 2, 0, 0, 3, 1, 3, 1],
                     [3, 1, 3, 1, 2, 2, 0, 0]]

def append_sample(state, action, reward, next_state, done):
    g0 = state
    g1 = g0[::-1,:,:]
    g2 = g0[:,::-1,:]
    g3 = g2[::-1,:,:]
    r0 = state.swapaxes(0,1)
    r1 = r0[::-1,:,:]
    r2 = r0[:,::-1,:]
    r3 = r2[::-1,:,:]

    g00 = next_state
    g10 = g00[::-1,:,:]
    g20 = g00[:,::-1,:]
    g30 = g20[::-1,:,:]
    r00 = next_state.swapaxes(0,1)
    r10 = r00[::-1,:,:]
    r20 = r00[:,::-1,:]
    r30 = r20[::-1,:,:]

    states = [g0, g1, g2, g3, r0, r1, r2, r3]
    next_states = [g00, g10, g20, g30, r00, r10, r20, r30]

    for i in range(8):
        memory.append([
            states[i],
            action_swap_array[action][i],
            reward,
            next_states[i],
            done
        ])

In [None]:
max_episodes = 10001
epsilon = 0.9
epsilon_min = 0.1

scores = []
steps = []
iteration = 0
train_count = 0

for i in range(max_episodes):
    if i % 100 == 0 and i != 0:
        print('score mean:', np.mean(scores[-100:]), 'step mean:', np.mean(steps[-100:]), 'iteration:', iteration, 'epsilon:', epsilon)

    prev_obs = env.reset()

    score = 0
    step = 0
    not_move_list = np.array([1,1,1,1])
    prev_max = np.max(prev_obs)

    while True:
        iteration += 1

        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            x = preprocess(prev_obs)
            logits = model.predict(np.expand_dims(x, axis=0))[0]
            prob = softmax(logits)
            prob = prob * not_move_list
            action = np.argmax(prob)

        obs, reward, done, info = env.step(action)

        score += reward
        step += 1

        # not moved situation
        if reward == 0 and np.array_equal(obs, prev_obs):
            not_move_list[action] = 0
            continue
        else:
            not_move_list = np.array([1,1,1,1])

        # custom reward
        now_max = np.max(obs)
        if prev_max < now_max:
            prev_max = now_max
            reward = math.log(now_max, 2) * 0.1
        else:
            reward = 0

        reward += np.count_nonzero(prev_obs) - np.count_nonzero(obs) + 1

        append_sample(preprocess(prev_obs), action, reward, preprocess(obs), done)

        if len(memory) >= max_memory:
            train_model()
            memory = []

            train_count += 1
            if train_count % 4 == 0:
                target_model.set_weights(model.get_weights())

        prev_obs = obs

        if epsilon > 0.01 and iteration % 2500 == 0:
            epsilon = epsilon / 1.005

        if done:
            break

    scores.append(score)
    steps.append(step)

    print(i, 'score:', score, 'step:', step, 'max tile:', np.max(obs), 'memory len:', len(memory))