-
Notifications
You must be signed in to change notification settings - Fork 0
/
value_network.py
524 lines (461 loc) · 20 KB
/
value_network.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
import numpy as np
import chess_rule as rule
from util import add_print_time_fun, print_use_time
from record import Record
import logging
import util
logger = logging.getLogger('train')
class NoActionException(BaseException):
pass
class ValueNetwork:
def __init__(self, epsilon=1.0, epsilon_decay=1e-5, hidden_activation='relu', output_activation='sigmoid', lr=1e-3, model=None, model_file=None, weights_file=None):
self.output_activation = output_activation
self.epsilon = epsilon
self._epsilon = epsilon
self.epsilon_decay = epsilon_decay
self.hidden_activation = hidden_activation
self.output_activation = output_activation
self.lr = lr
self.model_file = model_file
self.predicts = set()
# 跟踪上一步的值,供调试
self.q_value = None
self.valid = None
self.vq = None
self.episode = 0 # 第几次训练
if model:
self.model = model
elif model_file:
logger.info('load model from: %s', model_file)
self.model = self.load_model(model_file)
else:
self.model = self.create_model()
if weights_file:
logger.info('load model weights: %s', weights_file)
self.model.load_weights(weights_file)
util.show_model(self.model)
@staticmethod
def load_model(model_file):
logger.info('load model in ValueNetwork')
from keras.models import load_model
model = load_model(model_file)
'''
# 这里中途修改了一下输出层的正则化参数和SGD的学习率
l = 1e-6
for layer in model.layers:
layer.kernel_regularizer = l2(l)
layer.bias_regularizer = l2(l)
l = 0.01
out = model.get_layer(index=-1)
out.kernel_regularizer = l2(l)
out.bias_regularizer = l2(l)
'''
# model.optimizer = SGD(lr=1e-5, decay=1e-6)
return model
@staticmethod
def show_model(model):
for l in model.layers:
if hasattr(l, 'kernel_regularizer'):
if l.kernel_regularizer:
print(l.kernel_regularizer.get_config())
else:
print('NO regularizer')
print('optimizer:', model.optimizer.get_config())
def create_model(self):
from keras.models import Model
from keras.layers import Input, Dense, Convolution2D, Activation, Flatten
from keras.optimizers import Adam, SGD
from keras.regularizers import l2
from keras.layers.merge import add
l = 1e-3
def identity_block(x, nb_filter, kernel_size=3):
k1, k2, k3 = nb_filter
y = Convolution2D(filters=k1, kernel_size=1, strides=1, activation=self.hidden_activation, kernel_regularizer=l2(l), bias_regularizer=l2(l))(x)
y = Convolution2D(filters=k2, kernel_size=kernel_size, strides=1, padding='same', activation=self.hidden_activation, kernel_regularizer=l2(l), bias_regularizer=l2(l))(y)
y = Convolution2D(filters=k3, kernel_size=1, strides=1, kernel_regularizer=l2(l), bias_regularizer=l2(l))(y)
y = add([x,y])
return Activation(self.hidden_activation)(y)
def conv_block(x, nb_filter, kernel_size=3):
k1, k2, k3 = nb_filter
y = Convolution2D(filters=k1, kernel_size=1, strides=1, activation=self.hidden_activation, kernel_regularizer=l2(l), bias_regularizer=l2(l))(x)
y = Convolution2D(filters=k2, kernel_size=kernel_size, strides=1, padding='same', activation=self.hidden_activation, kernel_regularizer=l2(l), bias_regularizer=l2(l))(y)
y = Convolution2D(filters=k3, kernel_size=1, strides=1, kernel_regularizer=l2(l), bias_regularizer=l2(l))(y)
x = Convolution2D(filters=k3, kernel_size=1, strides=1, kernel_regularizer=l2(l), bias_regularizer=l2(l))(x)
y = add([x,y])
return Activation(self.hidden_activation)(y)
# 输入层
input_ = Input(shape=(5,5,5))
# 第一个卷积层
out = Convolution2D(
filters=100, # 卷积核/滤波器个数
kernel_size=3, # 卷积窗口大小
input_shape=(5,5,5), # 输入平面的形状
strides=1, # 步长
padding='same', # padding方式 same:保持图大小不变/valid
activation=self.hidden_activation, # 激活函数
kernel_regularizer=l2(l),
bias_regularizer=l2(l)
)(input_)
out = identity_block(out, (100,100,100))
out = identity_block(out, (100, 100, 100))
out = conv_block(out, (50,50,50))
out = identity_block(out, (50, 50, 50))
out = identity_block(out, (50, 50, 50))
out = identity_block(out, (50, 50, 50))
out = conv_block(out, (50, 50, 50))
out = identity_block(out, (50, 50, 50))
out = identity_block(out, (50, 50, 50))
out = identity_block(out, (50, 50, 50))
out = Flatten()(out)
# out = Dense(units=100, activation='relu')(out)
l = 1e-3
# 输出价值
out = Dense(units=1,
activation=self.output_activation,
kernel_initializer='zeros',
kernel_regularizer=l2(l),
bias_initializer='zeros',
bias_regularizer=l2(l)
)(out)
model = Model(inputs=input_, outputs=out)
# 定义优化器
# opt = Adam(lr=1e-4)
opt = SGD(lr=self.lr, decay=1e-6)
# loss function
loss = 'mse' # if self.output_activation == 'linear' else 'binary_crossentropy' if self.output_activation == 'sigmoid' else None
model.compile(optimizer=opt, loss=loss)
return model
@staticmethod
def feature(board, from_, action):
"""
第一视角的棋局特征
:param board: 棋盘
:param from_: 走哪颗子
:param action: 动作,向哪个方向走
:return: 当前动作的特征(5x5xN)
"""
player = board[from_]
to_ = tuple(np.add(from_, rule.actions_move[action]))
# 走子后的棋盘
board = board.copy()
result,_ = rule.move(board, from_, to_)
space = (board == 0).astype(np.int8).reshape((5, 5, 1))
self = (board == player).astype(np.int8).reshape((5, 5, 1))
opponent = (board == -player).astype(np.int8).reshape((5, 5, 1))
# 走子后是否赢棋
is_win = np.ones((5,5,1)) if result == rule.WIN else np.zeros((5,5,1))
# 偏置
bias = np.ones((5, 5, 1))
return np.concatenate((space, self, opponent, is_win, bias), axis=2)
def q(self, board, from_, action):
x = self.feature(board, from_, action)
x = np.array([x])
q = self.model.predict(x)[0][0]
return q
def maxq(self, board, player):
q = [self.q(board,from_,action) for from_,action in rule.valid_actions(board,player)]
return max(q)
def value(self, board, player):
return self.maxq(board, player)
def epsilon_greedy(self, board, valid_action, q):
"""
使用epsilon-greedy策略选择下一步动作
以epsilon以内的概率随机选择动作
以1-epsilon的概率选择最大Q值的动作
:return: 下一个动作: (from,to)
"""
if np.random.random() > self.epsilon:
# 选取Q值最大的
if q is None:
q = [self.q(board, from_, action) for from_, action in valid_action]
return self.pi_star(valid_action, q),q
else:
# 随机选择
return self.random_choice(valid_action),q
def epsilon_greedy2(self, board, valid_action, qs):
"""
使用epsilon-greedy策略选择下一步动作
以1-epsilon的概率选择最大Q值的动作
以epsilon的概率以epsilon-greedy的方式选择其它动作
:return: 下一个动作: (from,to)
"""
if len(valid_action) == 1:
return valid_action[0], qs
if qs is None:
qs = [self.q(board, from_, action) for from_, action in valid_action]
max_idx = np.argmax(qs)
if np.random.random() > self.epsilon:
# 1-epsilon的概率选取Q值最大的
return valid_action[max_idx],qs
else:
# epsilon的概率选择其它动作
valid_action = valid_action[:max_idx] + valid_action[max_idx+1:]
qs = qs[:max_idx] + qs[max_idx+1:]
return self.epsilon_greedy2(board, valid_action, qs)
def epsilon_greedy_probs(self, board, valid_action, qs):
"""
使用epsilon-greedy策略选择下一步动作
以1-epsilon的概率选择最大Q值的动作
以epsilon的概率按概率选择
:return: 下一个动作: (from,to)
"""
if len(valid_action) == 1:
return valid_action[0], qs
if qs is None:
qs = [self.q(board, from_, action) for from_, action in valid_action]
max_idx = np.argmax(qs)
if np.random.random() > self.epsilon:
# 1-epsilon的概率选取Q值最大的
return valid_action[max_idx],qs
else:
# epsilon的概率按概率选择动作
probs = self.value_to_probs(qs)
action = util.select_by_prob(valid_action, probs)
if self.episode % 10 == 0:
logger.info('probs:%s', probs)
return action,qs
def pi_star(self, valid_action, q):
"""
选择Q值最大的动作,即最优策略
"""
maxq = np.max(q)
idxes = np.argwhere(q == maxq)
action = valid_action[self.random_choice(idxes)[0]]
# logger.info('maxq:%s, idxes:%s, select:%s', maxq, idxes, action)
return action
def predict(self, board, player):
valid = rule.valid_actions(board, player)
q = [self.q(board, from_, action) for from_, action in valid]
return self.pi_star(valid, q),(valid,q)
def policy(self, board, player):
# return self.policy_by_probs(board, player)
return self.policy_by_epsilon_greedy(board, player)
def policy_by_epsilon_greedy(self, board, player):
valid = rule.valid_actions(board, player)
q = None
self.set_pre(q, valid, q)
if len(valid) == 0:
raise NoActionException
board_str = ''.join(map(str, board.flatten()))
(from_,action),q = self.epsilon_greedy_probs(board, valid, q)
self.predicts.add((board_str,from_,action))
self.set_pre(q, valid, None)
if self.episode % 10 == 0:
logger.info('action:%s,%s', from_, action)
# logger.info('valid:%s', valid)
logger.info('q:%s', q)
return from_,action
def policy_by_epsilon_greedy_no_repeat(self, board, player):
valid = rule.valid_actions(board, player)
q = None
self.set_pre(q, valid, q)
if len(valid) == 0:
raise NoActionException
board_str = ''.join(map(str, board.flatten()))
while True:
(from_,action),q = self.epsilon_greedy(board, valid, q)
if (board_str,from_,action) not in self.predicts or len(valid) == 1:
self.predicts.add((board_str,from_,action))
self.set_pre(q, valid, None)
if self.episode % 10 == 0:
logger.info('action:%s,%s', from_, action)
# logger.info('valid:%s', valid)
logger.info('q:%s', q)
return from_,action
else:
# 将已经走过的位置移除,选择其它位置
idx = valid.index((from_,action))
valid.pop(idx)
if q:
q.pop(idx)
def policy_by_probs(self, board, player):
valid = rule.valid_actions(board, player)
q = None
self.set_pre(q, valid, q)
if len(valid) == 0:
raise NoActionException
qs = [self.q(board, from_, act) for from_, act in valid]
probs = self.value_to_probs(qs)
action = util.select_by_prob(valid, probs)
if self.episode % 10 == 0:
logger.info('action:%s', action)
logger.info('q:%s', qs)
logger.info('probs:%s', probs)
return action
def train(self, records, batch_size=1, epochs=1, verbose=0):
x_train = []
y_train = []
for bd, from_, action, reward, _ in records:
x = self.feature(bd, from_, action)
x_train.append(x)
y_train.append(reward)
x_train = np.array(x_train, copy=False)
y_train = np.array(y_train, copy=False)
self.model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=verbose)
@staticmethod
def value_to_probs(values):
values = np.array(values)
# 对values进行少量加减,以防止出现0
x = np.log(1e-15 + values) - np.log(1 + 1e-15 - values)
y = np.e ** x
return y / y.sum()
def probabilities(self, board, player):
valid = rule.valid_actions(board, player)
qs = [self.q(board, from_, action) for from_, action in valid]
q2 = np.zeros((5,5,4))
for (from_, action),q in zip(valid,qs):
q2[from_][action] = q
return q2
def decay_epsilon(self):
self.epsilon = self._epsilon / (1 + self.epsilon_decay * np.log(1 + self.episode))
@staticmethod
def random_choice(a):
return a[np.random.randint(len(a))]
@staticmethod
def load(modelfile, epsilon=0.3):
return ValueNetwork(epsilon=epsilon, model=util.load_model(modelfile))
def set_pre(self, q, valid, vq):
self.q_value = q
self.valid = valid
self.vq = vq
def copy(self, other):
self.model.set_weights(other.model.get_weights())
def clear(self):
self.predicts.clear()
def save_model(self, filepath):
self.model.save(filepath)
@staticmethod
def close():
from keras import backend as K
if K.backend() == 'tensorflow':
import keras.backend.tensorflow_backend as tfb
tfb.clear_session()
# tfb.get_session().close()
logger.info('tensorflow session clear')
# @print_use_time()
def simulate(nw0, nw1, activation, init='fixed'):
np.random.seed(util.rand_int32())
player = 1 if np.random.random() > 0.5 else -1
logger.info('init:%s, player:%s', init, player)
board = rule.init_board(player) if init == 'fixed' else rule.random_init_board()
records = Record()
# full_records = Record()
boards = set() # {(board,player)}
nws = [None, nw0, nw1]
n_steps = 0
while True:
nw = nws[player] # nw0 if player == 1 else nw1
try:
bd = board.copy()
board_str = util.board_str(board)
if (board_str,player) in boards:
# 找出环,并将目标置为0.5进行训练,然后将环清除
finded = False
records2 = Record()
for i in range(len(boards) - 1, -1, -1):
b, f, a, _, _ = records[i]
if (b == board).all() and b[f] == player:
finded = True
break
assert finded, (board, player)
records2.records = records.records[i:]
records2.draw()
nw0.train(records2)
nw1.train(records2)
# 将环里的数据清除
records.records = records.records[:i]
for b, f, a, _, _ in records2:
boards.remove((util.board_str(b), b[f]))
logger.info('环:%s, records:%s, epsilon:%s', len(records2), records.length(), nw.epsilon)
boards.add((board_str,player))
from_, action = nw.policy(board, player)
assert board[from_] == player
to_ = tuple(np.add(from_, rule.actions_move[action]))
command,eat = rule.move(board, from_, to_)
reward = len(eat)
if activation == 'sigmoid':
records.add3(bd, from_, action, reward, win=command==rule.WIN)
# full_records.add3(bd, from_, action, reward, win=command==rule.WIN)
elif activation == 'linear':
records.add2(bd, from_, action, reward, win=command==rule.WIN)
# full_records.add2(bd, from_, action, reward, win=command == rule.WIN)
elif activation == 'selu':
records.add4(bd, from_, action, reward, win=command==rule.WIN)
# full_records.add4(bd, from_, action, reward, win=command == rule.WIN)
else:
raise ValueError
if command == rule.WIN:
logging.info('%s WIN, stone:%s, step use: %s, epsilon:%s', str(player), (board==player).sum(), records.length(), nw.epsilon)
return records, player
if n_steps - records.length() > 500:
logging.info('循环走子数过多: %s', records.length())
# 走子数过多,和棋
records.clear()
return records, 0
player = -player
if init == 'fixed':
board = rule.flip_board(board)
n_steps += 1
except NoActionException:
# 随机初始化局面后一方无路可走
return Record(),0
except Exception as e:
logging.info('board is:\n%s', board)
logging.info('player is: %s', player)
valid = rule.valid_actions(board, player)
logging.info('valid is:\n%s', valid)
logging.info('predict is:\n%s', nw.q_value)
logging.info('valid action is:\n%s', nw.valid)
logging.info('from:%s, action:%s', from_, action)
records.save('records/train/1st_')
raise e
@print_use_time()
def train_once(n0, n1, i, activation, init='random', copy_period=1):
logging.info('train: %d', i)
n0.episode = i
n1.episode = i
n0.decay_epsilon()
n1.decay_epsilon()
records, winner = simulate(n0, n1, activation, init)
if records.length() == 0:
return
if i % copy_period == 0:
n1.copy(n0)
n0.train(records, epochs=1)
n0.clear()
n1.clear()
return records
def train():
logging.info('...begin...')
add_print_time_fun(['simulate', 'train_once'])
hidden_activation = 'relu'
activation = 'sigmoid' # linear, sigmoid
begin = 2720000
n_ = ValueNetwork(epsilon=0, output_activation=activation, model_file='model/value_network/value_network_fixed_%05dw.model' % np.ceil(begin / 10000))
n0 = ValueNetwork(epsilon=1, epsilon_decay=0.2, output_activation=activation, hidden_activation=hidden_activation)
n1 = ValueNetwork(epsilon=1, epsilon_decay=0.2, output_activation=activation, hidden_activation=hidden_activation)
n0.copy(n_)
n1.copy(n_)
episode = 100000
for i in range(1, episode + 1):
records = train_once(n0, n1, i, activation, init='random')
if i % 1000 == 0:
records.save('records/train/value_network/')
if i % 1000 == 0:
logger.info('model/value_network/value_network_random_%05dw.model' % (np.ceil((begin + i) / 10000)))
n0.save_model('model/value_network/value_network_random_%05dw.model' % (np.ceil((begin + i) / 10000)))
begin = begin+episode+1
n0.episode = 1
n1.episode = 1
for i in range(1, episode * 3 + 1):
records = train_once(n0, n1, i, activation, init='fixed', copy_period=1)
if i % 1000 == 0:
records.save('records/train/value_network/1st_')
if i % 1000 == 0:
logger.info('model/value_network/value_network_fixed_%05dw.model' % (np.ceil((begin + i) / 10000)))
n0.save_model('model/value_network/value_network_fixed_%05dw.model' % (np.ceil((begin + i) / 10000)))
if __name__ == '__main__':
import logging.config
logging.config.fileConfig('logging.conf')
# _main()
train()