Skip to content

Commit

Permalink
Multi GPU: fix split and variable placement.
Browse files Browse the repository at this point in the history
* Fix split in net_to_model.
* Add soft placement of variables.
* Fixes Windows issues.

Pull request #1443.
  • Loading branch information
godmoves authored and gcp committed May 17, 2018
1 parent c822e5e commit 8751123
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 9 deletions.
2 changes: 1 addition & 1 deletion training/tf/net_to_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
print("Blocks", blocks)

tfprocess = TFProcess()
tfprocess.init(batch_size=1)
tfprocess.init(batch_size=1, gpus_num=1)
if tfprocess.RESIDUAL_BLOCKS != blocks:
raise ValueError("Number of blocks in tensorflow model doesn't match "\
"number of blocks in input network")
Expand Down
18 changes: 10 additions & 8 deletions training/tf/tfprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,13 +132,13 @@ def __init__(self):
self.swa_recalc_bn = True

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
config = tf.ConfigProto(gpu_options=gpu_options)
config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True)
self.session = tf.Session(config=config)

self.training = tf.placeholder(tf.bool)
self.global_step = tf.Variable(0, name='global_step', trainable=False)

def init(self, batch_size, macrobatch=1, logbase='leelalogs'):
def init(self, batch_size, macrobatch=1, gpus_num=None, logbase='leelalogs'):
self.batch_size = batch_size
self.macrobatch = macrobatch
self.logbase = logbase
Expand All @@ -159,13 +159,15 @@ def init(self, batch_size, macrobatch=1, logbase='leelalogs'):
probs = tf.reshape(probs, (batch_size, 19*19 + 1))
winner = tf.reshape(winner, (batch_size, 1))

self.init_net(planes, probs, winner)
if gpus_num is None:
gpus_num = self.gpus_num
self.init_net(planes, probs, winner, gpus_num)

def init_net(self, planes, probs, winner):
def init_net(self, planes, probs, winner, gpus_num):
self.y_ = probs # (tf.float32, [None, 362])
self.sx = tf.split(planes, self.gpus_num)
self.sy_ = tf.split(probs, self.gpus_num)
self.sz_ = tf.split(winner, self.gpus_num)
self.sx = tf.split(planes, gpus_num)
self.sy_ = tf.split(probs, gpus_num)
self.sz_ = tf.split(winner, gpus_num)
self.batch_norm_count = 0
self.reuse_var = None

Expand All @@ -182,7 +184,7 @@ def init_net(self, planes, probs, winner):
tower_reg_term = []
tower_y_conv = []
with tf.variable_scope(tf.get_variable_scope()):
for i in range(self.gpus_num):
for i in range(gpus_num):
with tf.device("/gpu:%d" % i):
with tf.name_scope("tower_%d" % i):
loss, policy_loss, mse_loss, reg_term, y_conv = self.tower_loss(
Expand Down

0 comments on commit 8751123

Please sign in to comment.