In [1]:
import glob
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import cv2
import json
import matplotlib.pyplot as plt
os.environ['CUDA_VISIBLE_DEVICES']='0,1'



In [None]:
# 如果设备未在 `tf.distribute.MirroredStrategy` 的指定列表中，它会被自动检测到。
strategy = tf.distribute.MirroredStrategy()
print ('Number of devices: {}'.format(strategy.num_replicas_in_sync)) # 输出设备数量

In [3]:
# BUFFER_SIZE = len(train_images)

BATCH_SIZE_PER_REPLICA = 2048
GLOBAL_BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

EPOCHS = 10

In [4]:
#黑白图片
def draw_cv2(raw_strokes, size=256, lw=6, time_color=True):
    img = np.zeros((BASE_SIZE, BASE_SIZE), np.uint8)
    for t, stroke in enumerate(raw_strokes):
        for i in range(len(stroke[0]) - 1):
            color = 255 - min(t, 10) * 13 if time_color else 255
            _ = cv2.line(img, (stroke[0][i], stroke[1][i]),
                         (stroke[0][i + 1], stroke[1][i + 1]), color, lw)
    if size != BASE_SIZE:
        return cv2.resize(img, (size, size))
    else:
        return img
    

In [5]:
class DataLoader(object):
    def __init__(self, resize_height=64, resize_width=64, batch_size=512, fileList=None, size=256, lw=6):
        self.resize_height = resize_height #图片高
        self.resize_height = resize_height #图片宽
        self.batch_size = batch_size #batch
        self.fileList = fileList #文件数据
        self.size = size #画图时图片大小
        self.lw = lw

    def __call__(self):
        def _generator(size,lw):
            for filename in self.fileList:
                df = pd.read_csv(filename)
                df['drawing'] = df['drawing'].apply(json.loads)
                x = np.zeros((len(df), size, size))
                for i, raw_strokes in enumerate(df.drawing.values):
                    x[i] = draw_cv2(raw_strokes, size=size, lw=lw)
                x = x / 255.
                x = x.reshape((len(df), size, size, 1)).astype(np.float32)
                y = tf.keras.utils.to_categorical(df.y, num_classes=n_labels)
                for x_i,y_i in zip(x,y):
                    yield (x_i,y_i)

        dataset = tf.data.Dataset.from_generator(generator=_generator,
                                                 output_types=(tf.dtypes.float32, tf.dtypes.int32),
                                                 output_shapes=((self.resize_height, self.resize_height, 1), (340, )),
                                                 args=(self.size, self.lw))
        dataset = dataset.prefetch(buffer_size=10240)
        dataset = dataset.shuffle(buffer_size=10240).batch(self.batch_size)
        return dataset    

In [6]:
DP_DIR = './shuffle_data_gzip/'


BASE_SIZE = 256
n_labels = 340
np.random.seed(seed=1987)
size = 64
batchsize = 1024    
fileList = glob.glob("./shuffle_data_gzip/*.csv.gz") 
train_fileList = fileList[:-1]
val_fileList = fileList[-1:]
train_ds = DataLoader(resize_height=64, resize_width=64, batch_size=GLOBAL_BATCH_SIZE, fileList=train_fileList, size=size, lw=6)()    
val_ds = DataLoader(resize_height=64, resize_width=64, batch_size=GLOBAL_BATCH_SIZE, fileList=val_fileList, size=size, lw=6)()    
        

train_dist_dataset = strategy.experimental_distribute_dataset(train_ds)
val_dist_dataset = strategy.experimental_distribute_dataset(val_ds)

In [7]:

class VGG11NetModel(tf.keras.models.Model):
    def __init__(self, size, n_labels, **kwargs):
        super(VGG11NetModel, self).__init__(**kwargs)
        
        self.conv1 = tf.keras.layers.Conv2D(64, kernel_size=3, strides=1, padding='same', activation='relu')
        self.pool1 = tf.keras.layers.MaxPooling2D(pool_size=2, strides=2)
        
        self.conv2 = tf.keras.layers.Conv2D(128, kernel_size=3, strides=1, padding='same', activation='relu')
        self.pool2 = tf.keras.layers.MaxPooling2D(pool_size=2, strides=2)
        
        self.conv3 = tf.keras.layers.Conv2D(256, kernel_size=3, strides=1, padding='same', activation='relu')
       
        self.conv4 = tf.keras.layers.Conv2D(256, kernel_size=3, strides=1, padding='same', activation='relu')
        self.pool4 = tf.keras.layers.MaxPooling2D(pool_size=2, strides=2)
        self.conv5 = tf.keras.layers.Conv2D(512, kernel_size=3, strides=1, padding='same', activation='relu')

        self.conv6 = tf.keras.layers.Conv2D(512, kernel_size=3, strides=1, padding='same', activation='relu')
        self.pool6 = tf.keras.layers.MaxPooling2D(pool_size=2, strides=2)
        self.conv7 = tf.keras.layers.Conv2D(512, kernel_size=3, strides=1, padding='same', activation='relu')
        self.conv8 = tf.keras.layers.Conv2D(512, kernel_size=3, strides=1, padding='same', activation='relu')
        self.pool8 = tf.keras.layers.MaxPooling2D(pool_size=2, strides=2)
        
        self.flatten = tf.keras.layers.Flatten()
        
        self.dense1 = tf.keras.layers.Dense(1024, activation='relu')
        self.dropout1 = tf.keras.layers.Dropout(rate=0.5)
        
        self.dense2 = tf.keras.layers.Dense(128, activation='relu')
        self.dropout2 = tf.keras.layers.Dropout(rate=0.5)
        
        self.outputs =  tf.keras.layers.Dense(n_labels, activation='softmax')

        
    def call(self, inputs,training=None):
        x = self.conv1(inputs)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.pool2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.pool4(x)
        x = self.conv5(x)
        x = self.conv6(x)
        x = self.pool6(x)
        x = self.conv7(x)
        x = self.conv8(x)
        x = self.pool8(x)
        
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.dropout1(x,training=training)
        
        x = self.dense2(x)
        x = self.dropout2(x,training=training)
        output_ = self.outputs(x)
        return output_

In [8]:
# model = VGG11NetModel(size=128,n_labels=340)

In [9]:
with strategy.scope():
    # 将减少设置为“无”，以便我们可以在之后进行这个减少并除以全局批量大小。
    loss_object = tf.keras.losses.CategoricalCrossentropy(
      reduction=tf.keras.losses.Reduction.NONE)
    def compute_loss(labels, predictions):
        per_example_loss = loss_object(labels, predictions)
        return tf.nn.compute_average_loss(per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE)

In [10]:
with strategy.scope():
#     train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.CategoricalAccuracy(name='train_accuracy')
    train_top3_accuracy = tf.keras.metrics.TopKCategoricalAccuracy(k=3,name='train_top_3_categorical_accuracy')

    test_loss = tf.keras.metrics.Mean(name='test_loss')
    test_accuracy = tf.keras.metrics.CategoricalAccuracy(name='test_accuracy')
    test_top3_accuracy = tf.keras.metrics.TopKCategoricalAccuracy(k=3,name='test_top_3_categorical_accuracy')

In [11]:
# 必须在`strategy.scope`下创建模型和优化器。
with strategy.scope():
    model = VGG11NetModel(size=128,n_labels=340)

    learning_rate = 0.001
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)


In [12]:
with strategy.scope():
    def train_one_step(inputs):
        images, labels = inputs
        with tf.GradientTape() as tape:
            predictions = model(images,training=True)
            loss = compute_loss(labels, predictions)

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        train_accuracy.update_state(labels, predictions)
        train_top3_accuracy.update_state(labels, predictions)
        return loss

    def val_one_step(inputs):
        images, labels = inputs
        predictions = model(images,training=False)
        t_loss = loss_object(labels, predictions)

        test_loss.update_state(t_loss)
        test_accuracy.update_state(labels, predictions)
        test_top3_accuracy.update_state(labels, predictions)

In [None]:
with strategy.scope():
    # `experimental_run_v2`将复制提供的计算并使用分布式输入运行它。

    def distributed_train_step(dataset_inputs):
        per_replica_losses = strategy.experimental_run_v2(train_one_step,
                                                          args=(dataset_inputs,))
        return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
                               axis=None)


    def distributed_test_step(dataset_inputs):
        return strategy.experimental_run_v2(val_one_step, args=(dataset_inputs,))


    for epoch in range(EPOCHS):
        # 训练循环
        total_loss = 0.0
        num_batches = 0
        for step,x in enumerate(train_dist_dataset):
            total_loss += distributed_train_step(x)
            num_batches += 1
            

            if step % 200 == 0:
                train_loss = total_loss / num_batches
                print("step:{0}; Samples:{1}; Train Loss:{2}; Train Accuracy:{3},Train Top3 Accuracy:{4}".format(step, (step + 1) * 1024, 
                                                                                                                 train_loss, 
                                                                                                                 train_accuracy.result()*100, 
                                                                                                                 train_top3_accuracy.result()*100))



        train_loss = total_loss / num_batches

        # 测试循环
        for x in val_dist_dataset:
            distributed_test_step(x)

            

        template = 'Epoch {}, Loss: {}, Accuracy: {}, Top3 Accuracy:{}, Test Loss: {}, Test Accuracy: {}, Test Top3 Accuracy: {}'
        print(template.format(epoch + 1,
                              train_loss,
                              train_accuracy.result() * 100,
                              train_top3_accuracy.result() *100,
                              test_loss.result(),
                              test_accuracy.result() * 100,
                              test_top3_accuracy.result()*100
                             ))

        
        train_accuracy.reset_states()
        train_top3_accuracy.reset_states()
        test_loss.reset_states()
        test_accuracy.reset_states()
        test_top3_accuracy.reset_states()



INFO:tensorflow:batch_all_reduce: 22 all-reduces with algorithm = nccl, num_packs = 1, agg_small_grads_max_bytes = 0 and agg_small_grads_max_group = 10
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
step:0; Samples:1024; Train Loss:5.829029083251953; Train Accuracy:0.29296875,Train Top3 Accuracy:0.6103515625
INFO:tensorflow:batch_all_reduce: 22 

In [10]:
model = VGG11NetModel(size=128,n_labels=340)

loss_object = tf.keras.losses.CategoricalCrossentropy()


learning_rate = 0.001
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)


train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.CategoricalAccuracy(name='train_accuracy')
train_top3_accuracy = tf.keras.metrics.TopKCategoricalAccuracy(k=3,name='train_top_3_categorical_accuracy')

test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.CategoricalAccuracy(name='test_accuracy')
test_top3_accuracy = tf.keras.metrics.TopKCategoricalAccuracy(k=3,name='test_top_3_categorical_accuracy')


# @tf.function
def train_one_step(images, labels):
    with tf.GradientTape() as tape:
        predictions = model(images,training=True)
        loss = loss_object(labels, predictions)
     
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy(labels, predictions)
    train_top3_accuracy(labels, predictions)

def val_one_step(images, labels):
    predictions = model(images,training=False)
    t_loss = loss_object(labels, predictions)

    test_loss(t_loss)
    test_accuracy(labels, predictions)
    test_top3_accuracy(labels, predictions)

In [None]:
EPOCHS=10
for epoch in range(EPOCHS):
    # 在下一个epoch开始时，重置评估指标
    train_loss.reset_states()
    train_accuracy.reset_states()
    train_top3_accuracy.reset_states()
    test_loss.reset_states()
    test_accuracy.reset_states()
    test_top3_accuracy.reset_states()

    for step,(images, labels) in enumerate(train_ds):
        train_one_step(images, labels)
        
        if step % 200 == 0:
            print("step:{0}; Samples:{1}; Train Loss:{2}; Train Accuracy:{3},Train Top3 Accuracy:{4}".format(step, (step + 1) * 1024, 
                                                                                                             train_loss.result(), 
                                                                                                             train_accuracy.result()*100, 
                                                                                                             train_top3_accuracy.result()*100))

    for step,(val_images, val_labels) in enumerate(val_ds):
        val_one_step(val_images, val_labels)


    template = 'Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, Test Accuracy: {}'
    print(template.format(epoch + 1,
                          train_loss.result(),
                          train_accuracy.result() * 100,
                          train_top3_accuracy()*100,
                          test_loss.result(),
                          test_accuracy.result() * 100,
                          test_top3_accuracy()*100
                         ))

step:0; Samples:1024; Train Loss:5.828896522521973; Train Accuracy:0.0,Train Top3 Accuracy:0.29296875
step:200; Samples:205824; Train Loss:5.5626397132873535; Train Accuracy:1.5503536462783813,Train Top3 Accuracy:4.0252838134765625
step:400; Samples:410624; Train Loss:4.746968746185303; Train Accuracy:9.10638427734375,Train Top3 Accuracy:17.637304306030273
step:600; Samples:615424; Train Loss:4.037374019622803; Train Accuracy:19.210983276367188,Train Top3 Accuracy:31.75095558166504
step:800; Samples:820224; Train Loss:3.5587403774261475; Train Accuracy:26.81206512451172,Train Top3 Accuracy:41.37052917480469
step:1000; Samples:1025024; Train Loss:3.22253155708313; Train Accuracy:32.380706787109375,Train Top3 Accuracy:48.05263137817383
step:1200; Samples:1229824; Train Loss:2.9760067462921143; Train Accuracy:36.644996643066406,Train Top3 Accuracy:52.932125091552734
step:1400; Samples:1434624; Train Loss:2.784346342086792; Train Accuracy:40.01933670043945,Train Top3 Accuracy:56.6867713928