In [11]:
import urllib.request
import zipfile
import numpy as np
from IPython.display import Image

import tensorflow as tf
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dropout, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint

In [12]:
tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


<tensorflow.python.distribute.mirrored_strategy.MirroredStrategy at 0x7f534c0fb450>

In [15]:
tf.debugging.set_log_device_placement(True)

In [16]:
# gpus = tf.config.experimental.list_physical_devices('GPU')
gpus = tf.distribute.MirroredStrategy()
if gpus:
    try:
    # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


TypeError: 'MirroredStrategy' object is not iterable

In [3]:
TRAINING_DIR = "/root/data/cut_data_limit10000/train/"
VALIDATION_DIR = "/root/data/cut_data_limit10000/validation/"
TEST_DIR = "/root/data/cut_data_limit10000/test/"

In [4]:
training_datagen = ImageDataGenerator(
    rescale=1. / 255,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.05,
    horizontal_flip=True,
    fill_mode='reflect', 
    validation_split=0.2
    )

In [5]:
training_generator = training_datagen.flow_from_directory(TRAINING_DIR, 
                                                          batch_size=32, 
                                                          target_size=(150, 150), 
                                                          class_mode='categorical', 
                                                          subset='training',
                                                         )

Found 51342 images belonging to 3 classes.


In [6]:
validation_generator = training_datagen.flow_from_directory(VALIDATION_DIR, 
                                                          batch_size=32, 
                                                          target_size=(150, 150), 
                                                          class_mode='categorical',
                                                          subset='validation', 
                                                         )

Found 1151 images belonging to 3 classes.


In [7]:
model = Sequential([
    # Conv2D, MaxPooling2D 조합으로 층을 쌓습니다. 첫번째 입력층의 input_shape은 (150, 150, 3)으로 지정합니다.
    Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3)),
    MaxPooling2D(2, 2), 
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(2, 2), 
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(2, 2), 
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(2, 2), 
    # 2D -> 1D로 변환을 위하여 Flatten 합니다.
    Flatten(), 
    # 과적합 방지를 위하여 Dropout을 적용합니다.
    Dropout(0.5),
    Dense(128, activation='relu'),
    # Classification을 위한 Softmax 
    # 출력층의 갯수는 클래스의 갯수와 동일하게 맞춰줍니다 (3개), activation도 잊지마세요!
    Dense(2, activation='softmax'),
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

checkpoint_path = f"/root/data/model/cut_data_tmp_checkpoint.ckpt"
checkpoint = ModelCheckpoint(filepath=checkpoint_path, 
                            save_weights_only=True, 
                            save_best_only=True, 
                            monitor='val_loss', 
                            verbose=1)
epochs=30
history = model.fit(training_generator, 
                    validation_data=(validation_generator),
                    epochs=epochs,
                    callbacks=[checkpoint],
                    )

model.load_weights(checkpoint_path)

loss = model.evaluate(validation_generator)[0]
acc = model.evaluate(validation_generator)[1]

model.save(f"/root/data/model/cut_data_model.h5")

Epoch 1/30


2021-11-12 22:25:01.218490: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10
2021-11-12 22:25:11.347714: W tensorflow/core/common_runtime/bfc_allocator.cc:431] Allocator (GPU_0_bfc) ran out of memory trying to allocate 384B (rounded to 512)requested by op SameWorkerRecvDone
Current allocation summary follows.
2021-11-12 22:25:11.347781: I tensorflow/core/common_runtime/bfc_allocator.cc:970] BFCAllocator dump for GPU_0_bfc
2021-11-12 22:25:11.347801: I tensorflow/core/common_runtime/bfc_allocator.cc:977] Bin (256): 	Total Chunks: 43, Chunks in use: 43. 10.8KiB allocated for chunks. 10.8KiB in use in bin. 1.3KiB client-requested in use in bin.
2021-11-12 22:25:11.347813: I tensorflow/core/common_runtime/bfc_allocator.cc:977] Bin (512): 	Total Chunks: 9, Chunks in use: 9. 4.8KiB allocated for chunks. 4.8KiB in use in bin. 4.5KiB client-requested in use in bin.
2021-11-12 22:25:11.347824: I tensorflow/core/common_runtime/bf

InternalError:  Dst tensor is not initialized.
	 [[{{node IteratorGetNext/_4}}]] [Op:__inference_train_function_1039]

Function call stack:
train_function


Physical devices cannot be modified after being initialized
