In [None]:
train_path = "/mnt/data_file/kaggle/train_data/train"
processed_path = "/mnt/data_file/kaggle/train_data/processed_data"

In [9]:
import os
import shutil
from random import choices

In [11]:
options = [True, False]
weights = [0.8, 0.2]
train_dog_path = os.path.join(processed_path, "training", "dog")
train_cat_path = os.path.join(processed_path, "training", "cat")
test_dog_path = os.path.join(processed_path, "testing", "dog")
test_cat_path = os.path.join(processed_path, "testing", "cat")
for image in os.listdir(train_path):
    train = choices(options, weights)[0]
    if "dog" in image:
        if train:
            shutil.copy(os.path.join(train_path, image), os.path.join(train_dog_path, image))
        else:
            shutil.copy(os.path.join(train_path, image), os.path.join(test_dog_path, image))
    else:
        if train:
            shutil.copy(os.path.join(train_path, image), os.path.join(train_cat_path, image))
        else:
            shutil.copy(os.path.join(train_path, image), os.path.join(test_cat_path, image))
        

# Building model

In [36]:
def train():
    import json
    import tempfile
    from tensorflow.keras import layers
    from tensorflow.keras import models
    from tensorflow.keras import optimizers
    from tensorflow.keras.callbacks import ModelCheckpoint
    import tensorflow as tf
    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
    with strategy.scope():

    
        checkpoint_path = "/mnt/data_file/ass4.model/"
        random_path = tempfile.TemporaryDirectory()
    
        chief_callback = ModelCheckpoint(checkpoint_path, 
                                         monitor='acc', 
                                         save_best_only=True,
                                         mode='max')
        dummy_callback = ModelCheckpoint(random_path.name, monitor='acc', save_best_only=True)
    
    
        processed_path = "/mnt/data_file/train_data/processed_data"
        
        model = models.Sequential()
        model.add(layers.Conv2D(32, (3, 3), activation='relu',
                                input_shape=(150, 150, 3)))
        model.add(layers.MaxPooling2D((2, 2)))
        model.add(layers.Conv2D(64, (3, 3), activation='relu'))
        model.add(layers.MaxPooling2D((2, 2)))
        model.add(layers.Conv2D(128, (3, 3), activation='relu'))
        model.add(layers.MaxPooling2D((2, 2)))
        model.add(layers.Conv2D(128, (3, 3), activation='relu'))
        model.add(layers.MaxPooling2D((2, 2)))
        model.add(layers.Flatten())
        model.add(layers.Dense(512, activation='relu'))
        model.add(layers.Dense(1, activation='sigmoid'))
        
        model.compile(loss='binary_crossentropy',
                      optimizer=optimizers.RMSprop(learning_rate=1e-4),
                      metrics=['acc'])
    
        from tensorflow.keras.preprocessing.image import ImageDataGenerator
    
        train_dir = os.path.join(processed_path, "training")
        validation_dir = os.path.join(processed_path, "testing")
        
        # All images will be rescaled by 1./255
        train_datagen = ImageDataGenerator(rescale=1./255)
        test_datagen = ImageDataGenerator(rescale=1./255)
        train_generator = train_datagen.flow_from_directory(
                # This is the target directory
                train_dir,
                # All images will be resized to 150x150
                target_size=(150, 150),
                batch_size=64,
                # Since we use binary_crossentropy loss, we need binary labels
                class_mode='binary')
        validation_generator = test_datagen.flow_from_directory(
                validation_dir,
                target_size=(150, 150),
                batch_size=64,
                class_mode='binary')
        if 'TF_CONFIG' in os.environ:    
            tf_config = json.loads(os.environ['TF_CONFIG'])    
            node_index = tf_config['task']['index']    
            is_chief = node_index == 0    
            print(f"Node Index: {node_index}, Is Chief: {is_chief}")
        callback = [chief_callback if is_chief else dummy_callback]


        model.fit(
          train_generator,
        #cuz there are 2000 sapmples=100*20(batch-size(20images))
          #steps_per_epoch=100,
          epochs=5,
          validation_data=validation_generator,
          callbacks=callback)

In [37]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("spark://spark:7077") \
                    .appName("Ass4-Q3") \
                    .config("spark.executor.memory", "8g") \
                    .getOrCreate()

from spark_tensorflow_distributor import MirroredStrategyRunner
 
BATCH_SIZE_PER_REPLICA = 64
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
runner = MirroredStrategyRunner(num_slots=2, local_mode=False, use_gpu=False, use_custom_strategy=True)
runner.run(train)

INFO:MirroredStrategyRunner:Doing CPU training...
INFO:MirroredStrategyRunner:Will run with 2 Spark tasks.
INFO:MirroredStrategyRunner:Distributed training in progress...
INFO:MirroredStrategyRunner:View Spark executor stderr logs to inspect training...
Exception in thread "serve RDD 9" java.net.SocketTimeoutException: Accept timed out
	at java.net.PlainSocketImpl.socketAccept(Native Method)
	at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:409)
	at java.net.ServerSocket.implAccept(ServerSocket.java:560)
	at java.net.ServerSocket.accept(ServerSocket.java:528)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:65)
INFO:MirroredStrategyRunner:Training with 2 slots is complete!                  
[Stage 3:>                                                          (0 + 2) / 2]

In [39]:
import tensorflow as tf
checkpoint_path = "/mnt/data_file/ass4.model/kaggle/"
model = tf.keras.models.load_model(checkpoint_path)

[Stage 3:>                                                          (0 + 2) / 2]

In [72]:
training_dir = "/mnt/data_file/kaggle/train_data/processed_data/training"
testing_dir = "/mnt/data_file/kaggle/train_data/processed_data/testing"
validation_dir = "/mnt/data_file/robot/Q3_images/processed_data/training"

In [76]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
def report(folder):
    image_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
                folder,
                target_size=(150, 150),
                shuffle=False,
                batch_size=1,
                class_mode='binary')
    model.evaluate(image_generator)

In [77]:
report(training_dir)

Found 20075 images belonging to 2 classes.


In [78]:
report(testing_dir)

Found 4925 images belonging to 2 classes.


In [79]:
report(validation_dir)

Found 5887 images belonging to 2 classes.


In [81]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 148, 148, 32)      896       
                                                                 
 max_pooling2d (MaxPooling2  (None, 74, 74, 32)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 72, 72, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 36, 36, 64)        0         
 g2D)                                                            
                                                                 
 conv2d_2 (Conv2D)           (None, 34, 34, 128)       73856     
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 17, 17, 128)       0