In [20]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt

In [21]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("spark://spark:7077") \
                    .appName("Ass4-Q3") \
                    .config("spark.executor.memory", "6g") \
                    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/29 19:59:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [22]:
import cv2
import tempfile
import json
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout, Activation, Conv2D, MaxPooling2D

BUFFER_SIZE = 10000

def make_test_datasets(path, batch_size=64):
    X = []
    y = []
    #convert = lambda category : int(category == 'dog')
    
    for p in os.listdir(path):
        #category = p.split(".")[0]
        #category = convert(category)
        img_array = cv2.imread(os.path.join(path,p),cv2.IMREAD_GRAYSCALE)
        new_img_array = cv2.resize(img_array, dsize=(80,80)) / 255
        X.append(new_img_array)
        #y.append(category)

    dataset = tf.data.Dataset.from_tensor_slices((
        tf.cast(X, tf.float32),
        #tf.cast(y, tf.int64)
        )
    )
    return dataset
    

def build_and_compile_cnn_model():
    model = Sequential()
    # Adds a densely-connected layer with 64 units to the model:
    model.add(Conv2D(64,(3,3), activation = 'relu', input_shape = (80,80,1)))
    model.add(MaxPooling2D(pool_size = (2,2)))
    # Add another:
    model.add(Conv2D(64,(3,3), activation = 'relu'))
    model.add(MaxPooling2D(pool_size = (2,2)))
    
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    # Add a softmax layer with 10 output units:
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer="adam",
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [24]:
def train(batch_size=64):
    import cv2
    import tempfile
    import json
    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Flatten, Dropout, Activation, Conv2D, MaxPooling2D
    

    BUFFER_SIZE = 10000

    path="/mnt/data_file/train_data/train"
    checkpoint_path = "/mnt/data_file/ass4.model/"
    random_path = tempfile.TemporaryDirectory()

    from tensorflow.keras.callbacks import ModelCheckpoint

    chief_callback = ModelCheckpoint(checkpoint_path, 
                                     monitor='accuracy', 
                                     save_best_only=True,
                                     mode='max')
    dummy_callback = ModelCheckpoint(random_path.name, monitor='accuracy', save_best_only=True)

    def make_datasets():
        X = []
        y = []
        convert = lambda category : int(category == 'dog')
        
        for p in os.listdir(path):
            category = p.split(".")[0]
            category = convert(category)
            img_array = cv2.imread(os.path.join(path,p),cv2.IMREAD_GRAYSCALE)
            new_img_array = cv2.resize(img_array, dsize=(80,80)) / 255
            X.append(new_img_array)
            y.append(category)

        dataset = tf.data.Dataset.from_tensor_slices((
            tf.cast(X, tf.float32),
            tf.cast(y, tf.int64))
        )
        dataset = dataset.repeat().shuffle(BUFFER_SIZE).batch(batch_size)
        print(f"Data size: {len(y)}")
        print(f"Dog pictures #: {sum(y)}")
        return dataset
        

    def build_and_compile_cnn_model():
        model = Sequential()
        # Adds a densely-connected layer with 64 units to the model:
        model.add(Conv2D(64,(3,3), activation = 'relu', input_shape = (80,80,1)))
        model.add(MaxPooling2D(pool_size = (2,2)))
        # Add another:
        model.add(Conv2D(64,(3,3), activation = 'relu'))
        model.add(MaxPooling2D(pool_size = (2,2)))
        
        model.add(Flatten())
        model.add(Dense(64, activation='relu'))
        # Add a softmax layer with 10 output units:
        model.add(Dense(1, activation='sigmoid'))
        
        model.compile(optimizer="adam",
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        return model

    train_datasets = make_datasets()
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA
    train_datasets = train_datasets.with_options(options)
    multi_worker_model = build_and_compile_cnn_model()
    if 'TF_CONFIG' in os.environ:    
        tf_config = json.loads(os.environ['TF_CONFIG'])    
        node_index = tf_config['task']['index']    
        is_chief = node_index == 0    
        print(f"Node Index: {node_index}, Is Chief: {is_chief}")
    callback = [chief_callback if is_chief else dummy_callback]
    multi_worker_model.fit(x=train_datasets, epochs=3, steps_per_epoch=5, callbacks=callback)
    print(max(multi_worker_model.predict(train_datasets, steps=5)))
    random_path.cleanup()

from spark_tensorflow_distributor import MirroredStrategyRunner
 
BATCH_SIZE_PER_REPLICA = 64
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
runner = MirroredStrategyRunner(num_slots=2, local_mode=False, use_gpu=False)
runner.run(train, batch_size=BATCH_SIZE_PER_REPLICA)

Doing CPU training...
Will run with 2 Spark tasks.
Distributed training in progress...
View Spark executor stderr logs to inspect training...
Training with 2 slots is complete!                                              


In [25]:
checkpoint_path = "/mnt/data_file/ass4.model/"
test_path="/mnt/data_file/train_data/test1"


In [26]:
model = build_and_compile_cnn_model()

In [27]:

model = tf.keras.models.load_model(checkpoint_path)

In [28]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 78, 78, 64)        640       
                                                                 
 max_pooling2d (MaxPooling2  (None, 39, 39, 64)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 37, 37, 64)        36928     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 18, 18, 64)        0         
 g2D)                                                            
                                                                 
 flatten (Flatten)           (None, 20736)             0         
                                                                 
 dense (Dense)               (None, 64)                1

In [29]:
test_data = make_test_datasets(test_path)

In [30]:
predict = model.predict(test_data.batch(64))



In [31]:
y_pred = (predict > 0.5).astype(np.float32)

In [32]:
y_pred.sum()

0.0

In [38]:
print(next(iter(test_data)))


(<tf.Tensor: shape=(80, 80), dtype=float32, numpy=
array([[0.23921569, 0.21960784, 0.23529412, ..., 0.4       , 0.42745098,
        0.4       ],
       [0.21568628, 0.23529412, 0.24705882, ..., 0.38431373, 0.41960785,
        0.3764706 ],
       [0.19215687, 0.22745098, 0.22745098, ..., 0.3647059 , 0.39607844,
        0.36078432],
       ...,
       [0.7647059 , 0.7372549 , 0.74509805, ..., 0.42352942, 0.47843137,
        0.49411765],
       [0.7490196 , 0.7647059 , 0.7647059 , ..., 0.79607844, 0.60784316,
        0.45490196],
       [0.7019608 , 0.7372549 , 0.74509805, ..., 0.5058824 , 0.7058824 ,
        0.34117648]], dtype=float32)>,)


In [43]:
len(predict)

12500

In [45]:
X_test = []
id_line = []
def create_test1_data(test_path):
    for p in os.listdir(test_path):
        id_line.append(p.split(".")[0])
        img_array = cv2.imread(os.path.join(test_path,p),cv2.IMREAD_GRAYSCALE)
        new_img_array = cv2.resize(img_array, dsize=(80, 80))
        X_test.append(new_img_array)
create_test1_data(test_path)
X_test = np.array(X_test).reshape(-1,80,80,1)
X_test = X_test/255

In [46]:
predictions = model.predict(X_test)



In [49]:
predicted_val = [int(round(p[0])) for p in predictions]

In [51]:
max(predicted_val)

0