In [1]:
# --------------------------------------------------------- install libraries
# region
# @title install libraries { display-mode: "form" }

# %pip install --upgrade pillow pydrive2
# endregion

In [2]:
# --------------------------------------------------------- imports
# region
# @title imports { display-mode: "form" }

import os
import uuid
import random
import warnings
import shutil
import zipfile
from os import makedirs
from os import listdir
from shutil import copyfile
from random import seed
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.image import imread
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,ConfusionMatrixDisplay
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.image import 


from tensorflow.keras.layers import Dense,MaxPooling2D,Dropout,Flatten,BatchNormalization,Conv2D
from tensorflow.keras.callbacks import ReduceLROnPlateau,EarlyStopping

print("imports done")

# endregion




imports done


In [3]:
# --------------------------------------------------------- functions
# region
# @title functions {display-mode: "form"}

raw_data_path = None

def get_environment():
    if 'COLAB_GPU' in os.environ:
        return 'Google Colab'
    elif 'KAGGLE_URL_BASE' in os.environ:
        return 'Kaggle'
    elif 'VSCODE_PID' in os.environ:
        return 'VS Code'
    else:
        return 'Unknown environment'

def do_colab_staff():

    from google.colab import drive
    drive.mount('/content/gdrive')

    kaggle_creds_path = '/content/gdrive/MyDrive/Bachelor/kaggle/kaggle.json'
    kaggle_data_set_name = 'karakaggle/kaggle-cat-vs-dog-dataset' # modify per dataset name
    extra_path_after_extraction = 'PetImages' # modify per dataset structure

    # storing kaggle credentials
    ! mkdir ~/.kaggle
    ! cp {kaggle_creds_path} ~/.kaggle/
    ! chmod 600 ~/.kaggle/kaggle.json

    !kaggle datasets download -d {kaggle_data_set_name}
    
    ! mkdir kaggle_data

    downloaded_zip_name = f"{kaggle_data_set_name.split('/')[-1]}.zip" # the !kaggle datasets download command will download the zip file with the same name as the dataset name
    with zipfile.ZipFile(downloaded_zip_name, 'r') as zip_ref:
        zip_ref.extractall('kaggle_data')
    
    # get the name of the first directory in kaggle_data
    extracted_folder_name = next(os.walk('kaggle_data'))[1][0] 

    extracted_folder_path = os.path.join('kaggle_data', extracted_folder_name)
    
    global raw_data_path
    raw_data_path = os.path.join(extracted_folder_path, extra_path_after_extraction)

def do_kaggle_staff():
    global raw_data_path
    raw_data_path = '/kaggle/input/kaggle-cat-vs-dog-dataset/kagglecatsanddogs_3367a/PetImages' # modify per dataset structure

def do_vscode_staff():
    global raw_data_path
    raw_data_path = '../dogs-vs-cats/data set 2/'  # modify per dataset structure

def do_unknown_environment_staff():
    print("This is an unknown environment, please enter the path to the data set folder:")
    global raw_data_path
    raw_data_path = input()     

def image_is_ok(image_path):
    try:
        with warnings.catch_warnings():
            warnings.simplefilter('error')  # treat warnings as exceptions within this context
            img = Image.open(image_path)
            img.verify()
        return True
    except (IOError, SyntaxError, UserWarning):  # catch UserWarning along with other exceptions
        return False

def remove_corrupted_files(dirty_dataset_path, destination_folder_path):
    os.makedirs(destination_folder_path, exist_ok=True)
    for root, dirs, files in os.walk(dirty_dataset_path):
        for file in files:
            file_path = os.path.join(root, file)
            if image_is_ok(file_path):
                new_file_path = os.path.join(destination_folder_path, os.path.relpath(file_path, dirty_dataset_path))
                os.makedirs(os.path.dirname(new_file_path), exist_ok=True)
                shutil.copyfile(file_path, new_file_path)

def rename_files(all_entities_path):
    all_entities_names = os.listdir(all_entities_path)
    print("giving temporary unique names...")
    for entity_name in all_entities_names:
        entity_path = os.path.join(all_entities_path, entity_name)
        for filename in os.listdir(entity_path):
            temp_filename = str(uuid.uuid4()) + ".jpg"  # generate a unique filename
            source = os.path.join(entity_path, filename)
            destination = os.path.join(entity_path, temp_filename)
            os.rename(source, destination)
    # ----------------------------------------------------------------------------------------------------
    print("renaming...")
    # then rename every file in every folder in the given path

    for entity_name in all_entities_names:
        entity_path = os.path.join(all_entities_path, entity_name)
        i = 1
        for filename in os.listdir(entity_path):
            entity_name = entity_name.lower()
            new_filename = entity_name + '.' + str(i) + ".jpg"
            source = os.path.join(entity_path, filename)
            destination = os.path.join(entity_path, new_filename)
            os.rename(source, destination)
            i += 1
    print("done renaming !")

print("functions done")  

# endregion

    

functions done


In [4]:
# --------------------------------------------------------- doing specific-environment things
# region
# @title doing specific-environment things {display-mode: "form"}

# test if the GPU is available
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus if gpus else "No GPU available, using CPU instead.")

raw_data_path = None    
environment_type = get_environment() 
print(f'Environment: {environment_type}')       
if environment_type == 'Google Colab':
    do_colab_staff()
elif environment_type == 'Kaggle':
    do_kaggle_staff()
elif environment_type == 'VS Code':
    do_vscode_staff()
else:
    do_unknown_environment_staff()

print("data set path:", raw_data_path)
assert os.path.exists(raw_data_path), ' wrong path for data set !'   

print("doing specific-environment things done")

# endregion

        


No GPU available, using CPU instead.
Environment: VS Code
data set path: ../dogs-vs-cats/data set 2/
doing specific-environment things done


In [5]:
# --------------------------------------------------------- data preparation
# region
# @title data preparation {display-mode: "form"}

clean_data_folder_name = 'clean_data'
remove_corrupted_files(raw_data_path, clean_data_folder_name)
parent_dir = os.path.dirname(raw_data_path)
clean_data_path = os.path.join(parent_dir, clean_data_folder_name)
rename_files(clean_data_path)
ready_data_path = clean_data_path

data_percentage_to_use = 0.5 # percentage of the data to use

print("data preparation done")

# endregion
        



Verifying all files are non-corrupted images...
File ../dogs-vs-cats/data set 2/Cat\cat.6358.jpg is corrupted and will be skipped.
File ../dogs-vs-cats/data set 2/Cat\cat.7819.jpg is corrupted and will be skipped.
File ../dogs-vs-cats/data set 2/Dog\dog.10336.jpg is corrupted and will be skipped.
File ../dogs-vs-cats/data set 2/Dog\dog.1128.jpg is corrupted and will be skipped.
File ../dogs-vs-cats/data set 2/Dog\dog.4411.jpg is corrupted and will be skipped.
Done reporting error files.
giving temporary unique names...
renaming...
done renaming !
data preparation done


In [6]:
# 1 --------------------------------------------------------- Loading Images in a Dataframe
# region
# @title 1 - Loading Images in a Dataframe { display-mode: "form" }

all_entities_names = os.listdir(raw_data_path)
print("all entities names:", all_entities_names)
filenames = []
for entity_name in all_entities_names:
    entity_path = os.path.join(raw_data_path, entity_name)
    print("entity " , entity_name , "has ", len(os.listdir(entity_path)) , "files" , "skipped files from it :" , len([file for file in os.listdir(entity_path) if os.path.join(entity_path, file) in corrupted_file_paths]) )
    entity_filenames = [file_name for file_name in os.listdir(entity_path) if os.path.join(entity_path, file_name) not in corrupted_file_paths]
    random.shuffle(entity_filenames)
    entity_filenames = entity_filenames[:int(len(entity_filenames) * data_percentage_to_use)]
    filenames.extend([os.path.join(entity_name, file_name) for file_name in entity_filenames])

file_labels = [x.split(os.sep)[0] for x in filenames] 
data = pd.DataFrame({"filename": filenames, "label": file_labels})

print("Loading Images in a Dataframe done")
# endregion


all entities names: ['Cat', 'Dog']
entity  Cat has  12501 files skipped files from it : 2
entity  Dog has  12501 files skipped files from it : 3
Loading Images in a Dataframe done


In [7]:
# 2 --------------------------------------------------------- Train Test Split
# region
# @title 2 - Train Test Split { display-mode: "form" }

all_entities_names = data['label']
X_train, X_temp = train_test_split(data, test_size=0.2, stratify=all_entities_names, random_state = 42)
label_test_val = X_temp['label']
X_test, X_val = train_test_split(X_temp, test_size=0.5, stratify=label_test_val, random_state = 42)

print(" ")
print('The shape of train data',X_train.shape)
print('The shape of test data',X_test.shape)
print('The shape of validation data',X_val.shape)
print(" ")

# endregion


 
The shape of train data (9998, 2)
The shape of test data (1250, 2)
The shape of validation data (1250, 2)
 


In [8]:
# 3 --------------------------------------------------------- Creating Image Data Generator
# region
# @title 3 - Creating Image Data Generator  { display-mode: "form" }

image_size = 128
image_channel = 3
bat_size = 32

# Creating image data generator
train_datagen = ImageDataGenerator(rescale=1./255,
                                    rotation_range = 15,
                                    horizontal_flip = True,
                                    zoom_range = 0.2,
                                    shear_range = 0.1,
                                    fill_mode = 'reflect',
                                    width_shift_range = 0.1,
                                    height_shift_range = 0.1)
test_datagen = ImageDataGenerator(rescale=1./255)

# Applying image data gernerator to train and test data

# print("all entities path : ", all_entities_path)
# print ("X_train head: ", X_train.head())
# print ("X_val head: ", X_val.head())
# print ("X_test head: ", X_test.head())

train_generator = train_datagen.flow_from_dataframe(X_train,
                                                directory = raw_data_path ,
                                                x_col= 'filename',
                                                y_col= 'label',
                                                batch_size = bat_size,
                                                target_size = (image_size,image_size),
                                                class_mode='categorical')
# print("tain generator indicise : " ,train_generator.class_indices)
# print("corrupted files : ", corrupted_file_paths)
val_generator = test_datagen.flow_from_dataframe(X_val,
                                                directory = raw_data_path ,
                                                x_col= 'filename',
                                                y_col= 'label',
                                                batch_size = bat_size,
                                                target_size = (image_size,image_size),
                                                shuffle=False,
                                                class_mode='categorical')

test_generator = test_datagen.flow_from_dataframe(X_test,
                                                directory = raw_data_path ,
                                                x_col= 'filename',
                                                y_col= 'label',
                                                batch_size = bat_size,
                                                target_size = (image_size,image_size),
                                                shuffle=False,
                                                class_mode='categorical')

print("Creating Image Data Generator done")

# endregion



Found 9998 validated image filenames belonging to 2 classes.
Found 1250 validated image filenames belonging to 2 classes.
Found 1250 validated image filenames belonging to 2 classes.
Creating Image Data Generator done


In [9]:
# 4 --------------------------------------------------------- Deep Learning Model
# region
# @title 4 - Deep Learning Model { display-mode: "form" }

model = Sequential()

# Input Layer
model.add(Conv2D(32,(3,3),activation='relu',input_shape = (image_size,image_size,image_channel)))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.2))

# Bloack 1
model.add(Conv2D(64,(3,3),activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.2))
# Block 2
model.add(Conv2D(128,(3,3),activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.2))
# Block 3
model.add(Conv2D(256,(3,3),activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.2))

# Fully Connected layers
model.add(Flatten())
model.add(Dense(512,activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))

# Output layer
model.add(Dense(2, activation='softmax'))
# model.summary()

print("Deep Learning Model done")

# endregion





Deep Learning Model done


In [10]:
# 5 --------------------------------------------------------- Callbacks
# region
# @title 5 - Callbacks { display-mode: "form" }
learning_rate_reduction = ReduceLROnPlateau(monitor = 'val_accuracy',
                                            patience=2,
                                            factor=0.5,
                                            min_lr = 0.00001,
                                            verbose = 1)

early_stoping = EarlyStopping(monitor='val_loss',patience= 3,restore_best_weights=True,verbose=0)

print("Callbacks done")

# endregion


Callbacks done


In [11]:
# 6 --------------------------------------------------------- Model Compilation
# region
# @title 6 - Model Compilation { display-mode: "form" }

model.compile(optimizer = 'adam',loss = 'categorical_crossentropy',metrics = ['accuracy'])

print("Model Compilation done")

# endregion

#


Model Compilation done


In [12]:
# 7 --------------------------------------------------------- Model Fitting
# region
# @title 7 - Model Fitting { display-mode: "form" }

print("x_train length: ",len(X_train))
print("x_test length: ",len(X_test))
print("batch size: ",bat_size)
print("steps_per_epoch: ",len(X_train) , " // " , bat_size , " = " , len(X_train) // bat_size)
print("validation_steps: ",len(X_test) , " // " , bat_size , " = " , len(X_test) // bat_size)
cat_dog = model.fit(train_generator,
                    validation_data = val_generator,
                    callbacks=[early_stoping,learning_rate_reduction],
                    epochs = 30,
                    # data generator must generate at least steps_per_epochs * epochs batches

                    steps_per_epoch = len(X_train) // bat_size,
                    validation_steps = len(X_test) // bat_size,
                   )

print("Model Fitting done")
# endregion

#

x_train length:  9998
x_test length:  1250
batch size:  32
steps_per_epoch:  9998  //  32  =  312
validation_steps:  1250  //  32  =  39
Epoch 1/30







UnknownError: Graph execution error:

Detected at node PyFunc defined at (most recent call last):
<stack traces unavailable>
UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x000002350A2DA7F0>
Traceback (most recent call last):

  File "c:\Users\Dell\Desktop\Bachelor\dogs-vs-cats\env\Lib\site-packages\tensorflow\python\ops\script_ops.py", line 270, in __call__
    ret = func(*args)
          ^^^^^^^^^^^

  File "c:\Users\Dell\Desktop\Bachelor\dogs-vs-cats\env\Lib\site-packages\tensorflow\python\autograph\impl\api.py", line 643, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^

  File "c:\Users\Dell\Desktop\Bachelor\dogs-vs-cats\env\Lib\site-packages\tensorflow\python\data\ops\from_generator_op.py", line 198, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "c:\Users\Dell\Desktop\Bachelor\dogs-vs-cats\env\Lib\site-packages\keras\src\engine\data_adapter.py", line 917, in wrapped_generator
    for data in generator_fn():

  File "c:\Users\Dell\Desktop\Bachelor\dogs-vs-cats\env\Lib\site-packages\keras\src\engine\data_adapter.py", line 1064, in generator_fn
    yield x[i]
          ~^^^

  File "c:\Users\Dell\Desktop\Bachelor\dogs-vs-cats\env\Lib\site-packages\keras\src\preprocessing\image.py", line 116, in __getitem__
    return self._get_batches_of_transformed_samples(index_array)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "c:\Users\Dell\Desktop\Bachelor\dogs-vs-cats\env\Lib\site-packages\keras\src\preprocessing\image.py", line 370, in _get_batches_of_transformed_samples
    img = image_utils.load_img(
          ^^^^^^^^^^^^^^^^^^^^^

  File "c:\Users\Dell\Desktop\Bachelor\dogs-vs-cats\env\Lib\site-packages\keras\src\utils\image_utils.py", line 423, in load_img
    img = pil_image.open(io.BytesIO(f.read()))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "c:\Users\Dell\Desktop\Bachelor\dogs-vs-cats\env\Lib\site-packages\PIL\Image.py", line 3309, in open
    raise UnidentifiedImageError(msg)

PIL.UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x000002350A2DA7F0>


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]] [Op:__inference_train_function_3336]

In [None]:
# 8 --------------------------------------------------------- Plot the results
# region
# @title 8 - Plot the results { display-mode: "form" }
# plots for accuracy and Loss with epochs

error = pd.DataFrame(cat_dog.history)

plt.figure(figsize=(18,5),dpi=200)
sns.set_style('darkgrid')

plt.subplot(121)
plt.title('Cross Entropy Loss',fontsize=15)
plt.xlabel('Epochs',fontsize=12)
plt.ylabel('Loss',fontsize=12)
plt.plot(error['loss'])
plt.plot(error['val_loss'])

plt.subplot(122)
plt.title('Classification Accuracy',fontsize=15)
plt.xlabel('Epochs',fontsize=12)
plt.ylabel('Accuracy',fontsize=12)
plt.plot(error['accuracy'])
plt.plot(error['val_accuracy'])

plt.show(block=False)  # hosain : prevent the popup

print("Plot the results done")

# endregion

#

In [None]:
# 9 --------------------------------------------------------- Evaluation
# region
# @title 9 - Evaluation { display-mode: "form" }
# Evaluvate for train generator
loss,acc = model.evaluate(train_generator,batch_size = bat_size, verbose = 0)

print('The accuracy of the model for training data is:',acc*100)
print('The Loss of the model for training data is:',loss)

# Evaluvate for validation generator
loss,acc = model.evaluate(val_generator,batch_size = bat_size, verbose = 0)

print('The accuracy of the model for validation data is:',acc*100)
print('The Loss of the model for validation data is:',loss)

#  endregion

#

In [None]:
# 10 --------------------------------------------------------- save the model
# region
# @title 10 - save the model { display-mode: "form" }
model.save("model.keras")

print("Model saved")
# endregion

#

In [None]:
# 11 --------------------------------------------------------- Prediction
# region
# @title 11 - Prediction { display-mode: "form" }
result = model.predict(test_generator,batch_size = bat_size,verbose = 0)

y_pred = np.argmax(result, axis = 1)

y_true = test_generator.labels

# Evaluvate
loss,acc = model.evaluate(test_generator, batch_size = bat_size, verbose = 0)

print('The accuracy of the model for testing data is:',acc*100)
print('The Loss of the model for testing data is:',loss)

# endregion

#

In [None]:
# 12 --------------------------------------------------------- Classification Report
# region
# @title 12 - Classification Report { display-mode: "form" }
all_entities_names =['Cat','Dog']
print(classification_report(y_true, y_pred,target_names=all_entities_names))

# endregion

#

In [None]:
# 13 --------------------------------------------------------- Confusion Matrix
# region
# @title 13 - Confusion Matrix { display-mode: "form" }

# Normalize the confusion matrix
confusion_mtx = confusion_matrix(y_true, y_pred)
confusion_mtx = confusion_mtx.astype('float') / confusion_mtx.sum(axis=1)[:, np.newaxis]

print("Normalized Confusion Matrix: \n", confusion_mtx)

f, ax = plt.subplots(figsize=(8, 4), dpi=200)
sns.heatmap(confusion_mtx, annot=True, linewidths=0.1, cmap="gist_yarg_r", linecolor="black", fmt='.2%', ax=ax, cbar=False, xticklabels=all_entities_names, yticklabels=all_entities_names)

plt.xlabel("Predicted Label", fontsize=10)
plt.ylabel("True Label", fontsize=10)
plt.title("Confusion Matrix", fontsize=13)

plt.show()
# endregion



In [None]:
# --------------------------------------------------------- reference codes
# region
#  @title  reference codes { display-mode: "form" }

# from google.colab import drive
# drive.mount('/content/gdrive')

# import zipfile
# import os
# extract_source = "/content/gdrive/My Drive/GUC Bachelor : Arabic Image-to-Letters Script Recognition/data set/data set 2.zip"
# extract_destination = "/content/extracted_from_drive"
# zip_ref = zipfile.ZipFile(extract_source, 'r')
# print("extracting...")
# zip_ref.extractall(extract_destination)
# extracted_folder_name = os.listdir(extract_destination)[0]
# zip_ref.close()
# print("extracted files from drive to colab successfully !")
# all_entities_path = os.path.join(extract_destination, extracted_folder_name)


# def get_corrupted_files_paths(all_entities_path):
#     all_entities_names = os.listdir(all_entities_path)
#     corrupted_file_paths = set()
#     print("\nVerifying all files are non-corrupted images...")
#     for entity_name in all_entities_names:
#         entity_path = os.path.join(all_entities_path, entity_name)
#         for filename in os.listdir(entity_path):
#             file_path = os.path.join(entity_path, filename)
#             if not image_is_ok(file_path):
#                 print(f"File {file_path} is corrupted and will be skipped.")
#                 corrupted_file_paths.add(file_path)
#     print("Your data is ok." if not corrupted_file_paths else "Done reporting error files.")
#     return corrupted_file_paths

