# Initialization

## Data Preparation

In [7]:
import os
import shutil
import pathlib

In [4]:
"""
!cd data
!unzip dogs-vs-cats.zip
!unzip train.zip
"""

'\n!cd data\n!unzip dogs-vs-cats.zip\n!unzip train.zip\n'

In [11]:
data_path = os.path.join(os.getcwd(), 'data/train/')
data_path_cat = data_path + 'cat/'
data_path_dog = data_path + 'dog/'
os.makedirs(data_path_cat, exist_ok=True)
os.makedirs(data_path_dog, exist_ok=True)

In [6]:
for filename in os.listdir('data/train/'):
    if filename.find('cat.') != -1:
        shutil.move(data_path + filename, data_path_cat)
    elif filename.find('dog.') != -1:
        shutil.move(data_path + filename, data_path_dog)

Only use 4000 images of cat and 4000 images of dog in total due to memory constraint.

In [1]:
# Delete some data from the dataset
"""
!cd data/train/cat
!find . -type f -print0 | sort -zR | tail -zn +4001 | xargs -0 rm
!cd ../dog
!find . -type f -print0 | sort -zR | tail -zn +4001 | xargs -0 r
"""

'\n!cd data/train/cat\n!find . -type f -print0 | sort -zR | tail -zn +4001 | xargs -0 rm\n!cd ../dog\n!find . -type f -print0 | sort -zR | tail -zn +4001 | xargs -0 r\n'

## Data Loading and Pre-processing

In [8]:
import numpy as np
import PIL
from PIL import Image
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import matplotlib.pyplot as plt

2023-07-12 17:57:05.202905: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
print(tf.__version__)
print(keras.__version__)

2.12.0
2.12.0


Calculate number of samples in the dataset

In [13]:
data_path = pathlib.Path(data_path).with_suffix('')
images_count = len(list(data_path.glob('*/*.jpg')))
print(images_count)

8000


## Dataset Initialization

In [14]:
batch_size = 20
img_height = 200
img_width = 200

Split dataset into 3 part:
- Test dataset
- Train dataset
- Validation dataset

In [16]:
train_dataset, test_dataset = keras.utils.image_dataset_from_directory(
    data_path,
    validation_split=0.2,
    subset='both',
    seed=84,
    image_size=(img_height, img_width),
    batch_size=batch_size
)

Found 8000 files belonging to 2 classes.
Using 6400 files for training.
Using 1600 files for validation.


In [21]:
valid_dataset = train_dataset.take(round(train_dataset.cardinality().numpy() * 0.2))
train_dataset = train_dataset.skip(round(train_dataset.cardinality().numpy() * 0.2))

In [22]:
print("Test dataset batches:", test_dataset.cardinality())
print("Train dataset batches:", train_dataset.cardinality())
print("Valid dataset batches:", valid_dataset.cardinality())

Test dataset batches: tf.Tensor(80, shape=(), dtype=int64)
Train dataset batches: tf.Tensor(256, shape=(), dtype=int64)
Valid dataset batches: tf.Tensor(64, shape=(), dtype=int64)
