In [1]:
import shutil
import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import logging
logger = tf.get_logger()
logger.setLevel(logging.ERROR)

KeyboardInterrupt: 

# Unpack & load data
1. unpack data
1. split dogs and cats to train and valid datasets
1. put dogs and cats files according to their catalogs (need for ImageDataGenerator)

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
CONTENT_DIR = '/kaggle/content'

TRAIN_DIR = CONTENT_DIR + '/train'
TRAIN_DIR_DOG = TRAIN_DIR + '/dog'
TRAIN_DIR_CAT = TRAIN_DIR + '/cat'

VALID_DIR = CONTENT_DIR + '/valid'
VALID_DIR_DOG = VALID_DIR + '/dog'
VALID_DIR_CAT = VALID_DIR + '/cat'

In [None]:
# Extract files
import zipfile
with zipfile.ZipFile('/kaggle/input/dogs-vs-cats/train.zip', 'r') as zipf:
    zipf.extractall(CONTENT_DIR)
    

In [None]:
# Split cats and dogs images to train and valid datasets
img_filenames = os.listdir(TRAIN_DIR)
print('Num of images:', len(img_filenames))

dog_filenames = [fn for fn in img_filenames if fn.startswith('dog')]
cat_filenames = [fn for fn in img_filenames if fn.startswith('cat')]

dataset_filenames = train_test_split(
    dog_filenames, cat_filenames, test_size=0.1, shuffle=True, random_state=42
)

train_dog_total, valid_dog_total, train_cat_total, valid_cat_total = [len(fns) for fns in dataset_filenames]
train_total = train_dog_total + train_cat_total
valid_total = valid_dog_total + valid_cat_total
print('Train: {}, test: {}'.format(train_total, valid_total))

In [None]:
# Move images
make_dirs = [TRAIN_DIR_DOG, VALID_DIR_DOG, TRAIN_DIR_CAT, VALID_DIR_CAT]
for dir, fns in zip(make_dirs, dataset_filenames):
    os.makedirs(dir, exist_ok=True)
    for fn in tqdm.tqdm(fns):
        shutil.move(os.path.join(TRAIN_DIR, fn), dir)
    print('elements in {}: {}'.format(dir, len(os.listdir(dir))))

# Preprocessing
1. decode images
1. rescale image layers from [0..255] to [0,1]

In [None]:
BATCH_SIZE = 128
IMAGE_SHAPE = 128

In [None]:
train_generator = ImageDataGenerator(rescale=1./255)
valid_generator = ImageDataGenerator(rescale=1./255)

In [None]:
train_data = train_generator.flow_from_directory(
    directory=TRAIN_DIR,
    target_size=(IMAGE_SHAPE, IMAGE_SHAPE),
    batch_size=BATCH_SIZE,
    class_mode='binary'
)
valid_data = valid_generator.flow_from_directory(
    directory=VALID_DIR,
    target_size=(IMAGE_SHAPE, IMAGE_SHAPE),
    batch_size=BATCH_SIZE,
    class_mode='binary',
    shuffle=False
)

# Image visualization

In [None]:
# show 25 images
some_pets = next(train_data)[0][:25]
fig, axes = plt.subplots(5, 5, figsize=(20, 20))
for img, ax in zip(some_pets, axes.flatten()):
    ax.imshow(img)
plt.tight_layout()
plt.show()

# Model
1. create model
1. compile
1. summary
1. training

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(
        filters=32,
        kernel_size=(3, 3),
        activation='relu',
        input_shape=(IMAGE_SHAPE, IMAGE_SHAPE, 3)
    ),
    tf.keras.layers.MaxPooling2D(pool_size=2, strides=2),

    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2, 2),

    tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2, 2),

    tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2, 2),

    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=512, activation='relu'),
    tf.keras.layers.Dense(units=2, activation='softmax')
])

In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
EPOCHS = 60

In [None]:
history = model.fit_generator(
    generator=train_data,
    steps_per_epoch=(train_total + BATCH_SIZE - 1) // BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=valid_data,
    validation_steps=(valid_total + BATCH_SIZE - 1) // BATCH_SIZE,
)

In [None]:
# save model - optional
model.save('./checkpoints/model4_60epoch.h5')

In [None]:
# download model - optional
# import os
# os.chdir(r'/kaggle/working')
# from IPython.display import FileLink
# FileLink(r'./checkpoints/model4_60epoch.h5')

In [None]:
# set finish path
# os.chdir(r'..')

# Train visualization
Look to validation loss. You see overfitting after ? epoch. Maybe better stay there

In [None]:
plt.figure(figsize=(12, 8))

plt.subplot(1, 2, 1)
plt.plot(range(EPOCHS), history.history['accuracy'], label='train')
plt.plot(range(EPOCHS), history.history['val_accuracy'], label='valid')
plt.legend(loc='lower right')
plt.title('Accuracy')

plt.subplot(1, 2, 2)
plt.plot(range(EPOCHS), history.history['loss'], label='train')
plt.plot(range(EPOCHS), history.history['val_loss'], label='valid')
plt.legend(loc='upper left')
plt.title('Loss (sparse_categorical_crossentropy)')

plt.show()

# Predictions
1. unzip test data
1. scale and predict
1. save results

In [None]:
TEST_DIR = CONTENT_DIR + '/test'

# Extract files
import zipfile
with zipfile.ZipFile('/kaggle/input/dogs-vs-cats/test1.zip', 'r') as zipf:
    zipf.extractall(TEST_DIR)

In [None]:
test_generator = ImageDataGenerator(rescale=1./255)

test_data = test_generator.flow_from_directory(
    directory=TEST_DIR,
    target_size=(IMAGE_SHAPE, IMAGE_SHAPE),
    batch_size=1,
    class_mode='binary',
    shuffle=False
)
test_data.reset()

test_total = len(test_data.filenames)
predict = model.predict_generator(test_data, steps=test_total, verbose=1)

In [None]:
# labels = dict((v,k) for k,v in train_data.class_indices.items())
# predictions = [labels[k] for k in np.argmax(predict,axis=1)]
# predictions

In [None]:
submission = pd.read_csv('/kaggle/input/dogs-vs-cats/sampleSubmission.csv', index_col='id')
submission['label'] = np.argmax(predict,axis=1)
submission.to_csv('./my_submission.csv')