In [None]:
"""
Trained on Google Colab

Before running this notebook, remember to download the dataset and to unzip the folders
"""

import os
import pickle
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint
from keras.applications.VGG19 import VGG19
from keras.applications.resnet50 import ResNet50
from keras.applications.inception_resnet_v2 import InceptionResNetV2
from keras.models import Sequential
from keras.layers import GlobalAveragePooling2D
from keras.layers.core import Dropout
from keras.layers.core import Flatten
from keras.layers.core import Dense
from keras.optimizers import SGD
from keras.optimizers import Adam
from keras.optimizers import RMSprop
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from utils import plot_confusion_matrix

# !pip install kaggle

In [None]:
# Uncomment to download the dataset

# os.environ['KAGGLE_USERNAME'] = ''
# os.environ['KAGGLE_KEY'] = ''
# !kaggle competitions download -c dog-breed-identification

In [None]:
# Uncomment to unzip the folders

# !unzip test.zip
# !unzip train.zip
# !unzip labels.csv

# Pre Processing

In [None]:
df = pd.read_csv('labels.csv')
occurrencies_df = df.pivot_table(index='breed', aggfunc=len).sort_values('id',
                                                                         ascending=False)

In [None]:
N = 35
TRAIN_PATH = 'train/'
INPUT_SIZE = 199
HISTORY_FILENAME = 'history_inceptionresnetv2_adam.pkl'
MODEL_FILENAME = 'model_inceptionresnetv2_adam.pkl'

In [None]:
# Get N breeds of dogs with most occurrency
labels = np.array(occurrencies_df.index[:N])
df = df.loc[df['breed'].isin(labels)]

In [None]:
data = list()
labels = list()
labels_occ = dict()

for file in os.listdir(TRAIN_PATH):
    filename = file.split('.')[0]
    
    if df['id'].str.contains(filename).any():
        label = df.loc[df['id'] == filename, 'breed'].iloc[0]
        labels_occ[label] = labels_occ.get(label, 0) + 1
        if labels_occ[label] <= 90:
            labels.append(label)
            image = cv2.imread(TRAIN_PATH + file)
            image = cv2.resize(image, (INPUT_SIZE, INPUT_SIZE))
            data.append(image)

# Rescale intensities from [0, 255] to [0.0, 1.0]
data = np.array(data, dtype='float') / 255.0

# Transform class labels into one-hot encoded vectors
labels = np.array(labels)
lb = LabelBinarizer()
labels = lb.fit_transform(labels)

In [None]:
# Splitting train and validation sets
(trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.2,
                                                  stratify=labels, random_state=1)

In [None]:
data_augmentation = ImageDataGenerator(rotation_range=25, width_shift_range=0.2,
                                       height_shift_range=0.2,
                                       shear_range=0.2, zoom_range=0.2,
                                       horizontal_flip=True, fill_mode='nearest')

# Modeling and Training

### Modeling

In [None]:
# base_model = VGG19(weights='imagenet', include_top=False, input_shape=(208, 208, 3))
# base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(208, 208, 3))
base_model = InceptionResNetV2(weights='imagenet', include_top=False,
                               input_shape=(INPUT_SIZE, INPUT_SIZE, 3))
# base_model = InceptionV3(weights='imagenet', include_top=False, input_shape=(208, 208, 3))

model = Sequential()

model.add(base_model)

# model.add(Flatten())
# model.add(Dense(2048, activation='relu'))
# model.add(Dropout(0.8))

model.add(GlobalAveragePooling2D())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.8))

# model.add(Flatten())
# model.add(Dense(1024, activation='relu'))
# model.add(Dropout(0.8))

model.add(Dense(len(lb.classes_), activation='softmax'))

In [None]:
model.summary()

### Optimizers

In [None]:
# opt = SGD(lr=0.01, clipnorm=1, decay=1e-6, momentum=0.9, nesterov=True)
opt = Adam(lr=0.00005)
# opt = RMSprop(lr=0.0001)

model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

### Training

In [None]:
checkpoint = ModelCheckpoint(filepath=MODEL_FILENAME, monitor='val_loss',
                             verbose=1, save_best_only=True)

history = model.fit_generator(data_augmentation.flow(trainX, trainY, batch_size=36),
                    validation_data=(testX, testY),
                    epochs=15, verbose=1,
                    steps_per_epoch=len(trainX) // 36, callbacks=[checkpoint])

In [None]:
# Save history to plot results
with open(HISTORY_FILENAME, 'wb') as file:
    pickle.dump(history.history, file)

# Evaluating

### Evaluation of all images

In [None]:
true_labels = list()
predicted_labels = list()

for image, true_label_enc in zip(testX, testY):
    prediction = model.predict(np.expand_dims(image, axis=0))[0]
    id_predicted = np.argmax(prediction)
    label = lb.classes_[id_predicted]
    predicted_labels.append(label)

    id_true_label = np.argmax(true_label_enc)
    true_label = lb.classes_[id_true_label]
    true_labels.append(true_label)
    if true_label != label:
        print('OH NO! It was a ' + true_label + ', but it predicted ' + label)
    else:
        print(true_label)

### Evaluation of one single image

In [None]:
IMG_ID = 11

image = testX[IMG_ID]
true_label_enc = testY[IMG_ID]
prediction = model.predict(np.expand_dims(image, axis=0))[0]
id_predicted = np.argmax(prediction)
predicted_label = lb.classes_[id_predicted]
print('Predicted: ' + predicted_label)

id_true_label = np.argmax(true_label_enc)
true_label = lb.classes_[id_true_label]
print('True: ' + true_label)

plt.axis('off')
plt.imshow(image[...,::-1])
plt.show()

### Confusion matrix

In [None]:
classes = lb.classes_.tolist()
plot_confusion_matrix(true_labels, labels_predicted, classes)

plt.show()