In [1]:
from keras.preprocessing.image import ImageDataGenerator
import pandas as pd
import os
from keras.models import Model
from keras.layers import Dense, Flatten, GlobalAveragePooling2D, Dropout, Input, AveragePooling2D, Lambda
from urllib import request
from keras.callbacks import TensorBoard
import tempfile
from keras import optimizers
from keras.utils import Sequence
import numpy as np
from tqdm import tqdm
from datetime import datetime
from keras.applications import vgg16, resnet50
from glob import glob

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Use env variable to detect if run in kaggle environment or not

#!export NOT_KAGGLE_KERNEL=true

In [2]:
NOT_KAGGLE_KERNEL = os.environ.get('NOT_KAGGLE_KERNEL', 'false') == 'true'
print('Run on kaggle kernel:', NOT_KAGGLE_KERNEL)

Run on kaggle kernel: False


# Define variables

In [3]:
ROOT_DIR = '../input/dog-breed' if NOT_KAGGLE_KERNEL else '../input'
TRAIN_DIR = os.path.join(ROOT_DIR, 'train')
TEST_DIR = os.path.join(ROOT_DIR, 'test')

EPOCHS = 50
BATCH_SIZE = 512
IMAGE_SIZE = (224,224)
INPUT_SHAPE = IMAGE_SIZE + (3,)

# Fetch data (if not exist)

In [4]:
def fetch_data():
    if not os.path.exists(ROOT_DIR):
        os.makedirs(ROOT_DIR)
        zip_path = os.path.join(tempfile.gettempdir(), 'dog-breed.zip')
        print('Start download!')
        request.urlretrieve('https://kienle.blob.core.windows.net/public/kaggle/dog-breed.zip', zip_path)
        print('Start unzip')
        import zipfile
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(ROOT_DIR)
        with zipfile.ZipFile(os.path.join(ROOT_DIR,'test.zip'), 'r') as zip_ref:
            zip_ref.extractall(ROOT_DIR)
        with zipfile.ZipFile(os.path.join(ROOT_DIR,'train.zip'), 'r') as zip_ref:
            zip_ref.extractall(ROOT_DIR)
        os.remove(zip_path)
        os.remove(os.path.join(ROOT_DIR,'train.zip'))
        os.remove(os.path.join(ROOT_DIR,'test.zip'))
        print('Done')
fetch_data()

In [5]:
# Confirm all data are ready 
# Expected output: labels.csv  sample_submission.csv  test  train
!ls $ROOT_DIR

labels.csv  my_submission.csv  sample_submission.csv  test  train


# Prepare training data set

In [6]:
pd_images = pd.read_csv(os.path.join(ROOT_DIR, 'labels.csv'))

In [7]:
pd_images.head()

Unnamed: 0,id,breed
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo
2,001cdf01b096e06d78e9e5112d419397,pekinese
3,00214f311d5d2247d5dfe4fe24b2303d,bluetick
4,0021f9ceb3235effd7fcde7f7538ed62,golden_retriever


In [8]:
image_gen = ImageDataGenerator()

In [9]:
train_valid_generator = image_gen.flow_from_dataframe(pd_images, TRAIN_DIR, batch_size=BATCH_SIZE, x_col='id', y_col='breed', has_ext=False, target_size=IMAGE_SIZE)

Found 10222 images belonging to 120 classes.


In [10]:
num_classes = len(train_valid_generator.class_indices)

# Feature extractions using pretrained models

In [11]:
class FeaturesExtractor():
    def __init__(self, input_shape, pretrained_model, preprocess_input):
        self.input_shape = input_shape
        model = pretrained_model(include_top=False, input_shape=self.input_shape, weights='imagenet')
        inputs = Input(self.input_shape)
        x = inputs
        x = Lambda(preprocess_input, name='preprocessing')(x)
        x = model(x)
        x = GlobalAveragePooling2D()(x)
        model = Model(inputs, x)
        self.model = model
    def predict(self, X):
        return self.model.predict(X)
    def predict_generator(self, X):
        return self.model.predict_generator(X)

In [12]:
vgg_features_extractor = FeaturesExtractor(INPUT_SHAPE, vgg16.VGG16, vgg16.preprocess_input)
resnet50_features_extractor = FeaturesExtractor(INPUT_SHAPE, resnet50.ResNet50, resnet50.preprocess_input)



In [13]:
X_train = np.zeros((train_valid_generator.n,) + INPUT_SHAPE)
y_train = np.zeros((train_valid_generator.n, 120))
for i in tqdm(range(len(train_valid_generator)), ncols=100):
    temp_x, temp_y = train_valid_generator[i]
    start_index = i*train_valid_generator.batch_size
    X_train[start_index:start_index + temp_x.shape[0]] = temp_x
    y_train[start_index:start_index + temp_x.shape[0]] = temp_y


100%|███████████████████████████████████████████████████████████████| 20/20 [00:33<00:00,  1.68s/it]


In [14]:
def create_simple_model(num_classes, input_shape):
    inputs = Input(input_shape)
    x = inputs
    x = Dropout(0.5)(x)
    x = Dense(num_classes, activation='softmax')(x)
    return Model(inputs, x)


# Use vgg16

In [15]:
# Calculate vgg features
vgg_features = vgg_features_extractor.predict(X_train)

In [16]:
vgg_based_model = create_simple_model(num_classes, vgg_features_extractor.model.output.get_shape().as_list()[1:])
vgg_based_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
if NOT_KAGGLE_KERNEL:
    callbacks = [TensorBoard('./data/logs/vgg-{0}'.format(datetime.now().isoformat().replace(':','-').split('.')[0]))]
else:
    callbacks = []
    
vgg_based_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 120)               61560     
Total params: 61,560
Trainable params: 61,560
Non-trainable params: 0
_________________________________________________________________


In [17]:
vgg_based_model.fit(vgg_features, y_train, batch_size=256, epochs=EPOCHS, callbacks=callbacks, validation_split=0.1)

Train on 9199 samples, validate on 1023 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f5ef8262630>

# Use resnet50

In [18]:
resnet50_features = resnet50_features_extractor.predict(X_train)

In [19]:
resnet50_based_model = create_simple_model(num_classes, resnet50_features_extractor.model.output.get_shape().as_list()[1:])
resnet50_based_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
if NOT_KAGGLE_KERNEL:
    callbacks = [TensorBoard('./data/logs/resnet50-{0}'.format(datetime.now().isoformat().replace(':','-').split('.')[0]))]
else:
    callbacks = []

In [20]:
resnet50_based_model.fit(resnet50_features, y_train, batch_size=256, epochs=EPOCHS,  callbacks=callbacks, validation_split=0.1)

Train on 9199 samples, validate on 1023 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f5ef809f828>

# Confusion matrix (VGG)

In [21]:
index2labels = {}
for k, v in train_valid_generator.class_indices.items():
    index2labels[v] = k

In [None]:
preds = []
targets = []
for i in tqdm(range(len(train_valid_generator)), ncols=100):
    x, y = train_valid_generator[i]
    p = vgg_based_model.predict(vgg_features_extractor.predict(x))
    p = np.argmax(p, axis=1)    
    y = np.argmax(y, axis=1)
    preds = np.concatenate((preds, p))
    targets = np.concatenate((targets, y))
from sklearn.metrics import confusion_matrix
print(np.sum(targets == preds))
print(preds[0:10])
print(targets[0:10])
cm = confusion_matrix(targets, preds)
print(cm[0:10, 0:10])

 70%|████████████████████████████████████████████                   | 14/20 [00:44<00:18,  3.14s/it]

In [None]:
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline
df_cm = pd.DataFrame(cm)
plt.figure(figsize = (10,10))
sn.heatmap(df_cm, annot=False)

# Prepare submission data

In [None]:
pd_test = pd.DataFrame(glob(TEST_DIR +'/*.jp*g'), columns=['filename'])
test_generator = image_gen.flow_from_dataframe(pd_test, TEST_DIR,  class_mode=None, target_size=IMAGE_SIZE, shuffle=False)

In [None]:
test_resnet_features = resnet50_features_extractor.predict_generator(test_generator)

In [None]:
type(test_resnet_features)

In [None]:
test_probs = resnet50_based_model.predict(test_resnet_features)

In [None]:
pd_sub = pd.read_csv(os.path.join(ROOT_DIR, 'sample_submission.csv'))

In [None]:
sub_mission = pd.DataFrame(test_probs, index=pd_sub.index, columns=pd_sub.columns[1:])
sub_mission['id'] = pd_sub['id']
sub_mission_cols = sub_mission.columns.tolist()
sub_mission = sub_mission[sub_mission_cols[-1:] + sub_mission_cols[:-1]]

In [None]:
sub_mission.to_csv(os.path.join(ROOT_DIR, 'my_submission.csv'), index=False)