In [8]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import keras
from keras.applications.vgg19 import VGG19
from keras.models import Model
from keras.layers import Dense, Dropout, Flatten

import os
from tqdm import tqdm
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import cv2
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

In [10]:
df_train = pd.read_csv('labels.csv')

In [11]:
df_train.head()

Unnamed: 0,id,breed
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo
2,001cdf01b096e06d78e9e5112d419397,pekinese
3,00214f311d5d2247d5dfe4fe24b2303d,bluetick
4,0021f9ceb3235effd7fcde7f7538ed62,golden_retriever


In [12]:
targets_series = pd.Series(df_train['breed'])
one_hot = pd.get_dummies(targets_series, sparse = True)

In [13]:
one_hot_labels = np.asarray(one_hot)

In [15]:
im_size = 256

In [16]:
x_train = []
y_train = []
x_test = []

In [17]:
i = 0 
for f, breed in tqdm(df_train.values):
    img = cv2.imread('train/{}.jpg'.format(f))
    label = one_hot_labels[i]
    x_train.append(cv2.resize(img, (im_size, im_size)))
    y_train.append(label)
    i += 1

100%|██████████| 10222/10222 [04:14<00:00, 40.23it/s]


In [22]:
df_test = pd.read_csv('sample_submission.csv')

In [23]:
for f in tqdm(df_test['id'].values):
    img = cv2.imread('test/{}.jpg'.format(f))
    x_test.append(cv2.resize(img, (im_size, im_size)))

100%|██████████| 10357/10357 [04:01<00:00, 42.83it/s]


In [24]:
y_train_raw = np.array(y_train, np.uint8)
x_train_raw = np.array(x_train, np.float32) / 255.
x_test  = np.array(x_test, np.float32) / 255.

In [25]:
print(x_train_raw.shape)
print(y_train_raw.shape)
print(x_test.shape)

(10222, 256, 256, 3)
(10222, 120)
(10357, 256, 256, 3)


In [26]:
#We can see above that there are 120 different breeds. We can put this in a num_class variable below that can then be used when creating the CNN model.
num_class = y_train_raw.shape[1]

In [27]:
X_train, X_valid, Y_train, Y_valid = train_test_split(x_train_raw, y_train_raw, test_size=0.2, random_state=1)

In [28]:
# Create the base pre-trained model
# Can't download weights in the kernel
base_model = VGG19(weights = None, include_top=False, input_shape=(im_size, im_size, 3))

# Add a new top layer
x = base_model.output
x = Flatten()(x)
predictions = Dense(num_class, activation='softmax')(x)

# This is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

# First: train only the top layers (which were randomly initialized)
for layer in base_model.layers:
    layer.trainable = False

model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

callbacks_list = [keras.callbacks.EarlyStopping(monitor='val_acc', patience=3, verbose=1)]
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 256, 256, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 256, 256, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 256, 256, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 128, 128, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 128, 128, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 128, 128, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 64, 64, 128)       0         
__________

In [29]:
model.fit(X_train, Y_train, epochs=1, validation_data=(X_valid, Y_valid), verbose=1)

Train on 8177 samples, validate on 2045 samples
Epoch 1/1


<keras.callbacks.History at 0x7fa39a134c50>

In [30]:
model.compile(loss='categorical_crossentropy', 
              optimizer='sgd', 
              metrics=['accuracy'])

In [31]:
model.fit(X_train, Y_train, epochs=2, validation_data=(X_valid, Y_valid), verbose=1)

Train on 8177 samples, validate on 2045 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fa41dd7df98>

In [36]:
model.compile(loss='categorical_crossentropy', 
              optimizer='sgd', 
              metrics=['accuracy'])

In [37]:
model.fit(X_train, Y_train, epochs=2, validation_data=(X_valid, Y_valid), verbose=1)

Train on 8177 samples, validate on 2045 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f9df0048dd8>

In [40]:
# First: train only the top layers (which were randomly initialized)
for layer in base_model.layers:
    layer.trainable = True

In [41]:
model.compile(loss='categorical_crossentropy', 
              optimizer='sgd', 
              metrics=['accuracy'])

In [42]:
model.fit(X_train, Y_train, epochs=2, validation_data=(X_valid, Y_valid), verbose=1)

Train on 8177 samples, validate on 2045 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f9df00941d0>

In [43]:
preds = model.predict(x_test, verbose=1)



In [44]:
sub = pd.DataFrame(preds)
# Set column names to those generated by the one-hot encoding earlier
col_names = one_hot.columns.values
sub.columns = col_names
# Insert the column id from the sample_submission at the start of the data frame
sub.insert(0, 'id', df_test['id'])
sub.head(5)

Unnamed: 0,id,affenpinscher,afghan_hound,african_hunting_dog,airedale,american_staffordshire_terrier,appenzeller,australian_terrier,basenji,basset,...,toy_poodle,toy_terrier,vizsla,walker_hound,weimaraner,welsh_springer_spaniel,west_highland_white_terrier,whippet,wire-haired_fox_terrier,yorkshire_terrier
0,000621fb3cbb32d8935728e48679680e,0.002458,0.010136,0.004575,0.01136,0.005095,0.007148,0.005755,0.016079,0.010807,...,0.005372,0.007739,0.00374,0.006615,0.008147,0.010009,0.010024,0.006231,0.009024,0.005947
1,00102ee9d8eb90812350685311fe5890,0.001527,0.015924,0.004218,0.014921,0.003911,0.002902,0.00829,0.018138,0.008165,...,0.005231,0.004197,0.002814,0.004977,0.010221,0.009363,0.00934,0.007482,0.006379,0.007503
2,0012a730dfa437f5f3613fb75efcd4ce,0.003542,0.009328,0.004401,0.00924,0.008027,0.014207,0.006076,0.014193,0.015523,...,0.005115,0.009732,0.003692,0.007942,0.006243,0.010808,0.009465,0.005381,0.00686,0.006301
3,001510bc8570bbeee98c8d80c8a95ec1,0.003647,0.010793,0.004921,0.009946,0.007916,0.011441,0.008058,0.014158,0.01162,...,0.00684,0.006275,0.00397,0.007249,0.007564,0.0093,0.009879,0.005142,0.005741,0.00847
4,001a5f3114548acdefa3d4da05474c2e,0.003319,0.011128,0.003202,0.008742,0.008666,0.017656,0.005718,0.014777,0.015007,...,0.005284,0.007183,0.002609,0.007053,0.006314,0.010175,0.007441,0.004005,0.004973,0.00691


In [46]:
sub.to_csv('dog-breed.csv',index = False)

In [47]:
!kg submit 'dog-breed.csv' -u lightsalsa -p 11QQqq!! -c dog-breed-identification -m "VGG19"

4.65901
