# Classifying dog species with Convolutional Neural Networks
DAT300, group CA2-10. Jon Nordby and Espen Sønneland.


## Kaggle setup
Downloads data from https://www.kaggle.com/c/dat300-2018-dogs

In [0]:
!pip install -q kaggle && echo Kaggle installed

Kaggle installed


In [0]:
from google.colab import drive
import shutil, os, os.path
drive.mount('/content/drive')
if not os.path.exists('/root/.kaggle'):
  os.makedirs('/root/.kaggle/')
shutil.copyfile('/content/drive/My Drive/kaggle.json', '/root/.kaggle/kaggle.json')
'Kaggle API key installed'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'Kaggle API key installed'

In [0]:
!kaggle competitions download -c dat300-2018-dogs -p data/
!unzip -n -q 'data/*.zip' -d data/
!ls data/train/*.jpg | wc -l
!ls data/test/*.jpg | wc -l

sampleSubmission.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
labels.csv: Skipping, found more recently modified local copy (use --force to force download)
test.zip: Skipping, found more recently modified local copy (use --force to force download)
train.zip: Skipping, found more recently modified local copy (use --force to force download)

3 archives were successfully processed.
10290
10290


In [0]:
import time
import os
import re

import pandas
import numpy
import matplotlib.pyplot as plt

import sklearn

import sklearn.preprocessing
import sklearn.model_selection
import sklearn.pipeline

import sklearn.linear_model
import sklearn.ensemble
import sklearn.svm

import keras
import keras.wrappers.scikit_learn

In [0]:
labels = pandas.read_csv('data/labels.csv', index_col='id')

print(labels.shape)
labels.head(3)

(10290, 1)


Unnamed: 0_level_0,breed
id,Unnamed: 1_level_1
0,soft-coated_wheaten_terrier
1,Tibetan_terrier
2,Lhasa


In [0]:
print('number of breeds', len(labels.breed.unique()))
print('images per breed (average)', len(labels)/len(labels.breed.unique()))
 

number of breeds 120
images per breed (average) 85.75


Under 100 images per class is considered **very few** for training deep learning models.

In [0]:
# Create a pandas.DataFrame with the samples (id, filename)
def files_dataframe(directory):
  import glob
  import re
  files = {}
  for filename in glob.iglob(directory+'/*.jpg'):
    m = re.findall('dog(\d+).jpg', filename)
    file_id = int(m[0])
    files[file_id] = os.path.basename(filename)
  filenames = list(files.values())

  ids = list(files.keys())
  df = pandas.DataFrame({
      'filename': filenames,
      'filepath': [ os.path.join(directory, f) for f in filenames ],
  }, index=ids)
  df.sort_index(inplace=True) # make sure ordered by id
  assert df.filename.values[0] == 'dog0.jpg'
  return df

train_files = files_dataframe('data/train')
_ = files_dataframe('data/test') # just ensure it runs without error
train_data = train_files.join(labels)
train_data.head(3)

Unnamed: 0,filename,filepath,breed
0,dog0.jpg,data/train/dog0.jpg,soft-coated_wheaten_terrier
1,dog1.jpg,data/train/dog1.jpg,Tibetan_terrier
2,dog2.jpg,data/train/dog2.jpg,Lhasa


## Transfer learning from pre-trained model

MobileNet is a strong but small Convolutional Neural Network that has been trained on the general ImageNet classification task. The task contains classes 'cat' and 'dog', so hopefully it has learned feature representations that can be used also to distinguish between dog breeds.

We compute the its activation maps, and use them as features for training a very simple classifier.

In [0]:

def build_model():
    from keras.applications import mobilenet_v2
       
    image_w, image_lh = (224, 224)
    model = mobilenet_v2.MobileNetV2(weights='imagenet',
                           input_shape=(image_w, image_h, 3),
                           include_top=False, pooling='avg')
    preprocess_input = mobilenet_v2.preprocess_input
    return model, preprocess_input
  
MobileNet = build_model()

In [0]:
def calculate_features(model, img_path, img_shape=(224, 224)):
    model, preprocess_input = model
    
    img = keras.preprocessing.image.load_img(img_path, target_size=img_shape)
    x = keras.preprocessing.image.img_to_array(img)
    x = numpy.expand_dims(x, axis=0)
    x = preprocess_input(x)
    f = model.predict(x)
    return f.reshape(-1)

# Return existing features if found, else compute them and then return
def get_features(model, files, name,
                 outdir='/content/drive/My Drive/dat300-dogs',
                 version=3,
                 force_recompute=False):
  def filepath(name):
    return os.path.join(outdir, "features_{}_{}.npy".format(name, version))

  def load_or_compute(inputfiles, outpath):
    dirpath = os.path.dirname(outpath)
    if not os.path.exists(dirpath):
      os.makedirs(dirpath)
    
    features = None
    if os.path.exists(outpath):
      features = numpy.load(open(outpath, 'rb'))
    else:
      print('could not find {}: computing'.format(outpath))
      ff = [calculate_features(model, f) for f in inputfiles]
      features = numpy.array(ff)
      numpy.save(open(outpath, 'wb'), features)
    return features
  
  def delete_if_exists(path):
      if os.path.exists(path):
        os.unlink(path)
  
  featuresfile = filepath(name)
  
  if force_recompute:
    print('forcing recompute of', featuresfile)
    delete_if_exists(featuresfile)
    
  features = load_or_compute(files, featuresfile)
  return features

# test on a small (fast) subset of data
_features = get_features(MobileNet, train_data.filepath[0:10], 'mobilenet_train', version=0, force_recompute=True)
assert _features.shape[0] == 10, _features.shape
assert _features.shape[1] == 1280, _features.shape
'ok'
  

forcing recompute of /content/drive/My Drive/dat300-dogs/features_mobilenet_train_0.npy
could not find /content/drive/My Drive/dat300-dogs/features_mobilenet_train_0.npy: computing


'ok'

In [0]:
test_data = files_dataframe('data/test')
mobilenet_features_test = get_features(MobileNet, test_data.filepath, 'mobilenet_test')
mobilenet_features_train = get_features(MobileNet, train_data.filepath, 'mobilenet_train')

In [0]:
def train_logreg(X, y):
  from sklearn.model_selection import train_test_split
  from sklearn.linear_model import LogisticRegression
 
  y = pandas.get_dummies(y)
  n_classes = y.shape[1]

  def multienc(y):
    #print('y', y.shape)
    return (y * range(n_classes)).sum(axis=1)
  
  X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                    test_size=0.33, random_state=42)
  estimator = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=1)
  estimator.fit(X_train, multienc(y_train))
  
  train_loss = sklearn.metrics.log_loss(y_train, estimator.predict_proba(X_train))
  train_acc = sklearn.metrics.accuracy_score(multienc(y_train), estimator.predict(X_train))
  print('train loss={:.2f} accuracy={:.1f}%'.format(train_loss, train_acc*100))
  
  val_loss = sklearn.metrics.log_loss(y_val, estimator.predict_proba(X_val))
  val_acc = sklearn.metrics.accuracy_score(multienc(y_val), estimator.predict(X_val))
  print('val   loss={:.2f} accuracy={:.1f}%'.format(val_loss, val_acc*100))
  
  return estimator

labelencoder = sklearn.preprocessing.LabelEncoder()
#onehotencoder = sklearn.preprocessing.OneHotEncoder()
y_ = labelencoder.fit_transform(train_data.breed)
#y_ = onehotencoder.fit_transform(y_.reshape(-1, 1)).todense()
mobilenet_logreg = train_logreg(mobilenet_features_train, y_)


train loss=0.01 accuracy=99.9%
val   loss=0.64 accuracy=80.9%


# Kaggle submission

In [0]:
import subprocess


def make_submission(competition, submit=True):
  # TODO: allow to pass in model,data
  name = 'MobileNetsV2 LogisticRegresion'
  model = mobilenet_logreg
  X = mobilenet_features_test
  
  y = model.predict_proba(X)
  columns = { c: y[:, i] for i,c in enumerate(labelencoder.classes_) }
  columns['id'] = range(0, len(X))
  predictions = pandas.DataFrame(columns)

  # make sure id goes first
  column_order = list(set(predictions.columns) - set(['id']))
  column_order.insert(0, 'id')
  
  filename = 'pred_{}.csv'.format(name.replace(' ', '_'))
  predictions.to_csv(filename, index=False, columns=column_order)
  
  if submit:
    args = [
        'kaggle', 'competitions', 'submit',
        '-c', competition,
        '-f', filename,
        '-m', name,
    ]
    try:
      out = subprocess.check_output(args)
    except subprocess.CalledProcessError as e:
        print('error', e)
    print(out)
  
  return filename
  
make_submission('dat300-2018-dogs', submit=True)



'pred_MobileNetsV2_LogisticRegresion.csv'