<a href="https://colab.research.google.com/github/imazerty/1project/blob/master/Kaggle_Fruits360.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
  """
  This function prints and plots the confusion matrix.
  Normalization can be applied by setting `normalize=True`.
  """
  if normalize:
      cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
      print("Normalized confusion matrix")
  else:
      print('Confusion matrix, without normalization')

  print(cm)

  plt.imshow(cm, interpolation='nearest', cmap=cmap)
  plt.title(title)
  plt.colorbar()
  tick_marks = np.arange(len(classes))
  plt.xticks(tick_marks, classes, rotation=45)
  plt.yticks(tick_marks, classes)

  fmt = '.2f' if normalize else 'd'
  thresh = cm.max() / 2.
  for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
      plt.text(j, i, format(cm[i, j], fmt),
               horizontalalignment="center",
               color="white" if cm[i, j] > thresh else "black")

  plt.tight_layout()
  plt.ylabel('True label')
  plt.xlabel('Predicted label')
  plt.show()


In [0]:
from __future__ import division, print_function
from builtins import range, input

from keras.layers import Dense, Flatten
from keras.models import Model
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator

from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import os

# re-size all the images to this
IMAGE_SIZE = [100, 100] # feel free to change depending on dataset

# training config:
epochs = 5
batch_size = 32

# https://www.kaggle.com/moltean/fruits
# train_path = '../large_files/fruits-360/Training'
# valid_path = '../large_files/fruits-360/Validation'
train_path = '../large_files/fruits-360-small/Training'
valid_path = '../large_files/fruits-360-small/Validation'
# useful for getting number of files
image_files = glob(train_path + '/*/*.jp*g')
valid_image_files = glob(valid_path + '/*/*.jp*g')

# useful for getting number of classes
folders = glob(train_path + '/*')

# look at an image for fun
plt.imshow(image.load_img(np.random.choice(image_files)))
plt.show()

# add preprocessing layer to the front of VGG
vgg = VGG16(input_shape = IMAGE_SIZE + [3], weights = 'imagenet', include_top = False)

# don't train existing weights
for layer in vgg.layers:
  layer.trainable = False
  
# our layers - you can add more if you want
#If you wanted to use a Dense(a fully connected layer) you need to flatten the output
x = Flatten()(vgg.output)
predictions = Dense(len(folders), activation = 'softmax')(x)

# create a model object
model = Model(inputs = vgg.input, outputs = predictions)

# view the structure of the model
model.summary

# tell the model what cost and optimization method to use
model.compile(loss = 'categorical_crossentropy', optimizer = 'rmsprop', metrics = ['accuracy'])
#rmsprop prevents gradients from blowing up

# create an instance of ImageDataGenerator
gen = ImageDataGenerator(
  rotation_range=20,
  width_shift_range=0.1,
  height_shift_range=0.1,
  shear_range=0.1,
  zoom_range=0.2,
  horizontal_flip=True,
  vertical_flip=True,
  preprocessing_function = preprocess_input #to rearrange the channels etc because vgg was trained using caffe 
)

# test generator to see how it works and some other useful things

# get label mapping for confusion matrix plot later

test_gen = gen.flow_from_directory(valid_path, target_size = IMAGE_SIZE)
print(test_gen.class_indices) #gives a dict where the key is the class name and the value is the class index
#let's switch the key and value
labels = [None] * len(test_gen.class_indices)
for k, v in test_gen.class_indices.items():
  labels[v] = k
  
# should be a strangely colored image (due to VGG weights being BGR) because we switched the channels
#A DirectoryIterator yielding tuples of (x, y) where x is a numpy array containing a batch of images with shape (batch_size, *target_size, channels) and y is a numpy array of corresponding labels.
for x, y in test_gen:
  print('min = {} and max = {}'.format(x[0].min(), x[0].max()) )#not 0-255
  plt.title(label[np.argmax(y[0])])
  plt.imshow(x[0])
  plt.show()
  break
  
# create generators
train_generator = gen.flow_from_directory(
train_path,
target_size=IMAGE_SIZE,
shuffle=True,
batch_size=batch_size,
)

valid_generator = gen.flow_from_directory(
  valid_path,
  target_size=IMAGE_SIZE,
  shuffle=True,
  batch_size=batch_size,
)

# fit the model
r = model.fit_generator(
train_generator,
validation_data=valid_generator,
epochs=epochs,
steps_per_epoch=len(image_files) // batch_size,
validation_steps=len(valid_image_files) // batch_size,
)
#scikit-learn already has a confusion matrix but the problem is we need to pass it 2arrays targets and predictions which we don't have
#we only have data generators which we can use to build the arrays we need
#useful note: flow_from_directory never ends so we need to use the steps_per_epoch  to stop the infinite loop

def get_confusion_matrix(data_path, N):
  # we need to see the data in the same order
  # for both predictions and targets
  print("Generating confusion matrix", N)
  predictions = []
  targets = []
  for x, y in gen.flow_from_directory(data_path, target_size=IMAGE_SIZE, shuffle=False, batch_size=batch_size * 2):
    #y is a one-hot encoded
    #p is an array of proba
    #we can turn these into a 1D array of labels by taking the argmax then concatenating with eisting targets and predictions
    
    p = model.predict(x)
    p = np.argmax(p, axis=1)
    y = np.argmax(y, axis=1)
    predictions = np.concatenate((predictions, p))
    targets = np.concatenate((targets, y))
    if len(targets) >= N:
      break

    cm = confusion_matrix(targets, predictions)
    return cm    
  
cm = get_confusion_matrix(train_path, len(image_files))
print(cm)
valid_cm = get_confusion_matrix(valid_path, len(valid_image_files))
print(valid_cm)

# plot some data

# loss
#keras stores it as a history object when we call fit
plt.plot(r.history['loss'], label= 'train loss' )
plt.plot(r.history['val_loss'], label = 'val loss')
plt.legend()
plt.show()

#accuracy

plt.plot(r.history['acc'], label = 'train acc')
plt.plot(r.history['val_acc'], label = 'val acc')
plt.legend()
plt.show

#confusion_matrix

plot_confusion_matrix(cm, labels, title='Train confusion matrix')
plot_confusion_matrix(valid_cm, labels, title='Validation confusion matrix')