In [None]:
import os
import hashlib
import numpy as np
from glob import glob
from random import shuffle
from keras import optimizers
from skimage.color import gray2rgb
from scipy.misc import imread, imsave
from keras.applications import InceptionV3
from keras.models import Model, Sequential
from keras.layers.core import Dense, Flatten
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

# Stop training if validation loss doesn't improve for 10 epochs
earlystop = EarlyStopping(monitor = "val_loss", 
                          patience = 10, 
                          verbose = 1, 
                          mode = "auto")

# Save the best model after every epoch
checkpoint = ModelCheckpoint(filepath = "inceptionv3.hdf5", 
                             verbose = 1, 
                             save_best_only = True)

# Reduce the learning rate after validation loss plateaus
reducelr = ReduceLROnPlateau(monitor = "val_loss", 
                             factor = 0.2,
                             patience = 5)

TARGET_SIZE = (299, 299) # Input shape for Inception v3
BATCH_SIZE = 32 # Batch size for training


    





def define_model():
    ''' Load a pre-trained inception V3 model and change the top layers to 
    match the number of classes of our problem'''
    base_model = InceptionV3(weights = "imagenet", 
                         include_top = False, 
                         input_shape = (299, 299, 3))
    for i in range(len(base_model.layers)):
        base_model.layers[i].trainable = False
    add_model = Sequential()
    add_model.add(Flatten(input_shape = base_model.output_shape[1:]))
    add_model.add(Dense(256, activation = "relu"))
    add_model.add(Dense(len(os.listdir("dataset//train")), activation="softmax"))
    model = Model(inputs=base_model.input, outputs=add_model(base_model.output))
    model.compile(loss = "categorical_crossentropy", 
              optimizer = optimizers.SGD(lr = 1e-4, momentum = 0.9),
              metrics = ["accuracy"])
    model.summary()
    return(model)

In [None]:
def clean_train(train_folder):
    '''Removes duplicates in train folder where the same images appears in
    more than one class'''
    hashes = {}
    labels = {}

    print("computing md5 of training data")

    for fname in glob(train_folder+"/*/*.jpg"):
        labels[fname] = fname.split("//")[-2]
        h = hashlib.md5(open(fname,"rb").read()).hexdigest()  
        if h in hashes:
            hashes[h].append(fname)
        else:
            hashes[h] = [fname]
    
    # Find duplicates
    repeated = sum(1 for k,v in hashes.items() if len(v) > 1 )
    print("Files appearing more than once in train: ", repeated)
    
    del_files = []
    
    # Find duplicate images with different class names
    for k,v in hashes.items():
        if len(v) > 1:
            c = set([labels[x] for x in v])
            if len(c) > 1:
                del_files = del_files.append(v)
    
    for x in del_files:
        os.remove(x)

    print(len(del_files), "images deleted from training set")

In [None]:
def process_test_images(test_folder):
    ''' Function to convert test images to 3 channels (for images having
    4 channels or less than 3 channels)'''
    for img in os.listdir(test_folder):
        img_path = os.path.join(test_folder, img)
        img_file = imread(img_path)
        if len(img_file.shape) < 3:
            img_file = gray2rgb(img_file)
            img_file = img_file.astype(np.float32, copy = False)
            imsave(img_path, img_file)
        if len(img_file.shape) == 4:
            img_file = img_file[:,:,:-1]
            img_file = img_file.astype(np.float32, copy = False)
            imsave(img_path, img_file)

In [None]:
def find_leak(train_folder, test_folder):
    '''Finds images present in both training and test set'''

    hashes = {}
    labels = {}

    print("computing md5 of training data")

    for fname in glob(train_folder+"/*/*.jpg"):
        labels[fname] = fname.split("//")[-2]
        h = hashlib.md5(open(fname,"rb").read()).hexdigest()  
        if h in hashes:
            hashes[h].append(fname)
        else:
            hashes[h] = [fname]

    print("comparing training and test set")
    
    leaks = []
    for fname in glob(test_folder+"/*.jpg"):
        h = hashlib.md5(open(fname,"rb").read()).hexdigest()
        if h in hashes:
            leaks.append((fname.split("//")[-1],hashes[h][0].split("//")[-2]))

    print("Number of test images present in train:{}".format(len(leaks)))
    return leaks

In [None]:
def process_train_images(train_folder):
    ''' Function to convert training images to 3 channels (for images having
    4 channels or less than 3 channels)''' 
    
    classes = os.listdir(train_folder)
    for cla in classes:
        cla_path = os.path.join("dataset", "train", cla)
        for img in os.listdir(cla_path):
            img_path = os.path.join("dataset", "train", cla, img)
            img_file = imread(img_path)
            if len(img_file.shape) < 3:
                img_file = gray2rgb(img_file)
                img_file = img_file.astype(np.float32, copy = False)
                imsave(img_path, img_file)
            if len(img_file.shape) == 4:
                img_file = img_file[:,:,:-1]
                img_file = img_file.astype(np.float32, copy = False)
                imsave(img_path, img_file)
                

In [None]:
# Pre-processing function for Inception v3 model
def preprocess_input(x):
    x /= 255.
    x -= 0.5
    x *= 2.
    return x

def remove_percentage(list_a, percentage):
    ''' Function to randomly pick x percentage from a list'''
    shuffle(list_a)
    count = int(len(list_a) * percentage)
    if not count: 
        return []
    list_a[-count:], list_b = [], list_a[-count:]
    return list_b

In [None]:
           


def create_val_set(val_size):
    '''Function to create a validation set from training images'''
    if not os.path.exists("dataset//valid"):
        os.makedirs("dataset//valid")
    class_list = os.listdir("dataset//train")
    for cla in class_list:
        if os.path.exists(os.path.join("dataset", "valid", cla)):
            if len(os.listdir(os.path.join("dataset", "valid", cla))) == 0:
                new_files = os.listdir(os.path.join("dataset", "train", cla))
                new_files = remove_percentage(new_files, val_size)
                for nf in new_files:
                    os.rename(os.path.join("dataset", "train", cla, nf), 
                              os.path.join("dataset", "valid", cla, nf))
            else:
                new_files = os.listdir(os.path.join("dataset", "valid", cla))
                for nf in new_files:
                    os.rename(os.path.join("dataset", "valid", cla, nf),
                              os.path.join("dataset", "train", cla, nf))
                new_files = os.listdir(os.path.join("dataset", "train", cla))
                new_files = remove_percentage(new_files, val_size)
                for nf in new_files:
                    os.rename(os.path.join("dataset", "train", cla, nf), 
                              os.path.join("dataset", "valid", cla, nf))
        else:
            os.makedirs(os.path.join("dataset", "valid", cla))
            new_files = os.listdir(os.path.join("dataset", "train", cla))
            new_files = remove_percentage(new_files, val_size)
            for nf in new_files:
                os.rename(os.path.join("dataset", "train", cla, nf), 
                          os.path.join("dataset", "valid", cla, nf))

In [None]:
def define_model():
    ''' Load a pre-trained inception V3 model and change the top layers to 
    match the number of classes of our problem'''
    base_model = InceptionV3(weights = "imagenet", 
                         include_top = False, 
                         input_shape = (299, 299, 3))
    for i in range(len(base_model.layers)):
        base_model.layers[i].trainable = False
    add_model = Sequential()
    add_model.add(Flatten(input_shape = base_model.output_shape[1:]))
    add_model.add(Dense(256, activation = "relu"))
    add_model.add(Dense(len(os.listdir("dataset//train")), activation="softmax"))
    model = Model(inputs=base_model.input, outputs=add_model(base_model.output))
    model.compile(loss = "categorical_crossentropy", 
              optimizer = optimizers.SGD(lr = 1e-4, momentum = 0.9),
              metrics = ["accuracy"])
    model.summary()
    return(model)

# UFFF ENOUGH OF FUNCTIONS 

In [None]:
! pip install psutil

In [None]:
! pip install keras==2.1.4

In [None]:
!pip install -U -q PyDrive

In [None]:
import keras
import psutil
import pandas as pd
import tensorflow as tf
from google.colab import auth
from google.colab import files
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from oauth2client.client import GoogleCredentials

In [None]:
tf.test.gpu_device_name()

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
file1 = drive.CreateFile({'id':'1Pqjs7c5MAgPQLiMA3YR8dFg_YHLnPdlO'})
file1.GetContentFile('dataset.zip')

In [None]:
!unzip dataset.zip -dq ./

In [None]:
#file2 = drive.CreateFile({'id':'18H1VBB5It4tneouiOhTqSQjFscOomAum'})
#file2.GetContentFile('ernest.py')
#from ernest import *

Delete files appearing in more than one class in training set

In [None]:
clean_train("dataset//train")

In [None]:

Check whether any file appears in both training and test set

In [None]:
_ = find_leak("dataset//train", "dataset//test")

In [None]:
#Convert all training images to 3 channels
process_train_images("dataset//train")

In [None]:

Convert all test images to 3 channels
process_test_images("dataset//test//data")

In [None]:
create_val_set(0.2)

In [None]:

Load the pre-trained Inception V3 model
model = define_model()

# VVI  Define data generators

In [None]:
train_datagen = ImageDataGenerator(
        rotation_range = 40,
        width_shift_range = 0.2,
        height_shift_range = 0.2,
        shear_range = 0.2,
        zoom_range = 0.2,
        horizontal_flip = True,
        preprocessing_function = preprocess_input
)

test_datagen = ImageDataGenerator(preprocessing_function = preprocess_input)

In [None]:
train_generator = train_datagen.flow_from_directory(
        "dataset//train",
        target_size = TARGET_SIZE, 
        batch_size = BATCH_SIZE, 
        class_mode = "categorical")

In [None]:
validation_generator = test_datagen.flow_from_directory(
        "dataset//valid", 
        target_size = TARGET_SIZE, 
        batch_size = BATCH_SIZE, 
        class_mode = "categorical")

In [None]:
test_generator = test_datagen.flow_from_directory(
        "dataset//test", 
        target_size = TARGET_SIZE, 
        batch_size = BATCH_SIZE, 
        class_mode = None, 
        shuffle = False)


Define callbacks

In [None]:
earlystop = EarlyStopping(monitor = "val_loss", 
                          patience = 10, 
                          verbose = 1, 
                          mode = "auto")

checkpoint = ModelCheckpoint(filepath = "inceptionv3.hdf5", 
                             verbose = 1, 
                             save_best_only = True)

reducelr = ReduceLROnPlateau(monitor = "val_loss", 
                             factor = 0.2,
                             patience = 2)


Fit the model

In [None]:

model.fit_generator(
        generator = train_generator,
        epochs = 10,
        callbacks = [checkpoint, earlystop, reducelr],
        validation_data = validation_generator,
        verbose = 1)

Predict on test set

In [None]:

predictions = model.predict_generator(
        test_generator,
        verbose = 1)

In [None]:
predictions = np.argmax(predictions, axis = 1)
labels = train_generator.class_indices
predictions = [list(labels.keys())[list(labels.values()).index(i)] for i in predictions]

In [None]:
#Save model object
json_model = model.to_json()
with open("inceptionV3.json", "w") as json_file:
    json_file.write(json_model)
    
model.save_weights("inceptionV3.h5")

In [None]:
#Save prediction file
new_subm = pd.DataFrame({"filename": test_generator.filenames, "Superhero": predictions})
new_subm["filename"] = new_subm["filename"].apply(lambda x: x.split("/")[1])
new_subm["filename"] = new_subm["filename"].apply(lambda x: x.split(".")[0])

In [None]:
file3 = drive.CreateFile({"title": "inceptionV3"})
file3.Upload()
print('title: %s, id: %s' % (file3['title'], file3['id']))

In [None]:
file3 = drive.CreateFile({"id": "193czxV0V5nJa9qJtv7c-_UAUkUfcdTEf"})
file3.SetContentFile('./inceptionV3.h5')

In [None]:
file3.Upload()