In [1]:
import pandas as pd
import numpy as np

In [2]:
## Create dataframe
df = pd.read_csv('books.csv',encoding="latin",header=None)
df.columns = ['img_idx','img_file','img_link','title','author','cat_id','category']
df.head()

Unnamed: 0,img_idx,img_file,img_link,title,author,cat_id,category
0,761183272,0761183272.jpg,http://ecx.images-amazon.com/images/I/61Y5cOdH...,Mom's Family Wall Calendar 2016,Sandra Boynton,3,Calendars
1,1623439671,1623439671.jpg,http://ecx.images-amazon.com/images/I/61t-hrSw...,Doug the Pug 2016 Wall Calendar,Doug the Pug,3,Calendars
2,B00O80WC6I,B00O80WC6I.jpg,http://ecx.images-amazon.com/images/I/41X-KQqs...,"Moleskine 2016 Weekly Notebook, 12M, Large, Bl...",Moleskine,3,Calendars
3,761182187,0761182187.jpg,http://ecx.images-amazon.com/images/I/61j-4gxJ...,365 Cats Color Page-A-Day Calendar 2016,Workman Publishing,3,Calendars
4,1578052084,1578052084.jpg,http://ecx.images-amazon.com/images/I/51Ry4Tsq...,Sierra Club Engagement Calendar 2016,Sierra Club,3,Calendars


In [3]:
#What are our categories?
df.category.unique()

array(['Calendars', 'Comics & Graphic Novels', 'Test Preparation',
       'Mystery, Thriller & Suspense', 'Science Fiction & Fantasy',
       'Romance', 'Humor & Entertainment', 'Literature & Fiction',
       'Gay & Lesbian', 'Engineering & Transportation',
       'Cookbooks, Food & Wine', 'Crafts, Hobbies & Home',
       'Arts & Photography', 'Education & Teaching',
       'Parenting & Relationships', 'Self-Help', 'Computers & Technology',
       'Medical Books', 'Science & Math', 'Health, Fitness & Dieting',
       'Business & Money', 'Law', 'Biographies & Memoirs', 'History',
       'Politics & Social Sciences', 'Reference',
       'Christian Books & Bibles', 'Religion & Spirituality',
       'Sports & Outdoors', 'Teen & Young Adult', "Children's Books",
       'Travel'], dtype=object)

In [4]:
#given a class sample size, downsample classes that are smaller and then upsample classes that are bigger.
sample_size = 1000

from sklearn.utils import resample

resampled_df_list = []
for category, group in df.groupby('category'):
    replace = len(group) < sample_size
    df_group_resampled = resample(group, replace=replace, n_samples=sample_size, random_state=123)
    resampled_df_list.append(df_group_resampled)
    
df_resampled = pd.concat(resampled_df_list)

#refer to resampled dataset from now on
df=df_resampled

In [5]:
#verifying things were properly resampled
print('There are ', len(df_resampled.groupby('img_file').groups), ' unique image files in our resampled data')

#number of categories
print('There are ', len(df_resampled.groupby('category').groups), ' categories')

correct_size = True
for category, group in df_resampled.groupby('category'):
    if len(group) != sample_size:
        correct_size = False
    
print('Categories have the correct sample size: ', correct_size)

There are  32000  unique image files in our resampled data
There are  32  categories
Categories have the correct sample size:  True


In [6]:
#test train split the dataframe
from sklearn.model_selection import train_test_split
#Notice that we don't drop target from x_train, since we have to feed a dataframe with target
# to train_generator

target = df['category']
x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.2)


#have to reset index of x_test so that can use image_data_generator on it ??
x_test = x_test.reset_index()
y_test = y_test.reset_index()

In [7]:
from keras.preprocessing.image import ImageDataGenerator
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

directory = 'pictures/'
batch_size = 512
train_generator = ImageDataGenerator(rescale=1./255).flow_from_dataframe(x_train, directory, x_col='img_file', y_col='category', has_ext=True, target_size=(256, 256), color_mode='rgb', classes=None, class_mode='categorical', batch_size=batch_size, shuffle=True, seed=None, save_to_dir=None, save_prefix='', save_format='jpeg', subset=None, interpolation='nearest')
#train_images, train_labels = next(train_generator)

test_generator = ImageDataGenerator(rescale=1./255).flow_from_dataframe(x_test, directory, x_col='img_file', y_col='category', has_ext=True, target_size=(256, 256), color_mode='rgb', classes=None, class_mode='categorical', batch_size=batch_size, shuffle=True, seed=None, save_to_dir=None, save_prefix='', save_format='jpeg', subset=None, interpolation='nearest')


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Found 23890 images belonging to 32 classes.
Found 5964 images belonging to 32 classes.


In [8]:
num_classes = len(df.category.unique())

In [9]:
from keras.layers import Dense,GlobalAveragePooling2D
from keras.applications import MobileNet
from keras.preprocessing import image
from keras.applications.mobilenet import preprocess_input
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Model
from keras.optimizers import Adam

In [10]:
from keras.applications import inception_v3

#initialise inception base model
base_model=inception_v3.InceptionV3(weights='imagenet',include_top=False)

x = base_model.output

#add our architecture

# add pooling layer
x = GlobalAveragePooling2D()(x)
# add fully-connected layer with 1024 neurons 
x = Dense(1024, activation='relu')(x)
# and a logistic layer
predictions = Dense(num_classes, activation='softmax')(x)

# this is the model we will train
model = Model(input=base_model.input, output=predictions)




In [11]:
# turn off transfer layer training
num_transfer_layers = len(base_model.layers)

for layer in model.layers[:num_transfer_layers]:
    layer.trainable = False

In [12]:
#number of epochs corresponds to one iteration through the entire dataset
EPOCHS = 1

#just picked a random optimizer and loss function
model.compile(optimizer='Adam',loss='binary_crossentropy',metrics=['accuracy'])

history = model.fit_generator(train_generator, validation_data=test_generator, 
                    steps_per_epoch=len(x_train)//batch_size, 
                    validation_steps=len(x_test)//batch_size, use_multiprocessing=True,
                    max_queue_size=12, epochs=EPOCHS)


Epoch 1/1


Process ForkPoolWorker-52:
Process ForkPoolWorker-56:
Process ForkPoolWorker-41:
Process ForkPoolWorker-46:
Process ForkPoolWorker-6:
Process ForkPoolWorker-39:
Process ForkPoolWorker-9:
Process ForkPoolWorker-31:
Process ForkPoolWorker-22:
Process ForkPoolWorker-63:
Process ForkPoolWorker-28:
Process ForkPoolWorker-38:
Process ForkPoolWorker-62:
Process ForkPoolWorker-48:
Process ForkPoolWorker-5:
Process ForkPoolWorker-54:
Process ForkPoolWorker-64:
Process ForkPoolWorker-42:
Process ForkPoolWorker-27:
Process ForkPoolWorker-12:
Process ForkPoolWorker-60:
Process ForkPoolWorker-15:
Process ForkPoolWorker-35:
Process ForkPoolWorker-18:
Traceback (most recent call last):
Process ForkPoolWorker-32:
Process ForkPoolWorker-34:
Process ForkPoolWorker-2:
Process ForkPoolWorker-58:
Process ForkPoolWorker-25:
Process ForkPoolWorker-53:
Process ForkPoolWorker-13:
Process ForkPoolWorker-44:
Process ForkPoolWorker-30:
Process ForkPoolWorker-17:
Process ForkPoolWorker-10:
Process ForkPoolWorker-5

  File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/anaconda3/lib/p

  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 2203, in next
    return self._get_batches_of_transformed_samples(index_array)
  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 2203, in next
    return self._get_batches_of_transformed_samples(index_array)
  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 1526, in __next__
    return self.next(*args, **kwargs)
  File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 1526, in __next__
    return self.next(*args, **kwargs)
  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 2163, in _get_batches_of_transformed_samples
    batch_x[i] = x
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent cal

  File "/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/lib/python3.

  File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/anaconda3/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/anaconda3/lib/python3.6/multipr

  File "/anaconda3/lib/python3.6/site-packages/keras/utils/data_utils.py", line 626, in next_sample
    return six.next(_SHARED_SEQUENCES[uid])
  File "/anaconda3/lib/python3.6/site-packages/keras/utils/data_utils.py", line 626, in next_sample
    return six.next(_SHARED_SEQUENCES[uid])
  File "/anaconda3/lib/python3.6/site-packages/keras/utils/data_utils.py", line 626, in next_sample
    return six.next(_SHARED_SEQUENCES[uid])
  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 1526, in __next__
    return self.next(*args, **kwargs)
  File "/anaconda3/lib/python3.6/site-packages/keras/utils/data_utils.py", line 626, in next_sample
    return six.next(_SHARED_SEQUENCES[uid])
  File "/anaconda3/lib/python3.6/site-packages/keras/utils/data_utils.py", line 626, in next_sample
    return six.next(_SHARED_SEQUENCES[uid])
  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 1526, in __next__
    return self.next(*args, **kwargs)
  

  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 1526, in __next__
    return self.next(*args, **kwargs)
  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 2203, in next
    return self._get_batches_of_transformed_samples(index_array)
  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 1526, in __next__
    return self.next(*args, **kwargs)
  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 1526, in __next__
    return self.next(*args, **kwargs)
  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 1526, in __next__
    return self.next(*args, **kwargs)
  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 1526, in __next__
    return self.next(*args, **kwargs)
  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 2203, in next
    return self._get_batches_of_transformed_sa

  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 2163, in _get_batches_of_transformed_samples
    batch_x[i] = x
  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 2163, in _get_batches_of_transformed_samples
    batch_x[i] = x
  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 2163, in _get_batches_of_transformed_samples
    batch_x[i] = x
  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 2163, in _get_batches_of_transformed_samples
    batch_x[i] = x
  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 2203, in next
    return self._get_batches_of_transformed_samples(index_array)
  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 2163, in _get_batches_of_transformed_samples
    batch_x[i] = x
  File "/anaconda3/lib/python3.6/site-packages/keras_preprocessing/image.py", line 2163, in _get_batche

KeyboardInterrupt: 

In [None]:
#TODO: get live training vs validation loss graph so we can visualise overfitting... and speed this up somehow! 
#TODO: tune hyperparameters to optimise for ... f1 score? probably.
#TODO: figure out how to actually use the model for prediction