In [1]:
import numpy as np
%matplotlib inline

### Imports utils data exploration

In [2]:
from utils_data_exploration import *

In [3]:
# Imported dataframes
df_train_labels
df_test_photo_to_biz_ids
photos_in_test_biz; # dictionary that maps a test buisness with photos ids

In [4]:
df_train_labels.head()

Unnamed: 0_level_0,labels,photos,n_photo
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1000,"(1, 2, 3, 4, 5, 6, 7)","[438623, 325966, 227692, 407856, 368729, 16319...",54
1001,"(0, 1, 6, 8)","[298536, 20346, 8457, 308694, 349310, 407838, ...",9
100,"(1, 2, 4, 5, 6, 7)","[338465, 328433, 243861, 361777, 127198, 46652...",84
1006,"(1, 2, 4, 5, 6)","[46472, 341947, 396253, 75316, 42330, 244095, ...",22
1010,"(0, 6, 8)","[118251, 219940, 27517, 8578, 148347, 433559, ...",11


In [5]:
df_test_photo_to_biz_ids.head()

Unnamed: 0,photo_id,business_id
0,317818,003sg
1,30679,003sg
2,455084,003sg
3,371381,003sg
4,86224,003sg


In [27]:
len(df_test_photo_to_biz_ids['photo_id'].unique())

237152

In [33]:
df_test_photo_to_biz_ids[df_test_photo_to_biz_ids['photo_id'] == 3]

Unnamed: 0,photo_id,business_id
266268,3,7sa4h
943234,3,sip1t
1153375,3,z0xdo
1180870,3,zo14i


### Assign Directory Paths to Constant Variable Names

In [6]:
import os
import sys

In [19]:
%pwd # verify you are in the correct folder

'/home/javier/Documents/YelpRestaurantPhotoClassification/nbs'

In [7]:
LESSON_HOME_DIR = os.getcwd()
DATA_DIR = LESSON_HOME_DIR + '/../data/'
TRAIN_PATH = DATA_DIR + '/train_photos/'
TEST_PATH = DATA_DIR + '/test_photos/'
VALID_PATH = DATA_DIR + '/valid_photos/'
RESULTS_PATH = DATA_DIR + '/results/'

# 1) Map Train Photos to FC1 Representation Using VGG16-Imagenet

### Image Data Generator

In [58]:
# For every training image subtract the per channel mean of the imagenet dataset
vgg_mean = np.array([123.68, 116.779, 103.939], dtype=np.float32).reshape((1, 1, 3))

def vgg_preprocess(x):
    """
        Subtracts the mean RGB value, and transposes RGB to BGR.
        The mean RGB was computed on the image set used to train the VGG model.
        
        Args:
            x: Image array (height x width x channels)
        
        Returns
               Image array (height x width x transposed_channels)
    """
    x = x - vgg_mean
    return x[:, ::-1] # reverse axis rgb->bgr

In [59]:
from keras.preprocessing.image import ImageDataGenerator

gen = ImageDataGenerator(preprocessing_function=vgg_preprocess) # No data augmentation is being applied

Using TensorFlow backend.


In [60]:
batch_size = 64

In [62]:
batches = gen.flow_from_directory(TEST_PATH, target_size=(224, 224), batch_size=batch_size, shuffle=False)

Found 237152 images belonging to 1 classes.


### Load the VGG16 model with it's pre trained weights

In [63]:
from keras import applications

In [64]:
model = applications.VGG16()

In [12]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

### Leave just the first fully connected layer to calculate the new representation

In [65]:
from keras.models import Model

In [66]:
model_extract_features = Model(inputs=model.layers[0].input, outputs=model.layers[-3].output)

In [22]:
model_extract_features.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

### Extract bottleneck features for the train set

In [36]:
bottleneck_features_train = model_extract_features.predict_generator(batches, batches.n // batches.batch_size + 1, verbose=1)



In [37]:
np.save(RESULTS_PATH + 'bottleneck_features_train.npy', bottleneck_features_train)

In [32]:
bottleneck_features_train = np.load(RESULTS_PATH + 'bottleneck_features_train.npy')

In [18]:
bottleneck_features_train.shape

(234842, 4096)

### Extract bottleneck features for the test set

In [79]:
bottleneck_features_test = model_extract_features.predict_generator(batches, batches.n // batches.batch_size + 1, verbose=1)



In [80]:
np.save(RESULTS_PATH + '/imagenet/features/' + 'bottleneck_features_test.npy', bottleneck_features_test)

In [81]:
bottleneck_features_test.shape

(237152, 4096)

### Retrieve image filenames associated with the training features

In [90]:
# get the filenames for the entire training set
filenames = batches.filenames
filenames = [f.split('/')[1] for f in filenames]
filenames = [f.split('.')[0] for f in filenames]

In [91]:
filenames = np.array(filenames, dtype=np.int32)

In [92]:
len(filenames)

237152

In [93]:
np.save(RESULTS_PATH + '/imagenet/' + 'test_filenames.npy', filenames)

In [99]:
filenames = np.load(RESULTS_PATH + '/imagenet/' + 'test### Retrieve image filenames associated with the training features

# get the filenames for the entire training set
filenames = batches.filenames
filenames = [f.split('/')[1] for f in filenames]
filenames = [f.split('.')[0] for f in filenames]

filenames = np.array(filenames, dtype=np.int32)

np.save(RESULTS_PATH + 'train_filenames.npy', train_filenames)

filenames = np.load(RESULTS_PATH + 'train_filenames.npy')

train_filenames = np.load(RESULTS_PATH + 'train_filenames.npy')

train_filenames.shape_filenames.npy')

In [100]:
filenames.shape

(237152,)

In [17]:
train_filenames = np.load(RESULTS_PATH + 'train_filenames.npy')

In [None]:
test_filenames.shape

### Retrieve image filenames associated with the training features

In [17]:
# get the filenames for the entire training set
filenames = batches.filenames
filenames = [f.split('/')[1] for f in filenames]
filenames = [f.split('.')[0] for f in filenames]

In [19]:
test_filenames = np.array(filenames, dtype=np.int32)

In [47]:
np.save(RESULTS_PATH + 'test_filenames.npy', train_filenames)

In [29]:
filenames = np.load(RESULTS_PATH + 'train_filenames.npy')

In [17]:
train_filenames = np.load(RESULTS_PATH + 'train_filenames.npy')

In [19]:
train_filenames.shape

(234842,)

# 2) Obtain each restaurant fc1 blueprint

### Shuffle the restaurants training data; Important so that we don't have to shuffle later on!

In [33]:
df_train_labels.head()

Unnamed: 0_level_0,labels,photos,n_photo
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1000,"(1, 2, 3, 4, 5, 6, 7)","[438623, 325966, 227692, 407856, 368729, 16319...",54
1001,"(0, 1, 6, 8)","[298536, 20346, 8457, 308694, 349310, 407838, ...",9
100,"(1, 2, 4, 5, 6, 7)","[338465, 328433, 243861, 361777, 127198, 46652...",84
1006,"(1, 2, 4, 5, 6)","[46472, 341947, 396253, 75316, 42330, 244095, ...",22
1010,"(0, 6, 8)","[118251, 219940, 27517, 8578, 148347, 433559, ...",11


In [34]:
unique_business = df_train_labels.index.get_values()
unique_business = np.sort(unique_business) # returns a copy of the sorted array

In [35]:
unique_business[:100]

array([  3,   4,   5,   6,   7,   8,   9,  12,  13,  14,  16,  18,  19,
        21,  23,  24,  26,  28,  29,  32,  35,  36,  37,  38,  39,  41,
        48,  50,  51,  54,  58,  60,  63,  65,  67,  68,  69,  71,  74,
        75,  77,  78,  79,  81,  84,  85,  87,  89,  91,  93,  96,  99,
       100, 101, 103, 104, 105, 108, 109, 110, 111, 112, 115, 118, 119,
       120, 123, 125, 129, 131, 132, 135, 140, 142, 143, 145, 147, 148,
       150, 153, 154, 157, 158, 161, 162, 163, 164, 165, 169, 171, 172,
       175, 177, 179, 180, 183, 184, 186, 187, 188])

In [36]:
np.random.seed(3)
np.random.shuffle(unique_business)

In [37]:
unique_business[:100]

array([2469, 2480,  722,  955,  319,  876, 2265, 1386, 2369, 2305, 1260,
       3077,  131, 1229,  966, 2213, 2108,  298, 3521, 2801, 3013, 3301,
        163, 1419, 1856,  908, 2166, 2391, 2935, 1903, 2020, 2640, 1065,
          6, 2796, 3074, 3905, 2234, 1783, 2401, 3168, 3877,  157,  494,
       2500, 2285, 1656, 1413, 2817,  501,   60, 3218, 1026, 1055, 2357,
        916, 3211, 3762, 3798, 3149, 1101, 1661, 3874,  495, 2434,  906,
       2023, 1537, 3693,  112, 2955, 1760, 3849,  161, 2748, 2134,  846,
       2540, 2671, 1993, 3430, 3170,  109, 3827,  806, 1490, 1626,  626,
        276, 2018, 1503, 1647, 3226, 1533, 2296, 1142, 3497, 3570, 2810,
       2494])

In [43]:
# save the shuffled businesses
np.save(RESULTS_PATH+'/businesses_shuffled.npy', unique_business)

### Extract the first fc layer representation for every restaurant

In [40]:
restaurant_fc1_features = []
features_shape = (1, bottleneck_features_train.shape[1])

for i, business in enumerate(unique_business):
    business_photos = df_train_labels.loc[business].photos
    restaurant_fc1_features.append(np.zeros(features_shape))
    photo_count = 0
    for business_photo in business_photos:
        restaurant_fc1_features[i] += bottleneck_features_train[np.where(train_filenames == business_photo)[0]]
        photo_count += 1
    restaurant_fc1_features[i] = restaurant_fc1_features[i] / photo_count

restaurant_fc1_features = np.array(restaurant_fc1_features)

In [41]:
restaurant_fc1_features = restaurant_fc1_features.reshape(1996, 4096)

In [42]:
restaurant_fc1_features.shape

(1996, 4096)

In [44]:
### Retrieve image filenames associated with the training features

# get the filenames for the entire training set
filenames = batches.filenames
filenames = [f.split('/')[1] for f in filenames]
filenames = [f.split('.')[0] for f in filenames]

filenames = np.array(filenames, dtype=np.int32)

np.save(RESULTS_PATH + 'train_filenames.npy', train_filenames)

filenames = np.load(RESULTS_PATH + 'train_filenames.npy')

train_filenames = np.load(RESULTS_PATH + 'train_filenames.npy')

train_filenames.shape### Retrieve image filenames associated with the training features

# get the filenames for the entire training set
filenames = batches.filenames
filenames = [f.split('/')[1] for f in filenames]
filenames = [f.split('.')[0] for f in filenames]

filenames = np.array(filenames, dtype=np.int32)

np.save(RESULTS_PATH + 'train_filenames.npy', train_filenames)

filenames = np.load(RESULTS_PATH + 'train_filenames.npy')

train_filenames = np.load(RESULTS_PATH + 'train_filenames.npy')

train_filenames.shape### Retrieve image filenames associated with the training features

# get the filenames for the entire training set
filenames = batches.filenames
filenames = [f.split('/')[1] for f in filenames]
filenames = [f.split('.')[0] for f in filenames]

filenames = np.array(filenames, dtype=np.int32)

np.save(RESULTS_PATH + 'train_filenames.npy', train_filenames)

filenames = np.load(RESULTS_PATH + 'train_filenames.npy')

train_filenames = np.load(RESULTS_PATH + 'train_filenames.npy')

train_filenames.shape# save the fc1 blueprint corresponding to each of the unique businesses
np.save(RESULTS_PATH+'/businesses_fc1_blueprint.npy', restaurant_fc1_features)

# The purpose of this notebook ends here, Gratz!!

# 2) Obtain each test restaurant fc1 blueprint

### Shuffle the restaurants training data; Important so that we don't have to shuffle later on!

In [140]:
unique_business = df_test_photo_to_biz_ids['business_id'].unique()
unique_business = np.sort(unique_business) # returns a copy of the sorted array

In [141]:
unique_business[:100]

array(['003sg', '00er5', '00kad', '00mc6', '00q7x', '00v0t', '00y7p',
       '019fg', '019r1', '01i5j', '01is9', '01mrb', '01pyb', '01s0p',
       '01xsq', '021oz', '026nc', '02bwy', '02d9t', '02eos', '02fio',
       '02pxt', '02qrp', '02rfd', '0357u', '035x6', '038l4', '03bbu',
       '03ked', '03m8y', '03vx8', '03yz9', '040nh', '042hy', '044sl',
       '045qe', '04944', '04cy7', '04ilw', '04imx', '04kgm', '04ud9',
       '04wn2', '04zgs', '050l6', '0573e', '057qc', '05fb2', '05h9r',
       '05ihx', '05jhx', '05rwc', '06cko', '06fzh', '06gbm', '06ml0',
       '06p0g', '06ums', '06vhp', '0707d', '070ll', '077lb', '07bbs',
       '07gmf', '07nri', '07o52', '084gg', '084v2', '08dzw', '08h8f',
       '08ni5', '08oeq', '08wfz', '0916a', '092lt', '098ef', '098un',
       '09ckq', '09ejq', '09iat', '09kll', '09pmi', '09ptq', '09w5k',
       '0a1ko', '0a45c', '0a938', '0adc9', '0adst', '0ahhr', '0aj6u',
       '0akmq', '0au0t', '0axc7', '0axvc', '0b8ky', '0bcfg', '0beyy',
       '0bnm5', '0bz

In [142]:
np.random.seed(3)
np.random.shuffle(unique_business)

In [143]:
unique_business[:100]

array(['l3hce', 'nim76', '57z69', 'bvw6i', '0rzi7', '4dnpo', '6v6r4',
       'yqld5', 'ub57s', 'r26ek', '9bfqp', '3k8b1', 'gsqc0', 'dikt8',
       'ebyno', 'exqyg', 'x75wv', 'wcln6', 'gn80r', 'vd75i', 'scc07',
       'd74of', '9kj6g', 'pjxce', '8oui7', 'k8s6m', '321ey', '39gcr',
       'mnc6a', '8uhqa', 'a58kk', 'ouoo1', 'qbm0k', 'sjjdd', 'jd9ky',
       '0nq6o', 'uotly', 'rwgnf', 'oab77', 'i95mo', 'lesy6', 'y3zpu',
       'pfhqw', 'nzy2m', '6ssbi', '1zjuy', 'mm6dj', '7y7gr', 'e9n68',
       '3o0k1', 'pqdoa', 'chici', 'fgbk6', 'bvb54', 'eok7m', 'n3eg2',
       'a0llc', 'x9iey', 'fup0v', 'hwxzm', '7l3qq', 's1ysp', '4w5dg',
       'j0sy8', '0tc9p', 'u30yi', 'aj2c4', '9gy9k', '3j75n', 'jza3m',
       'poe4x', '9xo1p', 'nrm98', 'vbthe', 'kkl9u', '0s3z5', '3yz1o',
       'zq5am', 'g07ov', 'p1l60', 'jubx3', 'gv3y7', '3i7w0', 'sf6dy',
       'cgvtr', 'qzqhw', 'hw3ht', 'j2z1w', '19c32', '7rjw4', 'zcd1t',
       'qhcbe', 'khriv', 'bfmq5', '4y0gz', 'q102l', '7f24o', 'datj5',
       'fz0aw', '5uk

In [145]:
# save the shuffled businesses
np.save(DATA_DIR + '/shared/' +'/test_businesses_shuffled.npy', unique_business)

### Extract the first fc layer representation for every restaurant

In [151]:
restaurant_fc1_features = []
features_shape = (1, bottleneck_features_test.shape[1])

for i, business in enumerate(unique_business):
    business_photos = df_test_photo_to_biz_ids[df_test_photo_to_biz_ids['business_id'] == business]['photo_id'].as_matrix()
    restaurant_fc1_features.append(np.zeros(features_shape))
    photo_count = 0
    for business_photo in business_photos:
        restaurant_fc1_features[i] += bottleneck_features_test[np.where(filenames == business_photo)[0]]
        photo_count += 1
    restaurant_fc1_features[i] = restaurant_fc1_features[i] / photo_count

restaurant_fc1_features = np.array(restaurant_fc1_features)

In [152]:
restaurant_fc1_features = restaurant_fc1_features.reshape(10000, 4096)

In [153]:
restaurant_fc1_features.shape

(10000, 4096)

In [155]:
restaurant_fc1_features

array([[ 2.17446654,  1.15032917,  1.09108251, ...,  2.38044859,
         1.53568873,  0.20470579],
       [ 1.80752024,  2.02153297,  1.3848336 , ...,  1.95438128,
         1.26758549,  0.13444891],
       [ 1.57852717,  1.75416363,  2.30216543, ...,  3.03419325,
         1.01502079,  0.44100595],
       ..., 
       [ 1.70478558,  2.36583228,  1.22405197, ...,  1.05381226,
         0.85718813,  0.16915629],
       [ 2.3860694 ,  0.77662372,  1.49644853, ...,  2.43986367,
         1.18885089,  0.25047417],
       [ 1.89444393,  2.10856588,  1.87857879, ...,  1.63456469,
         1.20462462,  0.56510732]])

In [154]:
### Retrieve image filenames associated with the training features
np.save(RESULTS_PATH+ '/imagenet/' +'/test_businesses_fc1_blueprint.npy', restaurant_fc1_features)