# Extract FC1 Features Food 101

### Imports

In [4]:
%matplotlib inline
import numpy as np
import gc
from utils_data_exploration import * # import all datastructures and functions from utils data exploration

import os, sys

In [3]:
# Imported dataframes
df_train_labels
df_test_photo_to_biz_ids
photos_in_test_biz; # dictionary that maps a test buisness with photos ids

### Assign Directory Paths to Constant Variable Names

In [6]:
dataset = 'food101'

In [7]:
LESSON_HOME_DIR = os.getcwd()
DATA_DIR = LESSON_HOME_DIR + '/../data/'
TRAIN_PATH = DATA_DIR + '/train_photos/'
TEST_PATH = DATA_DIR + '/test_photos/'
VALID_PATH = DATA_DIR + '/valid_photos/'

# representation specific paths
RESULTS_PATH = DATA_DIR + '/results/' + '/' + dataset + '/'
WEIGHTS_PATH = DATA_DIR + '/weights/' + '/' + dataset + '/'

# Map Train Photos to FC1 Representation Using VGG16 - Food 101

### Load the VGG16 model with its pre-trained Food-101 weights

In [8]:
from keras.models import Sequential, model_from_json, load_model, Model
from keras import backend as K

Using TensorFlow backend.


In [12]:
# Load model weight and structure
model = load_model(WEIGHTS_PATH+'vgg16_food101_model.h5')

In [13]:
# Keep the feature extraction section
model_extract_features = Model(input=model.layers[0].input, output=model.layers[-2].output)

  """Entry point for launching an IPython kernel.


### Instantiate Image Data Generator

In [14]:
# For every training image subtract the per channel mean of the imagenet dataset
vgg_mean = np.array([123.68, 116.779, 103.939], dtype=np.float32).reshape((1, 1, 3))

def vgg_preprocess(x):
    """
        Subtracts the mean RGB value, and transposes RGB to BGR.
        The mean RGB was computed on the image set used to train the VGG model.
        
        Args:
            x: Image array (height x width x channels)
        
        Returns
               Image array (height x width x transposed_channels)
    """
    x = x - vgg_mean
    return x[:, ::-1] # reverse axis rgb->bgr

In [15]:
from keras.preprocessing.image import ImageDataGenerator

gen = ImageDataGenerator(preprocessing_function=vgg_preprocess) # No data augmentation is being applied

In [16]:
batch_size = 64

## Extract bottleneck features for the train set

### Train Set

In [17]:
batches = gen.flow_from_directory(TRAIN_PATH, target_size=(224, 224), batch_size=batch_size, shuffle=False)

Found 237152 images belonging to 1 classes.


In [22]:
bottleneck_features_train = model_extract_features.predict_generator(batches, batches.n // batches.batch_size + 1, verbose=1)



In [25]:
bottleneck_features_train.shape

(234842, 4096)

In [27]:
np.save(RESULTS_PATH + 'bottleneck_features_train.npy', bottleneck_features_train)

In [59]:
bottleneck_features_train = np.load(RESULTS_PATH + 'bottleneck_features_train.npy')

### Test Set

In [None]:
batches = gen.flow_from_directory(TEST_PATH, target_size=(224, 224), batch_size=batch_size, shuffle=False)

In [22]:
bottleneck_features_test = model_extract_features.predict_generator(batches, batches.n // batches.batch_size + 1, verbose=1)



In [29]:
bottleneck_features_test.shape

(237152, 4096)

In [30]:
np.save(RESULTS_PATH + 'bottleneck_features_test.npy', bottleneck_features_test)

In [32]:
bottleneck_features_test.shape

(237152, 4096)

___

### Retrieve image filenames associated with the features

In [33]:
# get the filenames for the entire trained 
filenames = batches.filenames
filenames = [f.split('/')[1] for f in filenames]
filenames = [f.split('.')[0] for f in filenames]

In [36]:
filenames = np.array(filenames, dtype=np.int32)

In [37]:
filenames.shape

(237152,)

In [38]:
RESULTS_PATH

'/home/javier/Documents/YelpRestaurantPhotoClassification/nbs/../data//results//food101/'

In [40]:
type_str = 'test' # select if using train or test set

In [41]:
np.save(RESULTS_PATH + type_str +'_filenames.npy', filenames)

# Obtain each restaurant fc1 blueprint

## Train Set

### Shuffle the restaurants training data; Important so that we don't have to shuffle later on!

In [34]:
df_train_labels.head()

Unnamed: 0_level_0,labels,photos,n_photo
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1000,"(1, 2, 3, 4, 5, 6, 7)","[438623, 325966, 227692, 407856, 368729, 16319...",54
1001,"(0, 1, 6, 8)","[298536, 20346, 8457, 308694, 349310, 407838, ...",9
100,"(1, 2, 4, 5, 6, 7)","[338465, 328433, 243861, 361777, 127198, 46652...",84
1006,"(1, 2, 4, 5, 6)","[46472, 341947, 396253, 75316, 42330, 244095, ...",22
1010,"(0, 6, 8)","[118251, 219940, 27517, 8578, 148347, 433559, ...",11


In [35]:
unique_business = df_train_labels.index.get_values()
unique_business = np.sort(unique_business) # returns a copy of the sorted array

In [36]:
unique_business[:100]

array([  3,   4,   5,   6,   7,   8,   9,  12,  13,  14,  16,  18,  19,
        21,  23,  24,  26,  28,  29,  32,  35,  36,  37,  38,  39,  41,
        48,  50,  51,  54,  58,  60,  63,  65,  67,  68,  69,  71,  74,
        75,  77,  78,  79,  81,  84,  85,  87,  89,  91,  93,  96,  99,
       100, 101, 103, 104, 105, 108, 109, 110, 111, 112, 115, 118, 119,
       120, 123, 125, 129, 131, 132, 135, 140, 142, 143, 145, 147, 148,
       150, 153, 154, 157, 158, 161, 162, 163, 164, 165, 169, 171, 172,
       175, 177, 179, 180, 183, 184, 186, 187, 188])

In [37]:
np.random.seed(3)
np.random.shuffle(unique_business)

In [38]:
unique_business[:100]

array([2469, 2480,  722,  955,  319,  876, 2265, 1386, 2369, 2305, 1260,
       3077,  131, 1229,  966, 2213, 2108,  298, 3521, 2801, 3013, 3301,
        163, 1419, 1856,  908, 2166, 2391, 2935, 1903, 2020, 2640, 1065,
          6, 2796, 3074, 3905, 2234, 1783, 2401, 3168, 3877,  157,  494,
       2500, 2285, 1656, 1413, 2817,  501,   60, 3218, 1026, 1055, 2357,
        916, 3211, 3762, 3798, 3149, 1101, 1661, 3874,  495, 2434,  906,
       2023, 1537, 3693,  112, 2955, 1760, 3849,  161, 2748, 2134,  846,
       2540, 2671, 1993, 3430, 3170,  109, 3827,  806, 1490, 1626,  626,
        276, 2018, 1503, 1647, 3226, 1533, 2296, 1142, 3497, 3570, 2810,
       2494])

In [43]:
# save the shuffled businesses
np.save(RESULTS_PATH+'/businesses_shuffled.npy', unique_business)

### Extract the first fc layer representation for every restaurant

In [40]:
restaurant_fc1_features = []
features_shape = (1, bottleneck_features_train.shape[1])

for i, business in enumerate(unique_business):
    business_photos = df_train_labels.loc[business].photos
    restaurant_fc1_features.append(np.zeros(features_shape))
    photo_count = 0
    for business_photo in business_photos:
        restaurant_fc1_features[i] += bottleneck_features_train[np.where(train_filenames == business_photo)[0]]
        photo_count += 1
    restaurant_fc1_features[i] = restaurant_fc1_features[i] / photo_count

restaurant_fc1_features = np.array(restaurant_fc1_features)

In [41]:
restaurant_fc1_features = restaurant_fc1_features.reshape(1996, 4096)

In [42]:
restaurant_fc1_features.shape

(1996, 4096)

In [43]:
# save the fc1 blueprint corresponding to each of the unique businesses
np.save(RESULTS_PATH + '/features/' + 'businesses_fc1_blueprint', restaurant_fc1_features)

## Test Set

### Shuffle the restaurants training data; Important so that we don't have to shuffle later on!

In [50]:
unique_business = df_test_photo_to_biz_ids['business_id'].unique()
unique_business = np.sort(unique_business) # returns a copy of the sorted array

In [51]:
np.random.seed(3)
np.random.shuffle(unique_business)

In [52]:
unique_business[:100]

array(['l3hce', 'nim76', '57z69', 'bvw6i', '0rzi7', '4dnpo', '6v6r4',
       'yqld5', 'ub57s', 'r26ek', '9bfqp', '3k8b1', 'gsqc0', 'dikt8',
       'ebyno', 'exqyg', 'x75wv', 'wcln6', 'gn80r', 'vd75i', 'scc07',
       'd74of', '9kj6g', 'pjxce', '8oui7', 'k8s6m', '321ey', '39gcr',
       'mnc6a', '8uhqa', 'a58kk', 'ouoo1', 'qbm0k', 'sjjdd', 'jd9ky',
       '0nq6o', 'uotly', 'rwgnf', 'oab77', 'i95mo', 'lesy6', 'y3zpu',
       'pfhqw', 'nzy2m', '6ssbi', '1zjuy', 'mm6dj', '7y7gr', 'e9n68',
       '3o0k1', 'pqdoa', 'chici', 'fgbk6', 'bvb54', 'eok7m', 'n3eg2',
       'a0llc', 'x9iey', 'fup0v', 'hwxzm', '7l3qq', 's1ysp', '4w5dg',
       'j0sy8', '0tc9p', 'u30yi', 'aj2c4', '9gy9k', '3j75n', 'jza3m',
       'poe4x', '9xo1p', 'nrm98', 'vbthe', 'kkl9u', '0s3z5', '3yz1o',
       'zq5am', 'g07ov', 'p1l60', 'jubx3', 'gv3y7', '3i7w0', 'sf6dy',
       'cgvtr', 'qzqhw', 'hw3ht', 'j2z1w', '19c32', '7rjw4', 'zcd1t',
       'qhcbe', 'khriv', 'bfmq5', '4y0gz', 'q102l', '7f24o', 'datj5',
       'fz0aw', '5uk

In [145]:
# save the shuffled businesses
np.save(DATA_DIR + '/shared/' +'/test_businesses_shuffled.npy', unique_business)

### Extract the first fc layer representation for every restaurant

In [56]:
df_test_photo_to_biz_ids.head()

Unnamed: 0,photo_id,business_id
0,317818,003sg
1,30679,003sg
2,455084,003sg
3,371381,003sg
4,86224,003sg


In [57]:
restaurant_fc1_features = []
features_shape = (1, bottleneck_features_test.shape[1])

for i, business in enumerate(unique_business):
    business_photos = df_test_photo_to_biz_ids[df_test_photo_to_biz_ids['business_id'] == business]['photo_id'].as_matrix()
    restaurant_fc1_features.append(np.zeros(features_shape))
    photo_count = 0
    for business_photo in business_photos:
        restaurant_fc1_features[i] += bottleneck_features_test[np.where(filenames == business_photo)[0]]
        photo_count += 1
    restaurant_fc1_features[i] = restaurant_fc1_features[i] / photo_count

restaurant_fc1_features = np.array(restaurant_fc1_features)

In [60]:
restaurant_fc1_features = restaurant_fc1_features.reshape(10000, 4096)

In [61]:
restaurant_fc1_features.shape

(10000, 4096)

In [75]:
### Retrieve image filenames associated with the training features
np.save(RESULTS_PATH +'/test_businesses_fc1_blueprint.npy', restaurant_fc1_features)

# The purpose of this notebook ends here, Gratz!!