In [77]:
sc

In [78]:
import numpy as np
import pandas as pd
from PIL import Image
from os import listdir
from os.path import join, basename
import struct
import pickle
import json
import os
from scipy import misc
import datetime as dt
# import matplotlib.pyplot as plt
# %matplotlib inline

In [79]:
# %pylab inline
from bigdl.nn.layer import *
from bigdl.nn.criterion import *
from bigdl.optim.optimizer import *
from bigdl.util.common import *
from bigdl.dataset.transformer import *
from bigdl.dataset import mnist
from transformer import *

In [80]:

def scala_T(input_T):
    """
    Helper function for building Inception layers. Transforms a list of numbers to a dictionary with ascending keys 
    and 0 appended to the front. Ignores dictionary inputs. 
    
    :param input_T: either list or dict
    :return: dictionary with ascending keys and 0 appended to front {0: 0, 1: realdata_1, 2: realdata_2, ...}
    """    
    if type(input_T) is list:
        # insert 0 into first index spot, such that the real data starts from index 1
        temp = [0]
        temp.extend(input_T)
        return dict(enumerate(temp))
    # if dictionary, return it back
    return input_T

In [81]:
def Inception_Layer_v1(input_size, config, name_prefix=""):
    """
    Builds the inception-v1 submodule, a local network, that is stacked in the entire architecture when building
    the full model.  
    
    :param input_size: dimensions of input coming into the local network
    :param config: ?
    :param name_prefix: string naming the layers of the particular local network
    :return: concat container object with all of the Sequential layers' output concatenated depthwise
    """        
    
    '''
    Concat is a container who concatenates the output of it's submodules along the provided dimension: all submodules 
    take the same inputs, and their output is concatenated.
    '''
    concat = Concat(2)
    
    """
    In the above code, we first create a container Sequential. Then add the layers into the container one by one. The 
    order of the layers in the model is same with the insertion order. 
    
    """
    conv1 = Sequential()
    
    #Adding layers to the conv1 model we just created
    
    #SpatialConvolution is a module that applies a 2D convolution over an input image.
    conv1.add(SpatialConvolution(input_size, config[1][1], 1, 1, 1, 1).set_name(name_prefix + "1x1"))
    conv1.add(ReLU(True).set_name(name_prefix + "relu_1x1"))
    concat.add(conv1)
    
    conv3 = Sequential()
    conv3.add(SpatialConvolution(input_size, config[2][1], 1, 1, 1, 1).set_name(name_prefix + "3x3_reduce"))
    conv3.add(ReLU(True).set_name(name_prefix + "relu_3x3_reduce"))
    conv3.add(SpatialConvolution(config[2][1], config[2][2], 3, 3, 1, 1, 1, 1).set_name(name_prefix + "3x3"))
    conv3.add(ReLU(True).set_name(name_prefix + "relu_3x3"))
    concat.add(conv3)
    
    
    conv5 = Sequential()
    conv5.add(SpatialConvolution(input_size,config[3][1], 1, 1, 1, 1).set_name(name_prefix + "5x5_reduce"))
    conv5.add(ReLU(True).set_name(name_prefix + "relu_5x5_reduce"))
    conv5.add(SpatialConvolution(config[3][1], config[3][2], 5, 5, 1, 1, 2, 2).set_name(name_prefix + "5x5"))
    conv5.add(ReLU(True).set_name(name_prefix + "relu_5x5"))
    concat.add(conv5)
    
    
    pool = Sequential()
    pool.add(SpatialMaxPooling(3, 3, 1, 1, 1, 1, to_ceil=True).set_name(name_prefix + "pool"))
    pool.add(SpatialConvolution(input_size, config[4][1], 1, 1, 1, 1).set_name(name_prefix + "pool_proj"))
    pool.add(ReLU(True).set_name(name_prefix + "relu_pool_proj"))
    concat.add(pool).set_name(name_prefix + "output")
    return concat

In [82]:
def Inception_v1_Bottleneck():
    model = Sequential()
    model.add(SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3, 1, False).set_name("conv1/7x7_s2"))
    model.add(ReLU(True).set_name("conv1/relu_7x7"))
    model.add(SpatialMaxPooling(3, 3, 2, 2, to_ceil=True).set_name("pool1/3x3_s2"))
    model.add(SpatialCrossMapLRN(5, 0.0001, 0.75).set_name("pool1/norm1"))
    model.add(SpatialConvolution(64, 64, 1, 1, 1, 1).set_name("conv2/3x3_reduce"))
    model.add(ReLU(True).set_name("conv2/relu_3x3_reduce"))
    model.add(SpatialConvolution(64, 192, 3, 3, 1, 1, 1, 1).set_name("conv2/3x3"))
    model.add(ReLU(True).set_name("conv2/relu_3x3"))
    model.add(SpatialCrossMapLRN(5, 0.0001, 0.75).set_name("conv2/norm2"))
    model.add(SpatialMaxPooling(3, 3, 2, 2, to_ceil=True).set_name("pool2/3x3_s2"))
    model.add(Inception_Layer_v1(192, scala_T([scala_T([64]), scala_T(
         [96, 128]), scala_T([16, 32]), scala_T([32])]), "inception_3a/"))
    model.add(Inception_Layer_v1(256, scala_T([scala_T([128]), scala_T(
         [128, 192]), scala_T([32, 96]), scala_T([64])]), "inception_3b/"))
    model.add(SpatialMaxPooling(3, 3, 2, 2, to_ceil=True))
    model.add(Inception_Layer_v1(480, scala_T([scala_T([192]), scala_T(
         [96, 208]), scala_T([16, 48]), scala_T([64])]), "inception_4a/"))
    model.add(Inception_Layer_v1(512, scala_T([scala_T([160]), scala_T(
         [112, 224]), scala_T([24, 64]), scala_T([64])]), "inception_4b/"))
    model.add(Inception_Layer_v1(512, scala_T([scala_T([128]), scala_T(
         [128, 256]), scala_T([24, 64]), scala_T([64])]), "inception_4c/"))
    model.add(Inception_Layer_v1(512, scala_T([scala_T([112]), scala_T(
         [144, 288]), scala_T([32, 64]), scala_T([64])]), "inception_4d/"))
    model.add(Inception_Layer_v1(528, scala_T([scala_T([256]), scala_T(
         [160, 320]), scala_T([32, 128]), scala_T([128])]), "inception_4e/"))
    model.add(SpatialMaxPooling(3, 3, 2, 2, to_ceil=True))
    model.add(Inception_Layer_v1(832, scala_T([scala_T([256]), scala_T(
         [160, 320]), scala_T([32, 128]), scala_T([128])]), "inception_5a/"))
    model.add(Inception_Layer_v1(832, scala_T([scala_T([384]), scala_T(
         [192, 384]), scala_T([48, 128]), scala_T([128])]), "inception_5b/"))
    model.add(SpatialAveragePooling(7, 7, 1, 1).set_name("pool5/7x7_s1"))
    model.add(Dropout(0.4).set_name("pool5/drop_7x7_s1"))
    model.add(View([1024], num_input_dims=3))
    model.reset()
    return model

In [83]:
def Inception_v1_NoAuxClassifier():
    model = Inception_v1_Bottleneck()
    model.add(Linear(1024, class_num).set_name("loss3/classifier_flowers"))
    model.add(LogSoftMax().set_name("loss3/loss3"))
    model.reset()
    return model

## Creating the Bottleneck Model

In [84]:
# initializing BigDL engine
init_engine()

In [85]:

# paths for datasets, saving checkpoints 
from os import path

DATA_ROOT = "./sample_images"
checkpoint_path = path.join(DATA_ROOT, "checkpoints")

IMAGE_SIZE = 224

In [86]:
inception_model = Inception_v1_Bottleneck()

creating: createSequential
creating: createSpatialConvolution
creating: createReLU
creating: createSpatialMaxPooling
creating: createSpatialCrossMapLRN
creating: createSpatialConvolution
creating: createReLU
creating: createSpatialConvolution
creating: createReLU
creating: createSpatialCrossMapLRN
creating: createSpatialMaxPooling
creating: createConcat
creating: createSequential
creating: createSpatialConvolution
creating: createReLU
creating: createSequential
creating: createSpatialConvolution
creating: createReLU
creating: createSpatialConvolution
creating: createReLU
creating: createSequential
creating: createSpatialConvolution
creating: createReLU
creating: createSpatialConvolution
creating: createReLU
creating: createSequential
creating: createSpatialMaxPooling
creating: createSpatialConvolution
creating: createReLU
creating: createConcat
creating: createSequential
creating: createSpatialConvolution
creating: createReLU
creating: createSequential
creating: createSpatialConvolutio

## Download the Pre-trained Model

In [87]:

import urllib

# path, names of the downlaoded pre-trained caffe models
caffe_prototxt = 'bvlc_googlenet.prototxt'
caffe_model = 'bvlc_googlenet.caffemodel'

if not path.exists(caffe_model):
    model_loader = urllib.URLopener()
    model_loader.retrieve("http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel", caffe_model)

## Import Weights from Caffe Model

In [88]:

# loading the weights to the BigDL inception model, EXCEPT the weights for the last fc layer (classification layer)
model = Model.load_caffe(inception_model, caffe_prototxt, caffe_model, match_all=False, bigdl_type="float")


## Load images with their Labels

In [89]:
'''
Load img-label pairs, but only if label is shared by more than "min_samples" (160) other samples.
'''
# load img-label pairs, but only if label is shared by more than "min_samples" (160) other samples.
def imgs_to_load(labels_csv, pik, min_samples):
    labels = pd.read_csv(labels_csv)
    file_names = labels['obs_uid'].sort_values().tolist()
    imgs = pickle.load(open(pik, "rb"))
    result = []
    
    # item_name is index, obs_uid - count
    counts = labels[['item_name', 'obs_uid']].groupby(['item_name']).count()
    # print counts
    labels = labels.set_index(['obs_uid'])
    
    for idx in range(len(file_names)):
        uid = file_names[idx]
        label = labels.loc[uid]['item_name']
        add = True
        if min_samples is not None:
            cnt = counts.loc[label]['obs_uid']
            add = cnt >= min_samples
        # only add sample if there are more than min_sample number of samples w/that label
        if add:
            img = imgs[idx].convert('RGB')
            img_np = np.array(img)
            result.append((img_np, label))
        
    return result


### Download the processed images from Amazon s3

This could take 10-20 minutes, images require space of 1G

In [90]:
import urllib
from os import path
processed_imgs = 'processed-samples.pkl'

if not path.exists(processed_imgs):
#     imgs_loader = urllib.URLopener()
#     imgs_loader.retrieve("https://s3-us-west-2.amazonaws.com/vegnonveg/processed-samples.pkl", processed_imgs)
    os.system('wget https://s3-us-west-2.amazonaws.com/vegnonveg/processed-samples.pkl')

In [91]:
img_labels = imgs_to_load(DATA_ROOT + '/vegnonveg-samples_labels.csv',processed_imgs, 160)
print "# of images: ", len(img_labels)

# of images:  1927


In [92]:
# look at one img-label pair
print "one img-label pair: ", img_labels[0][0].shape, ", ", img_labels[0][1]

one img-label pair:  (224, 224, 3) ,  Fresh cucumber


In [93]:
transform_input = Transformer([TransposeToTensor(False)])

## Calculate the bottleneck values

In [94]:
pred_list = []
batch_size = 256
for start in range(0, len(img_labels), batch_size):
    img_batch = img_labels[start : start + batch_size]
    rdd_img = sc.parallelize(x[0] for x in img_batch)
    rdd_sample = rdd_img.map(lambda img: Sample.from_ndarray(transform_input(img), np.array(0)))    
    preds = model.predict(rdd_sample)
    p = preds.collect()
    pred_list.extend(p)

In [95]:
data = {
    'bottleneck_values': pred_list,
    'labels': [lbl for fname, lbl in img_labels]
}

In [96]:
pickle.dump(data, open(DATA_ROOT + "/bottlenecks_with_labels.pkl", 'wb'))

## Train and Test Classifiers

In [97]:
import pickle
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import SGDClassifier, LogisticRegression

In [98]:
data = pickle.load(open(DATA_ROOT + "/bottlenecks_with_labels.pkl", 'rb'))

## Use Stratified Train/Test Split
To make sure we have the same distribution of samples across labels in both train and test sets.

In [99]:

x_train, x_test, train_labels, test_labels = \
    train_test_split(data['bottleneck_values'], 
                     data['labels'], 
                     test_size=0.2, 
                     random_state=101,
                     stratify=data['labels'])
len(x_train), len(train_labels), len(x_test), len(test_labels)

(1541, 1541, 386, 386)

In [100]:
_, train_counts = np.unique(np.array(train_labels), return_counts=True)
train_counts = train_counts.astype(np.float) / len(train_labels)

In [101]:
_, test_counts = np.unique(np.array(test_labels), return_counts=True)
test_counts = test_counts.astype(np.float) / len(test_labels)
# Difference in labels counts, %
(train_counts - test_counts) / train_counts * 100

array([ 2.28812638,  0.19430052, -1.21141356,  1.36848522,  0.19430052,
       -0.32824241, -0.27871228, -0.77468686, -1.03786861])

## Classifier #1: BigDL Logistic Regression

In [102]:
# Create Labels for BigDl
categories = set(lbls for img, lbls in img_labels)
label_nums = dict(zip(categories, range(0,len(categories))))
pickle.dump(label_nums, open(DATA_ROOT + "/labels_bigdl_classifier.pkl", 'wb'))
print label_nums

{'Chicken eggs, caged hen, large size': 0, 'Fresh bananas, standard': 1, 'Fresh onions': 2, 'Fresh cucumber': 3, 'Fresh apple, red delicious': 4, 'Fresh potatoes, brown': 5, 'Fresh carrots': 6, 'Fresh oranges': 7, 'Fresh apples, typical local variety': 8}


In [103]:
# get rdd
def get_rdd_sample(images, labels):
    labels = map(lambda(word): label_nums[word] + 1, labels)
    imgs = sc.parallelize(images)
    labels = sc.parallelize(labels)
    sample_rdd = imgs.zip(labels).map(lambda(bottleneck, label): Sample.from_ndarray(bottleneck, np.array(label)))
    return sample_rdd



In [104]:
train_rdd = get_rdd_sample(x_train, train_labels)
test_rdd = get_rdd_sample(x_test, test_labels)

## Define Model

In [105]:
# Parameters
learning_rate = 0.2
training_epochs = 40
batch_size = 60

# Network Parameters
n_input = 1024 # 1024
n_classes = len(set(lbls for img, lbls in img_labels)) # item_name categories


def fc_layer(n_input, n_classes):
    model = Sequential()
    model.add(Linear(n_input, n_classes))
    model.add(LogSoftMax())
    return model # Create an Optimizer

model = fc_layer(n_input, n_classes)

creating: createSequential
creating: createLinear
creating: createLogSoftMax


In [106]:
optimizer = Optimizer(
    model=model,
    training_rdd=train_rdd,
    criterion=ClassNLLCriterion(),
    optim_method=SGD(learningrate=learning_rate),
    end_trigger=MaxEpoch(training_epochs),
    batch_size=batch_size)
# Set the validation logic
optimizer.set_validation(
    batch_size=batch_size,
    val_rdd=test_rdd,
    trigger=EveryEpoch(),
    val_method=[Top1Accuracy()]
)

app_name= 'vegnonveg' # + dt.datetime.now().strftime("%Y%m%d-%H%M%S")
train_summary = TrainSummary(log_dir='/tmp/bigdl_summaries',
                                     app_name=app_name)
train_summary.set_summary_trigger("Parameters", SeveralIteration(50))
val_summary = ValidationSummary(log_dir='/tmp/bigdl_summaries',
                                        app_name=app_name)
optimizer.set_train_summary(train_summary)
optimizer.set_val_summary(val_summary)
print "saving logs to ",app_name

creating: createClassNLLCriterion
creating: createDefault
creating: createSGD
creating: createMaxEpoch
creating: createOptimizer
creating: createEveryEpoch
creating: createTop1Accuracy
creating: createTrainSummary
creating: createSeveralIteration
creating: createValidationSummary
saving logs to  vegnonveg


In [107]:
# Start to train
trained_model = optimizer.optimize()
print "Optimization Done."

Optimization Done.


In [108]:
def map_predict_label(l):
    return np.array(l).argmax()
def map_groundtruth_label(l):
    return l[0] - 1
def map_to_label(l):
    return label_nums.keys()[label_nums.values().index(l)]

In [109]:
'''
Look at some predictions and their accuracy
'''
predictions = trained_model.predict(test_rdd)

num_preds = 8
truth = test_rdd.take(num_preds)
preds = predictions.take(num_preds)
for idx in range(num_preds):
    true_label = str(map_to_label(map_groundtruth_label(truth[idx].label.to_ndarray())))
    pred_label = str(map_to_label(map_predict_label(preds[idx])))
    print idx + 1, ')', 'Ground Truth label: ', true_label
    print idx + 1, ')', 'Predicted label: ', pred_label
    print "correct" if true_label == pred_label else "wrong"

1 ) Ground Truth label:  Fresh potatoes, brown
1 ) Predicted label:  Fresh onions
wrong
2 ) Ground Truth label:  Fresh apples, typical local variety
2 ) Predicted label:  Fresh apple, red delicious
wrong
3 ) Ground Truth label:  Fresh cucumber
3 ) Predicted label:  Fresh oranges
wrong
4 ) Ground Truth label:  Fresh cucumber
4 ) Predicted label:  Fresh cucumber
correct
5 ) Ground Truth label:  Fresh apples, typical local variety
5 ) Predicted label:  Fresh apple, red delicious
wrong
6 ) Ground Truth label:  Fresh cucumber
6 ) Predicted label:  Fresh cucumber
correct
7 ) Ground Truth label:  Fresh carrots
7 ) Predicted label:  Fresh carrots
correct
8 ) Ground Truth label:  Fresh potatoes, brown
8 ) Predicted label:  Fresh potatoes, brown
correct


In [110]:
'''
Measure Test Accuracy w/Test Set
'''

results = trained_model.evaluate(test_rdd, 32, [Top1Accuracy()])
print(results[0])

creating: createTop1Accuracy


## Classifier #2: Neural Net

In [111]:
clf = MLPClassifier(hidden_layer_sizes=(512,))

In [112]:
%%time 
cross_val_score(clf, x_train, train_labels, cv=StratifiedKFold(n_splits=3), scoring='accuracy')

CPU times: user 1min 24s, sys: 1min 34s, total: 2min 59s
Wall time: 46.8 s


array([ 0.69245648,  0.6692607 ,  0.69607843])

## Classifier #3: Logistic Regression

In [113]:
%%time
cross_val_score(LogisticRegression(), x_train, train_labels, cv=StratifiedKFold(n_splits=3), scoring='accuracy')

CPU times: user 6.77 s, sys: 192 ms, total: 6.96 s
Wall time: 6.86 s


array([ 0.67117988,  0.6614786 ,  0.66666667])