# Import required libraries

In [3]:
import numpy as np
import tensorflow as tf
import matplotlib.cm as cm
from matplotlib import pyplot as plt 
from keras.models import Sequential
from keras.layers import Activation, Dropout, Flatten, Dense
import sys
import deepdish as dd

# packages for learning from crowds
from crowd_layer.crowd_layers import CrowdsClassification, MaskedMultiCrossEntropy
from crowd_layer.crowd_aggregators import CrowdsCategoricalAggregator

# prevent tensorflow from allocating the entire GPU memory at once
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

# Configuration parameters

In [4]:
NUM_RUNS = 30
DATA_PATH = "/Users/diment/Downloads/LabelMe/prepared/"
N_CLASSES = 8
BATCH_SIZE = 64
N_EPOCHS = 50

# Load data

In [5]:
import sys
def load_data(filename):
    if sys.version_info[0] < 3:
        f = open(filename)
        data = np.load(f)
        f.close()
    else:
        data = dd.io.load(filename.replace('.npy', '.h5'))
    return data

In [6]:
print("\nLoading train data...")

# images processed by VGG16
data_train_vgg16 = load_data(DATA_PATH+"data_train_vgg16.npy")
print(data_train_vgg16.shape)

# ground truth labels
labels_train = load_data(DATA_PATH+"labels_train.npy")
print(labels_train.shape)

# labels obtained from majority voting
labels_train_mv = load_data(DATA_PATH+"labels_train_mv.npy")
print(labels_train_mv.shape)

# labels obtained by using the approach by Dawid and Skene
labels_train_ds = load_data(DATA_PATH+"labels_train_DS.npy")
print(labels_train_ds.shape)

# data from Amazon Mechanical Turk
print("\nLoading AMT data...")
answers = load_data(DATA_PATH+"answers.npy")
print(answers.shape)
N_ANNOT = answers.shape[1]
print("\nN_CLASSES:", N_CLASSES)
print("N_ANNOT:", N_ANNOT)

# load test data
print("\nLoading test data...")

# images processed by VGG16
data_test_vgg16 = load_data(DATA_PATH+"data_test_vgg16.npy")
print(data_test_vgg16.shape)

# test labels
labels_test = load_data(DATA_PATH+"labels_test.npy")
print(labels_test.shape)


Loading train data...
(10000, 4, 4, 512)
(10000,)
(10000,)
(10000,)

Loading AMT data...
(10000, 59)

N_CLASSES: 8
N_ANNOT: 59

Loading test data...
(1188, 4, 4, 512)
(1188,)


In [7]:
# Save h5 versions for python3 compatibility: run this under python2
if False:
    import deepdish as dd
    for filename in ['data_train_vgg16.npy', 'labels_train.npy', 'labels_train_mv.npy', 'labels_train_DS.npy', 'answers.npy', 'data_test_vgg16.npy', 'labels_test.npy']:
        inpath = DATA_PATH + filename
        outpath = DATA_PATH + filename.replace('.npy', '.h5')
        data = load_data(inpath)
        dd.io.save(outpath, data)


# Convert data to one-hot encoding

In [8]:
def one_hot(target, n_classes):
    targets = np.array([target]).reshape(-1)
    one_hot_targets = np.eye(n_classes)[targets]
    return one_hot_targets

In [9]:
print("\nConverting to one-hot encoding...")
labels_train_bin = one_hot(labels_train, N_CLASSES)
print(labels_train_bin.shape)
labels_train_mv_bin = one_hot(labels_train_mv, N_CLASSES)
print(labels_train_mv_bin.shape)
labels_train_ds_bin = one_hot(labels_train_ds, N_CLASSES)
print(labels_train_ds_bin.shape)
labels_test_bin = one_hot(labels_test, N_CLASSES)
print(labels_test_bin.shape)

answers_bin_missings = []
for i in range(len(answers)):
    row = []
    for r in range(N_ANNOT):
        if answers[i,r] == -1:
            row.append(-1 * np.ones(N_CLASSES))
        else:
            row.append(one_hot(answers[i,r], N_CLASSES)[0,:])
    answers_bin_missings.append(row)
answers_bin_missings = np.array(answers_bin_missings).swapaxes(1,2)
answers_bin_missings.shape


Converting to one-hot encoding...
(10000, 8)
(10000, 8)
(10000, 8)
(1188, 8)


(10000, 8, 59)

# Define the base deep learning model

Here we shall use features representation produced by the VGG16 network as the input. Our base model is then simply composed by one densely-connected layer with 128 hidden units and an output dense layer. We use 50% dropout between the two dense layers.

In [19]:
def build_base_model():
    base_model = Sequential()
    base_model.add(Flatten(input_shape=data_train_vgg16.shape[1:]))
    base_model.add(Dense(128, activation='relu'))
    base_model.add(Dropout(0.5))
    base_model.add(Dense(N_CLASSES))
    base_model.add(Activation("softmax"))
    base_model.compile(optimizer='adam', loss='categorical_crossentropy')

    return base_model

# Auxiliary function for evaluating the models

In [20]:
def eval_model(model, test_data, test_labels):
    # testset accuracy
    preds_test = model.predict(test_data)
    preds_test_num = np.argmax(preds_test, axis=1)
    accuracy_test = 1.0*np.sum(preds_test_num == test_labels) / len(test_labels)

    return accuracy_test

# Train the model on the true labels (ground truth) and evaluate on testset

In [21]:
model = build_base_model()
model.fit(data_train_vgg16, labels_train_bin, epochs=N_EPOCHS, shuffle=True, batch_size=BATCH_SIZE, verbose=2)

Epoch 1/50
 - 2s - loss: 0.5070
Epoch 2/50
 - 2s - loss: 0.1990
Epoch 3/50
 - 2s - loss: 0.1242
Epoch 4/50
 - 3s - loss: 0.0821
Epoch 5/50
 - 3s - loss: 0.0639
Epoch 6/50
 - 2s - loss: 0.0504
Epoch 7/50
 - 2s - loss: 0.0431
Epoch 8/50
 - 2s - loss: 0.0407
Epoch 9/50
 - 2s - loss: 0.0337
Epoch 10/50
 - 2s - loss: 0.0301
Epoch 11/50
 - 2s - loss: 0.0373
Epoch 12/50
 - 2s - loss: 0.0413
Epoch 13/50
 - 2s - loss: 0.0280
Epoch 14/50
 - 2s - loss: 0.0275
Epoch 15/50
 - 2s - loss: 0.0287
Epoch 16/50
 - 2s - loss: 0.0464
Epoch 17/50
 - 2s - loss: 0.0277
Epoch 18/50
 - 3s - loss: 0.0295
Epoch 19/50
 - 2s - loss: 0.0210
Epoch 20/50
 - 2s - loss: 0.0188
Epoch 21/50
 - 3s - loss: 0.0240
Epoch 22/50
 - 3s - loss: 0.0295
Epoch 23/50
 - 2s - loss: 0.0360
Epoch 24/50
 - 2s - loss: 0.0351
Epoch 25/50
 - 2s - loss: 0.0304
Epoch 26/50
 - 2s - loss: 0.0230
Epoch 27/50
 - 2s - loss: 0.0272
Epoch 28/50
 - 3s - loss: 0.0232
Epoch 29/50
 - 2s - loss: 0.0201
Epoch 30/50
 - 2s - loss: 0.0197
Epoch 31/50
 - 3s -

<keras.callbacks.History at 0xb20d2feb8>

In [13]:
accuracy_test = eval_model(model, data_test_vgg16, labels_test)
print("Accuracy: Test: %.3f" % (accuracy_test,)) 

Accuracy: Test: 0.908


# Train the model on the output of majority voting and evaluate on testset

In [14]:
model = build_base_model()
model.fit(data_train_vgg16, labels_train_mv_bin, epochs=N_EPOCHS, shuffle=True, batch_size=BATCH_SIZE, verbose=2)

Epoch 1/50
 - 2s - loss: 0.8617
Epoch 2/50
 - 2s - loss: 0.5076
Epoch 3/50
 - 2s - loss: 0.3659
Epoch 4/50
 - 2s - loss: 0.2777
Epoch 5/50
 - 2s - loss: 0.2117
Epoch 6/50
 - 2s - loss: 0.1678
Epoch 7/50
 - 2s - loss: 0.1378
Epoch 8/50
 - 2s - loss: 0.1323
Epoch 9/50
 - 2s - loss: 0.1161
Epoch 10/50
 - 2s - loss: 0.0911
Epoch 11/50
 - 2s - loss: 0.0837
Epoch 12/50
 - 3s - loss: 0.0865
Epoch 13/50
 - 2s - loss: 0.0957
Epoch 14/50
 - 2s - loss: 0.0752
Epoch 15/50
 - 2s - loss: 0.0701
Epoch 16/50
 - 2s - loss: 0.0694
Epoch 17/50
 - 2s - loss: 0.0840
Epoch 18/50
 - 2s - loss: 0.0702
Epoch 19/50
 - 2s - loss: 0.0723
Epoch 20/50
 - 2s - loss: 0.0650
Epoch 21/50
 - 2s - loss: 0.0554
Epoch 22/50
 - 2s - loss: 0.0560
Epoch 23/50
 - 2s - loss: 0.0656
Epoch 24/50
 - 2s - loss: 0.0652
Epoch 25/50
 - 2s - loss: 0.0539
Epoch 26/50
 - 2s - loss: 0.0618
Epoch 27/50
 - 2s - loss: 0.0630
Epoch 28/50
 - 2s - loss: 0.0467
Epoch 29/50
 - 2s - loss: 0.0617
Epoch 30/50
 - 2s - loss: 0.0516
Epoch 31/50
 - 2s -

<keras.callbacks.History at 0xb37600b38>

In [15]:
accuracy_test = eval_model(model, data_test_vgg16, labels_test)
print("Accuracy: Test: %.3f" % (accuracy_test,)) 

Accuracy: Test: 0.774


# Train the model on the output of Dawid & Skene [1] and evaluate on testset

[1] Dawid, A.P. and Skene, A.M., 1979. Maximum likelihood estimation of observer error-rates using the EM algorithm. Applied statistics, pp.20-28.

In [16]:
model = build_base_model()
model.fit(data_train_vgg16, labels_train_ds_bin, epochs=N_EPOCHS, shuffle=True, batch_size=BATCH_SIZE, verbose=2)

Epoch 1/50
 - 2s - loss: 0.8893
Epoch 2/50
 - 2s - loss: 0.5147
Epoch 3/50
 - 2s - loss: 0.3767
Epoch 4/50
 - 2s - loss: 0.2873
Epoch 5/50
 - 2s - loss: 0.2247
Epoch 6/50
 - 2s - loss: 0.1860
Epoch 7/50
 - 2s - loss: 0.1540
Epoch 8/50
 - 2s - loss: 0.1347
Epoch 9/50
 - 2s - loss: 0.1304
Epoch 10/50
 - 2s - loss: 0.1096
Epoch 11/50
 - 2s - loss: 0.0868
Epoch 12/50
 - 2s - loss: 0.0852
Epoch 13/50
 - 2s - loss: 0.0790
Epoch 14/50
 - 2s - loss: 0.0768
Epoch 15/50
 - 2s - loss: 0.0694
Epoch 16/50
 - 2s - loss: 0.0624
Epoch 17/50
 - 2s - loss: 0.0837
Epoch 18/50
 - 2s - loss: 0.0765
Epoch 19/50
 - 2s - loss: 0.0801
Epoch 20/50
 - 2s - loss: 0.0743
Epoch 21/50
 - 2s - loss: 0.0603
Epoch 22/50
 - 2s - loss: 0.0545
Epoch 23/50
 - 2s - loss: 0.0656
Epoch 24/50
 - 2s - loss: 0.0541
Epoch 25/50
 - 2s - loss: 0.0689
Epoch 26/50
 - 2s - loss: 0.0605
Epoch 27/50
 - 2s - loss: 0.0607
Epoch 28/50
 - 2s - loss: 0.0646
Epoch 29/50
 - 2s - loss: 0.0556
Epoch 30/50
 - 2s - loss: 0.0564
Epoch 31/50
 - 2s -

<keras.callbacks.History at 0xb375ec9b0>

In [17]:
accuracy_test = eval_model(model, data_test_vgg16, labels_test)
print("Accuracy: Test: %.3f" % (accuracy_test,)) 

Accuracy: Test: 0.806


# Train the model using EM approach and evaluate on testset

The CrowdsCategoricalAggregator class acts as a wrapper for the base model that computed the EM steps.

In [18]:
model = build_base_model()
crowds_agg = CrowdsCategoricalAggregator(model, data_train_vgg16, answers, batch_size=BATCH_SIZE)
for epoch in range(N_EPOCHS):
    print("Epoch:", epoch+1)
    
    # E-step
    ground_truth_est = crowds_agg.e_step()
    print("Adjusted ground truth accuracy:", 1.0*np.sum(np.argmax(ground_truth_est, axis=1) == labels_train) / len(labels_train))
    
    # M-step
    model, pi = crowds_agg.m_step()

Epoch: 1
E-step
Adjusted ground truth accuracy: 0.769
M-step
loss: 0.8985816038131714
Epoch: 2
E-step
Adjusted ground truth accuracy: 0.8478
M-step
loss: 0.09465183200836182
Epoch: 3
E-step
Adjusted ground truth accuracy: 0.8467
M-step
loss: 0.08496349008083344
Epoch: 4
E-step
Adjusted ground truth accuracy: 0.8568
M-step
loss: 0.07329484004974365
Epoch: 5
E-step
Adjusted ground truth accuracy: 0.8599
M-step


KeyboardInterrupt: 

In [None]:
accuracy_test = eval_model(model, data_test_vgg16, labels_test)
print("Accuracy: Test: %.3f" % (accuracy_test,)) 

# Train the model using proposed DL-MW approach and evaluate on testset

We start by adding a new layer (CrowdsClassification) on top of our neural network. We then require a special loss (MaskedMultiCrossEntropy) to handle the missing labels from some of the annotators (encoded as "-1").

Notice how the training is faster then the EM approach.

In [24]:
model = build_base_model()

# add crowds layer on top of the base model
model.add(CrowdsClassification(N_CLASSES, N_ANNOT, conn_type="MW"))

# instantiate specialized masked loss to handle missing answers
loss = MaskedMultiCrossEntropy().loss

# compile model with masked loss and train
model.compile(optimizer='adam', loss=loss)
model.fit(data_train_vgg16, answers_bin_missings, epochs=N_EPOCHS, shuffle=True, batch_size=BATCH_SIZE, verbose=2)

Epoch 1/50
 - 3s - loss: 0.0712
Epoch 2/50
 - 2s - loss: 0.0621
Epoch 3/50
 - 2s - loss: 0.0572
Epoch 4/50
 - 2s - loss: 0.0527
Epoch 5/50
 - 3s - loss: 0.0494
Epoch 6/50
 - 2s - loss: 0.0459
Epoch 7/50
 - 2s - loss: 0.0431
Epoch 8/50
 - 2s - loss: 0.0402
Epoch 9/50
 - 2s - loss: 0.0379
Epoch 10/50
 - 2s - loss: 0.0363
Epoch 11/50
 - 3s - loss: 0.0345
Epoch 12/50
 - 3s - loss: 0.0328
Epoch 13/50
 - 2s - loss: 0.0310
Epoch 14/50
 - 2s - loss: 0.0297
Epoch 15/50
 - 2s - loss: 0.0287
Epoch 16/50
 - 2s - loss: 0.0281
Epoch 17/50
 - 2s - loss: 0.0270
Epoch 18/50
 - 2s - loss: 0.0259
Epoch 19/50
 - 2s - loss: 0.0251
Epoch 20/50
 - 2s - loss: 0.0244
Epoch 21/50
 - 2s - loss: 0.0235
Epoch 22/50
 - 2s - loss: 0.0230
Epoch 23/50
 - 2s - loss: 0.0225
Epoch 24/50
 - 2s - loss: 0.0222
Epoch 25/50
 - 3s - loss: 0.0217
Epoch 26/50
 - 3s - loss: 0.0207
Epoch 27/50
 - 3s - loss: 0.0207
Epoch 28/50
 - 2s - loss: 0.0204
Epoch 29/50
 - 3s - loss: 0.0198
Epoch 30/50
 - 3s - loss: 0.0194
Epoch 31/50
 - 3s -

<keras.callbacks.History at 0xb430249e8>

Before evaluating our model, we need to remove the crowds layer used during training in order to expose the aggregation (bottleneck) layer

In [25]:
# save weights from crowds layer for later
weights = model.layers[5].get_weights()

# remove crowds layer before making predictions
model.pop() 
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

accuracy_test = eval_model(model, data_test_vgg16, labels_test)
print("Accuracy: Test: %.3f" % (accuracy_test,)) 

Accuracy: Test: 0.824


# Compare the weights learned by the crowds layer for each annotator with their true confution matrices

First, compute true confusion matrices:

In [None]:
conf_mats = np.zeros((N_CLASSES,N_CLASSES,N_ANNOT)) + 0.00000001
num_answers = np.zeros((N_CLASSES,N_ANNOT)) + (0.00000001 * N_CLASSES)
for i in range(len(answers)):
    for r in range(N_ANNOT):
        if answers[i][r] != -1:
            num_answers[labels_train[i],r] += 1
            conf_mats[labels_train[i],answers[i][r],r] += 1
for r in range(N_ANNOT):
    for c in range(N_CLASSES):
        for c2 in range(N_CLASSES):
            conf_mats[c,c2,r] = conf_mats[c,c2,r] / num_answers[c,r]

Auxiliary function that make a visual comparison:

In [None]:
def compare_conf_mats(true_conf_mat, weights):
    # normalize weights matrix between 0 and 1
    w_mat = (np.transpose(weights) + np.abs(weights.min()))
    w_mat = w_mat / w_mat.max()
    
    sp1 = plt.subplot(1,2,1)
    plt.imshow(true_conf_mat, interpolation='nearest', cmap=cm.YlOrRd)
    plt.title("True")

    sp = plt.subplot(1,2,2)
    plt.imshow(w_mat, interpolation='nearest', cmap=cm.YlOrRd)
    plt.title("Estimated")

    plt.show()

Make comparison for various annotators:

In [None]:
compare_conf_mats(conf_mats[:,:,1], weights[0][:,:,1])

In [None]:
compare_conf_mats(conf_mats[:,:,2], weights[0][:,:,2])

In [None]:
compare_conf_mats(conf_mats[:,:,9], weights[0][:,:,9])

In [None]:
compare_conf_mats(conf_mats[:,:,20], weights[0][:,:,20])

In [None]:
compare_conf_mats(conf_mats[:,:,23], weights[0][:,:,23])

In [None]:
compare_conf_mats(conf_mats[:,:,30], weights[0][:,:,30])

In [None]:
compare_conf_mats(conf_mats[:,:,36], weights[0][:,:,36])

In [None]:
compare_conf_mats(conf_mats[:,:,39], weights[0][:,:,39])

In [None]:
compare_conf_mats(conf_mats[:,:,45], weights[0][:,:,45])

In [None]:
compare_conf_mats(conf_mats[:,:,56], weights[0][:,:,56])

In [None]:
compare_conf_mats(conf_mats[:,:,58], weights[0][:,:,58])