# Problem 4 &mdash; Model Evaluation

We'll evaluate the model against the test data.

In [1]:
from cse547.data import CocoMultiLabelFeaturesDataset, FlattenTensorTransform, OneShotDataLoader
from cse547.models import LinearClassifier
from cse547.s3 import deserialize_object

S3_BUCKET = 'cse-547'
S3_MODEL_KEY = 'hw2/train/model_1525498537.pkl' # See train_s3_model_key.
DATA_SIZE = 'tiny' # Ensure that your model is trained on right-sized data.

Let's load the data as usual. We want to evaluate against the `test` set. `training` was what we used to compute gradients and `validation` was used to select a model.

In [2]:
dataset = CocoMultiLabelFeaturesDataset('/data', mode='test', size='tiny', transform=FlattenTensorTransform())
data_loader = OneShotDataLoader(dataset)

loading annotations into memory...
Done (t=14.32s)
creating index...
index created!


In [3]:
state_dict = deserialize_object(S3_BUCKET, S3_MODEL_KEY)
state_dict

OrderedDict([('fc1.weights', 
               1.0804e-03  2.4114e-03  2.8235e-04  ...  -7.2165e-04 -1.1089e-03 -1.1209e-03
               1.8562e-03  6.3103e-04  1.7021e-03  ...   2.9014e-05 -5.1436e-04 -2.4083e-03
              -2.1231e-03  1.1507e-03  1.9464e-03  ...   8.3683e-04 -1.9026e-05 -1.4915e-03
                              ...                   ⋱                   ...                
               1.6766e-05 -5.1669e-03 -2.2196e-03  ...   1.1093e-03  3.1030e-03  1.8658e-03
              -1.8034e-03 -2.1368e-03 -3.5687e-03  ...  -5.5807e-05  1.9003e-03  6.8568e-04
              -1.3327e-03  1.4789e-03  7.3115e-04  ...  -1.5257e-05  2.7027e-05  1.8777e-03
              [torch.FloatTensor of size 5408x18])])

In [4]:
model = LinearClassifier(len(dataset[0]['features']), len(dataset[0]['label']))
for param in model.parameters():
    param.requires_grad = False
print(list(model.parameters()))
model.load_state_dict(state_dict)
print(list(model.parameters()))

[Variable containing:
 3.3007e-03 -5.0398e-03 -8.4765e-04  ...  -1.3972e-03 -5.6444e-03  2.5557e-03
-2.7419e-03  5.0667e-03  2.1821e-03  ...  -1.9491e-03 -1.6532e-03  5.3134e-03
 3.1108e-03  4.1991e-03 -7.8125e-03  ...  -9.5346e-04  3.1182e-03  1.1410e-03
                ...                   ⋱                   ...                
-3.7840e-03  3.5785e-03 -7.8125e-03  ...   1.5059e-03 -3.9846e-03 -4.2174e-03
-2.2392e-04  7.2679e-03  2.0153e-03  ...  -3.9580e-03  1.4277e-03  7.8125e-03
 3.1081e-03 -1.6838e-03 -3.5008e-03  ...   1.7304e-03 -3.2286e-03  2.1057e-03
[torch.FloatTensor of size 5408x18]
]
[Variable containing:
 1.0804e-03  2.4114e-03  2.8235e-04  ...  -7.2165e-04 -1.1089e-03 -1.1209e-03
 1.8562e-03  6.3103e-04  1.7021e-03  ...   2.9014e-05 -5.1436e-04 -2.4083e-03
-2.1231e-03  1.1507e-03  1.9464e-03  ...   8.3683e-04 -1.9026e-05 -1.4915e-03
                ...                   ⋱                   ...                
 1.6766e-05 -5.1669e-03 -2.2196e-03  ...   1.1093e-03  3.103

In [5]:
print(list(model.parameters()))

[Variable containing:
 1.0804e-03  2.4114e-03  2.8235e-04  ...  -7.2165e-04 -1.1089e-03 -1.1209e-03
 1.8562e-03  6.3103e-04  1.7021e-03  ...   2.9014e-05 -5.1436e-04 -2.4083e-03
-2.1231e-03  1.1507e-03  1.9464e-03  ...   8.3683e-04 -1.9026e-05 -1.4915e-03
                ...                   ⋱                   ...                
 1.6766e-05 -5.1669e-03 -2.2196e-03  ...   1.1093e-03  3.1030e-03  1.8658e-03
-1.8034e-03 -2.1368e-03 -3.5687e-03  ...  -5.5807e-05  1.9003e-03  6.8568e-04
-1.3327e-03  1.4789e-03  7.3115e-04  ...  -1.5257e-05  2.7027e-05  1.8777e-03
[torch.FloatTensor of size 5408x18]
]


In [6]:
from torch.autograd import Variable
from torch.nn import functional

data_dict = iter(data_loader).next()
features = Variable(data_dict['features'], volatile=True)
labels = Variable(data_dict['label'], volatile=True)
predictions = functional.sigmoid(model(features))

In [7]:
from sklearn.metrics import average_precision_score

average_precision_score(labels.data.numpy(), predictions.data.numpy())

0.4819263272665726

In [8]:
dataset.label_names

['bicycle',
 'car',
 'motorcycle',
 'airplane',
 'bus',
 'train',
 'truck',
 'boat',
 'bird',
 'cat',
 'dog',
 'horse',
 'sheep',
 'cow',
 'elephant',
 'bear',
 'zebra',
 'giraffe']

In [9]:
from collections import OrderedDict
import numpy as np
import pandas as pd

In [10]:
evaluation_by_label = pd.DataFrame(OrderedDict([
    ('label_name', dataset.label_names),
    ('training_observations', np.sum(labels.data.numpy(), axis=0, dtype=np.int)),
    ('average_precision_score', average_precision_score(labels.data.numpy(), predictions.data.numpy(), average=None)),
]))

In [11]:
evaluation_by_label

Unnamed: 0,label_name,training_observations,average_precision_score
0,bicycle,201,0.370129
1,car,752,0.747794
2,motorcycle,192,0.514566
3,airplane,95,0.637784
4,bus,257,0.562634
5,train,74,0.337445
6,truck,458,0.515644
7,boat,137,0.350461
8,bird,217,0.316766
9,cat,96,0.533058
