# Train YOLO object detector with Turi Create

In [1]:
import os, sys, math
import pandas as pd
import turicreate as tc

Helper code for loading the CSV file and combining it with an SFrame. We only keep the images that we have annotations for.

In [2]:
def load_images_with_annotations(images_dir, annotations_file):
    # Load the images into a Turi SFrame.
    data = tc.image_analysis.load_images(images_dir, with_path=True)
    
    # Load the annotations CSV file into a Pandas dataframe.
    csv = pd.read_csv(annotations_file)

    # Loop through all the images and match these to the annotations from the
    # CSV file, if annotations are available for the image.
    all_annotations = []
    for i, item in enumerate(data):
        # Grab image info from the SFrame.
        img_path = item["path"]
        img_width = item["image"].width
        img_height = item["image"].height

        # Find the corresponding row(s) in the CSV's dataframe.
        image_id = os.path.basename(img_path)[:-4]
        rows = csv[csv["image_id"] == image_id]

        # Turi expects a list for every image that contains a dictionary for
        # every bounding box that we have an annotation for.
        img_annotations = []
        for row in rows.itertuples():
            # The CSV file stores the coordinate as numbers between 0 and 1,
            # but Turi wants pixel coordinates in the image.
            xmin = int(round(row[2] * img_width))
            xmax = int(round(row[3] * img_width))
            ymin = int(round(row[4] * img_height))
            ymax = int(round(row[5] * img_height))
            
            # A bounding box in Turi is given by a center coordinate and the
            # width and height, we have them as the four corners of the box.
            width = xmax - xmin
            height = ymax - ymin
            x = xmin + math.floor(width / 2)
            y = ymin + math.floor(height / 2)
            class_name = row[6]
            
            img_annotations.append({"coordinates": {"height": height, 
                                                    "width": width, 
                                                    "x": x, 
                                                    "y": y}, 
                                    "label": class_name})

        # If there were no annotations for this image, then append a None
        # so that we can filter out those images from the SFrame.
        if len(img_annotations) > 0:
            all_annotations.append(img_annotations)
        else:
            all_annotations.append(None)

    data["annotations"] = tc.SArray(data=all_annotations, dtype=list)
    return data.dropna()

In [3]:
data_dir = "snacks"
train_dir = os.path.join(data_dir, "train")
val_dir = os.path.join(data_dir, "val")
test_dir = os.path.join(data_dir, "test")

In [4]:
train_data = load_images_with_annotations(train_dir, data_dir + "/annotations-train.csv")

In [5]:
len(train_data)

4265

In [6]:
train_data.head()

path,image,annotations
snacks/train/apple/007a0b ec00a90a66.jpg ...,Height: 341 Width: 256,"[{'coordinates': {'y': 113, 'x': 73, 'width': ..."
snacks/train/apple/007ec5 6b6529e036.jpg ...,Height: 256 Width: 341,"[{'coordinates': {'y': 149, 'x': 185, 'width': ..."
snacks/train/apple/008816 27629888f6.jpg ...,Height: 256 Width: 384,"[{'coordinates': {'y': 210, 'x': 184, 'width': ..."
snacks/train/apple/00bb57 20a7ba062e.jpg ...,Height: 256 Width: 341,"[{'coordinates': {'y': 126, 'x': 169, 'width': ..."
snacks/train/apple/00cc1c 601b23f73d.jpg ...,Height: 256 Width: 341,"[{'coordinates': {'y': 170, 'x': 249, 'width': ..."
snacks/train/apple/01477e a37494a8ac.jpg ...,Height: 256 Width: 382,"[{'coordinates': {'y': 89, 'x': 107, 'width': ..."
snacks/train/apple/018257 a792aa90e6.jpg ...,Height: 256 Width: 256,"[{'coordinates': {'y': 58, 'x': 163, 'width': ..."
snacks/train/apple/01968e a73417ec3b.jpg ...,Height: 256 Width: 256,"[{'coordinates': {'y': 166, 'x': 143, 'width': ..."
snacks/train/apple/01ecc0 3a12e21e39.jpg ...,Height: 256 Width: 446,"[{'coordinates': {'y': 117, 'x': 232, 'width': ..."
snacks/train/apple/021d25 69ce62aa93.jpg ...,Height: 256 Width: 341,"[{'coordinates': {'y': 39, 'x': 29, 'width': ..."


In [7]:
train_data[0]

{'annotations': [{'coordinates': {'height': 95,
    'width': 112,
    'x': 73,
    'y': 113},
   'label': 'apple'},
  {'coordinates': {'height': 91, 'width': 116, 'x': 76, 'y': 118},
   'label': 'apple'},
  {'coordinates': {'height': 103, 'width': 114, 'x': 75, 'y': 111},
   'label': 'apple'}],
 'image': Height: 341px
 Width: 256px
 Channels: 3,
 'path': 'snacks/train/apple/007a0bec00a90a66.jpg'}

Visualize the bounding boxes on top of the training data:

In [None]:
train_data['image_with_ground_truth'] = tc.object_detector.util.draw_bounding_boxes(
                                            train_data['image'], train_data['annotations'])
train_data.explore()

Train the model. This first downloads the Darknet feature extractor.

In [5]:
model = tc.object_detector.create(train_data, feature='image', annotations='annotations')

Downloading https://docs-assets.developer.apple.com/turicreate/models/darknet.params
Download completed: /var/tmp/model_cache/darknet.params
Setting 'batch_size' to 32
Using GPU to create model (GeForce GTX 1080 Ti)
Setting 'max_iterations' to 13000
+--------------+--------------+--------------+
| Iteration    | Loss         | Elapsed Time |
+--------------+--------------+--------------+
| 1            | 11.276       | 12.7         |
| 36           | 10.892       | 22.8         |
| 71           | 10.506       | 32.8         |
| 107          | 10.517       | 43.1         |
| 142          | 9.942        | 53.2         |
| 177          | 9.459        | 63.3         |
| 212          | 8.914        | 73.3         |
| 248          | 8.299        | 83.6         |
| 283          | 7.992        | 93.7         |
| 318          | 7.582        | 103.7        |
| 354          | 7.431        | 114.0        |
| 389          | 7.024        | 124.1        |
| 424          | 6.907        | 134.1        

| 5837         | 2.618        | 1692.4       |
| 5872         | 2.484        | 1702.6       |
| 5907         | 2.642        | 1712.7       |
| 5942         | 2.496        | 1722.8       |
| 5977         | 2.678        | 1732.9       |
| 6012         | 2.679        | 1743.0       |
| 6048         | 2.596        | 1753.2       |
| 6083         | 2.546        | 1763.2       |
| 6119         | 2.566        | 1773.5       |
| 6154         | 2.530        | 1783.7       |
| 6189         | 2.515        | 1793.9       |
| 6224         | 2.662        | 1803.9       |
| 6259         | 2.464        | 1814.0       |
| 6294         | 2.518        | 1824.1       |
| 6330         | 2.633        | 1834.3       |
| 6365         | 2.678        | 1844.4       |
| 6400         | 2.556        | 1854.5       |
| 6435         | 2.504        | 1864.6       |
| 6470         | 2.517        | 1874.7       |
| 6505         | 2.801        | 1884.8       |
| 6540         | 2.683        | 1894.9       |
| 6575       

| 11980        | 2.101        | 3461.8       |
| 12015        | 2.118        | 3472.0       |
| 12050        | 2.081        | 3482.1       |
| 12085        | 2.097        | 3492.2       |
| 12120        | 2.125        | 3502.2       |
| 12155        | 2.100        | 3512.4       |
| 12191        | 2.183        | 3522.6       |
| 12227        | 2.065        | 3532.8       |
| 12262        | 2.029        | 3542.9       |
| 12297        | 2.170        | 3552.9       |
| 12332        | 2.041        | 3563.0       |
| 12367        | 2.037        | 3573.0       |
| 12402        | 2.074        | 3583.2       |
| 12437        | 1.930        | 3593.5       |
| 12472        | 2.162        | 3603.5       |
| 12507        | 2.078        | 3613.6       |
| 12542        | 1.989        | 3623.8       |
| 12577        | 2.061        | 3633.9       |
| 12612        | 2.218        | 3644.0       |
| 12647        | 1.995        | 3654.1       |
| 12682        | 1.990        | 3664.3       |
| 12717      

Save the model. Also export to Core ML.

In [6]:
model.save("SnackDetector.model")

In [7]:
model.export_coreml("SnackDetector.mlmodel")

  % (keras.__version__, KERAS_MAX_VERSION))
  % (tensorflow.__version__, TF_MAX_VERSION))


Load the model and evaluate it on the test set.

In [None]:
model = tc.load_model("SnackDetector.model")

In [7]:
val_data = load_images_with_annotations(val_dir, data_dir + "/annotations-val.csv")
test_data = load_images_with_annotations(test_dir, data_dir + "/annotations-test.csv")

In [9]:
scores = model.evaluate(test_data)

Predicting   1/826
Predicting  77/826
Predicting 154/826
Predicting 228/826
Predicting 305/826
Predicting 380/826
Predicting 456/826
Predicting 532/826
Predicting 609/826
Predicting 686/826
Predicting 762/826
Predicting 825/826
Predicting 826/826


`model.evaluate()` computes the "average precision" for each class, as well as the overall mean average precision metric. Higher is better.

In [10]:
scores

{'average_precision_50': {'apple': 0.52788541232511876,
  'banana': 0.41939129680862453,
  'cake': 0.38973319479991153,
  'candy': 0.36857447872282678,
  'carrot': 0.18244418788241806,
  'cookie': 0.28427061410479926,
  'doughnut': 0.40468256486874954,
  'grape': 0.28140128515800616,
  'hot dog': 0.48567416340497233,
  'ice cream': 0.33970801999070233,
  'juice': 0.49802894565611988,
  'muffin': 0.34942480208977489,
  'orange': 0.24009947536249898,
  'pineapple': 0.34740496704295087,
  'popcorn': 0.47768671607990487,
  'pretzel': 0.47881610200222496,
  'salad': 0.59927030914176194,
  'strawberry': 0.26752362871589347,
  'waffle': 0.44345717220029013,
  'watermelon': 0.37970409310715819},
 'mean_average_precision_50': 0.38825907147323535}

Make predictions on the test data. This outputs something like this:

```
[{'confidence': 0.7225357099539148,
  'coordinates': {'height': 73.92794444010806,
                  'width': 90.45315889211807,
                  'x': 262.2198759929745,
                  'y': 155.496952970812},
  'label': 'dog',
  'type': 'rectangle'},
 ...]
```

which is similar to the annotations, but now there is a `confidence` field as well.

In [11]:
test_data["predictions"] = model.predict(test_data)

Predicting   1/826
Predicting  78/826
Predicting 155/826
Predicting 231/826
Predicting 307/826
Predicting 383/826
Predicting 458/826
Predicting 534/826
Predicting 610/826
Predicting 687/826
Predicting 764/826
Predicting 826/826


In [12]:
test_data.head()

path,image,annotations,predictions
snacks/test/apple/00341c3 c5825fc7e.jpg ...,Height: 256 Width: 256,"[{'coordinates': {'height': 29, 'y': 20, ...",[]
snacks/test/apple/004be96 d7985d83e.jpg ...,Height: 256 Width: 384,"[{'coordinates': {'height': 66, 'y': 167, ...",[{'coordinates': {'height': ...
snacks/test/apple/01ac2a4 2f2a22ee7.jpg ...,Height: 256 Width: 341,"[{'coordinates': {'height': 123, 'y': 74, ...",[{'coordinates': {'height': ...
snacks/test/apple/03bfc0b 1cc6bde63.jpg ...,Height: 256 Width: 384,"[{'coordinates': {'height': 112, 'y': ...",[{'coordinates': {'height': ...
snacks/test/apple/09ed54b 36eaa5316.jpg ...,Height: 256 Width: 455,"[{'coordinates': {'height': 75, 'y': 143, ...",[{'coordinates': {'height': ...
snacks/test/apple/0f8670e 41c97c8cb.jpg ...,Height: 256 Width: 361,"[{'coordinates': {'height': 142, 'y': ...",[{'coordinates': {'height': ...
snacks/test/apple/1382c47 d4df56b77.jpg ...,Height: 256 Width: 332,"[{'coordinates': {'height': 92, 'y': 86, ...",[{'coordinates': {'height': ...
snacks/test/apple/1acfd56 0a4424e04.jpg ...,Height: 341 Width: 256,"[{'coordinates': {'height': 235, 'y': ...",[{'coordinates': {'height': ...
snacks/test/apple/1db0cb7 5f37d6cba.jpg ...,Height: 256 Width: 341,"[{'coordinates': {'height': 135, 'y': ...",[{'coordinates': {'height': ...
snacks/test/apple/1e5a5b8 fbd4ca698.jpg ...,Height: 256 Width: 256,"[{'coordinates': {'height': 40, 'y': 80, ...",[{'coordinates': {'height': ...


Visualize the predicted bounding boxes on top of the test set:

In [13]:
test_data['image_with_predictions'] = tc.object_detector.util.draw_bounding_boxes(
                                           test_data['image'], test_data['predictions'])

In [14]:
test_data.head()

path,image,annotations,predictions
snacks/test/apple/00341c3 c5825fc7e.jpg ...,Height: 256 Width: 256,"[{'coordinates': {'height': 29, 'y': 20, ...",[]
snacks/test/apple/004be96 d7985d83e.jpg ...,Height: 256 Width: 384,"[{'coordinates': {'height': 66, 'y': 167, ...",[{'coordinates': {'height': ...
snacks/test/apple/01ac2a4 2f2a22ee7.jpg ...,Height: 256 Width: 341,"[{'coordinates': {'height': 123, 'y': 74, ...",[{'coordinates': {'height': ...
snacks/test/apple/03bfc0b 1cc6bde63.jpg ...,Height: 256 Width: 384,"[{'coordinates': {'height': 112, 'y': ...",[{'coordinates': {'height': ...
snacks/test/apple/09ed54b 36eaa5316.jpg ...,Height: 256 Width: 455,"[{'coordinates': {'height': 75, 'y': 143, ...",[{'coordinates': {'height': ...
snacks/test/apple/0f8670e 41c97c8cb.jpg ...,Height: 256 Width: 361,"[{'coordinates': {'height': 142, 'y': ...",[{'coordinates': {'height': ...
snacks/test/apple/1382c47 d4df56b77.jpg ...,Height: 256 Width: 332,"[{'coordinates': {'height': 92, 'y': 86, ...",[{'coordinates': {'height': ...
snacks/test/apple/1acfd56 0a4424e04.jpg ...,Height: 341 Width: 256,"[{'coordinates': {'height': 235, 'y': ...",[{'coordinates': {'height': ...
snacks/test/apple/1db0cb7 5f37d6cba.jpg ...,Height: 256 Width: 341,"[{'coordinates': {'height': 135, 'y': ...",[{'coordinates': {'height': ...
snacks/test/apple/1e5a5b8 fbd4ca698.jpg ...,Height: 256 Width: 256,"[{'coordinates': {'height': 40, 'y': 80, ...",[{'coordinates': {'height': ...

image_with_predictions
Height: 256 Width: 256
Height: 256 Width: 384
Height: 256 Width: 341
Height: 256 Width: 384
Height: 256 Width: 455
Height: 256 Width: 361
Height: 256 Width: 332
Height: 341 Width: 256
Height: 256 Width: 341
Height: 256 Width: 256


In [None]:
test_data.explore()