In [32]:
import numpy as np
import pandas as pd
import json
import os
from parse import parse
import pprint
pp = pprint.PrettyPrinter(indent=2)

In [44]:
# Find out all pets that have an image

# Train dataset
train_df = pd.read_csv('data/train/train.csv')
train_pet_ids = train_df['PetID'].tolist()
train_pet_ids = set(train_pet_ids)

train_imgs_path = 'data/train_images'
train_imgs_filenames = os.listdir(train_imgs_path)
img_filename_format = '{}-{}.jpg'
train_img_ids = [parse(img_filename_format, img_f)[0] for img_f in train_imgs_filenames]
# Pets can have several images
train_img_ids = set(train_img_ids)
train_ids_no_img = train_pet_ids - train_img_ids

len(train_pet_ids), len(train_img_ids), len(train_ids_no_img)

(14993, 14652, 341)

In [25]:
# Test dataset
test_df = pd.read_csv('data/test/test.csv')
test_pet_ids = test_df['PetID'].tolist()
test_pet_ids = set(test_pet_ids)

test_imgs_path = 'data/test_images'
test_imgs_filenames = os.listdir(test_imgs_path)
test_img_ids = [parse(img_filename_format, img_f)[0] for img_f in test_imgs_filenames]

test_img_ids = set(test_img_ids)
test_ids_no_img = test_pet_ids - test_img_ids

len(test_pet_ids), len(test_img_ids), len(test_ids_no_img)

(3948, 3821, 127)

In [14]:
# Overview of image metadata
train_metadata_path = 'data/train_metadata'
train_metadata_filenames = os.listdir(train_metadata_path)
train_metadata_filepaths = [os.path.join(train_metadata_path, fname) for fname in train_metadata_filenames]

metadata_keys = set()
error_files = []
metadata_list = []
for fpath in train_metadata_filepaths:
    with open(fpath) as file:
        try:
            metadata = json.load(file)
            metadata_list.append(metadata)
            metadata_keys.update(metadata.keys())
        except UnicodeDecodeError:
            error_files.append(fpath)
metadata_keys

{'cropHintsAnnotation',
 'faceAnnotations',
 'imagePropertiesAnnotation',
 'labelAnnotations',
 'textAnnotations'}

In [17]:
len(error_files)

39

In [41]:
# Example of metadata values
for key in metadata_keys:
    for metadata in metadata_list:
        if key in metadata:
            break
    print('****** %s ******' % key)
    pp.pprint(metadata[key])

****** cropHintsAnnotation ******
{ 'cropHints': [ { 'boundingPoly': { 'vertices': [ {},
                                                   {'x': 348},
                                                   {'x': 348, 'y': 478},
                                                   {'y': 478}]},
                   'confidence': 0.79999995,
                   'importanceFraction': 1}]}
****** labelAnnotations ******
[ { 'description': 'cat',
    'mid': '/m/01yrx',
    'score': 0.9943703,
    'topicality': 0.9943703},
  { 'description': 'small to medium sized cats',
    'mid': '/m/07k6w8',
    'score': 0.9213904,
    'topicality': 0.9213904},
  { 'description': 'whiskers',
    'mid': '/m/01l7qd',
    'score': 0.91749674,
    'topicality': 0.91749674},
  { 'description': 'cat like mammal',
    'mid': '/m/0307l',
    'score': 0.89707345,
    'topicality': 0.89707345},
  { 'description': 'eye',
    'mid': '/m/014sv8',
    'score': 0.80012083,
    'topicality': 0.80012083},
  { 'description': 'khao

In [55]:
# Count how many images have different metadata types
metadata_keys_lens = {}
for key in metadata_keys:
    metadata_keys_lens[key] = 0
for metadata in metadata_list:
    for key in metadata_keys:
        if key in metadata:
            metadata_keys_lens[key] += 1


In [56]:
metadata_keys_lens

{'cropHintsAnnotation': 58272,
 'faceAnnotations': 655,
 'imagePropertiesAnnotation': 58272,
 'labelAnnotations': 58260,
 'textAnnotations': 3317}

In [58]:
# Check for "topicality" field, if its equal everywhere to score, then no AssertionError will raise
for metadata in metadata_list:
    if 'labelAnnotations' in metadata:
        assert np.product([la['score'] == la['topicality'] for la in metadata['labelAnnotations']]) == 1