In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

Initialize the `Intermediate Jesey Dataset` from `s3`:

In [None]:
from nucleus.dataset.vq import VqDataset


ds = VqDataset.from_s3(
    name='IntermediateJerseyDataset',
    bucket='hudlrd-experiments',
    key='jersey-tagging/development',
    n_jobs=None,
    show_progress=True
)

Visualize some images from this dataset:

In [None]:
%time ds.view_row(0)

In [None]:
%time ds.view_row(1)

Images are being accessed from s3, this is quite slow:

In [None]:
%time row, image = ds[0]

To gain faster access to the images let us save them locally:

In [None]:
ds.save()

In [None]:
%time row, image = ds[0]

In [None]:
ds.view_row(0)

In [None]:
ds.view_row(1)

Create `Real Jersey Dataset`:

In [None]:
from typing import Optional, List

import pathlib
import pandas as pd

from hudl_aws.s3 import write_to_s3, ContentType

from nucleus.dataset import quilt_tools
from nucleus.dataset.keys import DatasetKeys
from nucleus.utils import progress_bar

In [None]:
def create_dataset_from_boxes(
    dataset,
    name: str,
    cache: str = './dataset_cache',
    skip_labels: Optional[List[str]] = None,
    compress: bool = True,
    image_format: str = 'png'
):
    r"""
    """
    cache = pathlib.Path(cache)
    full_cache = cache / name
    full_cache.mkdir(parents=True, exist_ok=True)

    records = []
    for row, image in progress_bar(ds):
        path = pathlib.Path(row[DatasetKeys.PATH.value])
        for i, crop in enumerate(image.images_from_box_collection(skip_labels=skip_labels)):
            path_crop = full_cache / f'{path.stem}_{i}.{image_format}'
            
            crop.save(
                path=path_crop, 
                compress=compress,
                image_format=image_format
            )
            
            parsed = crop.serialize(path=path_crop)
            records.append(parsed)
            
    df = pd.DataFrame.from_records(records)
    
    return VqDataset(name=name, df=df, cache=cache)

In [None]:
new_ds = create_dataset_from_boxes(
    dataset=ds, 
    name='RealJerseyDataset', 
    cache='./dataset_cache',
    skip_labels=['partial'],
)

new_ds.df

In [None]:
%time row, image = new_ds[0]

In [None]:
new_ds.upload_images_to_s3(
    bucket='hudlrd-experiments',
    key='jersey-tagging/images',
)

new_ds.df

In [None]:
%time row, image = new_ds[0]

In [None]:
new_ds.view_row(99)

In [None]:
new_ds.update_quilt_df(
    user='hudlrd',
    pkg='basketball_jerseys'
)

# END

# Scrap

In [None]:
def create_quilt_dataset(
    ds,
    user: str = 'hudlrd',
    pkg: str = 'baskeball_jerseys',
    readme: Optional[str] = None,
    hash_key: Optional[str] = None,
    bucket: str = 'hudlrd-experiments',
    key: str = 'jersey-tagging/images',
    image_format: str = 'png'
):
    r"""
    """
    for row, image in progress_bar(ds):
        stem = pathlib.Path(row[DatasetKeys.PATH.value]).stem
        write_to_s3(
            data=image.bytes(image_format=image_format),
            bucket=bucket,
            key=f'{key}/{stem}.{image_format}',
            content_type=ContentType[image_format]
        )
    
    if readme is None:
        readme = self.create_default_readme(self.df)
    quilt_tools.update_df(
        ds.df,
        user=user,
        pkg=pkg,
        readme=readme,
        hash_key=hash_key
    )

In [None]:
def create_quilt_dataset(
    ds,
    user: str = 'hudlrd',
    pkg: str = 'baskeball_jerseys',
    readme: Optional[str] = None,
    hash_key: Optional[str] = None,
    bucket: str = 'hudlrd-experiments',
    key: str = 'jersey-tagging/images',
    image_format: str = 'png'
):
    r"""
    """
    for row, image in progress_bar(ds):
        stem = pathlib.Path(row[DatasetKeys.PATH.value]).stem
        write_to_s3(
            data=image.bytes(image_format=image_format),
            bucket=bucket,
            key=f'{key}/{stem}.{image_format}',
            content_type=ContentType[image_format]
        )
    
    if readme is None:
        readme = self.create_default_readme(self.df)
    quilt_tools.update_df(
        ds.df,
        user=user,
        pkg=pkg,
        readme=readme,
        hash_key=hash_key
    )

In [None]:
%debug

In [None]:
ds.view_row(6)

In [None]:
create_quilt_dataset(ds)

In [None]:
df.labels

In [None]:
df

In [None]:
images = image.images_from_box_collection(skip_labels=['unknown',' partial'])

for i in images:
    print(i.serialize(path='0'))

In [None]:
image = new_ds.images_lazy[0]

In [None]:
x = []
for i, row in new_ds.df.iterrows():
    if row['labels'][0] == 'visible':
        x.append(int(row['labels'][1]))

In [None]:
len(x)

In [None]:
import numpy as np
import seaborn as sns

sns.distplot(x, bins=60, kde=False)

In [None]:
type(ds.df.iloc[0])

In [None]:
ds.images_lazy[0].box_collection.labels_list

In [None]:
from nucleus.visualize.matplotlib import BasketballJerseyLabelColorMap

box_args = {
    'label_color_map': BasketballJerseyLabelColorMap,
    'alpha': 0.5,
    'caption_box_alpha': 0.5,
    'skip_labels': ['occluded', 'partial', 'unknown']
}

In [None]:
image = ds.images_lazy[2]

In [None]:
len(ds.images_lazy)

In [None]:
image.view(box_args=box_args)