# Register AFT dataset

This notebook shows how to register the latest version of the American Football Tactical (AFT) dataset to `dectectron2`.

## Get the AFT dataset from `quilt3`

Configure `quilt3` to look into our server and registry:

In [None]:
import quilt3
import pandas as pd


# Commented lines only necessary the first time
# quilt3.config('https://quilt3.hudltools.com/')
# quilt3.login() 
quilt3.config(default_remote_registry='s3://hudlrd-datasets') 

Get the AFT dataset from `quilt3`:

In [None]:
user = 'hudlrd'
package = 'american_football_tactical'
parquet = 'detections_df.parquet'

pkg = quilt3.Package.install(
    f'{user}/{package}', 
    registry=None,
    top_hash=None
)

pkg[parquet].fetch()
df = pd.read_parquet(parquet)

In [None]:
pkg = quilt3.Package.install(
    f'{user}/{package}', 
    registry=None,
    top_hash=None
)

pkg[parquet].fetch()
df = pd.read_parquet(parquet)

In [None]:
print("# examples: ", len(df))

df

Clean up the dataframe:

In [None]:
# Remove examples tagged as not match frames
aux_df = df
df = df[df.notMatchFrame == False]
print('# examples removed by notMatchFrame filtering: ', len(aux_df) - len(df))

# Remove examples with null boxes or labels
aux_df = df
df = df[~df.bbxs.isnull()]
df = df[~df.labels.isnull()]
print('# examples removed by isnull filtering: ', len(aux_df) - len(df))

# Remove examples with no boxes or labels
aux_df = df
df = df[df.bbxs.map(lambda d: len(d)) > 0]
df = df[df.labels.map(lambda d: len(d)) > 0]
print('# examples removed by empty box or label list filtering: ', len(aux_df) - len(df))

# Remove invalid bounding boxes where width or height are smaller or equak than 
aux_df = df
df = df[df.bbxs.map(lambda bbxs: len(set([True for b in bbxs if b[2] <= 0 or b[3] <= 0]))) == 0]
print('# examples removed by invalid box filtering: ', len(aux_df) - len(df))

# Reset dataframe indices
df = df.reset_index(drop=True)

In [None]:
print('# examples: ', len(df))

df

At this point is probably a good idea to save the dataset's images to the local machine and update the data frame accordingly:

In [None]:
import shutil
import requests
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor


cache = Path('/root/cvdev/aft_images/')
cache.mkdir(parents=True, exist_ok=True)


def save_df_images(df):
    def save_df_image(i):
        local_path = cache / f"{str(i).zfill(4)}.png"
        if not local_path.exists():
            with open(local_path, 'wb') as f:
                response = requests.get(df.at[i, 'path'], stream=True)
                shutil.copyfileobj(response.raw, f)
        df.at[i, 'path'] = str(local_path.absolute())

    with ThreadPoolExecutor() as executor:
        list(executor.map(save_df_image, range(len(df))))

In [None]:
save_df_images(df)

In [None]:
df

## Register the AFT dataset to `detectron2`

Get the dataset's unique labels:

In [None]:
unique_labels = set()

for labels in df["labels"]:
    if labels is not None:
        unique_labels.update(labels)

unique_labels = list(unique_labels)
        
print("Unique labels:", unique_labels)

Define the `get_dicts` function that will return the items in our dataset in the format expected by `detectron2`:

In [None]:
import numpy as np
from detectron2.structures import BoxMode


def get_dicts(df, partition):
    df = df[df.set_split_random == partition]
    
    records = []
    for i, row in df.iterrows():
        record = {}
        record["file_name"] = row['path']
        record["image_id"] = i
        record["height"] = 2000
        record["width"] = 2666

        annotations = []
        for bbox, label in zip(row["bbxs"], row["labels"]):
            ann = {}
            bbox = bbox * np.asanyarray([record["width"], record["height"]] * 2)
            ann["bbox"] = bbox.tolist()
            ann["bbox_mode"] = BoxMode.XYWH_ABS
            ann["category_id"] = unique_labels.index(label)
            annotations.append(ann)

        record["annotations"] =  annotations
        records.append(record)
    
    return records

In [None]:
print("# examples in train:", len(get_dicts(df, "train")))
print("# examples in dev:", len(get_dicts(df, "dev")))
print("# examples in test:", len(get_dicts(df, "test")))

Tell `detectron2` about the previous function:

In [None]:
from detectron2.data import DatasetCatalog, MetadataCatalog


for partition in ["train", "dev", "test"]:
    DatasetCatalog.register(
        f"AmericanFootballTactical/{partition}", 
        lambda partition = partition: get_dicts(df, partition)
    )
    MetadataCatalog.get(
        f"AmericanFootballTactical/{partition}"
    ).set(thing_classes=unique_labels)

Make sure the dataset has been correctly registered:

In [None]:
DatasetCatalog.get('AmericanFootballTactical/train')

Finally, let us plot some of the dataset's data:

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

import random
import requests
from PIL import Image, ImageFile
from detectron2.utils.visualizer import Visualizer


figsize = (16, 12)
ImageFile.LOAD_TRUNCATED_IMAGES = True

metadata = MetadataCatalog.get("AmericanFootballTactical/train")
records = get_dicts(df, "train")

for i, r in enumerate(random.sample(records, 5)):
#     img = Image.open(requests.get(r["file_name"], stream=True).raw)
    img = Image.open(r["file_name"])
    img = np.asanyarray(img)
    
    visualizer = Visualizer(img, metadata=metadata, scale=0.5)
    vis = visualizer.draw_dataset_dict(r)
    
    plt.figure(i, figsize=figsize)
    plt.imshow(vis.get_image())