In [None]:
import os
import pathlib
import cv2
import boto3
import matplotlib.pyplot as plt

## Read Data Connection Variables

In [None]:
AWS_ACCESS_KEY_ID = os.environ.get("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY")
AWS_S3_BUCKET = os.environ.get("AWS_S3_BUCKET")
AWS_S3_ENDPOINT = os.environ.get("AWS_S3_ENDPOINT")

## Define S3 Connection

In [None]:
session = boto3.Session(
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)
client = session.client("s3", endpoint_url=AWS_S3_ENDPOINT)

## Download and Extract the Dataset

In [None]:
if not AWS_ACCESS_KEY_ID:
    # If S3 is not used in the demo, we can fallback
    # to the _scenario_resources directory.
    !cp _scenario_resources/dataset.tar.gz dataset.tar.gz
else:
    client.download_file(AWS_S3_BUCKET, "dataset.tar.gz", "dataset.tar.gz")
    print("Files downloaded from S3")

!tar -xzf dataset.tar.gz

## Inspect the Data

In [None]:
!ls -l dataset/

In [None]:
!ls -l dataset/images

### Number of Training Images

In [None]:
!ls -1 dataset/images/train | wc -l

### Number of Validation Images

In [None]:
!ls -1 dataset/images/val | wc -l

### Number of Test Images

In [None]:
!ls -1 dataset/images/test | wc -l

## Visualize the Data

In [None]:
files = [f for f in os.listdir("dataset/images/train")][:9]
fig = plt.figure(figsize=(10, 7))
rows, columns = 3, 3
print(files)
for i, file in enumerate(files):
    fig.add_subplot(rows, columns, i+1)
    image = cv2.imread(str(pathlib.Path("dataset/images/train") / file))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(image)

In [None]:
labels = [f.replace("images", "labeles").replace(".jpg", ".txt") for f in files][2:]
for l in labels:
    with open(pathlib.Path("dataset/labels/train") / l) as f:
        print(l)
        print(f.read())