# Step 0 - Configuration and setup of dependencies

In [7]:
# Dependencies installation
!pip install matplotlib
!pip install pandas
!pip install torch torchvision torchaudio




In [8]:
import cv2
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader

import pandas as pd

# Step 1 - review dataset

In [21]:
import pandas as pd

# Specify the path to your Parquet file
image_dir = "../datasets/easy-500"
parquet_file_path = f"{image_dir}/images.parquet"
labels_parquet_file_path = f"{image_dir}/labels.parquet"

# Load the Parquet file into a DataFrame
df = pd.read_parquet(parquet_file_path, engine='pyarrow')  # or engine='fastparquet'

df_labels = pd.read_parquet(labels_parquet_file_path)

print(df_labels)

def dump_images(df):
  for i, row in df.iterrows():
    image = cv2.imdecode(np.frombuffer(row['image'], np.uint8), cv2.IMREAD_COLOR)
    success = cv2.imwrite(f'../{image_dir}/{i}.jpg', image)

# Now you can work with the DataFrame `df`
print(df.head())

dump_images(df)

       image_id    x    y  orientation  radius  class
0             0  269  450     0.000000      17      0
1             0  533  299     0.663225      45      1
2             0  539  427     0.610865      46      1
3             0  365  148     0.488692      45      1
4             0  472  136     2.426008      40      1
...         ...  ...  ...          ...     ...    ...
11495       499  356  363     0.575959      37      2
11496       499  477  371     2.268928      41      2
11497       499  934   74     2.076942      45      2
11498       499  525  283     0.506145      46      2
11499       499  284  397     5.916666      38      2

[11500 rows x 6 columns]
   id                                              image
0   0  b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
1   1  b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
2   2  b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
3   3  b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...
4   4  b'\xff\xd8\xff\xe0\x00\x10JFIF\x00

In [30]:
class CircleObjectsDataset(Dataset):
    def __init__(self, annotations_file, img_dir, target_size=(256, 256)):
        self.img_labels = pd.read_parquet(annotations_file)
        self.img_dir = img_dir
        self.target_size = target_size

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = f"{self.img_dir}/{int(self.img_labels.iloc[idx]['image_id'])}.jpg"
        print(img_path)
        # Load image with cv2 and convert to RGB
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        # Resize image
        image = cv2.resize(image, self.target_size, interpolation=cv2.INTER_LINEAR)
        # Normalize image
        image = image / 255.0
        image = np.transpose(image, (2, 0, 1))  # From HWC to CHW
        image = torch.tensor(image, dtype=torch.float)
        
        label = self.img_labels.iloc[idx][['x', 'y', 'radius', 'orientation', 'class']].values
        label = torch.tensor(label, dtype=torch.float)
        
        return image, label

In [29]:

players_data_set = CircleObjectsDataset(labels_parquet_file_path, image_dir)
image, label = players_data_set.__getitem__(0)
print(image.shape)
print(label)

../datasets/easy-500/0.jpg
torch.Size([3, 256, 256])
tensor([269., 450.,  17.,   0.,   0.])
