In [196]:
import fastparquet
from glob import glob
import os
import sys
import pandas as pd
import time
import numpy as np
import torch
import albumentations as A
import io
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from PIL import Image

In [2]:
train_image_path = "/home/bdai/spark_work/spark-warehouse/covid_train_binary"
test_image_path = "/home/bdai/spark_work/spark-warehouse/covid_test_binary"

image_shape = (3, 224, 224)

In [33]:
parquet_list = sorted(glob(os.path.join(train_image_path, '*.parquet')))
fdf = fastparquet.ParquetFile(parquet_list)

In [9]:
raw_cols = ['content', 'label']
list_df = []
for df in fdf.iter_row_groups(columns=raw_cols):
    list_df.append(df)

In [25]:
df_data = pd.concat(list_df)
now = time.time()
seed = int((now - int(now))*100000)
rng = np.random.RandomState(seed=seed)
np_indices = rng.permutation(np.arange(len(df_data)))
list_indices = np_indices.tolist()

In [202]:
# album_transforms = A.Compose([  
#         A.Resize(250, 250),
#         A.ShiftScaleRotate(
#                         shift_limit=0.05,
#                         scale_limit=0.05,
#                         rotate_limit=15,
#                         p=0.5),
#         A.RandomCrop(250, 250),
#         A.HorizontalFlip(p=0.5),
#         #A.RandomBrightnessContrast(p=0.2),
#         #A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
#         ToTensorV2(),
#     ])
album_transforms = A.Compose([A.pytorch.transforms.ToTensorV2()])

class ParquetCategoryDataset(Dataset):  
    def __init__(self,
                 parquet_path,
                 raw_cols,
                 transform,
                 num_cached_parquet=5,
                 num_workers=0,
                 shuffle=True
                 ):

        self.raw_cols = raw_cols  # parquet file에서 사용할 column

        self.parquet_list = sorted(glob(os.path.join(parquet_path, '*.parquet')))
        self.num_cached_parquet = num_cached_parquet  # 캐시할 파일 개수

        self.steps_cache = int(np.ceil(
            len(self.parquet_list) / self.num_cached_parquet))  # cache step
        self.current_parquet_idx = 0
        self.current_pd_parquets = None  # cached parquets
        self.current_indices_in_cache = []  # data index in cached parquet
        self.steps_per_epoch = 0
        self.total_len = self.get_total_length()
        self.transform = transform
        self.shuffle = shuffle
        self.num_workers = num_workers
        self._cache_setting()

    def _cache_setting(self):
        cur_pd, cur_indices = self._cache_parquet(self.current_parquet_idx)
        self.current_pd_parquets = cur_pd
        self.current_indices_in_cache = cur_indices

    def get_total_length(self):
        fdf = fastparquet.ParquetFile(self.parquet_list)
        total_len = 0
        for df in fdf.iter_row_groups(columns=raw_cols):
            total_len += len(df)
        return total_len

    def _cache_parquet(self, idx):
        next_idx = (idx+1)*self.num_cached_parquet
        next_idx = None if next_idx > len(self.parquet_list) else next_idx

        list_part_parquet = self.parquet_list[
            idx*self.num_cached_parquet:next_idx
            ]

        fparquet = fastparquet.ParquetFile(list_part_parquet)

        list_df = []
        for df in fparquet.iter_row_groups(columns=self.raw_cols):
            list_df.append(df)

        df_data = pd.concat(list_df)
        now = time.time()
        seed = int((now - int(now))*100000)
        rng = np.random.RandomState(seed=seed)
        np_indices = rng.permutation(len(df_data)) \
            if self.shuffle else np.arange(len(df_data))
        list_indices = np_indices.tolist()

        return df_data, list_indices

    def __len__(self):
        return self.total_len

    def __getitem__(self, idx):
        # idx는 사용하지 않고 parquet data queue에서 pop

        # parquet file을 교체하는 refresh_idx를 전체 주기보다 짧게 디자인한다.
        # 이때 교체 주기는 하이퍼파라미터로 중복 비율에 영향을 미친다.
        parquet_refresh_freq = 4
        refresh_idx = 1 \
            if self.num_workers == 0 \
            else len(self.current_pd_parquets) \
            - len(self.current_pd_parquets) // (parquet_refresh_freq*self.num_workers)

        if len(self.current_indices_in_cache) < refresh_idx:
            self.current_parquet_idx += 1
            # 캐시 파일 교체 주기가 빨라지므로 교체할 때 parquet list도 무작위로 섞는다.
            if self.current_parquet_idx >= self.steps_cache:
                self.current_parquet_idx = 0
                if self.num_workers > 0:
                    now = time.time()
                    seed = int((now - int(now))*100000)
                    rng = np.random.RandomState(seed=seed)
                    rng.shuffle(self.parquet_list)

            self._cache_setting()

        # 단일 프로세스에서 같은 리스트를 pop으로 꺼내는 경우는 병렬 프로세스에서 중복을 일으키므로
        # indices 행렬에서 random sampling으로 수행 후 해당 index를 삭제하는 형태로 변형한다.
        if self.num_workers != 0:
            now = time.time()
            seed = int((now - int(now))*100000)
            rng = np.random.RandomState(seed=seed)
            rand_idx = rng.randint(
                len(self.current_indices_in_cache)
                )
            pd_idx = self.current_indices_in_cache[rand_idx]
            del self.current_indices_in_cache[rand_idx]
        else:
            pd_idx = self.current_indices_in_cache.pop()
        print(len(self.current_pd_parquets))
        pd_idx = self.current_indices_in_cache.pop()
        pd_raw = self.current_pd_parquets.iloc[pd_idx]
        image = Image.open(io.BytesIO(pd_raw['content'])).resize([image_shape[1],image_shape[2]]).convert('RGB')
        image = np.array(image)
        trans_image = album_transforms(image=image)['image']
        pd_raw['image']= trans_image


        return pd_raw[['image', 'label']]

In [203]:
train_data = ParquetCategoryDataset(train_image_path, raw_cols, 10)

In [204]:
train_dataloader = DataLoader(train_data, batch_size = 64)

In [205]:
for idx, batch in enumerate(train_dataloader):
    image, label = batch['image'], batch['label']
    break

54
54


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image


54
54


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image


54
54


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image


54
54


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image


54
54


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image


54
54


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image


54
54


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image


54
54


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image


54
54


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image


54
54


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image


54
54


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image


54
54
54


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice fro

54
54


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image


55
55
55
55
55
55
55
55
55


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice fro

55
55
55
55
55
55
55
55
55


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice fro

55
55
55
55
55
55
55
55
55


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_raw['image']= trans_image
A value is trying to be set on a copy of a slice fro

55


IndexError: pop from empty list