Skip to content

torch.utils.data.Dataset combined with pycuda issue #349

@wuzuowuyou

Description

@wuzuowuyou

Describe the bug
when num_workers=0,everything is ok , but num_workers>0, error.
I went to pytorch and asked. They said it was the problem of pycuda multithreading initialization, but I didn't know how to solve it

pytorch/pytorch#74014

To Reproduce

import torch
print(torch.version.cuda) # 11.0
print(torch.__version__) # 1.7.0

import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
class Images_Dataset(torch.utils.data.Dataset):

    def __init__(self, path_lmdb, path_lmdb_key_pkl, transformI=None, transformM=None):
        self.path_lmdb_key_pkl = path_lmdb_key_pkl
        self.path_lmdb = path_lmdb

        self.list_key_read = []
        with open(self.path_lmdb_key_pkl, 'rb') as f:
            self.list_key_read = pickle.load(f)

        self.lmdb_env = lmdb.open(self.path_lmdb)
        self.datum = caffe_pb2.Datum()

        self.list_pos = generate_x_y()

        self.Data_get = mod.get_function("Data_get")
        self.SpatialAugmentation = mod10.get_function("SpatialAugmentation")

    def __len__(self):
        return len(self.list_key_read)

    def __getitem__(self, idx):
        self.datum.ParseFromString(value)
        data = caffe.io.datum_to_array(self.datum)
        data_img = data[:3, ...]  # [3,540,1760]
        data_label = data[3:5, ...]  # [2,540,1760]


        loc_gt = np.zeros([16,32,96]).astype(np.float32)
        conf_gt = np.zeros([8,32,96]).astype(np.float32)

        img_out = np.zeros([3, 128, 384]).astype(np.float32)

        self.Data_get(cuda.In(data_label.astype(np.float32)), cuda.Out(loc_gt),
                        cuda.Out(conf_gt), block=(12, 8, 1), grid=(8, 4, 1))

        self.SpatialAugmentation(cuda.In(data_img.astype(np.float32)),
                              cuda.Out(img_out), block=(384, 1, 1), grid=(3, 1, 128))
        return data, img_out, loc_gt, conf_gt





if __name__ == "__main__":
    path_lmdb_key_pkl = "/media/lmdb_key_big.pkl"
    path_lmdb = "/media/train/data_lmdb/"
    batch_size = 2
    dataset_lmdb = Images_Dataset(path_lmdb, path_lmdb_key_pkl)

    loader = torch.utils.data.DataLoader(dataset_lmdb, batch_size=batch_size, num_workers=1)##num_workers=0
    for i, data_ in enumerate(loader):
        print("***"*3)
        data = data_[0][0].numpy()
/data_1/anconda_install/envs/myconda_1/bin/python /media/algo/data_1/project/goe_pytorch/Data_Loader.py
11.0
1.7.0
<__main__.Images_Dataset_Lmdb object at 0x7f128014b9d0>
Traceback (most recent call last):
  File "/media/algo/data_1/project/goe_pytorch/Data_Loader.py", line 927, in <module>
    for i, data_ in enumerate(loader):
  File "/data_1/anconda_install/envs/myconda_1/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 435, in __next__
    data = self._next_data()
  File "/data_1/anconda_install/envs/myconda_1/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1085, in _next_data
    return self._process_data(data)
  File "/data_1/anconda_install/envs/myconda_1/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1111, in _process_data
    data.reraise()
  File "/data_1/anconda_install/envs/myconda_1/lib/python3.7/site-packages/torch/_utils.py", line 428, in reraise
    raise self.exc_type(msg)
pycuda._driver.LogicError: Caught LogicError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/data_1/anconda_install/envs/myconda_1/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 198, in _worker_loop
    data = fetcher.fetch(index)
  File "/data_1/anconda_install/envs/myconda_1/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/data_1/anconda_install/envs/myconda_1/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/media/algo/data_1/project/goe_pytorch/Data_Loader.py", line 823, in __getitem__
    cuda.Out(conf_gt), block=(12, 8, 1), grid=(8, 4, 1))
  File "/data_1/anconda_install/envs/myconda_1/lib/python3.7/site-packages/pycuda/driver.py", line 480, in function_call
    func._set_block_shape(*block)
pycuda._driver.LogicError: cuFuncSetBlockShape failed: initialization error

Expected behavior
A clear and concise description of what you expected to happen.

Environment (please complete the following information):
appdirs 1.4.4
certifi 2021.10.8
distro 1.6.0
imageio 2.16.0
lmdb 1.3.0
Mako 1.1.6
MarkupSafe 2.1.0
mkl-fft 1.3.1
mkl-random 1.2.2
mkl-service 2.4.0
networkx 2.6.3
numpy 1.21.2
olefile 0.46
opencv-python 3.4.6.27
packaging 21.3
Pillow 8.4.0
pip 21.2.2
platformdirs 2.5.1
protobuf 3.19.4
pycuda 2021.1
pyparsing 3.0.7
pytools 2022.1
PyWavelets 1.2.0
scikit-build 0.13.1
scikit-image 0.19.2
scipy 1.7.3
setuptools 58.0.4
six 1.16.0
tifffile 2021.11.2
torch 1.7.0
torchaudio 0.7.0a0+ac17b64
torchvision 0.8.0
typing-extensions 3.10.0.2
wheel 0.37.1

Additional context
Looking forward to your reply, thank you very much

Metadata

Metadata

Assignees

No one assigned

    Labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions