Describe the bug
when num_workers=0,everything is ok , but num_workers>0, error.
I went to pytorch and asked. They said it was the problem of pycuda multithreading initialization, but I didn't know how to solve it
pytorch/pytorch#74014
To Reproduce
import torch
print(torch.version.cuda) # 11.0
print(torch.__version__) # 1.7.0
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
class Images_Dataset(torch.utils.data.Dataset):
def __init__(self, path_lmdb, path_lmdb_key_pkl, transformI=None, transformM=None):
self.path_lmdb_key_pkl = path_lmdb_key_pkl
self.path_lmdb = path_lmdb
self.list_key_read = []
with open(self.path_lmdb_key_pkl, 'rb') as f:
self.list_key_read = pickle.load(f)
self.lmdb_env = lmdb.open(self.path_lmdb)
self.datum = caffe_pb2.Datum()
self.list_pos = generate_x_y()
self.Data_get = mod.get_function("Data_get")
self.SpatialAugmentation = mod10.get_function("SpatialAugmentation")
def __len__(self):
return len(self.list_key_read)
def __getitem__(self, idx):
self.datum.ParseFromString(value)
data = caffe.io.datum_to_array(self.datum)
data_img = data[:3, ...] # [3,540,1760]
data_label = data[3:5, ...] # [2,540,1760]
loc_gt = np.zeros([16,32,96]).astype(np.float32)
conf_gt = np.zeros([8,32,96]).astype(np.float32)
img_out = np.zeros([3, 128, 384]).astype(np.float32)
self.Data_get(cuda.In(data_label.astype(np.float32)), cuda.Out(loc_gt),
cuda.Out(conf_gt), block=(12, 8, 1), grid=(8, 4, 1))
self.SpatialAugmentation(cuda.In(data_img.astype(np.float32)),
cuda.Out(img_out), block=(384, 1, 1), grid=(3, 1, 128))
return data, img_out, loc_gt, conf_gt
if __name__ == "__main__":
path_lmdb_key_pkl = "/media/lmdb_key_big.pkl"
path_lmdb = "/media/train/data_lmdb/"
batch_size = 2
dataset_lmdb = Images_Dataset(path_lmdb, path_lmdb_key_pkl)
loader = torch.utils.data.DataLoader(dataset_lmdb, batch_size=batch_size, num_workers=1)##num_workers=0
for i, data_ in enumerate(loader):
print("***"*3)
data = data_[0][0].numpy()
/data_1/anconda_install/envs/myconda_1/bin/python /media/algo/data_1/project/goe_pytorch/Data_Loader.py
11.0
1.7.0
<__main__.Images_Dataset_Lmdb object at 0x7f128014b9d0>
Traceback (most recent call last):
File "/media/algo/data_1/project/goe_pytorch/Data_Loader.py", line 927, in <module>
for i, data_ in enumerate(loader):
File "/data_1/anconda_install/envs/myconda_1/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 435, in __next__
data = self._next_data()
File "/data_1/anconda_install/envs/myconda_1/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1085, in _next_data
return self._process_data(data)
File "/data_1/anconda_install/envs/myconda_1/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1111, in _process_data
data.reraise()
File "/data_1/anconda_install/envs/myconda_1/lib/python3.7/site-packages/torch/_utils.py", line 428, in reraise
raise self.exc_type(msg)
pycuda._driver.LogicError: Caught LogicError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/data_1/anconda_install/envs/myconda_1/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 198, in _worker_loop
data = fetcher.fetch(index)
File "/data_1/anconda_install/envs/myconda_1/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/data_1/anconda_install/envs/myconda_1/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/media/algo/data_1/project/goe_pytorch/Data_Loader.py", line 823, in __getitem__
cuda.Out(conf_gt), block=(12, 8, 1), grid=(8, 4, 1))
File "/data_1/anconda_install/envs/myconda_1/lib/python3.7/site-packages/pycuda/driver.py", line 480, in function_call
func._set_block_shape(*block)
pycuda._driver.LogicError: cuFuncSetBlockShape failed: initialization error
Expected behavior
A clear and concise description of what you expected to happen.
Environment (please complete the following information):
appdirs 1.4.4
certifi 2021.10.8
distro 1.6.0
imageio 2.16.0
lmdb 1.3.0
Mako 1.1.6
MarkupSafe 2.1.0
mkl-fft 1.3.1
mkl-random 1.2.2
mkl-service 2.4.0
networkx 2.6.3
numpy 1.21.2
olefile 0.46
opencv-python 3.4.6.27
packaging 21.3
Pillow 8.4.0
pip 21.2.2
platformdirs 2.5.1
protobuf 3.19.4
pycuda 2021.1
pyparsing 3.0.7
pytools 2022.1
PyWavelets 1.2.0
scikit-build 0.13.1
scikit-image 0.19.2
scipy 1.7.3
setuptools 58.0.4
six 1.16.0
tifffile 2021.11.2
torch 1.7.0
torchaudio 0.7.0a0+ac17b64
torchvision 0.8.0
typing-extensions 3.10.0.2
wheel 0.37.1
Additional context
Looking forward to your reply, thank you very much
Describe the bug
when num_workers=0,everything is ok , but num_workers>0, error.
I went to pytorch and asked. They said it was the problem of pycuda multithreading initialization, but I didn't know how to solve it
pytorch/pytorch#74014
To Reproduce
Expected behavior
A clear and concise description of what you expected to happen.
Environment (please complete the following information):
appdirs 1.4.4
certifi 2021.10.8
distro 1.6.0
imageio 2.16.0
lmdb 1.3.0
Mako 1.1.6
MarkupSafe 2.1.0
mkl-fft 1.3.1
mkl-random 1.2.2
mkl-service 2.4.0
networkx 2.6.3
numpy 1.21.2
olefile 0.46
opencv-python 3.4.6.27
packaging 21.3
Pillow 8.4.0
pip 21.2.2
platformdirs 2.5.1
protobuf 3.19.4
pycuda 2021.1
pyparsing 3.0.7
pytools 2022.1
PyWavelets 1.2.0
scikit-build 0.13.1
scikit-image 0.19.2
scipy 1.7.3
setuptools 58.0.4
six 1.16.0
tifffile 2021.11.2
torch 1.7.0
torchaudio 0.7.0a0+ac17b64
torchvision 0.8.0
typing-extensions 3.10.0.2
wheel 0.37.1
Additional context
Looking forward to your reply, thank you very much