-
Notifications
You must be signed in to change notification settings - Fork 1
/
sdo_ml_v1_dataset.py
271 lines (239 loc) · 10.9 KB
/
sdo_ml_v1_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
import torch
import numpy as np
from pathlib import Path
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision.transforms import Compose, Resize, Normalize, Lambda
import math
import pytorch_lightning as pl
class SDONumpyDataset(Dataset):
def __init__(
self,
base_dir,
mode="train",
n_items=None,
file_pattern="*.npz",
data_key="x",
transforms=None):
"""Dataset which loads Numpy npz files
Args:
base_dir ([str]): [Directory in which the npz files are.]
mode (str, optional): [train or val, TODO implement val split]. Defaults to "train".
n_items ([type], optional): [Number of items in on iteration, by default number of files in the loaded set
but can be smaller (uses subset) or larger (uses file multiple times)]. Defaults to None.
file_pattern (str, optional): [File pattern of files to load from the base_dir]. Defaults to "*.npz".
data_key (str, optional): [Data key used to load data from the npz array]. Defaults to 'x'.
transforms ([type], optional): [Transformations to do after loading the data -> pytorch data transforms]. Defaults to None
"""
# TODO only load required channels
self.base_dir = base_dir
self.file_pattern = file_pattern
self.files = list(Path(base_dir).glob(f'**/{file_pattern}'))
self.transforms = transforms
self.data_key = data_key
self.data_len = len(self.files)
print(f"found {len(self.files)} files")
if n_items is None:
self.n_items = self.data_len
else:
self.n_items = int(n_items)
def __len__(self):
return self.n_items
def __getitem__(self, item):
if item >= self.n_items:
raise StopIteration()
idx = item % self.data_len
file_path = str(self.files[idx])
np_arr = np.load(file_path)[self.data_key] # .astype(np.float64)
torch_arr = torch.from_numpy(np_arr)
# convert to 1 x H x W, to be in compatible torchvision format
torch_arr = torch_arr.unsqueeze(dim=0)
if self.transforms is not None:
torch_arr = self.transforms(torch_arr)
return torch_arr, file_path
# Same preprocess as github.com/i4Ds/SDOBenchmark
CHANNEL_PREPROCESS = {
"94": {"min": 0.1, "max": 800, "scaling": "log10"},
"131": {"min": 0.7, "max": 1900, "scaling": "log10"},
"171": {"min": 5, "max": 3500, "scaling": "log10"},
"193": {"min": 20, "max": 5500, "scaling": "log10"},
"211": {"min": 7, "max": 3500, "scaling": "log10"},
"304": {"min": 0.1, "max": 3500, "scaling": "log10"},
"335": {"min": 0.4, "max": 1000, "scaling": "log10"},
"1600": {"min": 10, "max": 800, "scaling": "log10"},
"1700": {"min": 220, "max": 5000, "scaling": "log10"},
"4500": {"min": 4000, "max": 20000, "scaling": "log10"},
"continuum": {"min": 0, "max": 65535, "scaling": None},
"magnetogram": {"min": -250, "max": 250, "scaling": None},
"bx": {"min": -250, "max": 250, "scaling": None},
"by": {"min": -250, "max": 250, "scaling": None},
"bz": {"min": -250, "max": 250, "scaling": None},
}
def get_default_transforms(target_size=128, channel="171"):
"""Returns a Transform which resizes 2D samples (1xHxW) to a target_size (1 x target_size x target_size)
and then converts them to a pytorch tensor.
Args:
target_size (int, optional): [New spatial dimension of the input data]. Defaults to 128.
channel (str, optional): [The SDO channel]. Defaults to 171.
Returns:
[Transform]
"""
"""
Apply the normalization necessary for the sdo-dataset. Depending on the channel, it:
- flips the image vertically
- clips the "pixels" data in the predefined range (see above)
- applies a log10() on the data
- normalizes the data to the [0, 1] range
- normalizes the data around 0 (standard scaling)
:param channel: The kind of data to preprocess
:param resize: Optional size of image (integer) to resize the image
:return: a transforms object to preprocess tensors
"""
#also refer to https://pytorch.org/vision/stable/transforms.html#
# https://gitlab.com/jdonzallaz/solarnet-thesis/-/blob/master/solarnet/data/transforms.py
preprocess_config = CHANNEL_PREPROCESS[channel.lower()]
if preprocess_config["scaling"] == "log10":
# TODO why was vflip(x) used here in SolarNet?
def lambda_transform(x): return torch.log10(torch.clamp(
x,
min=preprocess_config["min"],
max=preprocess_config["max"],
))
mean = math.log10(preprocess_config["min"])
std = math.log10(preprocess_config["max"]) - \
math.log10(preprocess_config["min"])
else:
def lambda_transform(x): return torch.clamp(
x,
min=preprocess_config["min"],
max=preprocess_config["max"],
)
mean = preprocess_config["min"]
std = preprocess_config["max"] - preprocess_config["min"]
transforms = Compose(
[Resize((target_size, target_size)),
Lambda(lambda_transform),
Normalize(mean=[mean], std=[std]),
# required to remove strange distribution of pixels (everything too bright)
Normalize(mean=(0.5), std=(0.5))
]
)
return transforms
def get_sdo_ml_v1_dataset(
base_dir,
mode="train",
batch_size=16,
n_items=None,
pin_memory=False,
num_workers=0,
drop_last=False,
target_size=512,
file_pattern="*.npz",
shuffle=False,
):
"""Returns a Pytorch DataLoader which loads an SDONumpyDataset
Args:
base_dir ([str]): [Directory in which the npz files are.]
mode (str, optional): [train or val, TODO implement train val split]. Defaults to "train".
batch_size (int, optional): [See pytorch DataLoader]. Defaults to 16.
n_items ([int], optional): [Number of items in the dataset, by default number of files in the loaded set
but can be smaller (uses subset) or larger (uses file multiple times)]. Defaults to None.
pin_memory (bool, optional): [See pytorch DataLoader]. Defaults to False.
num_workers (int, optional): [See pytorch DataLoader]. Defaults to 0.
drop_last (bool, optional): [See pytorch DataLoader]. Defaults to False.
target_size (int, optional): [New spatial dimension of to which the input data will be transformed]. Defaults to 512.
file_pattern (str, optional): [File pattern of files to load from the base_dir]. Defaults to "*.npz".
shuffle (bool, optional): [See pytorch DataLoader]. Defaults to False.
Returns:
[DataLoader]: Pytorch data loader which loads an SDONumpyDataset
"""
transforms = get_default_transforms(
target_size=target_size, channel=str(171))
dataset = SDONumpyDataset(
base_dir=base_dir,
mode=mode,
n_items=n_items,
file_pattern=file_pattern,
transforms=transforms,
)
data_loader = DataLoader(
dataset,
batch_size=batch_size,
shuffle=shuffle,
num_workers=num_workers,
pin_memory=pin_memory,
drop_last=drop_last,
)
return data_loader
class SDOMLv1DataModule(pl.LightningDataModule):
def __init__(self, base_dir: Path,
batch_size: int = 16,
n_items: int = None,
pin_memory: bool = False,
num_workers: int = 0,
drop_last: bool = False,
target_size: int = 512,
file_pattern: str = "*.npz",
channel: str = "171",
shuffle: bool = False):
"""
Creates a LightningDataModule for the SDO Ml v1 dataset
Args:
base_dir ([str]): [Directory in which the npz files are.]
batch_size (int, optional): [See pytorch DataLoader]. Defaults to 16.
n_items ([int], optional): [Number of items in the dataset, by default number of files in the loaded set
but can be smaller (uses subset) or larger (uses file multiple times)]. Defaults to None.
pin_memory (bool, optional): [See pytorch DataLoader]. Defaults to False.
num_workers (int, optional): [See pytorch DataLoader]. Defaults to 0.
drop_last (bool, optional): [See pytorch DataLoader]. Defaults to False.
target_size (int, optional): [New spatial dimension of to which the input data will be transformed]. Defaults to 512.
file_pattern (str, optional): [File pattern of files to load from the base_dir]. Defaults to "*.npz".
shuffle (bool, optional): [See pytorch DataLoader]. Defaults to False.
"""
super().__init__()
transforms = get_default_transforms(
target_size=target_size, channel=channel)
dataset = SDONumpyDataset(
base_dir=base_dir / Path("train"),
n_items=n_items,
file_pattern=file_pattern,
transforms=transforms,
)
self.dataset_test = SDONumpyDataset(
base_dir=base_dir / Path("test"),
n_items=n_items,
file_pattern=file_pattern,
transforms=transforms,
)
# TODO implement temporal split
num_samples = len(dataset)
self.dataset_train, self.dataset_val = random_split(
dataset, [int(num_samples*0.8), int(num_samples * 0.2)])
self.batch_size = batch_size
self.pin_memory = pin_memory
self.num_workers = num_workers
self.drop_last = drop_last
self.shuffle = shuffle
def train_dataloader(self):
return DataLoader(self.dataset_train, batch_size=self.batch_size,
shuffle=self.shuffle,
num_workers=self.num_workers,
pin_memory=self.pin_memory,
drop_last=self.drop_last,)
def val_dataloader(self):
return DataLoader(self.dataset_val, batch_size=self.batch_size,
shuffle=self.shuffle,
num_workers=self.num_workers,
pin_memory=self.pin_memory,
drop_last=self.drop_last,)
def test_dataloader(self):
return DataLoader(self.dataset_test, batch_size=self.batch_size,
shuffle=self.shuffle,
num_workers=self.num_workers,
pin_memory=self.pin_memory,
drop_last=self.drop_last,)
def predict_dataloader(self):
return DataLoader(self.dataset_test, batch_size=self.batch_size,
shuffle=self.shuffle,
num_workers=self.num_workers,
pin_memory=self.pin_memory,
drop_last=self.drop_last,)