# 2. Making dataset

- `Dataset` is central for machine learning model
- `aml` offers `BaseDataset`
- Below are subclasses of `BaseDataset` 

## 2.1. InMemoryDataset

- Hold all data in memory, therefore the fastest
- Not appropriate if the size of dataset is very big
- Only requires iterable of data (ex. list of data, datapipe, ...)
- Can be saved to disk and loaded

In [2]:
from aml.data.datapipes import ASEFileReader, AtomsGraphParser, NeighborListBuilder
from torchdata.datapipes.iter import IterableWrapper

# Import InMemoryDataset
from aml.data.dataset import InMemoryDataset

def make_dp(src, neighbor_cutoff):
    if isinstance(src, str):
        src = [src]
    dp = IterableWrapper(src)
    dp = ASEFileReader(dp)
    dp = AtomsGraphParser(dp)
    dp = NeighborListBuilder(dp, neighbor_cutoff)
    return dp

dp = make_dp(["data/molecules_1.xyz", "data/molecules_2.xyz"], 5.0)
in_memory_dataset = InMemoryDataset(dp)
in_memory_dataset

InMemoryDataset(7)

In [3]:
in_memory_dataset.save("dataset.pt")
in_memory_dataset_2 = InMemoryDataset.load("dataset.pt")
in_memory_dataset_2

InMemoryDataset(7)

## 2.2. ASEDataset

- Kind of(=subclass of) `InMemoryDataset`
- Read the files using `ASE` and create `InMemoryDataset`
- Builds neighborlist at the time of dataset creation

In [4]:
from aml.data.dataset import ASEDataset

ase_dataset = ASEDataset(
    data_source=["data/molecules_1.xyz", "data/molecules_2.xyz"],
    index=[":", ":"],
    neighborlist_cutoff=5.0,
    neighborlist_backend="ase",
    progress_bar=True,
)
ase_dataset

7it [00:00, 237.50it/s]


ASEDataset(7)

## 2.3. LMDBDataset

- Reads `lmdb` database from file
- Fast and memory-effective
- Can be created using `dataest.write_lmdb` or `aml.data.utils.write_lmdb_dataset`

In [5]:
import os
from aml.data.utils import write_lmdb_dataset

files_for_cleanup = [
    "lmdb_dataset.lmdb",
    "lmdb_dataset.lmdb-lock",
    "lmdb_dataset_2.lmdb",
    "lmdb_dataset_2.lmdb-lock",
]
# Cleanup
for file in files_for_cleanup:
    if os.path.isfile(file):
        os.remove(file)

# using write_lmdb_dataset
write_lmdb_dataset(
    dp,
    "lmdb_dataset.lmdb",
    compress=True,
    metadata={"description": "created using write_lmdb_dataset"}, # optional
)

# using dataset.write_lmdb
ase_dataset.write_lmdb("lmdb_dataset_2.lmdb", compress=True)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 1444.46it/s]


In [6]:
from aml.data.dataset import LMDBDataset

lmdb_dataset = LMDBDataset("lmdb_dataset.lmdb")
lmdb_dataset

LMDBDataset(7)

### 2.4 How to use datasets

In [7]:
import torch
# Accessing by single index
data = lmdb_dataset[0]
print(data)

AtomsGraph(n_atoms=[1], elems=[3], pos=[3, 3], cell=[1, 3, 3], batch=[3], edge_index=[2, 6], edge_shift=[6, 3])


In [8]:
# Accessing by tensor index to create subset
idx = torch.tensor([0, 3, 4])
subset = lmdb_dataset[idx]
print(subset)

LMDBDataset(3)


In [9]:
# Methods for creating subsets
subset = lmdb_dataset.subset(0.5) # 0.5 is fraction
print(subset)

dataset_1, dataset_2 = lmdb_dataset.split(0.7)
print(dataset_1, dataset_2)

train_dataset, val_dataset, test_dataset = lmdb_dataset.train_val_test_split(0.5, 0.3)
print(train_dataset, val_dataset, test_dataset)

LMDBDataset(3)
LMDBDataset(4) LMDBDataset(3)
LMDBDataset(3) LMDBDataset(2) LMDBDataset(2)


In [10]:
# Dataloader
from torch_geometric.loader import DataLoader

dataloader = DataLoader(lmdb_dataset, batch_size=3, shuffle=True)
for batch in dataloader:
    print(batch)

AtomsGraphBatch(n_atoms=[3], elems=[22], pos=[22, 3], cell=[3, 3, 3], batch=[22], edge_index=[2, 174], edge_shift=[174, 3], ptr=[4])
AtomsGraphBatch(n_atoms=[3], elems=[20], pos=[20, 3], cell=[3, 3, 3], batch=[20], edge_index=[2, 134], edge_shift=[134, 3], ptr=[4])
AtomsGraphBatch(n_atoms=[1], elems=[4], pos=[4, 3], cell=[1, 3, 3], batch=[4], edge_index=[2, 12], edge_shift=[12, 3], ptr=[2])


### 2.5 Using configuration dictionary to create & save dataset config

- All datasets (except `InMemoryDataset`) can be constructed from config dictionary
- Two methods exists: `get_config`, `from_config`
- The special key `"@name"` is required to specify which class to use

In [11]:
# Example (data_source may be different since it uses absolute path)
from pprint import pprint
config = ase_dataset.get_config()
pprint(config)

{'@name': 'ase_dataset',
 'atomref_energies': None,
 'data_source': ['/Users/mjhong/Documents/Github/aml/tutorials/data/molecules_1.xyz',
                 '/Users/mjhong/Documents/Github/aml/tutorials/data/molecules_2.xyz'],
 'index': [':', ':'],
 'neighborlist_backend': 'ase',
 'neighborlist_cutoff': 5.0,
 'progress_bar': True}


In [12]:
from aml.data.dataset import BaseDataset

ase_dataset_from_config = ASEDataset.from_config(config)
# Since "@name" is specified, the dataset can be created from base class
ase_dataset_from_basedataset = BaseDataset.from_config(config)

7it [00:00, 287.90it/s]
7it [00:00, 498.72it/s]
