In [1]:
!pwd
!pip install -U -r requirements.txt

/examples/qm9
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113
Collecting tqdm
  Downloading tqdm-4.64.0-py2.py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.4/78.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rdkit-pypi
  Downloading rdkit_pypi-2022.3.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.9/22.9 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: tqdm, rdkit-pypi
Successfully installed rdkit-pypi-2022.3.4 tqdm-4.64.0


In [2]:
from dask.distributed import Client

client = Client()

In [3]:
from pathlib import Path
from main import download_qm9_data, make_dataset, data_root

file_path: Path | str = data_root.joinpath("data")
manifest_file: str = "manifest.csv"
if not file_path.exists():
    make_dataset(file_path, manifest_file)
manifest_path = file_path.joinpath(manifest_file)

## Pandas DataFrame

In [4]:
from timeit import default_timer as timer
import pandas as pd
from main import make_fingerprint_feature2

df = pd.read_csv(manifest_path)
    
def pandas_worker(df):
    results = df['smiles'].apply(make_fingerprint_feature2)
    print(results)
    
t = timer()
results = pandas_worker(df)
et = timer() - t
print(f"elapsed time: {et:.3f} secs")


0         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                                ...                        
130181    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
130182    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
130183    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
130184    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
130185    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: smiles, Length: 130186, dtype: object
elapsed time: 24.248 secs


## Dask DataFrame

In [5]:
import dask.dataframe as dd

ddf = dd.read_csv(manifest_path)

def dask_worker(ddf):
    ddf = ddf.repartition(npartitions=8)

    def mff_wrapper(dfd):
        df = dfd.compute()
        return df['smiles'].apply(make_fingerprint_feature2)

    futures = client.map(mff_wrapper, ddf.to_delayed())
    results = pd.concat(client.gather(futures))
    return results

t = timer()
results = dask_worker(ddf)
et = timer() - t
print(f"elapsed time: {et:.3f} secs")
results

elapsed time: 14.802 secs


0         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                                ...                        
130181    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
130182    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
130183    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
130184    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
130185    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: smiles, Length: 130186, dtype: object