In [1]:
!pwd
!pip install -U -r requirements.txt

/examples/qm9
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113


In [2]:
from dask.distributed import Client

client = Client()

In [3]:
from pathlib import Path
from main import download_qm9_data, make_dataset, data_root

file_path: Path | str = data_root.joinpath("data")
manifest_file: str = "manifest.csv"
if not file_path.exists():
    make_dataset(file_path, manifest_file)
manifest_path = file_path.joinpath(manifest_file)

PermissionError: [Errno 13] Permission denied: '/examples/.data/qm9'

## Pandas DataFrame

In [None]:
from timeit import default_timer as timer
import pandas as pd
from main import make_fingerprint_feature2

df = pd.read_csv(manifest_path)
    
def pandas_worker(df):
    results = df['smiles'].apply(make_fingerprint_feature2)
    return results
    
t = timer()
results = pandas_worker(df)
et = timer() - t
print(f"elapsed time: {et:.3f} secs")
results

## Dask DataFrame

In [None]:
import dask.dataframe as dd

ddf = dd.read_csv(manifest_path)
ddf = ddf.repartition(npartitions=8)

def dask_worker(ddf):
    
    def mff_wrapper(dfd):
        df = dfd.compute()
        return df['smiles'].apply(make_fingerprint_feature2)

    futures = client.map(mff_wrapper, ddf.to_delayed())
    results = pd.concat(client.gather(futures))
    return results

t = timer()
results = dask_worker(ddf)
et = timer() - t
print(f"elapsed time: {et:.3f} secs")
results

## Test dask in training

In [None]:
from dask_ml.model_selection import train_test_split
import numpy as np
import dask.array as da

X = results
y = ddf.homo.compute()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
print(X_train)
print(y_train)

X_train = da.stack(X_train).astype(float)
y_train = da.stack(y_train)
print(X_train)
print(y_train)

In [None]:
import joblib
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR


param_space = {
    'C': np.logspace(-6, 6, 13),
    'gamma': np.logspace(-8, 8, 17),
    'tol': np.logspace(-4, -1, 4),
}

model = SVR(kernel='rbf')
search = RandomizedSearchCV(model, param_space, cv=3, n_iter=5, verbose=10)

with joblib.parallel_backend('dask'):
    search.fit(X_train, y_train)