In [None]:
!pwd
!pip install -U -r requirements.txt

In [None]:
from pathlib import Path
from main import download_qm9_data, make_dataset, data_root

file_path: Path | str = data_root.joinpath("data")
manifest_file: str = "manifest.csv"
if not file_path.exists():
    make_dataset(file_path, manifest_file)
manifest_path = file_path.joinpath(manifest_file)

## Pandas in training

In [None]:
from timeit import default_timer as timer
import pandas as pd
from main import make_fingerprint_feature2

df = pd.read_csv(manifest_path)
    
def pandas_worker(df):
    results = df['smiles'].apply(make_fingerprint_feature2)
    return results
    
t = timer()
results = pandas_worker(df)
et = timer() - t
print(f"elapsed time: {et:.3f} secs")
results

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

X = results
y = df.homo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
print(X_train)
print(y_train)

X_train = np.stack(X_train).astype(float)
y_train = np.stack(y_train)
print(X_train)
print(y_train)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR

param_space = {
    'C': np.logspace(-6, 6, 13),
    'gamma': np.logspace(-8, 8, 17),
    'tol': np.logspace(-4, -1, 4),
}

model = SVR(kernel='rbf')
search = RandomizedSearchCV(model, param_space, cv=5, n_iter=10, verbose=10)
search.fit(X_train, y_train)


In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None)                                                                                                                                                                                                                               
    pd.DataFrame(search.cv_results_)

## dask in training

In [None]:
from dask.distributed import Client

client = Client()

In [None]:
from timeit import default_timer as timer
import pandas as pd
import dask.dataframe as dd
from main import make_fingerprint_feature2

ddf = dd.read_csv(manifest_path)
ddf = ddf.repartition(npartitions=8)

def dask_worker(ddf):
    
    def mff_wrapper(dfd):
        df = dfd.compute()
        return df['smiles'].apply(make_fingerprint_feature2)

    futures = client.map(mff_wrapper, ddf.to_delayed())
    results = pd.concat(client.gather(futures))
    return results

t = timer()
results = dask_worker(ddf)
et = timer() - t
print(f"elapsed time: {et:.3f} secs")
results

In [None]:
from dask_ml.model_selection import train_test_split
import dask.array as da

X = results
y = ddf.homo.compute()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
print(X_train)
print(y_train)

X_train = da.stack(X_train)
y_train = da.stack(y_train)
print(X_train)
print(y_train)

In [None]:
import joblib
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
import numpy as np

param_space = {
    'C': np.logspace(-6, 6, 13),
    'gamma': np.logspace(-8, 8, 17),
    'tol': np.logspace(-4, -1, 4),
}

model = SVR(kernel='rbf')
search = RandomizedSearchCV(model, param_space, cv=5, n_iter=10, verbose=10)

with joblib.parallel_backend('dask'):
    search.fit(X_train, y_train)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None)                                                                                                                                                                                                                               
    pd.DataFrame(search.cv_results_)