# LitMatter DeepChem
Попытка повторения эксперимента с нашими данными для модели из репозитория
https://github.com/ncfrey/litmatter/blob/main/prototyping.ipynb

* This notebook shows how to speed up [DeepChem](https://github.com/deepchem/deepchem) model training on [MoleculeNet](https://arxiv.org/abs/1703.00564) datasets using the LitMatter template.  
* In this example, we train a simple DeepChem `TorchModel` on the Tox21 dataset.
* The training workflow shown here can be scaled to hundreds of GPUs by changing a single keyword argument!

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch

In [4]:
!pip install deepchem

Collecting deepchem
  Downloading deepchem-2.8.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdkit (from deepchem)
  Downloading rdkit-2024.3.6-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading deepchem-2.8.0-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rdkit-2024.3.6-cp310-cp310-manylinux_2_28_x86_64.whl (32.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit, deepchem
Successfully installed deepchem-2.8.0 rdkit-2024.3.6


In [6]:
!pip install pytorch_lightning

Collecting pytorch_lightning
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics>=0.7.0 (from pytorch_lightning)
  Downloading torchmetrics-1.6.0-py3-none-any.whl.metadata (20 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch_lightning)
  Downloading lightning_utilities-0.11.8-py3-none-any.whl.metadata (5.2 kB)
Downloading pytorch_lightning-2.4.0-py3-none-any.whl (815 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.2/815.2 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.8-py3-none-any.whl (26 kB)
Downloading torchmetrics-1.6.0-py3-none-any.whl (926 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m926.4/926.4 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightning-utilities, torchmetrics, pytorch_lightning
Successfully installed lightning-utilities-0.11.8 pytorch_lightning-2.4.0 torchmetrics-1.6.0


In [7]:
import deepchem as dc

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import (LightningDataModule, LightningModule, Trainer,
                               seed_everything)

### Load a `LitMolNet` dataset
Any MolNet dataset from `deepchem.molnet` can be used with LitMatter. The specific MolNet dataset and any pre-processing steps can be defined in `data.LitMolNet`

In [9]:
import pandas as pd

In [10]:
hilic_oxana = pd.DataFrame({"smiles": ["CC1=NC(=NC(=N1)N)N", "C1=CC=NN=C1 ", 'C1=CN=CN=C1'], 'rt': [7.1, 4.7, 4.3]})

In [13]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [14]:
from typing import Optional

from pytorch_lightning import LightningDataModule

from torch_geometric.data import DataLoader as PyGDataLoader

from torch.utils.data import DataLoader

import deepchem as dc


# class LitMolNet(LightningDataModule):
#     def __init__(self, dataset, batch_size=16, num_workers=4):
#         super().__init__()
#         self.loader = loader
#         self.batch_size = batch_size
#         self.num_workers = num_workers

#     def prepare_data(self):
#         """Download data if needed."""
#         pass

#     def setup(self, stage: Optional[str] = None):
#         """Apply transformations and split datasets."""
#         task, df, trans = self.loader()
#         train, valid, test = df
#         train, valid, test = (
#             train.make_pytorch_dataset(),
#             valid.make_pytorch_dataset(),
#             test.make_pytorch_dataset(),
#         )

#         self.train_dataset, self.val_dataset, self.test_dataset = train, valid, test

#     def train_dataloader(self):
#         return DataLoader(
#             self.train_dataset,
#             batch_size=self.batch_size,
#             num_workers=self.num_workers,
#             pin_memory=True,
#         )

#     def val_dataloader(self):
#         return DataLoader(
#             self.val_dataset,
#             batch_size=self.batch_size,
#             num_workers=self.num_workers,
#             pin_memory=True,
#         )

#     def test_dataloader(self):
#         return DataLoader(
#             self.test_dataset,
#             batch_size=self.batch_size,
#             num_workers=self.num_workers,
#             pin_memory=True,
#         )

In [36]:
# class LitMolNet(LightningDataModule):
#     def __init__(self, dataset, batch_size=16, num_workers=4):
#         super().__init__()
#         self.dataset = dataset
#         self.batch_size = batch_size
#         self.num_workers = num_workers

#     def prepare_data(self):
#         """Download data if needed."""
#         pass

#     def setup(self, stage: Optional[str] = None):
#         """Apply transformations and split datasets."""
#         # Получаем данные из загруженного датасета
#         self.train_dataset = self.dataset
#         # Если у вас есть валидация и тестовые данные, разбейте их как нужно
#         # Например:
#         self.val_dataset = self.dataset
#         self.test_dataset = self.dataset

#     def train_dataloader(self):
#         return DataLoader(
#             self.train_dataset,
#             batch_size=self.batch_size,
#             num_workers=self.num_workers,
#             pin_memory=True,
#         )

#     def val_dataloader(self):
#         # Проверьте, есть ли у вас валидационные данные
#         return DataLoader(
#             self.val_dataset,
#             batch_size=self.batch_size,
#             num_workers=self.num_workers,
#             pin_memory=True,
#         )

#     def test_dataloader(self):
#         # Проверьте, есть ли у вас тестовые данные
#         return DataLoader(
#             self.test_dataset,
#             batch_size=self.batch_size,
#             num_workers=self.num_workers,
#             pin_memory=True,
#         )

In [113]:
from typing import Optional
import tempfile
import pandas as pd
import deepchem as dc
from pytorch_lightning import LightningDataModule
from torch.utils.data import DataLoader

class LitMolNet(LightningDataModule):
    def __init__(self, dataset, batch_size=16, num_workers=4):
        super().__init__()
        self.dataset = dataset
        self.batch_size = batch_size
        self.num_workers = num_workers

    def prepare_data(self):
        """Download data if needed."""
        pass

    def setup(self, stage: Optional[str] = None):
        """Apply transformations and split datasets."""
        self.train_dataset = self.dataset
        self.val_dataset = self.dataset
        self.test_dataset = self.dataset

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=True,
        )

    def val_dataloader(self):
        if self.val_dataset is not None:
            return DataLoader(
                self.val_dataset,
                batch_size=self.batch_size,
                num_workers=self.num_workers,
                pin_memory=True,
            )
        return None

    def test_dataloader(self):
        if self.test_dataset is not None:
            return DataLoader(
                self.test_dataset,
                batch_size=self.batch_size,
                num_workers=self.num_workers,
                pin_memory=True,
            )
        return None

In [158]:
!pip install --upgrade deepchem



In [161]:
from deepchem.feat import MorganFingerprint


ImportError: cannot import name 'MorganFingerprint' from 'deepchem.feat' (/usr/local/lib/python3.10/dist-packages/deepchem/feat/__init__.py)

In [166]:
hilic_oxana = pd.DataFrame({"smiles": ["CC1=NC(=NC(=N1)N)N", "C1=CC=NN=C1 ", 'C1=CN=CN=C1'], 'rt': [7.1, 4.7, 4.3]})

featurizer = dc.feat.RDKitDescriptors()
features = featurizer.featurize(hilic_oxana["smiles"])

In [169]:
features.shape

(3, 210)

In [178]:
dataset = dc.data.NumpyDataset(features, hilic_oxana["rt"])

In [157]:
dir(dc.feat)

['AtomicConformation',
 'AtomicConformationFeaturizer',
 'AtomicConvFeaturizer',
 'AtomicCoordinates',
 'BAMFeaturizer',
 'BPSymmetryFunctionInput',
 'BasicSmilesTokenizer',
 'BertFeaturizer',
 'BindingPocketFeaturizer',
 'CGCNNFeaturizer',
 'CRAMFeaturizer',
 'CationPiVoxelizer',
 'ChargeVoxelizer',
 'CircularFingerprint',
 'ComplexFeaturizer',
 'ComplexNeighborListFragmentAtomicCoordinates',
 'ContactCircularFingerprint',
 'ContactCircularVoxelizer',
 'ConvMolFeaturizer',
 'CoulombMatrix',
 'CoulombMatrixEig',
 'DMPNNFeaturizer',
 'DummyFeaturizer',
 'ElemNetFeaturizer',
 'ElementPropertyFingerprint',
 'Featurizer',
 'GraphData',
 'GraphMatrix',
 'GroverFeaturizer',
 'HuggingFaceFeaturizer',
 'HuggingFaceVocabularyBuilder',
 'HydrogenBondCounter',
 'HydrogenBondVoxelizer',
 'LCNNFeaturizer',
 'MACCSKeysFingerprint',
 'MATFeaturizer',
 'MXMNetFeaturizer',
 'MaterialCompositionFeaturizer',
 'MaterialStructureFeaturizer',
 'Mol2VecFingerprint',
 'MolGanFeaturizer',
 'MolGraphConvFeaturi

In [141]:
import torch
from torch.utils.data import Dataset, DataLoader

In [142]:
# class DiskDatasetAdapter(Dataset):
#     def __init__(self, disk_dataset):
#         self.disk_dataset = disk_dataset

#     def __len__(self):
#         return len(self.disk_dataset)

#     def __getitem__(self, idx):
#         # Извлечение примера по индексу.
#         # Предполагается, что disk_dataset возвращает x и y. Измените в соответствии с вашим набором данных.
#         x, y, _ = self.disk_dataset[idx]  # Если у вас есть метаданные, добавьте их при необходимости
#         return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.long)  # Приведение к тензорам

# # Инициализация вашего DiskDataset
# # Например, disk_dataset = dc.data.DiskDataset("ваш_путь_к_данным")

# # Создание адаптера Dataset
# disk_dataset = dataset  # ваш DiskDataset
# adapted_dataset = DiskDatasetAdapter(disk_dataset)

# # Создание DataLoader
# data_loader = DataLoader(adapted_dataset, batch_size=32, shuffle=True)

# # Использование DataLoader в вашей модели

In [143]:
data_loader

<torch.utils.data.dataloader.DataLoader at 0x7b4b3c297160>

In [180]:
dm = LitMolNet(dataset=dataset, batch_size=16)
dm.prepare_data()
dm.setup()

In [145]:
for x, y in data_loader:
    print('INPUTS: ', x)
    print('LABELS: ', y)

TypeError: 'DiskDataset' object is not subscriptable

In [17]:
import tempfile

### Instantiate a `LitDeepChem` model
Any `deepchem.models.torch_models.TorchModel` can be used with LitMatter. Here, we'll write our own custom base model in PyTorch and make a `TorchModel`.

In [181]:
import os.path as osp

from typing import Optional, List, NamedTuple

import torch
from torch import Tensor
import torch.nn.functional as F
from torch.nn import Sequential, Linear, ReLU, GRU, ModuleList, BatchNorm1d, MSELoss

import deepchem as dc

from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import (
    LightningDataModule,
    LightningModule,
    Trainer,
    seed_everything,
)


# class LitDeepChem(LightningModule):
#     def __init__(self, torch_model, lr=1e-2):
#         """Define DeepChem TorchModel."""
#         super().__init__()

#         self.model = torch_model.model  # torch.nn.Module
#         self.save_hyperparameters()
#         self.lr = lr
#         self.loss_fn = torch_model.loss

#     def training_step(self, batch, batch_idx: int):
#         # Modify for MolNet dataset as needed
#         inputs = batch[0].float()
#         y = batch[1].float()
#         outputs = self.model(inputs)
#         loss = self.loss_fn(outputs, y)
#         self.log("train_loss", loss, prog_bar=True, on_step=False, on_epoch=True)
#         return loss

#     def validation_step(self, batch, batch_idx: int):
#         inputs = batch[0].float()
#         y = batch[2].float()
#         outputs = self.model(inputs)
#         loss = self.loss_fn(outputs, y)
#         self.log("val_loss", loss, prog_bar=True, on_step=False, on_epoch=True)

#     def test_step(self, batch, batch_idx: int):
#         inputs = batch[0].float()
#         y = batch[2].float()
#         outputs = self.model(inputs)
#         loss = self.loss_fn(outputs, y)
#         self.log("test_loss", loss, prog_bar=True, on_step=False, on_epoch=True)

#     def configure_optimizers(self):
#         optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
#         return optimizer

In [78]:
# from deepchem.models import TorchModel

# #class LModel(pl.LightningModule):
# class LitDeepChem(pl.LightningModule):
#     def __init__(self, torch_model, lr=0.001):
#         super(LitDeepChem, self).__init__()
#         self.torch_model = torch_model
#         self.save_hyperparameters(logger=False)
#         self.lr = lr

#         self.criterion = nn.CrossEntropyLoss()

#         # metrics
#         self.metrics = MetricCollection([torchmetrics.MeanAbsoluteError()])
#         self.train_metrics = self.metrics.clone(postfix="/train")
#         self.val_metrics = self.metrics.clone(postfix="/val")

#     def configure_optimizers(self):
#         optimizer = torch.optim.Adam(self.torch_model.model.parameters(), lr=self.lr)
#         scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5)
#         return {
#             "optimizer": optimizer,
#             "lr_scheduler": {
#                 "scheduler": scheduler,
#                 "interval": "epoch",  # or 'step'
#                 "monitor": "loss/val",  # only for self.log
#             },
#         }

#     def forward(self, x):
#         return self.torch_model.predict(x)  # или self.torch_model(x), если метод forward есть в TorchModel

#     def training_step(self, batch, batch_idx):
#         x, y = batch
#         out = self.torch_model.predict(x)
#         loss = self.criterion(out, y)
#         self.train_metrics.update(out.softmax(-1), y)
#         self.log("loss/trian", loss, prog_bar=True)
#         return loss

#     def validation_step(self, batch, batch_idx):
#         x, y = batch
#         out = self.torch_model.predict(x)
#         loss = self.criterion(out, y)
#         self.log("loss/val", loss, prog_bar=True)
#         self.val_metrics.update(out.softmax(-1), y)

#     def on_train_epoch_end(self):
#         self.log_dict(self.train_metrics.compute())
#         self.train_metrics.reset()

#         self.log_dict(self.val_metrics.compute())
#         self.val_metrics.reset()

#     def test_step(self, batch, batch_idx):
#         x, y = batch
#         out = self.torch_model.predict(x)
#         self.metrics.update(out.softmax(-1), y)

#     def on_test_epoch_end(self):
#         self.log_dict(self.metrics.compute())
#         self.metrics.reset()

In [182]:
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torchmetrics
from deepchem.models import TorchModel

class LitDeepChem(pl.LightningModule):
    def __init__(self, torch_model, lr=0.001):
        super(LitDeepChem, self).__init__()
        self.torch_model = torch_model
        self.save_hyperparameters(logger=False)
        self.lr = lr

        self.criterion = nn.CrossEntropyLoss()

        # metrics
        self.metrics = torchmetrics.MetricCollection([torchmetrics.MeanAbsoluteError()])
        self.train_metrics = self.metrics.clone(prefix="train_")
        self.val_metrics = self.metrics.clone(prefix="val_")

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.torch_model.model.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5)
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "interval": "epoch",  # or 'step'
                "monitor": "val_loss",  # monitor validation loss
            },
        }

    def forward(self, x):
        return self.torch_model.predict(x)  # или self.torch_model(x), если метод forward есть в TorchModel

    def training_step(self, batch, batch_idx):
        x, y = batch
        out = self.torch_model.predict(x)
        loss = self.criterion(out, y)
        self.train_metrics.update(out.softmax(-1), y)
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        out = self.torch_model.predict(x)
        loss = self.criterion(out, y)
        self.val_metrics.update(out.softmax(-1), y)
        self.log("val_loss", loss, prog_bar=True)

    def on_train_epoch_end(self):
        self.log_dict(self.train_metrics.compute())
        self.train_metrics.reset()

        self.log_dict(self.val_metrics.compute())
        self.val_metrics.reset()

    def test_step(self, batch, batch_idx):
        x, y = batch
        out = self.torch_model.predict(x)
        self.metrics.update(out.softmax(-1), y)

    def on_test_epoch_end(self):
        self.log_dict(self.metrics.compute())
        self.metrics.reset()


In [86]:
!pip install lightning

Collecting lightning
  Downloading lightning-2.4.0-py3-none-any.whl.metadata (38 kB)
Downloading lightning-2.4.0-py3-none-any.whl (810 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m811.0/811.0 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightning
Successfully installed lightning-2.4.0


In [183]:
import torch
import random
import numpy as np


def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)


set_seed()

In [184]:
import timm
import torch
import torchmetrics
import numpy as np
import pandas as pd
import torch.nn as nn
import matplotlib.pyplot as plt
import lightning as L
import torchmetrics

from tqdm import tqdm
from torchvision import transforms
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

from torchmetrics import MetricCollection

set_seed()
L.seed_everything(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

INFO: Seed set to 42
INFO:lightning.fabric.utilities.seed:Seed set to 42


In [185]:
import torch.nn as nn
from torchmetrics import MetricCollection

In [186]:
#from lit_models.deepchem_models import LitDeepChem

base_model = torch.nn.Sequential(
torch.nn.Linear(1024, 256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 12),
)

torch_model = dc.models.TorchModel(base_model, loss=torch.nn.MSELoss())

model = LitDeepChem(torch_model, lr=1e-2)

### Train the model
Simply change the `Trainer` flags as desired for multi-gpu and multi-node training.

In [187]:
trainer = Trainer(#gpus=-1,  # use all available GPUs on each node
#                   num_nodes=1,  # change to number of available nodes
#                  accelerator='ddp',
                 max_epochs=5,
                 )

Exception ignored in: <function _ConnectionBase.__del__ at 0x7b4bdf53c040>
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 132, in __del__
    self._close()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor
INFO: GPU available: False, used: False
INFO:lightning.pytorch.utilities.rank_zero:GPU available: False, used: False
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [188]:
trainer.fit(model, datamodule=dm)

INFO:pytorch_lightning.callbacks.model_summary:
  | Name          | Type             | Params | Mode 
-----------------------------------------------------------
0 | criterion     | CrossEntropyLoss | 0      | train
1 | metrics       | MetricCollection | 0      | train
2 | train_metrics | MetricCollection | 0      | train
3 | val_metrics   | MetricCollection | 0      | train
-----------------------------------------------------------
0         Trainable params
0         Non-trainable params
0         Total params
0.000     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]



TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/worker.py", line 351, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
TypeError: 'NumpyDataset' object is not subscriptable


That's it! By changing the `num_nodes` argument, training can be distributed across all available GPUs. For longer training jobs on an HPC cluster, see the provided example batch scripts.

# ИЗ ДОКУМЕНТАЦИИ

In [190]:
import os
import tempfile
tempdir = tempfile.mkdtemp()

In [191]:
import pandas as pd
import deepchem as dc
smiles = ["CCN(CCSC)C(=O)N[C@@](C)(CC)C(F)(F)F","CC1(C)CN(C(=O)Nc2cc3ccccc3nn2)C[C@@]2(CCOC2)O1"]
labels = [3.112,2.432]
df = pd.DataFrame(list(zip(smiles, labels)), columns=["smiles", "task1"])
with dc.utils.UniversalNamedTemporaryFile(mode='w') as tmpfile:
     df.to_csv(tmpfile.name)
     loader = dc.data.CSVLoader(["task1"], feature_field="smiles", featurizer=dc.feat.DummyFeaturizer())
     dataset = loader.create_dataset(tmpfile.name)

In [196]:
!pip install matplotlib-venn



In [195]:
!apt-get -qq install -y libfluidsynth1

E: Package 'libfluidsynth1' has no installation candidate


In [199]:
!apt-get -qq install -y libarchive-dev && pip install -U libarchive

Collecting libarchive
  Using cached libarchive-0.4.7.tar.gz (23 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nose (from libarchive)
  Using cached nose-1.3.7-py3-none-any.whl.metadata (1.7 kB)
Using cached nose-1.3.7-py3-none-any.whl (154 kB)
Building wheels for collected packages: libarchive
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for libarchive (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for libarchive[0m[31m
[0m[?25h  Running setup.py clean for libarchive
Failed to build libarchive
[31mERROR: ERROR: Failed to build installable wheels for some pyproject.toml based projects (libarchive)[0m[31m
[0m

In [200]:
# https://pypi.python.org/pypi/libarchive
import libarchive

ModuleNotFoundError: No module named 'libarchive'

In [201]:
# https://pypi.python.org/pypi/pydot
!apt-get -qq install -y graphviz && pip install pydot



In [202]:
import pydot

In [204]:
!pip install cartopy

Collecting cartopy
  Downloading Cartopy-0.24.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.9 kB)
Downloading Cartopy-0.24.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cartopy
Successfully installed cartopy-0.24.1


In [207]:
!pip install --pre deepchem[torch]

Collecting dgl (from deepchem[torch])
  Downloading dgl-2.1.0-cp310-cp310-manylinux1_x86_64.whl.metadata (553 bytes)
Collecting dgllife (from deepchem[torch])
  Downloading dgllife-0.3.2-py3-none-any.whl.metadata (667 bytes)
Collecting torchdata>=0.5.0 (from dgl->deepchem[torch])
  Downloading torchdata-0.9.0-cp310-cp310-manylinux1_x86_64.whl.metadata (5.5 kB)
Downloading dgl-2.1.0-cp310-cp310-manylinux1_x86_64.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dgllife-0.3.2-py3-none-any.whl (226 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.1/226.1 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torchdata-0.9.0-cp310-cp310-manylinux1_x86_64.whl (2.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchdata, dgllife, dgl
Successfully installe

In [205]:
import cartopy

In [212]:
import deepchem

In [215]:
dir(deepchem.models.torch_models)

['AtomConvModel',
 'AtomicConv',
 'AtomicConvolution',
 'AttentiveFP',
 'AttentiveFPModel',
 'BasicMolGANModel',
 'CGCNN',
 'CGCNNModel',
 'CNN',
 'CNNModule',
 'Chemberta',
 'CombineMeanStd',
 'DTNN',
 'DTNNModel',
 'DecoderRNN',
 'EdgeNetwork',
 'EncoderRNN',
 'GAN',
 'GANModel',
 'GAT',
 'GATModel',
 'GCN',
 'GCNModel',
 'GINEncoder',
 'GradientPenaltyLayer',
 'GroverFinetune',
 'GroverModel',
 'GroverPretrain',
 'GroverReadout',
 'HuggingFaceModel',
 'InfoGraph',
 'InfoGraphEncoder',
 'InfoGraphModel',
 'InfoGraphStar',
 'InfoGraphStarModel',
 'LCNN',
 'LCNNModel',
 'MAT',
 'MATModel',
 'MEGNetModel',
 'MPNN',
 'MPNNModel',
 'ModularTorchModel',
 'MolGANAggregationLayer',
 'MolGANConvolutionLayer',
 'MolGANEncoderLayer',
 'MolGANMultiConvolutionLayer',
 'MultilayerPerceptron',
 'NeighborList',
 'NormalizingFlow',
 'Pagtn',
 'PagtnModel',
 'ProgressiveMultitask',
 'ProgressiveMultitaskModel',
 'ScaledDotProductAttention',
 'SelfAttention',
 'SeqToSeq',
 'SeqToSeqModel',
 'SetGather'

In [217]:
# pretraining
from deepchem.models.torch_models import ModularTorchModel
pretrain_model_dir = os.path.join(tempdir, 'pretrain-molformer-model')
tokenizer_path = "ibm/MoLFormer-XL-both-10pct"
pretrain_model = ModularTorchModel(task='mlm', model_dir=pretrain_model_dir, tokenizer_path=tokenizer_path)  # mlm pretraining
pretraining_loss = pretrain_model.fit(dataset, nb_epoch=1)

TypeError: ModularTorchModel.__init__() missing 2 required positional arguments: 'model' and 'components'

In [193]:
finetune_model_dir = os.path.join(tempdir, 'finetune-model')
finetune_model = MoLFormer(task='regression', model_dir=finetune_model_dir, tokenizer_path=tokenizer_path)
finetune_model.load_from_pretrained(pretrain_model_dir)
finetuning_loss = finetune_model.fit(dataset, nb_epoch=1)

NameError: name 'MoLFormer' is not defined

In [194]:
result = finetune_model.predict(dataset)
eval_results = finetune_model.evaluate(dataset, metrics=dc.metrics.Metric(dc.metrics.mae_score))



NameError: name 'finetune_model' is not defined