In [None]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm.auto import trange

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
model = "ibm-research/GP-MoLFormer-Uniq"
deterministic= "store_true"
temperature = 1
batch_size = 1000
num_batches = 1
filename = "uncond.csv"
device = "cuda" if torch.cuda.is_available() else "cpu"


In [8]:
tokenizer = AutoTokenizer.from_pretrained("ibm/MoLFormer-XL-both-10pct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model, deterministic_eval=deterministic, trust_remote_code=True)
model = model.to(device)
model.eval()


MolformerForCausalLM(
  (molformer): MolformerModel(
    (embeddings): MolformerEmbeddings(
      (word_embeddings): Embedding(2362, 768, padding_idx=2)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (encoder): MolformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x MolformerLayer(
          (attention): MolformerAttention(
            (self): MolformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (rotary_embeddings): MolformerRotaryEmbedding()
              (feature_map): MolformerFeatureMap(
                (kernel): ReLU()
              )
            )
            (output): MolformerSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
         

In [11]:
all_generated_batches = []

for _ in trange(num_batches, desc="Generating"):
    # モデルからサンプルを生成
    outputs = model.generate(
        do_sample=True,                      # サンプリングモードを有効にする
        temperature=temperature,              # 温度パラメータ
        top_k=None,                           # top-k フィルタリングを無効にする
        max_length=model.config.max_position_embeddings,  # 最大長さ
        num_return_sequences=batch_size       # バッチサイズ
    )
    # GPU上のテンソルをCPUに移動
    outputs = outputs.cpu()
    all_generated_batches.append(outputs)

# ---------------------------
# デコード（トークン列 → 文字列）
# ---------------------------
smiles_list = []
for batch in all_generated_batches:
    smiles_list.extend(tokenizer.batch_decode(batch, skip_special_tokens=True))

# ---------------------------
# CSVに保存
# ---------------------------
pd.Series(smiles_list).to_csv(filename, header=False, index=False)

Generating: 100%|██████████| 1/1 [00:04<00:00,  4.60s/it]


In [14]:
from rdkit import Chem, RDLogger
RDLogger.DisableLog('rdApp.*')

prompt = "c1ccccc1"

input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"][:, :-1].to(device)

gen = model.generate(
    input_ids,
    do_sample=True,
    top_k=None,
    max_length=model.config.max_position_embeddings,
    num_return_sequences=batch_size
)
smi = tokenizer.batch_decode(gen, skip_special_tokens=True)
mols = [Chem.CanonSmiles(s, useChiral=0) for s in smi if Chem.MolFromSmiles(s) is not None]

print(f"Total number of successful molecules generated is {len(mols)}")
print(f"Total number of unique molecules generated is {len(set(mols))}")
print(smi)

Total number of successful molecules generated is 829
Total number of unique molecules generated is 752
['c1ccccc1-c1cccc(-c2ccc3c(c2)C2(c4cc(-c5ccccc5-c5ccccc5-c5ccccc5)ccc4-c4ccccc42)c2ccccc2-3)c1', 'c1ccccc1.c1ccc([As](c2ccccc2)c2ccc(-n3c4ccccc4c4cc(-n5c6ccccc6c6ccccc65)ccc43)cc2)cc1', 'c1ccccc1.c1c(-c2nnnn2C2CCCCC2)n[nH]c1C1(c2nnnn2-c2ccc(C3CCCCC3NCC3CCCCC3)nn2)CC1', 'c1ccccc1.c1ccncc1', 'c1ccccc1CN1CCN(Cc2cn(CC3CCOC3)nn2)CC1', 'c1ccccc1.c1ccc2c(c1)-c1nc-2nc2[n-]c(nc3nc(nc4[n-]c(n1)c1ccccc41)-c1ccccc1-3)c1ccccc21.c1ccc2[nH+]ccnc2c1', 'c1ccccc1', 'c1ccccc1C(NCc1nncs1)c1cccs1', 'c1ccccc1C1CCC(NC2CCCC(C3CC3)C2)CC1', 'c1ccccc1.c1ccc2ccc3ccccc3c2c1', 'c1ccccc1C1CC(c2ccccc2)=NC(c2cc(-c3ccc4c5ccccc5n(-c5ccccn5)c4c3)cc(-c3ccc4c5ccccc5n(-c5ccc6oc7ccccc7c6c5)c4c3)c2)=N1', 'c1ccccc1CSc1nnc(C2CC2)n1C1CCCCC1', 'c1ccccc1C[S+](C[n+]1ccccc1)c1ccccc1', 'c1ccccc1.c1ccc2cc3c(cc2c1)oc1ccc(-c2c4ccccc4c(-c4ccc5ccccc5c4)c4ccccc24)cc13', 'c1ccccc1-c1cc(-c2cccc3c2oc2c(-c4cccc5c4oc4c(-c6cccc7ccccc67)cccc45)

In [13]:
mols

['c1ccc(OCc2ccc3ccccc3n2)cc1',
 'Nc1ccccc1.[Mo].c1ccc2c(c1)Cc1[n-]ccc1C2c1ccc2[nH]c(Cc3nc4ccccc4[nH]3)cc2c1.c1ccccc1',
 'c1ccc(C2CCN(Cc3ccnc(-c4nn[nH]n4)c3)C2)cc1',
 'c1ccc(OCCCc2nc(C3CCCOC3)no2)cc1',
 'c1ccc2c(c1)Nc1ccccc1S2.c1ccccc1',
 'c1ccc(CN(c2ccccc2NCc2cccs2)C2CC2)cc1',
 'c1ccc(CC[n+]2ccc(-c3cc[nH+]cc3)cc2)cc1',
 'c1ccc(PC2CCCCC2)cc1',
 'Oc1ccccc1COc1ccccc1COCCOCCOCCOCCOc1ccccc1COCCOCCOCCOCc1ccccc1OCCOCCOCc1ccccc1COCCOCCOCCOCCOc1ccccc1',
 'c1ccc2c(CN3CCNC4(CC4)C3)cccc2c1.c1ccccc1',
 'c1ccc2c(c1)C[n+]1ccccc1-2.c1ccccc1',
 'NC(=O)C(N)CO.c1ccccc1',
 'c1ccc(-c2cccc(-c3ccc4sc5cc6c(cc5c4c3)sc3ccccc36)c2)cc1.c1ccccc1',
 'c1ccc(OCCN(Cc2cn(CC3CCCO3)nn2)CC2CCCO2)cc1',
 'c1ccc2scnc2c1.c1ccccc1',
 'c1cc[n+](CC[n+]2cc[se]c2)nc1.c1ccccc1',
 'c1ccc(-c2nnc(SCCCN3CCOCC3)[nH]2)cc1',
 'c1ccc(Oc2ccc(NCc3ccncc3)cc2)cc1',
 'c1ccc([Si]2(c3ccccc3)SCCC2CCCn2cccn2)cc1.c1ccccc1',
 'c1ccc(N(Cc2ccccc2-c2ccc(-n3ccnc3)cc2)c2ccccc2)cc1',
 'c1ccc(OCCOc2ccc(-c3nc(-c4ccc(OCC5CC5)nc4)no3)nc2)cc1',
 'c1ccc2c(c1)CCN