In [26]:
!pip install torch transformers rdkit tqdm pandas
!pip install datasets
!pip install 'accelerate=0.26.0
!pip install transformers[torch]
!pip install transformers[torch] accelerate


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
zsh:1: unmatched '
zsh:1: no matches found: transformers[torch]
zsh:1: no matches found: transformers[torch]


In [34]:
pip show accelerate

[0mNote: you may need to restart the kernel to use updated packages.


In [27]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from rdkit import Chem
from tqdm import tqdm

print("All libraries imported successfully!")

All libraries imported successfully!


In [28]:
def load_dataset(file_path):
    """
    Load the dataset from a CSV file.
    """
    df = pd.read_csv(file_path)
    smiles_list = df["SMILES"].tolist()
    return Dataset.from_dict({"text": smiles_list})

In [29]:
def tokenize_dataset(dataset, tokenizer):
    """
    Tokenize the SMILES strings using the tokenizer.
    """
    # Set the padding token if not already set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as padding token

    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",  # Pad to the maximum length
            truncation=True,       # Truncate to the maximum length
            max_length=100,        # Set the maximum length
        )

    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    return tokenized_dataset

In [30]:
def train_molgpt(tokenized_dataset, output_dir="./molgpt_model"):
    """
    Train the MolGPT model on the tokenized dataset.
    """
    # Load the tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=10,
        per_device_train_batch_size=32,
        save_steps=10_000,
        save_total_limit=2,
        logging_dir="./logs",
        logging_steps=500,
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
    )

    # Train the model
    trainer.train()

    # Save the model and tokenizer
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model saved to {output_dir}")

In [31]:
def generate_molecules(model, tokenizer, num_molecules=10, max_length=100):
    """
    Generate new molecules using the trained MolGPT model.
    """
    generated_smiles = []
    for _ in tqdm(range(num_molecules)):
        # Generate SMILES strings
        output = model.generate(
            input_ids=torch.tensor([tokenizer.bos_token_id]).unsqueeze(0),  # Start with a beginning token
            max_length=max_length,
            num_return_sequences=1,
            do_sample=True,
            top_k=50,
            top_p=0.95,
        )
        # Decode the generated SMILES
        smiles = tokenizer.decode(output[0], skip_special_tokens=True)
        generated_smiles.append(smiles)
    return generated_smiles


In [32]:
def validate_molecules(generated_smiles):
    """
    Validate the generated SMILES strings using RDKit.
    """
    valid_molecules = []
    for smiles in generated_smiles:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            valid_molecules.append(smiles)
    return valid_molecules

In [33]:
if __name__ == "__main__":
    # Load the dataset
    dataset = load_dataset("antibiotic_dataset.csv")

    # Tokenize the dataset
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenized_dataset = tokenize_dataset(dataset, tokenizer)

    # Train the MolGPT model
    train_molgpt(tokenized_dataset)

    # Load the trained model and tokenizer
    model = GPT2LMHeadModel.from_pretrained("./molgpt_model")
    tokenizer = GPT2Tokenizer.from_pretrained("./molgpt_model")

    # Generate new molecules
    generated_smiles = generate_molecules(model, tokenizer, num_molecules=10)

    # Validate the generated molecules
    valid_molecules = validate_molecules(generated_smiles)

    # Print the results
    print("Generated Molecules:")
    for smiles in generated_smiles:
        print(smiles)
    
    print("\nValid Molecules:")
    for smiles in valid_molecules:
        print(smiles)

Map: 100%|██████████| 10/10 [00:00<00:00, 588.10 examples/s]


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`