# Finetuning model

In [130]:
import os

import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
import torch
import selfies as sf
from sklearn.metrics import average_precision_score, accuracy_score

In [69]:
def convert_canonical_smiles_to_selfies(smiles_str):
    # Convert SMILES to SELFIES
    selfies_str = sf.encoder(smiles_str)
    return selfies_str

In [70]:
def compute_metrics(p):
    pred, labels = p
    auc_pr = average_precision_score(y_true=labels, y_pred=pred)
    
    return {"auc_pr": auc_pr}

# Prepare data

## Load data

In [71]:
task_filepaths

['/rds/general/user/ssh22/home/FS-Tox/outputs/2023-08-10/16-27-54/data/processed/task/ddd5a240d7bec48_support-32_permutation-3.parquet',
 '/rds/general/user/ssh22/home/FS-Tox/outputs/2023-08-10/16-27-54/data/processed/task/b12eb94097d71ed_support-32_permutation-2.parquet',
 '/rds/general/user/ssh22/home/FS-Tox/outputs/2023-08-10/16-27-54/data/processed/task/ddd5a240d7bec48_support-32_permutation-2.parquet',
 '/rds/general/user/ssh22/home/FS-Tox/outputs/2023-08-10/16-27-54/data/processed/task/9d8d134d88b4b2f_support-32_permutation-1.parquet',
 '/rds/general/user/ssh22/home/FS-Tox/outputs/2023-08-10/16-27-54/data/processed/task/ddd5a240d7bec48_support-32_permutation-1.parquet',
 '/rds/general/user/ssh22/home/FS-Tox/outputs/2023-08-10/16-27-54/data/processed/task/b12eb94097d71ed_support-32_permutation-3.parquet',
 '/rds/general/user/ssh22/home/FS-Tox/outputs/2023-08-10/16-27-54/data/processed/task/b12eb94097d71ed_support-32_permutation-1.parquet',
 '/rds/general/user/ssh22/home/FS-Tox/out

In [153]:
# Set the tokenizer
checkpoint = "ncfrey/ChemGPT-1.2B"

# Set the path to the task folder
path = "/rds/general/user/ssh22/home/FS-Tox/outputs/2023-08-10/16-27-54/data/processed/task"

# Get a list of task filepaths
task_filepaths = [f"{path}/{task}" for task in os.listdir(path)]

# Tokenize the SELFIES
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Adding a padding token
tokenizer.pad_token = "[PAD]"

# Load a task
task = pd.read_parquet(task_filepaths[2])

# Convert canonical SMILES to SELFIES
selfies = task["canonical_smiles"].apply(convert_canonical_smiles_to_selfies)

# Split task into support and query sets
support_selfies = selfies[task["support_query"] == 0].tolist()
query_selfies = selfies[task["support_query"] == 1].tolist()

# Get support and query labels
support_labels = torch.tensor(task["ground_truth"][task["support_query"] == 0].reset_index(drop=True))
query_labels = torch.tensor(task["ground_truth"][task["support_query"] == 1].reset_index(drop=True))

Downloading (…)okenizer_config.json: 100%|██████████| 260/260 [00:00<00:00, 2.42MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 10.8k/10.8k [00:00<00:00, 93.3MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 2.00/2.00 [00:00<00:00, 16.4kB/s]


## Tokenize data

In [157]:
support_encodings = tokenizer(
        support_selfies, padding=True, truncation=True, return_tensors="pt", max_length=256
    )

query_encoding = support_encodings = tokenizer(
        support_selfies, padding=True, truncation=True, return_tensors="pt", max_length=256
)

## Create dataset

In [158]:
# Create a PyTorch dataset for each task
class ChemDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])
    
# Create a PyTorch dataset for each task
support_dataset = ChemDataset(support_encodings, support_labels)
query_dataset = ChemDataset(query_encodings, query_labels)

# Finetune model

In [156]:
# Finetune model with Trainer
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model.config.pad_token_id = tokenizer.pad_token_id

# Define Trainer
args = TrainingArguments(
    output_dir="results",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    seed=0,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=support_dataset,
    eval_dataset=query_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

trainer.save_model("./results")

Downloading (…)lve/main/config.json: 100%|██████████| 1.28k/1.28k [00:00<00:00, 11.7MB/s]
Downloading pytorch_model.bin: 100%|██████████| 4.91G/4.91G [02:41<00:00, 30.3MB/s]
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at ncfrey/ChemGPT-1.2B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 38.00 MiB (GPU 0; 23.64 GiB total capacity; 20.95 GiB already allocated; 28.50 MiB free; 22.86 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# Make predictions

## Finetuned model

In [159]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained("./results", num_labels=2)
    
model.config.pad_token_id = tokenizer.pad_token_id

# Define test trainer
trainer = Trainer(model)

# Make prediction
raw_pred, _, _ = trainer.predict(query_dataset)

# Assuming true_labels is a numpy array containing the ground truth labels for query_dataset
true_labels = np.array(query_dataset.labels)

# Compute the probabilities for the positive class
y_prob = raw_pred[:, 1]

# Compute AUC-PR
print(average_precision_score(true_labels, y_prob))

0.9051066217732884


## Baseline model

In [160]:
# Reload baseline model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model.config.pad_token_id = tokenizer.pad_token_id

baseline_trainer = Trainer(model)
model.config.pad_token_id = tokenizer.pad_token_id

# Make prediction
raw_pred, _, _ = baseline_trainer.predict(query_dataset)

# Assuming true_labels is a numpy array containing the ground truth labels for query_dataset
true_labels = np.array(query_dataset.labels)

# Compute the probabilities for the positive class
y_prob = raw_pred[:, 1]

# Compute AUC-PR
print(average_precision_score(true_labels, y_prob))

Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at ncfrey/ChemGPT-1.2B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB (GPU 0; 23.64 GiB total capacity; 21.17 GiB already allocated; 10.50 MiB free; 22.88 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [149]:
auc_pr_baseline

0.4540109581776248

# Run script

In [161]:
baseline_auc_pr = []
finetuned_auc_pr = []

baseline_accuracy = []
finetuned_accuracy = []

for task_filepath in task_filepaths:

    # Load data
    support_selfies, support_labels, query_selfies, query_labels = load_data(task_filepath)
    
    # Tokenize data
    support_encodings = tokenize_data(support_selfies)
    query_encodings = tokenize_data(query_selfies)

    # Create support and query data 
    support_dataset = ChemDataset(support_encodings, support_labels)
    query_dataset = ChemDataset(query_encodings, query_labels)
    
    
    finetuned_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
    finetuned_model.config.pad_token_id = tokenizer.pad_token_id
    
    # Finetune model
    finetune_model(support_dataset, query_dataset, finetuned_model)
    
    # Load baseline and finetuned models 
    finetuned_model = AutoModelForSequenceClassification.from_pretrained("./results", num_labels=2)
    finetuned_model.config.pad_token_id = tokenizer.pad_token_id
    
    baseline_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
    baseline_model.config.pad_token_id = tokenizer.pad_token_id
    
    # Generate predictions
    base_acc, base_auc = generate_predictions(baseline_model, query_dataset)
    finetune_acc, finetune_auc = generate_predictions(finetuned_model, query_dataset)
    baseline_accuracy.append(base_acc)
    finetuned_accuracy.append(finetune_acc)
    baseline_auc_pr.append(base_auc)
    finetuned_auc_pr.append(finetune_auc)

Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at ncfrey/ChemGPT-1.2B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB (GPU 0; 23.64 GiB total capacity; 21.28 GiB already allocated; 38.50 MiB free; 22.85 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [151]:
np.mean(baseline_auc_pr)

0.7727593711937145

In [152]:
np.mean(finetuned_auc_pr)

0.8892953159198951

# Functions

In [124]:
def finetune_model(support_dataset, query_dataset, model):

    # Define Trainer
    args = TrainingArguments(
        output_dir="results",
        evaluation_strategy="steps",
        eval_steps=500,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        seed=0,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=support_dataset,
        eval_dataset=query_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()
    
    trainer.save_model("./results")

In [123]:
def load_data(task_filepath):
    
    # Load a task
    task = pd.read_parquet(task_filepath)

    # Convert canonical SMILES to SELFIES
    selfies = task["canonical_smiles"].apply(convert_canonical_smiles_to_selfies)

    # Split task into support and query sets
    support_selfies = selfies[task["support_query"] == 0].tolist()
    query_selfies = selfies[task["support_query"] == 1].tolist()

    # Get support and query labels
    support_labels = torch.tensor(task["ground_truth"][task["support_query"] == 0].reset_index(drop=True))
    query_labels = torch.tensor(task["ground_truth"][task["support_query"] == 1].reset_index(drop=True))
    
    return support_selfies, support_labels, query_selfies, query_labels

In [122]:
def tokenize_data(data):

    # Tokenize the SELFIES
    return tokenizer(
        data, padding=True, truncation=True, return_tensors="pt", max_length=256
    )

In [137]:
def generate_predictions(model, query_dataset):

    # Define test trainer
    trainer = Trainer(model)

    # Make prediction
    raw_pred, _, _ = trainer.predict(query_dataset)
    
    # Assuming true_labels is a numpy array containing the ground truth labels for query_dataset
    true_labels = np.array(query_dataset.labels)

    # Compute the probabilities for the positive class
    y_prob = raw_pred[:, 1]
    
    y_pred = np.argmax(raw_pred, axis=1)

    # Compute AUC-PR
    return accuracy_score(true_labels, y_pred), average_precision_score(true_labels, y_prob)