In [1]:
import os

os.chdir("/content/drive/MyDrive/DREAM")
!ls

 competition  'Dream Olfactory Mixtures Plan.gdoc'   paper_chart.ipynb	 Production
 Data	       examine.ipynb			     PaperData		 train_production.ipynb
 dream	       examine_paper.ipynb		     paper_gated.ipynb	 Trials
'dream (1)'    metamers_perceptual.ipynb	     paper.ipynb
'dream (2)'    PAPER_ARCHIVE			     PaperOutput


In [2]:
!rm -rf dream
!git clone https://github.com/laurahsisson/dream.git
import sys

sys.path.append("dream")

Cloning into 'dream'...
remote: Enumerating objects: 424, done.[K
remote: Counting objects: 100% (177/177), done.[K
remote: Compressing objects: 100% (141/141), done.[K
remote: Total 424 (delta 101), reused 100 (delta 36), pack-reused 247 (from 1)[K
Receiving objects: 100% (424/424), 3.25 MiB | 11.44 MiB/s, done.
Resolving deltas: 100% (256/256), done.


In [3]:
# # If something breaks in the notebook it is probably related to a mismatch between the Python version, CUDA or torch
import torch

pytorch_version = f"torch-{torch.__version__}.html"
!pip install --no-index torch-scatter -f https://pytorch-geometric.com/whl/$pytorch_version
!pip install --no-index torch-sparse -f https://pytorch-geometric.com/whl/$pytorch_version
!pip install --no-index torch-cluster -f https://pytorch-geometric.com/whl/$pytorch_version
!pip install --no-index torch-spline-conv -f https://pytorch-geometric.com/whl/$pytorch_version
!pip install torch-geometric
!pip install rdkit
!pip install ogb

Looking in links: https://pytorch-geometric.com/whl/torch-2.5.1+cu121.html
Looking in links: https://pytorch-geometric.com/whl/torch-2.5.1+cu121.html
Looking in links: https://pytorch-geometric.com/whl/torch-2.5.1+cu121.html
Looking in links: https://pytorch-geometric.com/whl/torch-2.5.1+cu121.html


In [4]:
import random

random_seed = 42

torch.manual_seed(random_seed)
random.seed(random_seed)

In [5]:
import json
import dream.gcn

production_path = "Production/gcn"
with open(os.path.join(production_path, "config.json")) as f:
    config = json.load(f)

graph_model = dream.gcn.GCN(**config)
model_weights = torch.load(os.path.join(production_path, "model.pt"), weights_only=True)
graph_model.load_state_dict(model_weights)
graph_model.cuda()
graph_model.eval()
graph_model

GCN(
  (feature_norm): BatchNorm1d(9, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (project_node_feats): Sequential(
    (0): Linear(in_features=9, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.04808747892762695, inplace=False)
  )
  (convs): ModuleList(
    (0-2): 3 x GINConv(nn=Sequential(
      (0): Linear(in_features=128, out_features=128, bias=True)
      (1): ReLU()
      (2): Linear(in_features=128, out_features=128, bias=True)
    ))
  )
  (norms): ModuleList(
    (0-2): 3 x BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (dropout): Dropout(p=0.04808747892762695, inplace=False)
  (readout): BlendAggregator(
    (readout): SetTransformerAggregation(128, num_seed_points=1, heads=2, layer_norm=False, dropout=0.04808747892762695)
  )
  (notes_predictor): Linear(in_features=128, out_features=101, bias=True)
)

In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
import math
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy import stats
from sklearn.metrics import mean_squared_error
from scipy import stats
from scipy.optimize import curve_fit


data_path = "PaperData"
output_path = "PaperOutput"

# Load the data
mixture_df = pd.read_csv(os.path.join(data_path, "Mixure_Definitions_Training_set.csv"))


def standardize(mix_df):
    mix_df = mix_df.copy()
    # Standardize 'Dataset' names and ensure 'Mixture Label' and CIDs are numeric
    mix_df["Dataset"] = mix_df["Dataset"].str.strip().str.lower()
    mix_df["Mixture Label"] = mix_df["Mixture Label"].astype(int)
    mix_df_cids = mix_df.filter(like="CID").apply(pd.to_numeric, errors="coerce")
    mix_df[mix_df.filter(like="CID").columns] = mix_df_cids
    return mix_df


mixture_df = standardize(mixture_df)
mixture_df

Unnamed: 0,Dataset,Mixture Label,CID,CID.1,CID.2,CID.3,CID.4,CID.5,CID.6,CID.7,...,CID.48,CID.49,CID.50,CID.51,CID.52,CID.53,CID.54,CID.55,CID.56,CID.57
0,snitz 1,1,6501,264.0,2879.0,7685.0,7731.0,326.0,7888.0,61138.0,...,,,,,,,,,,
1,snitz 1,2,240,93009.0,323.0,8148.0,7762.0,3314.0,460.0,6184.0,...,,,,,,,,,,
2,snitz 1,3,7710,,,,,,,,...,,,,,,,,,,
3,snitz 1,4,31276,93009.0,11002.0,323.0,7966.0,8148.0,7632.0,22201.0,...,,,,,,,,,,
4,snitz 1,5,10890,93009.0,11002.0,6982.0,323.0,8797.0,7966.0,8148.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1018,metamer,225,6544,7600.0,7749.0,8130.0,8797.0,18458.0,637511.0,637563.0,...,,,,,,,,,,
1019,metamer,233,1213,7888.0,8130.0,8174.0,26331.0,61832.0,82227.0,93375.0,...,,,,,,,,,,
1020,metamer,235,999,6501.0,7406.0,7710.0,7749.0,8174.0,8797.0,22201.0,...,,,,,,,,,,
1021,metamer,243,999,7406.0,7600.0,8129.0,8174.0,9017.0,31252.0,62378.0,...,,,,,,,,,,


In [7]:
distance_df = pd.read_csv(os.path.join(data_path, "TrainingData_mixturedist.csv"))

distance_df["Dataset"] = distance_df["Dataset"].str.strip().str.lower()
distance_df["Mixture 1"] = distance_df["Mixture 1"].astype(int)
distance_df["Mixture 2"] = distance_df["Mixture 2"].astype(int)
distance_df

Unnamed: 0,Dataset,Mixture 1,Mixture 2,Experimental Values
0,snitz 1,1,2,0.604167
1,snitz 1,1,3,0.651042
2,snitz 1,1,4,0.526042
3,snitz 1,1,5,0.505208
4,snitz 1,1,6,0.411458
...,...,...,...,...
827,angle,9,25,0.453571
828,angle,10,26,0.539286
829,metamer,223,233,0.058193
830,metamer,225,235,0.182002


In [8]:
datasets_to_exclude = ["metamer", "random", "angle", "manual"]
train_df = mixture_df[~mixture_df["Dataset"].isin(datasets_to_exclude)]
test_df = mixture_df[mixture_df["Dataset"].isin(datasets_to_exclude)]
print("Not using metamer for test data")

Not using metamer for test data


In [9]:
def load_cid_to_smiles():
    with open(os.path.join(data_path, "cid_to_smiles.json")) as f:
        return json.load(f)


cid_to_smiles = load_cid_to_smiles()
smiles_to_cid = {v: k for k, v in cid_to_smiles.items()}
all_smiles = list(cid_to_smiles.values())
len(cid_to_smiles), next(iter(cid_to_smiles.items()))

(235, ('6501', 'CCOC(=O)C1C(O1)(C)C2=CC=CC=C2'))

In [10]:
def map_cid_to_smiles(df, cid_to_smiles):
    # Create a copy of the DataFrame to avoid modifying the original
    df = df.copy()

    # Get all columns that start with "CID."
    cid_columns = [col for col in df.columns if col.startswith("CID")]

    # Convert CIDs to strings, handle NaN, and apply the mapping for each CID column
    for col in cid_columns:
        df[col] = df[col].apply(
            lambda x: cid_to_smiles.get(str(int(x)), None) if pd.notna(x) else None
        )

    # Combine the `smiles` into a single column, filtering out None values
    df["smiles"] = df[cid_columns].apply(
        lambda row: list(set([val for val in row if pd.notna(val)])), axis=1
    )

    # Return the desired columns
    return df[["Dataset", "Mixture Label", "smiles"]]


# Convert CIDs to SMILES and save the result
smiles_train_df = map_cid_to_smiles(train_df, cid_to_smiles)
smiles_test_df = map_cid_to_smiles(test_df, cid_to_smiles)
smiles_train_df

Unnamed: 0,Dataset,Mixture Label,smiles
0,snitz 1,1,"[CCOC(=O)C1C(O1)(C)C2=CC=CC=C2, C1=CSC=C1, C=C..."
1,snitz 1,2,"[C1=CC=C(C=C1)CCO, CCCCNCCCC, COC1=CC=CC=C1O, ..."
2,snitz 1,3,[CCCCCC1CCC(=O)O1]
3,snitz 1,4,"[C1CCC(CC1)O, CC1(C2CCC(O1)(CC2)C)C, CC(C)CCOC..."
4,snitz 1,5,"[CCCCCC(=O)O, CCCCCCO, C1CCC(CC1)O, CC1COCC2=C..."
...,...,...,...
920,ravia 6,294,"[CC1=CC=C(C=C1)OC(=O)C, CCCCC(=O)OCC, CC1(C2CC..."
921,ravia 6,295,"[CC1=CC(=O)CC(C1)(C)C, COC1=CC=CC=C1, CC(C)CC(..."
922,ravia 6,296,"[CC1=CC(=O)CC(C1)(C)C, C1=CC=C2C(=C1)C(=CN2)CC..."
923,ravia 6,297,"[CC1=CC(=O)CC(C1)(C)C, CC1=CCC(CC1=O)C(=C)C, C..."


In [11]:
ex_target = smiles_train_df["smiles"][0]
ex_target

['CCOC(=O)C1C(O1)(C)C2=CC=CC=C2',
 'C1=CSC=C1',
 'C=CCCC(=O)O',
 'CC1=CC=C(C=C1)O',
 'CC1=CC=C(C=C1)OC(=O)C(C)C',
 'CC(C)C1=CC=C(C=C1)C=O',
 'COC1=C(C=CC(=C1)C=O)O',
 'CCCC(=O)O',
 'CC(CCCC(C)(C)O)CC=O',
 'CC1=CC=C(C=C1)OC']

In [12]:
palette = set(smiles_test_df["smiles"].explode().tolist())
len(palette)

121

In [13]:
from tqdm.notebook import tqdm
from ogb.utils import smiles2graph
import dream.pairdata

graph_data = dict()
errored = 0

# Compute individual graphs for all unique SMILES
for smiles in tqdm(all_smiles, desc="Processing SMILES to graphs"):
    try:
        graph_data[smiles] = dream.pairdata.to_torch(smiles2graph(smiles))
    except AttributeError as e:
        print(f"Error processing SMILES: {smiles}. Error: {e}")
        errored += 1
    except TypeError as e:
        print(f"Error processing SMILES: {smiles}. Error: {e}")
        errored += 1

f"Errored smiles: {errored}"

Processing SMILES to graphs:   0%|          | 0/235 [00:00<?, ?it/s]

'Errored smiles: 0'

In [14]:
smiles_train_df

Unnamed: 0,Dataset,Mixture Label,smiles
0,snitz 1,1,"[CCOC(=O)C1C(O1)(C)C2=CC=CC=C2, C1=CSC=C1, C=C..."
1,snitz 1,2,"[C1=CC=C(C=C1)CCO, CCCCNCCCC, COC1=CC=CC=C1O, ..."
2,snitz 1,3,[CCCCCC1CCC(=O)O1]
3,snitz 1,4,"[C1CCC(CC1)O, CC1(C2CCC(O1)(CC2)C)C, CC(C)CCOC..."
4,snitz 1,5,"[CCCCCC(=O)O, CCCCCCO, C1CCC(CC1)O, CC1COCC2=C..."
...,...,...,...
920,ravia 6,294,"[CC1=CC=C(C=C1)OC(=O)C, CCCCC(=O)OCC, CC1(C2CC..."
921,ravia 6,295,"[CC1=CC(=O)CC(C1)(C)C, COC1=CC=CC=C1, CC(C)CC(..."
922,ravia 6,296,"[CC1=CC(=O)CC(C1)(C)C, C1=CC=C2C(=C1)C(=CN2)CC..."
923,ravia 6,297,"[CC1=CC(=O)CC(C1)(C)C, CC1=CCC(CC1=O)C(=C)C, C..."


In [15]:
import dream.data
import time
from torch_geometric.data import Batch
from scipy.spatial import distance

cache = dict()


def make_embedding(blend_smiles):
    assert len(blend_smiles) == len(set(blend_smiles))
    # For consistency, sort the smiles in the blend.
    blend_smiles = sorted(blend_smiles)
    if tuple(blend_smiles) in cache:
        return cache[tuple(blend_smiles)]
    graphs = [graph_data[smiles] for smiles in blend_smiles]
    blend_data = dream.data.combine_graphs(graphs)

    with torch.no_grad():
        embedding = graph_model(blend_data.cuda())["embed"].cpu().squeeze()

    cache[tuple(blend_smiles)] = embedding
    return embedding


def make_embedding_batch(batch_blend_smiles):
    assert all(
        [
            len(blend_smiles) == len(set(blend_smiles))
            for blend_smiles in batch_blend_smiles
        ]
    )

    # For consistency, sort the smiles in the blend.
    batch_blend_smiles = [sorted(blend_smiles) for blend_smiles in batch_blend_smiles]
    batch_graphs = [
        [graph_data[smiles] for smiles in blend_smiles]
        for blend_smiles in batch_blend_smiles
    ]
    batch_blend_data = [dream.data.combine_graphs(graphs) for graphs in batch_graphs]

    batched_graph = Batch.from_data_list(batch_blend_data)

    with torch.no_grad():
        embedding = graph_model(batched_graph.cuda())["embed"].cpu()

    return embedding


def euclidean_distance(embed1, embed2):
    return (embed1 - embed2).square().sum(dim=-1).sqrt()


ex_batch_smiles = smiles_train_df["smiles"][:42].tolist()
make_embedding(ex_target).shape, make_embedding_batch(ex_batch_smiles).shape

(torch.Size([128]), torch.Size([42, 128]))

In [16]:
# Merge smiles_train_df with metamer_blends to add Smiles 1 and Smiles 2
metamer_blends_smiles = (
    distance_df.merge(
        smiles_test_df,
        left_on=["Dataset", "Mixture 1"],
        right_on=["Dataset", "Mixture Label"],
        how="left",
    )
    .rename(columns={"smiles": "Smiles 1"})
    .merge(
        smiles_test_df,
        left_on=["Dataset", "Mixture 2"],
        right_on=["Dataset", "Mixture Label"],
        how="left",
    )
    .rename(columns={"smiles": "Smiles 2"})
)

# Drop redundant 'Mixture Label' columns after merging
metamer_blends_smiles = metamer_blends_smiles.drop(
    columns=["Mixture Label_x", "Mixture Label_y"]
)
metamer_blends_smiles = metamer_blends_smiles[
    metamer_blends_smiles["Dataset"] == "metamer"
]

metamer_blends_smiles

Unnamed: 0,Dataset,Mixture 1,Mixture 2,Experimental Values,Smiles 1,Smiles 2
829,metamer,223,233,0.058193,"[CC1=CC(=O)CC(C1)(C)C, CC1=CCC(CC1=O)C(=C)C, C...","[CCCCCC=CC=CC=C, CC1=CCC2CC1C2(C)C, CCC1=NC=CN..."
830,metamer,225,235,0.182002,"[CC1=CC(=O)CC(C1)(C)C, CC1=CC=C(C=C1)OC(=O)C, ...","[CCOC(=O)C1C(O1)(C)C2=CC=CC=C2, CC1=CC=C(C=C1)..."
831,metamer,243,253,0.180894,"[CC(C)CCOC(=O)CC1=CC=CC=C1, CC(CCC=C(C)C)CCOC(...","[CC1=CC=C(C=C1)OC(=O)C, CC1=CC2=C(C=C1)N=CC=C2..."


In [17]:
distances = dict()
for _, row in metamer_blends_smiles.iterrows():
    embed1 = make_embedding(row["Smiles 1"])
    embed2 = make_embedding(row["Smiles 2"])
    key = f"{row['Dataset']} {row['Mixture 1']}x{row['Mixture 2']}"
    distance = euclidean_distance(embed1, embed2)
    distances[key] = distance
    print(
        f"Dataset {row['Dataset']}. Mixture 1 {row['Mixture 1']}. Mixture 2 {row['Mixture 2']}. Euclidean Distance {distance:.2f}"
    )
distances

Dataset metamer. Mixture 1 223. Mixture 2 233. Euclidean Distance 33.79
Dataset metamer. Mixture 1 225. Mixture 2 235. Euclidean Distance 25.36
Dataset metamer. Mixture 1 243. Mixture 2 253. Euclidean Distance 37.13


{'metamer 223x233': tensor(33.7868),
 'metamer 225x235': tensor(25.3600),
 'metamer 243x253': tensor(37.1259)}

In [18]:
import random
from tqdm.notebook import tqdm
import time
import numpy as np

# Constants
ALL_BLENDS = smiles_train_df["smiles"].tolist()
MAX_SIZE = 30

In [19]:
def greedy_metamer(target_smiles, display=False):
    valid_smiles = [smiles for smiles in palette if smiles not in target_smiles]
    target_embed = make_embedding(target_smiles)

    current_blend = []
    current_distance = float("inf")
    current_embed = None

    while len(current_blend) < MAX_SIZE:
        if display:
            print(f"Step {len(current_blend)} w/ distance {current_distance:.2f}")

        step_blends = [
            current_blend + [smiles]
            for smiles in valid_smiles
            if smiles not in current_blend
        ]
        step_embeds = make_embedding_batch(step_blends)

        step_distances = euclidean_distance(target_embed, step_embeds)
        best_distance, best_idx = torch.min(step_distances, 0)
        best_distance, best_idx = best_distance.item(), best_idx.item()

        best_embed = step_embeds[best_idx]
        best_blend = step_blends[best_idx]

        if best_distance < current_distance:
            current_blend = best_blend
            current_distance = best_distance
            current_embed = best_embed
        else:
            break

    return current_blend, current_distance


greedy_metamer(ex_target, display=True)

Step 0 w/ distance inf
Step 1 w/ distance 47.60
Step 2 w/ distance 35.85
Step 3 w/ distance 27.53
Step 4 w/ distance 19.87
Step 5 w/ distance 14.06
Step 6 w/ distance 11.48
Step 7 w/ distance 10.59
Step 8 w/ distance 10.24
Step 9 w/ distance 9.43
Step 10 w/ distance 8.77


(['CCCCCCCCCCO',
  'C1=CC=C(C=C1)C=O',
  'CC1=CC=C(C=C1)OC(=O)C',
  'CCCCCOC(=O)CCCC',
  'CCC=C(C)C(=O)O',
  'CC1=CC=CC=C1',
  'CCCCCC=CC=CC=C',
  'CC(C)CC(=O)O',
  'COC1=CC=CC=C1',
  'CC1=CCCC(=C)C2CC(C2CC1)(C)C'],
 8.77117919921875)

In [20]:
import random


def random_metamer(target_smiles, display=False):
    valid_smiles = [smiles for smiles in palette if not smiles in target_smiles]
    target_embed = make_embedding(target_smiles)

    blend = random.sample(valid_smiles, MAX_SIZE)
    embed = make_embedding(blend)
    distance = euclidean_distance(target_embed, embed)

    return blend, distance.item()


random_metamer(ex_target)

(['CCCO',
  'CCCCCC1=C(CCC1=O)C',
  'CCCCCCCCCCO',
  'CCCCCC=O',
  'CC(CCC=C(C)C)C=O',
  'COC1=CC=C(C=C1)C=O',
  'CC1=CCC2CC1C2(C)C',
  'COC(=O)C1=CC=CC=C1',
  'CC1=NC=CN=C1C',
  'CC(C)C=O',
  'CCCSCCC',
  'CCCC(=O)OC(C)(C)CC1=CC=CC=C1',
  'COC1=C(C=CC(=C1)CC=C)O',
  'CC1=CCC(CC1=O)C(=C)C',
  'CCCC(CC)O',
  'CC(C)CCOC(=O)C',
  'CC1=CC(=O)CC(C1)(C)C',
  'C1=CC=C(C=C1)CC(=O)O',
  'CSSC',
  'CCCCCCCC(=O)OC',
  'CC(C1=CC=CC=C1)C(OC)OC',
  'CC(CCC=C(C)C)CCO',
  'CCCCCCC=O',
  'CCOC(=O)C(C)O',
  'CCCCOC(=O)C',
  'CCCC(=O)CC',
  'COC1=CC=CC=C1',
  'CC(C)CC(=O)O',
  'CCCC(=O)SC',
  'CCCCCCCCO'],
 33.46923065185547)

In [21]:
import random
import copy

POPULATION_SIZE = 1000
GENERATIONS = 5
MUTATION_RATE = 0.25  # Mutation rate
CROSSOVER_RATE = 0.5  # Crossover rate


def evolutionary_metamer(target_smiles, display=False, initial=None):
    # Prepare the target embedding
    valid_smiles = [smiles for smiles in palette if smiles not in target_smiles]
    target_embed = make_embedding(target_smiles)

    # Helper functions for mutation and crossover
    def mutate(blend):
        # Perform mutation with the given mutation rate
        blend = copy.deepcopy(blend)  # Ensure a new object is modified
        if random.random() < MUTATION_RATE:
            action = random.choice(["replace", "drop", "add"])
            available = [s for s in valid_smiles if s not in blend]

            if action == "replace" and len(blend) > 0:
                to_replace = random.randint(0, len(blend) - 1)
                new_smile = random.choice(available)
                blend[to_replace] = new_smile

            elif action == "drop" and len(blend) > 1:
                to_remove = random.randint(0, len(blend) - 1)
                blend.pop(to_remove)

            elif action == "add" and len(blend) < MAX_SIZE:
                new_smile = random.choice(available)
                blend.append(new_smile)

        return list(set(blend))[:MAX_SIZE]  # Ensure uniqueness and size constraint

    def crossover(parent1, parent2):
        # Perform crossover with the given crossover rate
        if random.random() < CROSSOVER_RATE:
            try:
                split_point = random.randint(1, min(len(parent1), len(parent2)) - 1)
                child = parent1[:split_point] + parent2[split_point:]
                return list(set(child))[
                    :MAX_SIZE
                ]  # Ensure uniqueness and size constraint
            except ValueError:
                return copy.deepcopy(parent1)
        return copy.deepcopy(parent1)  # Return a copy to avoid modifying parent1

    if initial is None:
        initial, ga = greedy_metamer(target_smiles)

    # Initialize population with deep copies of the initial blend
    population = [copy.deepcopy(initial) for _ in range(POPULATION_SIZE)]

    # Track the best blend overall
    global_best_blend = initial
    global_best_embed = make_embedding(initial)
    global_best_distance = euclidean_distance(target_embed, global_best_embed).item()

    for generation in range(GENERATIONS):
        if display:
            print(f"Generation {generation + 1}")

        # Calculate fitness (similarity to target embedding)
        embeds = make_embedding_batch(population)
        distances = euclidean_distance(target_embed, embeds)  # Lower is better
        fitness = 1 / (distances + 1e-8)  # Avoid division by zero

        # Update global best
        best_idx = torch.argmin(distances)  # Minimize similarity
        best_distance = distances[best_idx].item()
        if best_distance < global_best_distance:
            global_best_distance = best_distance
            global_best_blend = population[best_idx]
            global_best_embed = embeds[best_idx]

        # Select parents based on fitness (roulette-wheel selection)
        total_fitness = fitness.sum()
        probabilities = fitness / total_fitness
        parents_idx = torch.multinomial(
            probabilities, POPULATION_SIZE, replacement=True
        )
        parents = [population[idx] for idx in parents_idx]

        # Create next generation
        next_population = []
        for i in range(0, len(parents), 2):
            parent1 = parents[i]
            parent2 = parents[i + 1] if i + 1 < len(parents) else parents[0]
            child1 = mutate(crossover(parent1, parent2))
            child2 = mutate(crossover(parent2, parent1))
            next_population.extend([child1, child2])

        # Ensure population size is consistent
        population = next_population[:POPULATION_SIZE]

        if display:
            print(
                f"Best similarity this generation: {best_distance:.4f}, Global Best: {global_best_distance:.4f}"
            )

    return global_best_blend, global_best_distance


# Call the function
evolutionary_metamer(ex_target, display=True)

Generation 1
Best similarity this generation: 8.7712, Global Best: 8.7712
Generation 2
Best similarity this generation: 8.7712, Global Best: 8.7712
Generation 3
Best similarity this generation: 8.7712, Global Best: 8.7712
Generation 4
Best similarity this generation: 8.7712, Global Best: 8.7712
Generation 5
Best similarity this generation: 8.7712, Global Best: 8.7712


(['CCCCCCCCCCO',
  'C1=CC=C(C=C1)C=O',
  'CC1=CC=C(C=C1)OC(=O)C',
  'CCCCCOC(=O)CCCC',
  'CCC=C(C)C(=O)O',
  'CC1=CC=CC=C1',
  'CCCCCC=CC=CC=C',
  'CC(C)CC(=O)O',
  'COC1=CC=CC=C1',
  'CC1=CCCC(=C)C2CC(C2CC1)(C)C'],
 8.771177291870117)

In [None]:
from tqdm.notebook import tqdm
import numpy as np
import gc

experiment_name = f"Palette_{len(palette)}_MaxSize_{MAX_SIZE}"

greedy_distances = []
random_distances = []
evolutionary_distances = []

greedy_blends = []
random_blends = []
evolutionary_blends = []

LIM = None
print(experiment_name)

# Create a progress bar
with tqdm(total=len(ALL_BLENDS[:LIM]), smoothing=0, desc="Generating Metamers") as pbar:
    for target_smiles in ALL_BLENDS[:LIM]:

        rb, rs = random_metamer(target_smiles)
        random_blends.append(rb)
        random_distances.append(rs)

        gb, gs = greedy_metamer(target_smiles)
        greedy_blends.append(gb)
        greedy_distances.append(gs)

        eb, es = evolutionary_metamer(target_smiles, initial=gb)
        evolutionary_distances.append(es)
        evolutionary_blends.append(eb)

        # Update the progress bar with the current averages
        pbar.set_postfix(
            {
                "Greedy": f"{np.mean(greedy_distances):.3f}",
                "Evolutionary": f"{np.mean(evolutionary_distances):.3f}",
                "Random": f"{np.mean(random_distances):.3f}",
            }
        )
        pbar.update(1)

        gc.collect()
        torch.cuda.empty_cache()

# Print final averages
print(f"\nAverage greedy (Similarity) distance: {np.mean(greedy_distances):.3f}")
print(f"Average evolutionary distance: {np.mean(evolutionary_distances):.3f}")
print(f"Average random distance: {np.mean(random_distances):.3f}")

Palette_121_MaxSize_30


Generating Metamers:   0%|          | 0/925 [00:00<?, ?it/s]

In [None]:
import numpy as np

experiment_path = os.path.join(output_path, experiment_name)
os.makedirs(experiment_path, exist_ok=True)

plt.figure(figsize=(10, 6))

NUM_BINS = 20


blend_sizes = [len(gb) for gb in greedy_blends]
all_distances = greedy_distances + evolutionary_distances + random_distances
max_val = np.max(all_distances)


# Combine the datasets and calculate shared bins
bins = np.linspace(0, max_val, NUM_BINS)


# Plot the histograms using the shared bins
plt.hist(
    greedy_distances,
    bins=bins,
    alpha=0.7,
    label=f"Greedy Metamer (Mean = {np.mean(greedy_distances):.2f})",
)
plt.hist(
    evolutionary_distances,
    bins=bins,
    alpha=0.7,
    label=f"Evolutionary Metamer (Mean = {np.mean(evolutionary_distances):.2f})",
)
plt.hist(
    random_distances,
    bins=bins,
    alpha=0.7,
    label=f"Random Metamer (Mean = {np.mean(random_distances):.2f})",
)

# Add vertical lines for the benchmarks
for name, value in distances.items():
    plt.axvline(value, color="black", linestyle="--", label=f"{name} = {value:.2f}")

plt.xlabel("Euclidean Distance")
plt.ylabel("Counts")
plt.title(
    f"Generated Blend Euclidean Distance (Palette of {len(palette)}) by Technique\nMax Blend Size {MAX_SIZE}. Average Blend Size = {np.mean(blend_sizes):.1f}"
)
plt.xlim((0, max_val))
plt.legend()
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.savefig(os.path.join(experiment_path, "histogram.png"))

plt.show()

In [None]:
gen_metamer_df = smiles_train_df.copy().reset_index(drop=True)[: len(greedy_distances)]
gen_metamer_df["greedy_metamer_smiles"] = greedy_blends
gen_metamer_df["greedy_metamer_distances"] = greedy_distances
gen_metamer_df["greedy_metamer_valid_threshold"] = [d <= 25 for d in greedy_distances]
gen_metamer_df["greedy_metamer_cids"] = [
    [smiles_to_cid[s] for s in blend] for blend in greedy_blends
]

gen_metamer_df["evolutionary_metamer_smiles"] = evolutionary_blends
gen_metamer_df["evolutionary_metamer_distances"] = evolutionary_distances
gen_metamer_df["evolutionary_metamer_valid_threshold"] = [
    d <= 25 for d in evolutionary_distances
]
gen_metamer_df["evolutionary_metamer_cids"] = [
    [smiles_to_cid[s] for s in blend] for blend in evolutionary_blends
]
gen_metamer_df.to_csv(os.path.join(experiment_path, "generated.csv"))

gen_metamer_df

In [None]:
with open(os.path.join(experiment_path, "palette.json"), "w") as f:
    json.dump(cid_to_smiles, f)
cid_to_smiles

In [None]:
valid_greedy = len(gen_metamer_df[gen_metamer_df["greedy_metamer_valid_threshold"]])
valid_evolutionary = len(
    gen_metamer_df[gen_metamer_df["evolutionary_metamer_valid_threshold"]]
)
print(
    f"Generated {valid_greedy} greedy metamers.\nGenerated {valid_evolutionary} evolutionary metamers."
)