In [None]:
from rdkit import Chem
from rdkit.Chem import Draw # Added for molecule visualization
from rdkit import RDLogger
from rdkit.Chem import Crippen, Descriptors, MolToSmiles, Draw, QED, rdMolDescriptors
try:
    from rdkit.Chem.SA_Score import sascorer
except ImportError:
    from rdkit.Contrib.SA_Score import sascorer
import random
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split #ML training
import seaborn as sns # Added import

RDLogger.DisableLog('rdApp.*') 
random.seed(42)

In [None]:
from simple_ga import simple_genetic_algorithm, generate_random_smiles, plot_ga_results

In [None]:
# --- Notes for the user ---
# 1. Fitness Function: The `example_fitness_function` is very basic.
#    For real molecular design, use RDKit or other cheminformatics toolkits to:
#    a. Validate SMILES: `mol = Chem.MolFromSmiles(s)`. If `mol is None`, it's invalid.
#    b. Calculate properties: LogP, QED, molecular weight, presence of specific
#       functional groups, similarity to a target, docking scores, etc.
#    c. The fitness function MUST handle invalid SMILES gracefully (e.g., return
#       a very low score or None, as handled by the GA).
#
# 2. SMILES_CHARS: The `SMILES_CHARS` list is a simplified alphabet.
#    Tailor it to the chemical space you are exploring (e.g., add 'Si', 'B', aromatic lowercase atoms).
#
# 3. Crossover and Mutation: These are simple string manipulations and will
#    often produce invalid SMILES. The fitness function's ability to
#    penalize/filter invalid SMILES is crucial. More advanced, chemically-aware
#    operators exist (e.g., using SELFIES, graph-based methods) but are more complex.
#

In [None]:
# --- User-defined components ---
def example_fitness_function(smiles_string):
    """
    A placeholder fitness function. Scores based on length and presence of 'O'.
    A real fitness function should use cheminformatics toolkits (e.g., RDKit)
    for SMILES validation and property calculation.
    It MUST handle potentially invalid SMILES (e.g., return a very low score or None).
    """
    if not smiles_string: # Handle empty string
        return -1000.0 

    # Using RDKit for validation
    mol = Chem.MolFromSmiles(smiles_string)
    if mol is None:
        return -1000.0  # Invalid SMILES

    score = 0.0
    # Reward length within a certain range
    if 5 <= len(smiles_string) <= 30:
        score += len(smiles_string)
    else:
        score -= abs(len(smiles_string) - 15) # Penalize deviation from ideal length 15

    score += smiles_string.count('O') * 10  # Reward Oxygen
    score += smiles_string.count('N') * 5   # Reward Nitrogen
    score -= smiles_string.count('Cl') * 2 # Penalize Chlorine slightly
    
    # Penalize if too short or too long after other calcs
    if len(smiles_string) < 3 or len(smiles_string) > 50:
        return -1000.0

    return score

In [None]:
# --- GA Parameters ---
num_generations = 100
pop_size = 100
crossover_rate = 0.8
mutation_rate = 0.2 # Probability an individual is mutated
tournament_size = 5
elitism_count = 2
initial_pop = [generate_random_smiles(random.randint(10, 25)) for _ in range(pop_size)]

In [None]:
print("Starting Simple Genetic Algorithm for SMILES generation...")

results = simple_genetic_algorithm(
    initial_population=initial_pop,
    fitness_function=example_fitness_function,
    generations=num_generations,
    population_size=pop_size,
    crossover_rate=crossover_rate,
    mutation_rate=mutation_rate,
    tournament_size=tournament_size,
    elitism_count=elitism_count
)

print("\nGA Run Finished.")
print(f"Best SMILES found: {results['best_smiles']}")
print(f"Best Fitness score: {results['best_fitness']:.4f}")
plot_ga_results(results)

In [None]:
# Lets do a more interesting example using simple rdkit logp prediction
def rdkit_logp_fitness(smiles):
    """
    Calculates the LogP for a given SMILES string using RDKit.
    Returns a very low score for invalid SMILES.
    """
    if not smiles:
        return -1000.0 # Penalize empty SMILES

    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return -1000.0  # Invalid SMILES, return a very low fitness
    try:
        logp = Crippen.MolLogP(mol)
        # Penalize if too short or too long after other calcs
        if len(smiles) < 3 or len(smiles) > 50:
            return -1000.0
        return logp
    except Exception: # Catch any other RDKit errors during calculation
        return -1000.0

In [None]:
# --- GA Parameters ---
num_generations = 50
pop_size = 100
crossover_rate = 0.8
mutation_rate = 0.2 # Probability an individual is mutated
tournament_size = 5
elitism_count = 2
initial_pop = [generate_random_smiles(random.randint(10, 25)) for _ in range(pop_size)]

print("Starting Simple Genetic Algorithm for SMILES generation...")

results = simple_genetic_algorithm(
    initial_population=initial_pop,
    fitness_function=rdkit_logp_fitness,
    generations=num_generations,
    population_size=pop_size,
    crossover_rate=crossover_rate,
    mutation_rate=mutation_rate,
    tournament_size=tournament_size,
    elitism_count=elitism_count
)

print("\nGA Run Finished.")
print(f"Best SMILES found: {results['best_smiles']}")
print(f"Best Fitness score: {results['best_fitness']:.4f}")
plot_ga_results(results)


In [None]:
def rdkit_logp_fitness_2(smiles):
    """
    Calculates a fitness score based on LogP, molecular weight,
    ring presence, and connectivity for a given SMILES string using RDKit.
    Returns a very low score for invalid or highly undesirable SMILES.
    """
    if not smiles:
        return -1000.0 # Penalize empty SMILES

    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return -1000.0  # Invalid SMILES, return a very low fitness

    fitness_score = 0.0

    try:
        # 1. Base score from LogP
        fitness_score += Crippen.MolLogP(mol)

        # 2. Check for connected molecule
        frags = Chem.GetMolFrags(mol, asMols=False) # Get fragment indices
        if len(frags) > 1:
            return -1000.0

        # 3. Molecular weight constraint (< 500 Da)
        mw = Descriptors.MolWt(mol)
        if mw < 500:
            fitness_score += 10 # Bonus for being under 500 Da
        else:
            fitness_score -= (mw - 500) / 10 # Penalize proportionally for exceeding 500 Da

        # 4. Ring presence (at least one ring)
        if mol.GetRingInfo().NumRings() > 0:
            fitness_score += 10*mol.GetRingInfo().NumRings()   # Bonus for having at least one ring
        else:
            fitness_score -= 10   # Penalty for no rings
            
        # Penalize if too short or too long after other calcs
        if len(smiles) < 3 or len(smiles) > 100:
            return -1000.0
        return fitness_score
    except Exception: # Catch any other RDKit errors during calculation
        return -1000.0

In [None]:
# --- GA Parameters ---
num_generations = 50
pop_size = 100
crossover_rate = 0.8
mutation_rate = 0.2 # Probability an individual is mutated
tournament_size = 5
elitism_count = 2
initial_pop = [generate_random_smiles(random.randint(10, 25)) for _ in range(pop_size)]

print("Starting Simple Genetic Algorithm for SMILES generation...")

results = simple_genetic_algorithm(
    initial_population=initial_pop,
    fitness_function=rdkit_logp_fitness_2,
    generations=num_generations,
    population_size=pop_size,
    crossover_rate=crossover_rate,
    mutation_rate=mutation_rate,
    tournament_size=tournament_size,
    elitism_count=elitism_count
)

print("\nGA Run Finished.")
print(f"Best SMILES found: {results['best_smiles']}")
print(f"Best Fitness score: {results['best_fitness']:.4f}")
plot_ga_results(results)

In [None]:
def rdkit_qed_logp_fitness(smiles):
    """
    Calculates a fitness score based on LogP, QED (drug-likeness),
    molecular weight, ring presence, and connectivity.
    Higher QED (0-1) is better. QED is weighted by 10.
    Returns a very low score for invalid or highly undesirable SMILES.
    """
    if not smiles:
        return -1000.0

    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return -1000.0

    fitness_score = 0.0
    try:
        # 1. Base score from LogP
        fitness_score += Crippen.MolLogP(mol)

        # 2. Add QED score (weighted)
        qed_val = QED.qed(mol)
        fitness_score += qed_val * 10  # QED is 0-1, scale it up

        # 3. Check for connected molecule
        frags = Chem.GetMolFrags(mol, asMols=False)
        if len(frags) > 1:
            fitness_score -= 100  # Heavy penalty for disconnected molecules

        # 4. Molecular weight constraint (< 500 Da)
        mw = Descriptors.MolWt(mol)
        if mw < 500:
            fitness_score += 10 # Bonus
        else:
            fitness_score -= (mw - 500) / 10 # Penalize proportionally

        # 5. Ring presence (at least one ring)
        if mol.GetRingInfo().NumRings() > 0:
            fitness_score += 10*mol.GetRingInfo().NumRings()   # Bonus
        else:
            fitness_score -= 10   # Penalty
            
        # Penalize if too short or too long after other calcs
        if len(smiles) < 3 or len(smiles) > 100:
            return -1000.0
        
        return fitness_score
    except Exception:
        return -1000.0

In [None]:
# --- GA Parameters ---
num_generations = 50
pop_size = 100
crossover_rate = 0.8
mutation_rate = 0.2 # Probability an individual is mutated
tournament_size = 5
elitism_count = 2
initial_pop = [generate_random_smiles(random.randint(10, 25)) for _ in range(pop_size)]

print("Starting Simple Genetic Algorithm for SMILES generation...")

results = simple_genetic_algorithm(
    initial_population=initial_pop,
    fitness_function=rdkit_qed_logp_fitness,
    generations=num_generations,
    population_size=pop_size,
    crossover_rate=crossover_rate,
    mutation_rate=mutation_rate,
    tournament_size=tournament_size,
    elitism_count=elitism_count
)

print("\nGA Run Finished.")
print(f"Best SMILES found: {results['best_smiles']}")
print(f"Best Fitness score: {results['best_fitness']:.4f}")
plot_ga_results(results)

In [None]:
def rdkit_sa_qed_logp_fitness(smiles):
    """
    Calculates a fitness score based on LogP, QED, SAscore (synthetic accessibility),
    molecular weight, ring presence, and connectivity.
    Lower SAscore (1-10) is better. Term (5 - SAscore) * 2 is used.
    Returns a very low score for invalid or highly undesirable SMILES.
    """
    if not smiles:
        return -1000.0

    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return -1000.0

    fitness_score = 0.0
    try:
        logp_val = Crippen.MolLogP(mol)
        qed_val = QED.qed(mol)
        sa_val = sascorer.calculateScore(mol) # SAscore: 1 (easy) to 10 (hard)

        fitness_score = logp_val + (qed_val * 10) + ((5 - sa_val) * 2) # SA term: +8 to -10

        frags = Chem.GetMolFrags(mol, asMols=False)
        if len(frags) > 1: fitness_score -= 100
        
        mw = Descriptors.MolWt(mol)
        if mw < 500: fitness_score += 10
        else: fitness_score -= (mw - 500) / 10
        
        if mol.GetRingInfo().NumRings() > 0: 
            fitness_score += 10*mol.GetRingInfo().NumRings()   # Bonus
        else: 
            fitness_score -= 10
            
        # Penalize if too short or too long after other calcs
        if len(smiles) < 3 or len(smiles) > 100:
            return -1000.0
        
        return fitness_score
    except Exception:
        return -1000.0

In [None]:
# --- GA Parameters ---
num_generations = 50
pop_size = 100
crossover_rate = 0.8
mutation_rate = 0.2 # Probability an individual is mutated
tournament_size = 5
elitism_count = 2
initial_pop = [generate_random_smiles(random.randint(10, 25)) for _ in range(pop_size)]

print("Starting Simple Genetic Algorithm for SMILES generation...")

results = simple_genetic_algorithm(
    initial_population=initial_pop,
    fitness_function=rdkit_sa_qed_logp_fitness,
    generations=num_generations,
    population_size=pop_size,
    crossover_rate=crossover_rate,
    mutation_rate=mutation_rate,
    tournament_size=tournament_size,
    elitism_count=elitism_count
)

print("\nGA Run Finished.")
print(f"Best SMILES found: {results['best_smiles']}")
print(f"Best Fitness score: {results['best_fitness']:.4f}")
plot_ga_results(results)

In [None]:
sol_df = pd.read_csv('solubility_aqsoldb.tab', sep="\t")
sol_df = sol_df.rename(columns={"Drug": "SMILES", "Y": "LogS"}) # Thijs used ["Name","SMILES","LogS"]

# Prepare RDKit property calculator
property_names = list(rdMolDescriptors.Properties.GetAvailableProperties())
property_getter = rdMolDescriptors.Properties(property_names)

def smi2props(smi):
    mol = Chem.MolFromSmiles(smi)
    props = None
    if mol:
        # Optional: Chem.DeleteSubstructs(mol, Chem.MolFromSmarts("[#1X0]"))
        props = np.array(property_getter.ComputeProperties(mol))
    return props

# Calculate properties and store them in the DataFrame
sol_df['props'] = sol_df['SMILES'].apply(smi2props)

# Critical fix: Remove rows where 'props' could not be calculated (are None)
sol_df.dropna(subset=['props'], inplace=True)

# Check if DataFrame is empty after dropping NaNs
if sol_df.empty:
    raise ValueError("DataFrame is empty after attempting to calculate properties and remove NaNs. Check SMILES validity and data.")

# Initialize the figure
plt.figure(figsize=(10, 6))

# Create the histogram
sns.histplot(sol_df['LogS'], kde=False)

# Customize the plot (optional)
plt.title('Distribution of LogS values')
plt.xlabel('LogS')
plt.ylabel('Frequency')

# Show the plot
plt.show()

train, test = train_test_split(sol_df, test_size=0.2, random_state=42)

# Critical fix: Convert Series of arrays into a 2D NumPy array for scikit-learn
# Also, ensure that there's data left to train on after filtering.
if train.empty or test.empty:
    raise ValueError("Training or test set is empty after filtering and splitting. Check data.")

train_X = np.vstack(train['props'].values)
train_y = train['LogS'].values
test_X = np.vstack(test['props'].values)
test_y = test['LogS'].values

rf2 = RandomForestRegressor(max_depth=10, n_estimators=100, max_features=0.2, random_state=42)

# train the model
rf2.fit(train_X, train_y)
# predict
pred = rf2.predict(test_X)

print("Model training and prediction complete.")
# Optional: from sklearn.metrics import mean_squared_error
# mse = mean_squared_error(test_y, pred)
# print(f"Test MSE: {mse:.4f}")

def ml_fitness(smiles_string):
    """
    Fitness function that uses the trained RandomForestRegressor model (rf2)
    to predict a property (e.g., LogS) for a given SMILES string.

    Args:
        smiles_string (str): The SMILES string of the molecule.

    Returns:
        float: The predicted property value from the rf2 model.
               Returns a very low score (-1000.0) if the SMILES is invalid,
               properties cannot be generated, or prediction fails.
    """
    props = smi2props(smiles_string) # Uses the smi2props function defined above

    if props is None:
        return -1000.0  # SMILES invalid or properties could not be generated

    try:
        # The model expects a 2D array, so reshape/vstack the single props array
        predicted_value = rf2.predict(np.array([props]))
        return float(predicted_value[0]) # Return the first (and only) prediction
    except Exception:
        return -1000.0 # Fallback for any error during prediction


In [None]:
# --- GA Parameters ---
num_generations = 100
pop_size = 50
crossover_rate = 0.9
mutation_rate = 0.9 # Probability an individual is mutated
tournament_size = 5
elitism_count = 2
initial_pop = []
for i in range(pop_size):
    initial_pop.append(sol_df["SMILES"].values[random.randint(1,9000)])
print("Starting Simple Genetic Algorithm for SMILES generation...")

results = simple_genetic_algorithm(
    initial_population=initial_pop,
    fitness_function=ml_fitness,
    generations=num_generations,
    population_size=pop_size,
    crossover_rate=crossover_rate,
    mutation_rate=mutation_rate,
    tournament_size=tournament_size,
    elitism_count=elitism_count
)

print("\nGA Run Finished.")
print(f"Best SMILES found: {results['best_smiles']}")
print(f"Best Fitness score: {results['best_fitness']:.4f}")
plot_ga_results(results)

In [None]:
def GaussianModifier(score, target, sigma=2) -> float:
    score = np.exp(-0.5 * np.power((score - target) / sigma, 2.0))
    return float(score)

def targeted_ml_fitness(smiles_string):
    """
    Fitness function that uses the trained RandomForestRegressor model (rf2)
    to predict a property (e.g., LogS) for a given SMILES string.

    Args:
        smiles_string (str): The SMILES string of the molecule.

    Returns:
        float: The predicted property value from the rf2 model.
               Returns a very low score (-1000.0) if the SMILES is invalid,
               properties cannot be generated, or prediction fails.
    """
    props = smi2props(smiles_string) # Uses the smi2props function defined above

    if props is None:
        return -1000.0  # SMILES invalid or properties could not be generated

    mol = Chem.MolFromSmiles(smiles_string)
    if mol is None:
        return -1000.0
    
    frags = Chem.GetMolFrags(mol, asMols=False)
    if len(frags) > 1: 
        return -1000.0
    
    mw = Descriptors.MolWt(mol)
    if mw < 500: 
        return -1000.0
        
    # Penalize if too short or too long after other calcs
    if len(smiles_string) < 3 or len(smiles_string) > 100:
        return -1000.0

    try:
        # The model expects a 2D array, so reshape/vstack the single props array
        predicted_value = rf2.predict(np.array([props]))
        return GaussianModifier(float(predicted_value[0]), -4) # Return the first (and only) prediction wrapped to target -4
    except Exception as e:
        print(e)
        return -1000.0 # Fallback for any error during prediction

In [None]:
# --- GA Parameters ---
num_generations = 100
pop_size = 50
crossover_rate = 0.7
mutation_rate = 0.3 # Probability an individual is mutated
tournament_size = 5
elitism_count = 2
initial_pop = []
for i in range(pop_size):
    initial_pop.append(sol_df["SMILES"].values[random.randint(1,9000)])
print("Starting Simple Genetic Algorithm for SMILES generation...")

results = simple_genetic_algorithm(
    initial_population=initial_pop,
    fitness_function=targeted_ml_fitness,
    generations=num_generations,
    population_size=pop_size,
    crossover_rate=crossover_rate,
    mutation_rate=mutation_rate,
    tournament_size=tournament_size,
    elitism_count=elitism_count
)

print("\nGA Run Finished.")
print(f"Best SMILES found: {results['best_smiles']}")
print(f"Best Fitness score: {results['best_fitness']:.4f}")
plot_ga_results(results)