# Digital chemical reactions - Seminar Notebook

In [None]:
# Imports
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
import pandas as pd
from rdkit.Chem import AllChem
from rdkit.Chem import SDWriter, SDMolSupplier
from rdkit.Chem import rdChemReactions

### 0. SMILES

#### 0.0. Introduction to SMILES

**SMILES (Simplified Molecular Input Line Entry System)** is a specification that encodes molecular structures into a line of text using short ASCII strings. These strings provide a compact and human-readable way to represent molecules, and are widely used in cheminformatics for storing, sharing, and processing chemical data.

**Why are SMILES important?**
- They provide a standard format to represent molecules digitally.
- They can be easily parsed by computers and are supported by many cheminformatics libraries (e.g., RDKit).
- They enable descriptor generation and molecular modeling, which you'll explore in depth in the next session.

**What is RDKit?**

RDKit is an open-source cheminformatics toolkit that allows the manipulation of chemical information, including SMILES parsing, molecular descriptor calculation, substructure searching, and chemical reaction modeling.

**What is a Mol object?**

In RDKit, a Mol object is a Python representation of a molecule. It is typically created from a SMILES string or a structure file (like `.sdf`), and can be used for visualization, property calculations, and transformations.

**Basic SMILES Rules**:
- Atoms are represented by atomic symbols (e.g., C for carbon, O for oxygen).
- Single bonds are implied; double (=), triple (#), and aromatic bonds (:) are explicitly written.
- Branches are represented using parentheses.
- Rings are encoded by numbers that indicate where the ring opens and closes.

**Useful references**:
- [Daylight SMILES Tutorial](http://www.daylight.com/dayhtml/doc/theory/theory.smiles.html)
- [RDKit SMILES Documentation](https://www.rdkit.org/docs/source/rdkit.Chem.rdmolfiles.html#rdkit.Chem.rdmolfiles.MolFromSmiles)
- [Wikipedia - SMILES](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system)

#### 0.1. Some basic SMILES examples

In [None]:
# Convert SMILES to molecule objects (Mol)

# Create a list of SMILES named smiles_list
smiles_list = [
    "CCO",        # ethanol
    "c1ccccc1",   # benzene
    "CC(=O)O",    # acetic acid
    "C1=CC=CC=C1O" # phenol
]

print(smiles_list)

In [None]:
# Generate molecule objects (Mol), in one line
mols = [Chem.MolFromSmiles(smi) for smi in smiles_list]

print(mols)
print(type(mols))

In [None]:
# Display molecules
Draw.MolsToGridImage(mols, molsPerRow=4, subImgSize=(200, 200), legends=smiles_list)

#### 0.2. SMILES from CSV file

In [None]:
# Loading SMILES from a CSV file
"""
In many real-world scenarios, chemical data is stored in external files like CSVs. Here, we'll load a file with molecule names and their 
corresponding SMILES strings, and visualize them using RDKit.

What is a CSV file?
A CSV (Comma-Separated Values) file is a simple, plain-text file used to store tabular data, where each line represents a row and columns
are separated by commas. It's commonly used for exchanging structured data between programs like Excel, Python (via pandas), and databases.
"""

# Create a sample DataFrame and save as CSV (in practice this file would already exist)
data = {
    "code_name": ["ethanol", "benzene", "acetic_acid"],
    "SMILES": ["CCO", "c1ccccc1", "CC(=O)O"]
}
df = pd.DataFrame(data)
df.to_csv("molecules.csv", index=False)

print(df)
print(type(df))

In [None]:
# Load the CSV
df_loaded = pd.read_csv("molecules.csv")
print(df_loaded)

# Convert SMILES to Mol and visualize
mols_csv = [Chem.MolFromSmiles(smi) for smi in df_loaded["SMILES"]]
Draw.MolsToGridImage(mols_csv, molsPerRow=3, subImgSize=(200, 200), legends=df_loaded["code_name"].tolist())

#### 0.3. SMILES from SDF file

In [None]:
"""
Reading molecules from an SDF file

SDF (Structure Data File) is a widely used file format for storing multiple molecular structures along with metadata. 
RDKit can parse these files and convert them into Mol objects.

Below is a demonstration of how to load molecules from an SDF file. You can replace 'nitriles.sdf' with any real dataset.
"""

# Load from SDF (have to be in the same folder than this .ipynb file)
sdf_file = SDMolSupplier("nitriles.sdf")

mols = []
for mol in sdf_file:
    if mol is not None: # Filter out None values (invalid mols)
        mols.append(mol)

Draw.MolsToGridImage(mols[:5], molsPerRow=6, subImgSize=(200, 200))

In [None]:
# create the list of mols and filter out None values (invalid mols) in just one line
valid_mols = [mol for mol in sdf_file if mol is not None]

# Display first few molecules
Draw.MolsToGridImage(valid_mols, molsPerRow=7, subImgSize=(200, 200))

#### 0.4. It's your turn!

In [None]:
"""
Try the following tasks based on what you've learned so far:

1. Convert the following SMILES string into a Mol and visualize it: `CCN(CC)CC` (triethylamine)
2. Load the CSV file again and visualize the molecules.
3. Load your own SDF file and display the first few molecules.

Use the cells below to practice:
"""

In [None]:
# Task 1: Convert this SMILES to a Mol and visualize
practice_smiles = "CCN(CC)CC"
# Your code here


In [None]:
# Task 2: From the CSV file 'oxygenated_compounds.csv', convert the SMILES to a Mol and visualize
csv_file = 'oxygenated_compounds.csv'
# Your code here


In [None]:
# Task 3: From the SDF file alkynes.csv, convert the SMILES to a Mol and visualize
sdf_file = 'alkynes.sdf'
# Your code here


### 1. SMARTS

#### 1.0. Introduction to SMARTS

**SMARTS (SMILES Arbitrary Target Specification)** is a powerful language used to define substructures in molecules. While SMILES describes entire molecules, SMARTS allows you to describe **patterns or fragments** to match within molecules, making it ideal for filtering, searching, and performing **digital reactions**.

**Why use SMARTS?**
- Substructure matching in large datasets
- Defining reactants or transformation rules in digital reactions
- Functional group identification

**10 Useful Basic SMARTS Rules**:
1. `[#6]` — Carbon atom
2. `[O]` — Oxygen atom
3. `[#7]` — Nitrogen atom
4. `[C]=[O]` — Carbon double bonded to Oxygen (e.g., carbonyl group)
5. `[OH]` — Hydroxyl group
6. `[CH3]` — Methyl group
7. `[nH]` — Aromatic nitrogen with hydrogen (e.g., in indole)
8. `[R]` — Ring atom
9. `[!#6]` — Any atom except carbon
10. `*` — Any atom (wildcard)

**SMARTS Resources**:
- [Daylight SMARTS Tutorial](http://www.daylight.com/dayhtml/doc/theory/theory.smarts.html)
- [RDKit SMARTS Examples](https://www.rdkit.org/docs/Cookbook.html#substructure-matching)
- [SMARTS Tutorial](https://www.daylight.com/dayhtml_tutorials/languages/smarts/)
- [SMARTS Examples](https://www.daylight.com/dayhtml_tutorials/languages/smarts/smarts_examples.html)

#### 1.1. Basic examples of SMARTS

In [None]:
# Define a molecule and convert SMILES into molecule objects (MOL)
molecule = "CCN(CC)CC"
mol = Chem.MolFromSmiles("CCN(CC)CC")

# Does the molecule contain N?
# Define the pattern
pattern = "N"

# Convert SMARTS into MOL 
mol_pattern = Chem.MolFromSmarts(pattern)
print(mol_pattern)
print(type(mol_pattern))

# Does the molecule contain N?
print(mol.HasSubstructMatch(mol_pattern)) # will return a boolean
print(type(mol.HasSubstructMatch(mol_pattern)))

In [None]:
# Define two lists
smiles_list = ["CC(=O)N", 'CC#N', 'CCN']
name_list = ["acetamide", "acetonitrile", "ethylamine"]

# Create a dictionary
dict_smiles = {
    'SMILES': smiles_list,
    'code_name': name_list
}
print(dict_smiles)

# Create the DataFrame from a dictionary
new_df = pd.DataFrame(dict_smiles)
print(new_df)

In [None]:
# Which molecules contain nitrogen?

# Define the pattern
pattern1 = "N" # Nitrogen
mol_pattern1 = Chem.MolFromSmarts(pattern1)

# Check if molecules of new_df match the pattern1
smi_match1 = []
for smi in new_df['SMILES']:
    mol = Chem.MolFromSmiles(smi)
    if mol.HasSubstructMatch(mol_pattern1): # only if it is True
        smi_match1.append(smi)
        
print (smi_match1)

# How many molecules pass the filter?
print(f'{len(smi_match1)} molecules pass the filter')


In [None]:
# Which molecules contain amino groups?
pattern2 = "[NX3]" # Atom with 3 total bonds (includes implicit H's)
mol_pattern2 = Chem.MolFromSmarts(pattern2)

# Create a list of mols, easier to work with
mols_molecules = [Chem.MolFromSmiles(smi) for smi in new_df['SMILES']]

# Check if molecules of new_df match the pattern2
smi_match2 = []
for mol in mols_molecules:
    if mol.HasSubstructMatch(mol_pattern2): # only if it is True
        smi_match2.append(Chem.MolToSmiles(mol))
        
print (smi_match2)

# How many molecules pass the filter?
print(f'{len(smi_match2)} molecules pass the filter???')

In [None]:
# Solving the problem
# Which molecules contain amino groups?
pattern2 = "[NX3;!$(NC=O)]" # Amino groups

mol_pattern2 = Chem.MolFromSmarts(pattern2)

# Create a list of mols, easier to work with
mols_molecules = [Chem.MolFromSmiles(smi) for smi in new_df['SMILES']]

# Check if molecules of new_df match the pattern2
smi_match2 = []
for mol in mols_molecules:
    if mol.HasSubstructMatch(mol_pattern2): # only if it is True
        smi_match2.append(Chem.MolToSmiles(mol))
        
print (smi_match2)

# How many molecules pass the filter?
print(f'{len(smi_match2)} molecules pass the filter')

In [None]:
# Which molecules contain cyano groups?

# Define the pattern

pattern3 = "N#C" # Cyano groups
mol_pattern3 = Chem.MolFromSmarts(pattern3)

# Check if molecules of new_df match the pattern2
smi_match3 = []
for mol in mols_molecules:
    if mol.HasSubstructMatch(mol_pattern3): # only if it is True
        smi_match3.append(Chem.MolToSmiles(mol))
        
print (smi_match3)

# How many molecules pass the filter?
print(f'{len(smi_match3)} molecules pass the filter')


In [None]:
# Define some SMARTS patterns and test them on molecules
patterns = {
    "alcohol": Chem.MolFromSmarts("[CX4][OH]"),
    "aromatic ring": Chem.MolFromSmarts("c1ccccc1"),
    "carbonyl": Chem.MolFromSmarts("[CX3]=[OX1]")
}

example_mols = [Chem.MolFromSmiles(smi) for smi in ["CCO", "c1ccccc1", "CC(=O)O"]]

# Check if molecules match each pattern
for name, pattern in patterns.items():
    print(f"\nSMARTS pattern: {name}")
    for mol in example_mols:
        print(f"Matches: {mol.HasSubstructMatch(pattern)} for {Chem.MolToSmiles(mol)}")

Draw.MolsToGridImage(example_mols, molsPerRow=3, subImgSize=(200,200))

#### 1.2. Applying SMARTS - Filtering an SDF file

In [None]:
"""
### SMARTS Filtering in a Molecule Set
We'll simulate loading an SDF file with 20 small organic molecules, and apply a SMARTS pattern to filter those containing a carbonyl group.
"""

# Read the SDF file sample_mols.sdf
sdf_supplier = SDMolSupplier("sample_mols.sdf")
# "[NX3;!$(NC=O)]" 
carbonyl_pattern = Chem.MolFromSmarts('[#6][CX3](=O)[#6]')

matched, unmatched = [], []
for mol in sdf_supplier:
    if mol is not None:
        (matched if mol.HasSubstructMatch(carbonyl_pattern) else unmatched).append(mol)

print(f"Matched: {len(matched)} | Unmatched: {len(unmatched)}")
Draw.MolsToGridImage(matched, molsPerRow=6, subImgSize=(200,200))


#### 1.3. It's your turn!

In [None]:
# Task 4: Does this molecule contain any oxygen atoms? Solve by applying SMART, obtaining a boolean
molecule = "CCN(CC)CC"
# Your code here


In [None]:
# Task 5: Filter molecules containing carboxylic acid groups using SMARTS and visualize them
"""
SMARTS Filtering from CSV

We've prepared a CSV with some oxygenated compounds Your task is to:

- Load the CSV into a DataFrame
- Convert the SMILES to Mol objects
- Define a SMARTS pattern for carboxylic acid groups
- Filter and visualize only the molecules containing carboxylic acid groups groups

Hint: Beware of aldehydes and ester.
"""
csv_file = 'oxygenated_compounds.csv'
# Your code here


### 2. DIGITAL CHEMICAL REACTIONS

#### 2.0. Introduction to Digital Chemical Reactions

Digital chemical reactions simulate real chemical transformations using SMARTS-based reaction templates. These templates define the transformation logic (reactant patterns → product structure) and can be applied programmatically using RDKit's `rdChemReactions` module.

**Common uses**:
- Automating reaction design
- Virtual synthesis of new molecules
- Filtering or modifying compound libraries

**Basic Imports for Digital Reactions**:
```python
from rdkit.Chem import rdChemReactions
```

**Structure**:
```python
rxn = rdChemReactions.ReactionFromSmarts("[C:1]=[O:2].[N:3]>>[C:1](N:3)[O:2]")
products = rxn.RunReactants((mol1, mol2))
```

**Common pitfalls**:
- Reactants must match the SMARTS pattern exactly.
- Atom mapping (`:[n]`) is essential to track atoms during transformations.
- Input molecule order matters for multi-reactant reactions.

**Resources**:
- [RDKit Reaction Guide](https://www.rdkit.org/docs/Cookbook.html#using-chemical-reactions)
- [Reaction SMARTS - Daylight](http://www.daylight.com/dayhtml/doc/theory/theory.reactions.html)
- [SMARTS display](https://smarts.plus/)

#### 2.1. Basic examples of digital reactions

In [None]:
# Define a simple esterification reaction: carboxylic acid + alcohol -> ester
rxn = rdChemReactions.ReactionFromSmarts("[C:1](=O)[O:2].[O:3][C:4]>>[C:1](=O)[O:3][C:4].[O:2]")

acid = Chem.MolFromSmiles("CC(=O)O")
alcohol = Chem.MolFromSmiles("CO")

products = rxn.RunReactants((acid, alcohol))
Draw.MolsToGridImage([p[0] for p in products])

#### 2.1. Digital Reactions from a CSV (Automated Reaction Workflow)

In [None]:
# Load CSV with molecules
df = pd.read_csv("oxygenated_compounds.csv")
df["mol_object"] = df["SMILES"].apply(Chem.MolFromSmiles)
# print(df)
Draw.MolsToGridImage(df["mol_object"], molsPerRow=5, subImgSize=(200, 200))

In [None]:
# Define a simple ketone → enol conversion
rxn = rdChemReactions.ReactionFromSmarts("[C:0][C:1](=O)[C:2]>>[C:0][C:1](O)(=[C:2])")

product_names, product_smiles = [], []

for i, mol in enumerate(df["mol_object"]):  # i is just the enumeration (0, 1, 2, 3...)
    if mol is None:  # only continues if the mole (SMILE) is valid
        continue
    ps = rxn.RunReactants((mol,))    # The double parentheses are because it expects a tuple
    if ps:   # check if the reaction gaives any product
        product = ps[0][0]    # ps is a list of results, each being a tuple of products.
        product_names.append(f"product_{i+1}")    # defining the product name and adding it to the list
        product_smiles.append(Chem.MolToSmiles(product))    # adding the SMILE to the second list
        
# Saving it as a df
product_df = pd.DataFrame({"product_name": product_names, "product_smiles": product_smiles})
# Saving it as a CSV file titled "digital_products.csv"
product_df.to_csv("digital_products.csv", index=False)
product_df.head()
print(product_df.head())

product_df['mol_object'] = product_df["product_smiles"].apply(Chem.MolFromSmiles)

Draw.MolsToGridImage(product_df['mol_object'], molsPerRow=5, subImgSize=(200, 200))

In [None]:
# Define a simple ketone → enol conversion
rxn = rdChemReactions.ReactionFromSmarts("[#6:0][C:1](=O)[C:2]>>[#6:0][C:1](O)(=[C:2])")

product_names, product_smiles = [], []

for i, mol in enumerate(df["mol_object"]):  # i is just the enumeration (0, 1, 2, 3...)
    if mol is None:  # only continues if the mole (SMILE) is valid
        continue
    ps = rxn.RunReactants((mol,))    # The double parentheses are because it expects a tuple
    if ps:   # check if the reaction gaives any product
        product = ps[0][0]    # ps is a list of results, each being a tuple of products.
        product_names.append(f"product_{i+1}")    # defining the product name and adding it to the list
        product_smiles.append(Chem.MolToSmiles(product))    # adding the SMILE to the second list
        
# Saving it as a df
product_df = pd.DataFrame({"product_name": product_names, "product_smiles": product_smiles})
# Saving it as a CSV file titled "digital_products.csv"
product_df.to_csv("digital_products.csv", index=False)
print(product_df.head())

product_df['mol_object'] = product_df["product_smiles"].apply(Chem.MolFromSmiles)

Draw.MolsToGridImage(product_df['mol_object'], molsPerRow=5, subImgSize=(200, 200))

#### 2.3. Digital Reactions from an SDF File

In [None]:
sdf_supplier = SDMolSupplier("sample_mols.sdf")

# Reaction: Replace ketone (C=O) with alkene (C=C)
rxn = rdChemReactions.ReactionFromSmarts("[#6:1][C:2](=O)[#6:3]>>[#6:1][C:2]=[C:3]")

alkenes = []
for mol in sdf_supplier:   # Iterate through each molecule in the .sdf file.
    if mol is None:        # Ignore any that are incorrectly formatted or empty.
        continue
    ps = rxn.RunReactants((mol,))   # Apply the digital reaction to each molecule.
    if ps:                          # If there is a result, continue.
        alkenes.append(ps[0][0])    # Save the first generated product.

Draw.MolsToGridImage(alkenes, molsPerRow=4)

#### 2.4. Multicomponent Digital Reactions

In [None]:
"""
Using the `alkynes.sdf` and `nitriles.sdf` files, your task is:

- Load the alkynes and nitriles.
- Use a reaction rule to enerate pyridine-like structures from 2 alkynes (the same) and 1 nitrile.
- Depending on the orientation in which the cycloadditions occur, up to four structural isomers of pyridines can be generated. 
- Explore all combinations of the 6 alkynes with the 6 nitriles, including all their conformers.

Notes:
- The alkynes.sdf file contains only terminal alkynes.
- You can see the molecules present in each sdf file by opening it with ChemDraw.

Start by exploring the structures in the SDF files and plan your SMARTS accordingly.
"""

In [None]:
# 1. Load SDF files
alkyne_supplier = SDMolSupplier("alkynes_reducido.sdf")
nitrile_supplier = SDMolSupplier("nitriles_reducido.sdf")

# 2. Extract valid molecules and names # list o
alkynes = [(Chem.AddHs(mol), mol.GetProp("_Name")) for mol in alkyne_supplier if mol is not None] # list of tuplas
nitriles = [(Chem.AddHs(mol), mol.GetProp("_Name")) for mol in nitrile_supplier if mol is not None] # list of tuplas

print(type(alkynes))
print(type(alkynes[0]))

print(alkynes[0][0])
print(alkynes[0][1])
# Your code here
# Tip: Use rdChemReactions.ReactionFromSmarts(...) and RunReactants as shown above

In [None]:
# 3. Define the pyridine-forming reaction
# Reaction: 2 alkynes + 1 nitrile → pyridine (multiple regioisomers possible)
# Define 4 reactions for each conformer
reaction_smarts_list = [
    ("conf_1", rdChemReactions.ReactionFromSmarts("[H:6][C:1]#[C:4].[H:7][C:2]#[C:5].[#6:8][C]#[N:3]>>c1([#6:8])[c:1]([H:6])[c:4][c:2]([H:7])[c:5][n:3]1")),
    ("conf_2", rdChemReactions.ReactionFromSmarts("[H:6][C:1]#[C:4].[H:7][C:2]#[C:5].[#6:8][C]#[N:3]>>c1([#6:8])[c:4][c:1]([H:6])[c:2]([H:7])[c:5][n:3]1")),
    ("conf_3", rdChemReactions.ReactionFromSmarts("[H:6][C:1]#[C:4].[H:7][C:2]#[C:5].[#6:8][C]#[N:3]>>c1([#6:8])[c:1]([H:6])[c:4][c:5][c:2]([H:7])[n:3]1")),
    ("conf_4", rdChemReactions.ReactionFromSmarts("[H:6][C:1]#[C:4].[H:7][C:2]#[C:5].[#6:8][C]#[N:3]>>c1([#6:8])[c:4][c:1]([H:6])[c:5][c:2]([H:7])[n:3]1")),
]

#[Chem.AddHs(Chem.MolFromSmiles(smiles))for smiles in df_enamine['SMILES']

# 4. Apply the reaction to all combinations
# Storage
product_names = []
product_smiles = []
used_alkyne_names = []
used_nitrile_names = []
conformer_ids = []

product_count = 1

for alkyne_mol, alkyne_name in alkynes: 
    for nitrile_mol, nitrile_name in nitriles:        
        for conf_label, rxn in reaction_smarts_list:    # triple for-loop
            try:
                products = rxn.RunReactants((alkyne_mol, alkyne_mol, nitrile_mol))
                for prod in products:
                    mol = prod[0]        # takes the only element, which is the first of the tuple
                    if mol is not None:
                        Chem.SanitizeMol(mol)   # This function cleans and validates the generated molecule. It performs tasks such as:
                                                # Assigning single, double, and aromatic bonds. Calculating formal charges if missing.
                                                # Verify that the molecule is chemically valid.
                        smi = Chem.MolToSmiles(mol)
                        product_names.append(f"py_{product_count}")
                        product_smiles.append(smi)
                        used_alkyne_names.append(alkyne_name)
                        used_nitrile_names.append(nitrile_name)
                        conformer_ids.append(conf_label)
                        product_count += 1
            except:
                continue  # Skip any failed reaction

# 5. Save results to DataFrame and CSV
df_pyridines = pd.DataFrame({
    "product_name": product_names,
    "product_smiles": product_smiles,
    "alkyne_name": used_alkyne_names,
    "nitrile_name": used_nitrile_names,
    "conformer": conformer_ids
})

df_pyridines.to_csv("pyridine_products.csv", index=False)
df_pyridines.head()

In [None]:
Draw.MolsToGridImage(df_pyridines["product_smiles"][:8].apply(Chem.MolFromSmiles), molsPerRow=4, subImgSize=(200, 200))

In [None]:
# Task 6: Now we are going to generate only 1 structural isomer from the reaction, the pyridine isomer that has all the substituents in meta.
# Your code here

In [None]:
#[Chem.AddHs(Chem.MolFromSmiles(smiles))for smiles in df_enamine['SMILES']

### Solutions to the tasks

In [None]:
# Task 1: Convert this SMILES to a Mol and visualize
practice_smiles = "CCN(CC)CC"
# Your code here
mol = Chem.MolFromSmiles("CCN(CC)CC")
Draw.MolToImage(mol, molsPerRow=1, subImgSize=(200, 200), legends='triethylamine')

In [None]:
# Task 2: From the CSV file 'oxygenated_compounds.csv', convert the SMILES to a Mol and visualize
csv_file = 'oxygenated_compounds.csv'
# Your code here
df_loaded = pd.read_csv(csv_file)
print(df_loaded)

mols_csv = [Chem.MolFromSmiles(smi) for smi in df_loaded["SMILES"]]
Draw.MolsToGridImage(mols_csv, molsPerRow=5, subImgSize=(200, 200), legends=df_loaded["name"].tolist())

In [None]:
# Task 3: From the SDF file alkynes.csv, convert the SMILES to a Mol and visualize
sdf_file = 'alkynes.sdf'
# Your code here
# Load from SDF (have to be in the same folder than this .ipynb file)
sdf_file = SDMolSupplier(sdf_file)
valid_mols = [mol for mol in sdf_file if mol is not None]
Draw.MolsToGridImage(valid_mols, molsPerRow=7, subImgSize=(200, 200))

In [None]:
# Task 4: Does this molecule contain any oxygen atoms? Solve by applying SMART, obtaining a boolean
molecule = "CCN(CC)CC"
# Your code here

pattern = "O"
mol_pattern = Chem.MolFromSmarts(pattern)

if (mol.HasSubstructMatch(mol_pattern)) == True:
    word = 'contains'
else:
    word = "doesn't contain"
    
print(f'The molecule {molecule} {word} oxygen atoms')

In [None]:
# Task 5: Filter molecules containing carboxylic acid groups using SMARTS and visualize them
"""
SMARTS Filtering from CSV

We've prepared a CSV with some oxygenated compounds Your task is to:

- Load the CSV into a DataFrame
- Convert the SMILES to Mol objects
- Define a SMARTS pattern for carboxylic acid groups
- Filter and visualize only the molecules containing carboxylic acid groups groups

Hint: Beware of aldehydes and ester.
"""
csv_file = 'oxygenated_compounds.csv'
# Your code here

df_task = pd.read_csv(csv_file)
mols = [Chem.MolFromSmiles(smi) for smi in df_task['SMILES']]

pattern_acid = '[CX3](=O)[OH]'
mol_pattern = Chem.MolFromSmarts(pattern_acid)

matched, unmatched = [], []
for mol in mols:
    if mol is not None:
        (matched if mol.HasSubstructMatch(mol_pattern) else unmatched).append(mol)

print(f"Matched: {len(matched)} | Unmatched: {len(unmatched)}")
Draw.MolsToGridImage(matched, molsPerRow=7, subImgSize=(200,200))

In [None]:
# Task 6: Now we are going to generate only 1 structural isomer from the reaction, the pyridine isomer that has all the substituents in meta.
# Your code here
# 1. Load SDF files
alkyne_supplier = SDMolSupplier("alkynes_reducido.sdf")
nitrile_supplier = SDMolSupplier("nitriles_reducido.sdf")

# 2. Extract valid molecules and names # list o
alkynes = [(Chem.AddHs(mol), mol.GetProp("_Name")) for mol in alkyne_supplier if mol is not None] # list of tuplas
nitriles = [(Chem.AddHs(mol), mol.GetProp("_Name")) for mol in nitrile_supplier if mol is not None] # list of tuplas

# 3. Define the pyridine-forming reaction
# Reaction: 2 alkynes + 1 nitrile → pyridine (multiple regioisomers possible)
# Define 4 reactions for each conformer
reaction_smarts_list = [
    ("conf_1", rdChemReactions.ReactionFromSmarts("[H:6][C:1]#[C:4].[H:7][C:2]#[C:5].[#6:8][C]#[N:3]>>c1([#6:8])[c:1]([H:6])[c:4][c:2]([H:7])[c:5][n:3]1")),
#    ("conf_2", rdChemReactions.ReactionFromSmarts("[H:6][C:1]#[C:4].[H:7][C:2]#[C:5].[#6:8][C]#[N:3]>>c1([#6:8])[c:4][c:1]([H:6])[c:2]([H:7])[c:5][n:3]1")),
#    ("conf_3", rdChemReactions.ReactionFromSmarts("[H:6][C:1]#[C:4].[H:7][C:2]#[C:5].[#6:8][C]#[N:3]>>c1([#6:8])[c:1]([H:6])[c:4][c:5][c:2]([H:7])[n:3]1")),
#    ("conf_4", rdChemReactions.ReactionFromSmarts("[H:6][C:1]#[C:4].[H:7][C:2]#[C:5].[#6:8][C]#[N:3]>>c1([#6:8])[c:4][c:1]([H:6])[c:5][c:2]([H:7])[n:3]1")),
]

#[Chem.AddHs(Chem.MolFromSmiles(smiles))for smiles in df_enamine['SMILES']

# 4. Apply the reaction to all combinations
# Storage
product_names = []
product_smiles = []
used_alkyne_names = []
used_nitrile_names = []
conformer_ids = []

product_count = 1

for alkyne_mol, alkyne_name in alkynes: 
    for nitrile_mol, nitrile_name in nitriles:        
        for conf_label, rxn in reaction_smarts_list:    # triple for-loop
            try:
                products = rxn.RunReactants((alkyne_mol, alkyne_mol, nitrile_mol))
                for prod in products:
                    mol = prod[0]        # takes the only element, which is the first of the tuple
                    if mol is not None:
                        Chem.SanitizeMol(mol)   # This function cleans and validates the generated molecule. It performs tasks such as:
                                                # Assigning single, double, and aromatic bonds. Calculating formal charges if missing.
                                                # Verify that the molecule is chemically valid.
                        smi = Chem.MolToSmiles(mol)
                        product_names.append(f"py_{product_count}")
                        product_smiles.append(smi)
                        used_alkyne_names.append(alkyne_name)
                        used_nitrile_names.append(nitrile_name)
                        conformer_ids.append(conf_label)
                        product_count += 1
            except:
                continue  # Skip any failed reaction

# 5. Save results to DataFrame and CSV
df_pyridines = pd.DataFrame({
    "product_name": product_names,
    "product_smiles": product_smiles,
    "alkyne_name": used_alkyne_names,
    "nitrile_name": used_nitrile_names,
    "conformer": conformer_ids
})

df_pyridines.to_csv("pyridine_products_conf.csv", index=False)
df_pyridines.head()

Draw.MolsToGridImage(df_pyridines["product_smiles"][:8].apply(Chem.MolFromSmiles), molsPerRow=4, subImgSize=(200, 200))