In [2]:
# read the data and turn it into a dataframe
#imports"
from pathlib import Path
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
import pandas as pd
import os

# Create a Path object for the current directory
current_directory = Path.cwd()
print("Current Directory:", current_directory.resolve())

# Creating a Path object for an example file that does not yet exist
example_file_path = current_directory / "1976_Sep2016_USPTOgrants_smiles.rsmi"

# Reading the contents of the file to test if we have well acess to our data
if example_file_path.exists():
    with example_file_path.open("r") as file:
        first_line = file.readline()
        print(first_line)
else:
    print("The file does not exist.")

dataFrame= pd.read_csv("1976_Sep2016_USPTOgrants_smiles.rsmi", delimiter='\t',low_memory=False)
#Delete columns that are not the reactions or the yield
columns_to_delete = ["PatentNumber", "ParagraphNum", "Year", "TextMinedYield"]
dataFrame.drop(columns=columns_to_delete, inplace=True)
#Separate the Reactants from the Products
new_columns = dataFrame['ReactionSmiles'].str.split('>', expand=True)
new_columns = new_columns.rename(columns={0: 'Reactant 1', 2 : "Product 1"})
dataFrame = pd.concat([new_columns, dataFrame.iloc[:, 1:]], axis=1)
dataFrame.drop(columns=1, inplace=True)
#Separate the reactants from each other
reactants_split = dataFrame['Reactant 1'].str.split("\.", expand = True)
reactants_split.columns = [f'Reactant {i+1}' for i in range(reactants_split.shape[1])]
dataFrame = pd.concat([reactants_split, dataFrame.iloc[:, 1:]], axis=1)
#Separate the Products from each other
products_split = dataFrame['Product 1'].str.split("\.", expand = True)
products_split.columns = [f'Product {i+1}' for i in range(products_split.shape[1])]
dataFrame = pd.concat([products_split, dataFrame.iloc[:, 1:]], axis=1)
dataFrame = dataFrame.loc[:, ~dataFrame.columns.duplicated(keep='first')]
dataFrame.head()


Current Directory: /Users/Travail/git/Amazing-Project
ReactionSmiles	PatentNumber	ParagraphNum	Year	TextMinedYield	CalculatedYield



Unnamed: 0,Product 1,Product 2,Product 3,Product 4,Product 5,Product 6,Product 7,Product 8,Product 9,Product 10,...,Reactant 53,Reactant 54,Reactant 55,Reactant 56,Reactant 57,Reactant 58,Reactant 59,Reactant 60,Reactant 61,CalculatedYield
0,[CH2:5]([S:7]([O:4][CH2:3][CH2:2][Br:1])(=[O:9...,,,,,,,,,,...,,,,,,,,,,
1,[CH3:6][S:7]([O:5][CH2:4][CH2:3][CH2:2][Br:1])...,,,,,,,,,,...,,,,,,,,,,
2,[CH2:10]([S:14]([O:3][CH2:2][CH2:1][Cl:4])(=[O...,,,,,,,,,,...,,,,,,,,,,
3,[CH2:5]([S:7]([O:4][CH2:3][CH2:2][Br:1])(=[O:9...,,,,,,,,,,...,,,,,,,,,,
4,[CH3:6][S:7]([O:5][CH2:4][CH2:3][CH2:2][Br:1])...,,,,,,,,,,...,,,,,,,,,,


In [3]:
"""
This Cell groups the functions used to clean the dataframe to be able to use it later
"""
# Remove atome mapping numbers from smiles
import re

def remove_atom_mapping(smiles):
    """
    Remove atom mapping numbers from a SMILES-like notation.
    Args:
    - smiles (str): The SMILES-like notation with atom mapping numbers.
    Returns:
    - smiles_without_mapping (str): The SMILES notation without atom mapping numbers.
    """
    # Define a regular expression pattern to match atom mapping numbers
    pattern = r':\d+'
    smiles_without_mapping = re.sub(pattern, '', smiles)
    return smiles_without_mapping

dataFrame = dataFrame.astype(str)
columns_to_process = dataFrame.columns[:-1]
for column in columns_to_process:
    dataFrame[column] = dataFrame[column].apply(remove_atom_mapping)
    
#Remove the percentage symbol
def remove_percent_symbol(value):
    """
    Remove the '%' symbol from a percentage value.
    Args:
    - value (str): The percentage value with the '%' symbol.
    Returns:
    - value_without_percent (str): The percentage value without the '%' symbol.
    """
    return value.replace('%', '')

# Apply the remove_percent_symbol function to the "PercentageColumn" column
dataFrame['CalculatedYield'] = dataFrame['CalculatedYield'].apply(remove_percent_symbol)

In [15]:
pd.set_option('display.max_colwidth', None)
def main():
    
    
    while True:
        choice=input("Press Enter to get random products or type 'exit' to quit: ")
        if choice == "exit":
            print("Exiting.")
            break
        else: 
            filtered_columns = [pd.Series(dataFrame.iloc[:, i].dropna().unique()) for i in range(128)]
            selected_columns = pd.concat(filtered_columns, axis=0)
            random_products = selected_columns.sample(n=1)
            print(random_products)

        
        choice = input("Do you want to continue? (yes/no): ").strip().lower()
        if choice == "no":
            print("Exiting.")
            break
            
if __name__ == "__main__":
    main()

Press Enter to get random products or type 'exit' to quit:  


632019    [Cl][C]1[C]([NH][C]2[NH][N]=[C]([CH3])[CH]=2)=[N][C]([NH][C]2[CH]=[CH][C]3[CH2][CH2][N]([CH2][CH2][O][CH3])[CH2][CH2][C]=3[CH]=2)=[N][CH]=1
dtype: object


Do you want to continue? (yes/no):  no


Exiting.


In [16]:
# Prompt the user to enter the molecule that he wants to form
print ("you can enter the name of the molecule or its SMILES, the racemic configuration is available. Only halogen molecules are taken in account in the data base")
string_input_mol = input("Enter the molecule that you want to form: ")

you can enter the name of the molecule or its SMILES, the racemic configuration is available. Only halogen molecules are taken in account in the data base


Enter the molecule that you want to form:  [Cl][C]1[C]([NH][C]2[NH][N]=[C]([CH3])[CH]=2)=[N][C]([NH][C]2[CH]=[CH][C]3[CH2][CH2][N]([CH2][CH2][O][CH3])[CH2][CH2][C]=3[CH]=2)=[N][CH]=1


In [17]:
# function to determine if the user enters the SMILES notation or the usual name of the molecule
from rdkit import Chem

def is_smiles(smiles):
    """
    Check if a string represents a valid SMILES notation.
    Args:
    - smiles (str): The string to check.
    Returns:
    - is_valid (bool): True if the string is a valid SMILES notation, False otherwise.
    """
    if Chem.MolFromSmiles(smiles)== None:
        return False
    else:
        return True

# Check if the entered string is a valid SMILES notation
if is_smiles(string_input_mol):
    print(f"'{string_input_mol}' is a valid SMILES notation.")
else:
    print(f"'{string_input_mol}' is not a valid SMILES notation.")

'[Cl][C]1[C]([NH][C]2[NH][N]=[C]([CH3])[CH]=2)=[N][C]([NH][C]2[CH]=[CH][C]3[CH2][CH2][N]([CH2][CH2][O][CH3])[CH2][CH2][C]=3[CH]=2)=[N][CH]=1' is a valid SMILES notation.


In [18]:
!pip install pubchempy



In [19]:
# function to convert name into SMILES

import pubchempy as pcp

def name_to_smiles(molecule_name):
    """
    Convert a molecule name to a SMILES notation using PubChemPy's PubChem database.
    Args:
    - molecule_name (str): The name of the molecule.
    Returns:
    - smiles (str): The SMILES notation of the molecule, or None if retrieval fails.
    """
    try:
        compound = pcp.get_compounds(molecule_name, 'name')
        if compound:
            return compound[0].canonical_smiles
        else:
            print("Error: Unable to retrieve molecule information. Please try with the SMILE of the molecule")
            return None
    except:
        print("Error: Unable to retrieve molecule information. Please try again")
        return None

# Call the function to convert the molecule name to SMILES notation
if is_smiles(string_input_mol)== False:
    string_input_mol = name_to_smiles(string_input_mol)
    if string_input_mol:
        print(f"SMILES notation for {string_input_mol}: {string_input_mol}")
else:
    print (string_input_mol)

[Cl][C]1[C]([NH][C]2[NH][N]=[C]([CH3])[CH]=2)=[N][C]([NH][C]2[CH]=[CH][C]3[CH2][CH2][N]([CH2][CH2][O][CH3])[CH2][CH2][C]=3[CH]=2)=[N][CH]=1


In [20]:
remove_atom_mapping(string_input_mol)
# Function that returns the row of the elements formation reaction
def compare_molecule_with_data(element):
    return ''.join(element.split()).lower() == ''.join(string_input_mol.split()).lower()

# Initialize a variable to store the row number
found_row_number = None
rows=[]
# Iterate over the last three columns of the DataFrame
for column_name in dataFrame.columns[0 : 128]:
    column = dataFrame[column_name]
    # Check if any element in the column matches the input molecule
    for index, value in column.items():
        if compare_molecule_with_data(value):
            # Store the row number where the molecule is found
            found_row_number = index
            rows.append(found_row_number)
    else:
        continue  # Continue to the next column if molecule not found in current column

# Check if the molecule was found and return the row number
if found_row_number is not None:
   print(rows)
else:
    print("The product is not in the database")


[1173160, 1376724]


In [22]:
"""
This code prints the yield and shows the image of the reaction corresponding
"""
import numpy as np
# Convert the 'Yield' column to float
dataFrame['CalculatedYield'] = dataFrame['CalculatedYield'].astype(float)
# Subset the DataFrame to include only the specified rows
subset_df = dataFrame.iloc[rows]
if subset_df['CalculatedYield'].isnull().all():
    # If all values are NaN, randomly select a row as the maximum yield row
    max_yield_row_index = np.random.choice(subset_df.index)
else:
    # Find the index of the row with the highest yield
    max_yield_row_index = subset_df['CalculatedYield'].idxmax()
# Retrieve the row with the highest yield
max_yield_row = dataFrame.loc[max_yield_row_index]
print("Row with the highest yield:")
print(max_yield_row)

# print the reaction
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
#Create a Dataframe where the reaction is in one column
dataFrameImage= pd.read_csv("1976_Sep2016_USPTOgrants_smiles.rsmi", delimiter='\t',low_memory=False)
columns_to_delete = ["PatentNumber", "ParagraphNum", "Year", "TextMinedYield"]
dataFrameImage.drop(columns=columns_to_delete, inplace=True)
dataFrameImage["ReactionSmiles"] = dataFrameImage["ReactionSmiles"].apply(remove_atom_mapping)
# Create a reaction SMILES string
reaction_smiles = dataFrameImage.iloc[max_yield_row_index, 0]
# Parse the reaction SMILES string
reaction = Chem.AllChem.ReactionFromSmarts(reaction_smiles)
# Draw the reaction
reaction_image = Draw.ReactionToImage(reaction)

# Save or display the image
reaction_image.show()

Row with the highest yield:
Product 1          [Cl][C]1[C]([NH][C]2[NH][N]=[C]([CH3])[CH]=2)=[N][C]([NH][C]2[CH]=[CH][C]3[CH2][CH2][N]([CH2][CH2][O][CH3])[CH2][CH2][C]=3[CH]=2)=[N][CH]=1
Product 2                                                                                                                                                 None
Product 3                                                                                                                                                 None
Product 4                                                                                                                                                 None
Product 5                                                                                                                                                 None
                                                                                      ...                                                                     
Reactant 58       