In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors
from mordred import Calculator, descriptors
import os

In [3]:
# Load input data from CSV file
input_file = 'data/OROP.csv'
input_data = pd.read_csv(input_file)


In [4]:
solvent_properties_file = 'data/solvent_properties.csv'
if not os.path.exists(solvent_properties_file):
    default_solvent_properties = [
        {'Solvent': 'acetonitrile', 'DielectricConstant': 36.6, 'PolarityIndex': 5.8, 'Density': 0.786},
        {'Solvent': 'water', 'DielectricConstant': 78.5, 'PolarityIndex': 9.0, 'Density': 1.0},
        {'Solvent': 'methanol', 'DielectricConstant': 32.7, 'PolarityIndex': 5.1, 'Density': 0.792},
        {'Solvent': 'ethanol', 'DielectricConstant': 24.3, 'PolarityIndex': 4.3, 'Density': 0.789},
        {'Solvent': 'dimethylformamide', 'DielectricConstant': 36.7, 'PolarityIndex': 6.8, 'Density': 0.944},
        {'Solvent': 'dimethyl sulfoxide', 'DielectricConstant': 47.2, 'PolarityIndex': 7.2, 'Density': 1.1}
    ]
    pd.DataFrame(default_solvent_properties).to_csv(solvent_properties_file, index=False)
    print(f"Default solvent properties file '{solvent_properties_file}' created.")

# Load solvent properties from CSV file
solvent_properties_df = pd.read_csv(solvent_properties_file)

# Convert solvent properties to a dictionary
SOLVENT_PROPERTIES = solvent_properties_df.set_index('Solvent').T.to_dict()

In [5]:
# Function to generate 0D descriptors
def generate_0D_descriptors(mol):
    return {
        'MolWt': Descriptors.MolWt(mol),
        'NumAtoms': mol.GetNumAtoms(),
        'NumHeteroatoms': Descriptors.NumHeteroatoms(mol)
    }

# Function to generate 1D descriptors
def generate_1D_descriptors(mol):
    return {
        'NumRings': Descriptors.RingCount(mol),
        'NumRotatableBonds': Descriptors.NumRotatableBonds(mol)
    }

# Function to generate 2D descriptors
def generate_2D_descriptors(mol):
    descriptors = {
        'TPSA': Descriptors.TPSA(mol),
        'LogP': Descriptors.MolLogP(mol),
        'NumHBD': rdMolDescriptors.CalcNumHBD(mol),
        'NumHBA': rdMolDescriptors.CalcNumHBA(mol),
        'FractionCSP3': rdMolDescriptors.CalcFractionCSP3(mol)
    }
    return descriptors

In [6]:
calc = Calculator(descriptors, ignore_3D=True)  # ignore_3D=True, jeśli nie chcesz obliczać deskryptorów 3D


In [7]:
# Function to add solvent descriptors
def generate_solvent_descriptors(solvent):
    return SOLVENT_PROPERTIES.get(solvent, {
        'DielectricConstant': None,
        'PolarityIndex': None,
        'Density': None
    })

# Function to add charge and unpaired electrons descriptors
def generate_charge_descriptors(row):
    return {
        'Charge_gn': row['Charge_gn'],
        'UHF_gn': row['UHF_gn'],
        'Charge_rd': row['Charge_rd'],
        'UHF_rd': row['UHF_rd']
    }

# # Function to generate all descriptors
# def generate_descriptors(smiles, solvent, row):
#     mol = Chem.MolFromSmiles(smiles)
#     if mol is None:
#         return None

#     descriptors = {}
#     descriptors.update(generate_0D_descriptors(mol))
#     descriptors.update(generate_1D_descriptors(mol))
#     descriptors.update(generate_2D_descriptors(mol))
#     descriptors.update(generate_solvent_descriptors(solvent))
#     descriptors.update(generate_charge_descriptors(row))

#     return descriptors
from mordred import Calculator, descriptors

# Initialize the Mordred calculator
calc = Calculator(descriptors, ignore_3D=True)

def generate_descriptors(smiles, solvent, row):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    # Calculate descriptors using Mordred
    mordred_result = calc(mol)
    mordred_dict = mordred_result.asdict()

    # Calculate 0D, 1D, and 2D descriptors using RDKit
    rdkit_descriptors = {
        'MolWt': Descriptors.MolWt(mol),
        'NumAtoms': mol.GetNumAtoms(),
        'NumHeteroatoms': Descriptors.NumHeteroatoms(mol),
        'NumRings': Descriptors.RingCount(mol),
        'NumRotatableBonds': Descriptors.NumRotatableBonds(mol),
        'TPSA': Descriptors.TPSA(mol),
        'LogP': Descriptors.MolLogP(mol),
        'NumHBD': rdMolDescriptors.CalcNumHBD(mol),
        'NumHBA': rdMolDescriptors.CalcNumHBA(mol),
        'FractionCSP3': rdMolDescriptors.CalcFractionCSP3(mol)
    }

    # Combine Mordred and RDKit descriptors
    descriptors = {**mordred_dict, **rdkit_descriptors}

    # Add solvent descriptors
    descriptors.update(generate_solvent_descriptors(solvent))

    # Add charge and unpaired electrons descriptors
    descriptors.update(generate_charge_descriptors(row))

    return descriptors


In [8]:
# Generate descriptors for each SMILES
descriptor_list = []
for idx, row in input_data.iterrows():
    smiles = row['SMILES_1']
    solvent = row['Solvent']
    if pd.isna(smiles):
        continue
    descriptors = generate_descriptors(smiles, solvent, row)
    if descriptors is not None:
        descriptor_list.append(descriptors)

# Create a DataFrame from the descriptor list
output_data = pd.DataFrame(descriptor_list)
output_data

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,NumHBD,NumHBA,FractionCSP3,DielectricConstant,PolarityIndex,Density # !!! have not checked if the values are correct,Charge_gn,UHF_gn,Charge_rd,UHF_rd
0,6.432911,6.255590,0,0,11.203558,2.221583,4.443167,11.203558,1.24484,3.090233,...,1,2,0.142857,36.6,5.8,0.786,1,1,0,0
1,22.976814,18.721178,1,0,37.051229,2.579777,5.159555,37.051229,1.277629,4.315072,...,2,4,0.000000,36.6,5.8,0.786,0,0,-1,1
2,5.875634,5.525875,0,0,9.924777,2.170086,4.340173,9.924777,1.240597,2.97973,...,1,1,0.000000,36.6,5.8,0.786,1,1,0,0
3,7.180458,6.674040,0,0,12.505533,2.202654,4.405308,12.505533,1.250553,3.188488,...,0,0,0.111111,36.6,5.8,0.786,1,1,0,0
4,10.606602,8.731144,0,0,18.877841,2.210509,4.421017,18.877841,1.348417,3.551969,...,0,1,0.000000,36.6,5.8,0.786,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,7.778175,7.339588,0,0,13.80695,2.229072,4.458143,13.80695,1.255177,3.278705,...,0,1,0.200000,36.6,5.8,0.786,1,1,0,0
186,4.242641,4.000000,0,0,8.0,2.0,4.0,8.0,1.333333,2.687624,...,0,2,1.000000,36.6,5.8,0.786,1,1,0,0
187,6.473351,6.127583,0,0,11.189957,2.193993,4.387987,11.189957,1.243329,3.089765,...,1,2,0.000000,36.6,5.8,0.786,1,1,0,0
188,4.242641,4.000000,0,0,8.0,2.0,4.0,8.0,1.333333,2.687624,...,0,1,0.600000,36.6,5.8,0.786,1,1,0,0


In [9]:
# Save to CSV
output_data.to_csv('chemical_descriptors.csv', index=False)
print("Descriptors generated and saved to 'chemical_descriptors.csv'.")

Descriptors generated and saved to 'chemical_descriptors.csv'.
