In [12]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors
from mordred import Calculator, descriptors
import os

In [13]:
# Load input data from CSV file
input_file = 'data/OROP_updated.csv'
input_data = pd.read_csv(input_file)


In [14]:
# Function to generate 0D descriptors
def generate_0D_descriptors(mol):
    return {
        'MolWt': Descriptors.MolWt(mol),
        'NumAtoms': mol.GetNumAtoms(),
        'NumHeteroatoms': Descriptors.NumHeteroatoms(mol)
    }

# Function to generate 1D descriptors
def generate_1D_descriptors(mol):
    return {
        'NumRings': Descriptors.RingCount(mol),
        'NumRotatableBonds': Descriptors.NumRotatableBonds(mol)
    }

# Function to generate 2D descriptors
def generate_2D_descriptors(mol):
    descriptors = {
        'TPSA': Descriptors.TPSA(mol),
        'LogP': Descriptors.MolLogP(mol),
        'NumHBD': rdMolDescriptors.CalcNumHBD(mol),
        'NumHBA': rdMolDescriptors.CalcNumHBA(mol),
        'FractionCSP3': rdMolDescriptors.CalcFractionCSP3(mol)
    }
    return descriptors

In [16]:
calc = Calculator(descriptors, ignore_3D=True)  # ignore_3D=True, jeśli nie chcesz obliczać deskryptorów 3D


In [17]:

# Function to add charge and unpaired electrons descriptors
def generate_charge_descriptors(row):
    return {
        'Charge_gn': row['Charge_gn'],
        'UHF_gn': row['UHF_gn'],
        'Charge_rd': row['Charge_rd'],
        'UHF_rd': row['UHF_rd']
    }

from mordred import Calculator, descriptors

# Initialize the Mordred calculator
calc = Calculator(descriptors, ignore_3D=True)

def generate_descriptors(smiles, row):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    # Calculate descriptors using Mordred
    mordred_result = calc(mol)
    mordred_dict = mordred_result.asdict()

    # Calculate 0D, 1D, and 2D descriptors using RDKit
    rdkit_descriptors = {
        'MolWt': Descriptors.MolWt(mol),
        'NumAtoms': mol.GetNumAtoms(),
        'NumHeteroatoms': Descriptors.NumHeteroatoms(mol),
        'NumRings': Descriptors.RingCount(mol),
        'NumRotatableBonds': Descriptors.NumRotatableBonds(mol),
        'TPSA': Descriptors.TPSA(mol),
        'LogP': Descriptors.MolLogP(mol),
        'NumHBD': rdMolDescriptors.CalcNumHBD(mol),
        'NumHBA': rdMolDescriptors.CalcNumHBA(mol),
        'FractionCSP3': rdMolDescriptors.CalcFractionCSP3(mol)
    }

    # Combine Mordred and RDKit descriptors
    descriptors = {**mordred_dict, **rdkit_descriptors}

    # Add charge and unpaired electrons descriptors
    descriptors.update(generate_charge_descriptors(row))

    return descriptors


In [21]:
# Generate descriptors for each SMILES
descriptor_list = []
for idx, row in input_data.iterrows():
    smiles = row['SMILES_1']
    solvent = row['Solvent']
    if pd.isna(smiles):
        continue
    descriptors = generate_descriptors(smiles, row)
    if descriptors is not None:
        descriptor_list.append(descriptors)

# Create a DataFrame from the descriptor list
output_data = pd.DataFrame(descriptor_list)
output_data

TypeError: No registered converter was able to produce a C++ rvalue of type std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > from this Python object of type int

The `TypeError` indicates that there is an attempt to convert an `int` to a `std::string` (or similar type) in C++, which is not possible. This error might be occurring in the `generate_charge_descriptors` function or in the `calc` function from Mordred.

To debug this, you can add print statements to check the types of the variables being passed around. Here is an example of how you can modify the 

generate_descriptors

 function to include these checks:



In [19]:


def generate_descriptors(smiles, row):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    # Calculate descriptors using Mordred
    mordred_result = calc(mol)
    mordred_dict = mordred_result.asdict()

    # Calculate 0D, 1D, and 2D descriptors using RDKit
    rdkit_descriptors = {
        'MolWt': Descriptors.MolWt(mol),
        'NumAtoms': mol.GetNumAtoms(),
        'NumHeteroatoms': Descriptors.NumHeteroatoms(mol),
        'NumRings': Descriptors.RingCount(mol),
        'NumRotatableBonds': Descriptors.NumRotatableBonds(mol),
        'TPSA': Descriptors.TPSA(mol),
        'LogP': Descriptors.MolLogP(mol),
        'NumHBD': rdMolDescriptors.CalcNumHBD(mol),
        'NumHBA': rdMolDescriptors.CalcNumHBA(mol),
        'FractionCSP3': rdMolDescriptors.CalcFractionCSP3(mol)
    }

    # Combine Mordred and RDKit descriptors
    descriptors = {**mordred_dict, **rdkit_descriptors}

    # Debugging: Print the type of row
    print(f"Type of row: {type(row)}")

    # Add charge and unpaired electrons descriptors
    charge_descriptors = generate_charge_descriptors(row)
    print(f"Type of charge_descriptors: {type(charge_descriptors)}")
    descriptors.update(charge_descriptors)

    return descriptors



Additionally, ensure that the `generate_charge_descriptors` function is returning a dictionary and that the 

row

 parameter is of the expected type. Here is an example of how you might modify `generate_charge_descriptors` to include type checks:



In [20]:


def generate_charge_descriptors(row):
    # Debugging: Print the type of row
    print(f"Type of row in generate_charge_descriptors: {type(row)}")

    # Ensure row is of the expected type (e.g., a dictionary or a specific class instance)
    if not isinstance(row, dict):
        raise TypeError("row must be a dictionary")

    # Example implementation (modify as needed)
    charge_descriptors = {
        'TotalCharge': row.get('TotalCharge', 0),
        'UnpairedElectrons': row.get('UnpairedElectrons', 0)
    }

    return charge_descriptors



By adding these checks, you can identify where the type mismatch is occurring and ensure that the correct types are being used throughout your code.

In [9]:
# Save to CSV
output_data.to_csv('chemical_descriptors.csv', index=False)
print("Descriptors generated and saved to 'chemical_descriptors.csv'.")

Descriptors generated and saved to 'chemical_descriptors.csv'.
