# For dataset **qm9_new.csv** get 1000+ descriptors from 2+ sources: RDKit, Pubchem, Mordred, etc.

In [1]:
pip install pandas rdkit pubchempy mordred



In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from mordred import Calculator, descriptors
import pubchempy as pcp
from sklearn.feature_selection import VarianceThreshold

In [3]:
# Функция для загрузки дескрипторов из PubChem
def get_pubchem_descriptors(smiles):
    try:
        compound = pcp.get_compounds(smiles, 'smiles')
        cid = compound[0].cid
        properties = pcp.get_properties('property', cid)
        descriptors = {prop['Name']: prop['Value'] for prop in properties[0]}
        return descriptors
    except Exception as e:
        print(f"Error fetching descriptors for SMILES {smiles}: {e}")
        return None

In [11]:
# Функция для вычисления дескрипторов с помощью Mordred
def calculate_mordred_descriptors(smiles):
    try:
        molecule = Chem.MolFromSmiles(smiles)
        if molecule is None:
            return None
        calc = Calculator(descriptors)
        descriptors = calc(molecule)
        return descriptors.asdict()
    except Exception as e:
        print(f"Error calculating Mordred descriptors for SMILES {smiles}: {e}")
        return None

In [12]:
df = pd.read_csv("qm9_new.csv")

In [13]:
# Выборка подмножества данных для тестирования (можно удалить в финальной версии)
df = df.head(100)


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mol_id  100 non-null    object 
 1   smiles  100 non-null    object 
 2   A       100 non-null    float64
 3   B       100 non-null    float64
 4   C       100 non-null    float64
 5   mu      100 non-null    float64
 6   alpha   100 non-null    float64
 7   homo    100 non-null    float64
 8   lumo    100 non-null    float64
 9   gap     100 non-null    float64
 10  r2      100 non-null    float64
 11  zpve    100 non-null    float64
 12  u0      100 non-null    float64
 13  u298    100 non-null    float64
 14  h298    100 non-null    float64
 15  g298    100 non-null    float64
 16  cv      100 non-null    float64
dtypes: float64(15), object(2)
memory usage: 13.4+ KB


In [15]:
gap_counts = df['gap'].value_counts(normalize=False)
print(gap_counts)

gap
0.2097    3
0.2046    2
0.2161    2
0.2608    2
0.2155    2
         ..
0.1847    1
0.2638    1
0.3104    1
0.2522    1
0.2479    1
Name: count, Length: 94, dtype: int64


In [16]:
# Выделение таргет параметра "gap"
target_gap = df['gap']
df.drop(columns=['gap'], inplace=True)

In [17]:
# Вычисление дескрипторов с помощью Mordred
df['mordred_descriptors'] = df['smiles'].apply(calculate_mordred_descriptors)


Error calculating Mordred descriptors for SMILES OC1C2OC(=N)C12C#C: local variable 'descriptors' referenced before assignment
Error calculating Mordred descriptors for SMILES C1C2OC3C4C=CC13C24: local variable 'descriptors' referenced before assignment
Error calculating Mordred descriptors for SMILES COC(=N)N1C=NN=C1: local variable 'descriptors' referenced before assignment
Error calculating Mordred descriptors for SMILES O=C1CC(CN1)N1CC1: local variable 'descriptors' referenced before assignment
Error calculating Mordred descriptors for SMILES CC1OC2COCC2O1: local variable 'descriptors' referenced before assignment
Error calculating Mordred descriptors for SMILES COCC(=O)C(=O)OC: local variable 'descriptors' referenced before assignment
Error calculating Mordred descriptors for SMILES C1C=CCC(=O)CC1=O: local variable 'descriptors' referenced before assignment
Error calculating Mordred descriptors for SMILES NC1=CC(=O)C(CO)C1: local variable 'descriptors' referenced before assignment


In [18]:
# Вычисление дескрипторов с помощью PubChem
df['pubchem_descriptors'] = df['smiles'].apply(get_pubchem_descriptors)

Error fetching descriptors for SMILES OC1C2OC(=N)C12C#C: identifier/cid cannot be None
Error fetching descriptors for SMILES C1C2OC3C4C=CC13C24: identifier/cid cannot be None
Error fetching descriptors for SMILES COC(=N)N1C=NN=C1: identifier/cid cannot be None
Error fetching descriptors for SMILES O=C1CC(CN1)N1CC1: identifier/cid cannot be None
Error fetching descriptors for SMILES CC1OC2COCC2O1: 'PUGREST.BadRequest'
Error fetching descriptors for SMILES COCC(=O)C(=O)OC: 'PUGREST.BadRequest'
Error fetching descriptors for SMILES C1C=CCC(=O)CC1=O: 'PUGREST.BadRequest'
Error fetching descriptors for SMILES NC1=CC(=O)C(CO)C1: identifier/cid cannot be None
Error fetching descriptors for SMILES CC1N=COCCC1O: identifier/cid cannot be None
Error fetching descriptors for SMILES CCC1OC1(C)C1CC1: identifier/cid cannot be None
Error fetching descriptors for SMILES CN1C2C3CC1C(=O)C23: identifier/cid cannot be None
Error fetching descriptors for SMILES CC12CC3C(CC1O)C23: identifier/cid cannot be No

In [19]:
# Объединение всех дескрипторов в один DataFrame
df_descriptors = pd.concat([df.drop(['mordred_descriptors', 'pubchem_descriptors'], axis=1),
                            df['mordred_descriptors'].apply(pd.Series),
                            df['pubchem_descriptors'].apply(pd.Series)], axis=1)

In [20]:
# Добавление таргет параметра "gap" в DataFrame с дескрипторами
df_descriptors['gap'] = target_gap

In [21]:
# Удаление ненужных столбцов
df_descriptors.drop(columns=['smiles'], inplace=True)

In [23]:
# Применение метода фильтрации по дисперсии
selector = VarianceThreshold(threshold=0.1)

In [26]:
# Удаление нечисловых столбцов перед применением селектора
numeric_df_descriptors = df_descriptors.select_dtypes(include=['number'])

In [27]:
# Применение селектора к числовому DataFrame
selected_features = selector.fit_transform(numeric_df_descriptors.drop(columns=['gap']))

In [28]:
# Получение имен выбранных признаков
selected_feature_names = numeric_df_descriptors.drop(columns=['gap']).columns[selector.get_support()]


In [29]:
# Создание DataFrame с выбранными признаками и таргетом gap
selected_features_df = pd.DataFrame(selected_features, columns=selected_feature_names)
selected_features_df['gap'] = target_gap


In [30]:
# Сохранение датасета с выбранными признаками
selected_features_df.to_csv('filtered_features.csv', index=False)