<a href="https://colab.research.google.com/github/illhammm/Data-in-chem/blob/main/task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# For dataset **qm9_new.csv** get 1000+ descriptors from 2+ sources: RDKit, Pubchem, Mordred, etc.

In [1]:
!pip install pandas rdkit pubchempy mordred



In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from mordred import Calculator, descriptors
import pubchempy as pcp
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression, mutual_info_regression

In [3]:
# Функция для загрузки дескрипторов из PubChem
def get_pubchem_descriptors(smiles):
    try:
        compound = pcp.get_compounds(smiles, 'smiles')
        cid = compound[0].cid
        properties = pcp.get_properties('property', cid)
        descriptors = {prop['Name']: prop['Value'] for prop in properties[0]}
        return descriptors
    except Exception as e:
        print(f"Error fetching descriptors for SMILES {smiles}: {e}")
        return None

In [4]:
# Функция для получения дополнительных дескрипторов из PubChem
def get_additional_pubchem_descriptors(smiles):
    try:
        # Ваши операции для получения дополнительных дескрипторов из PubChem
        pass
    except Exception as e:
        print(f"Error fetching additional descriptors for SMILES {smiles}: {e}")
        return None

In [None]:
# # Функция для загрузки дескрипторов из PubChem
# def get_pubchem_descriptors(smiles):
#     try:
#         compound = pcp.get_compounds(smiles, 'smiles')
#         cid = compound[0].cid
#         properties = pcp.get_properties('property', cid)
#         descriptors = {prop['Name']: prop['Value'] for prop in properties[0]}
#         return descriptors
#     except Exception as e:
#         print(f"Error fetching descriptors for SMILES {smiles}: {e}")
#         return None

In [5]:
# Функция для вычисления дескрипторов с помощью Mordred
def calculate_mordred_descriptors(smiles):
    try:
        molecule = Chem.MolFromSmiles(smiles)
        if molecule is None:
            return None
        calc = Calculator(descriptors)
        descriptors = calc(molecule)
        return descriptors.asdict()
    except Exception as e:
        print(f"Error calculating Mordred descriptors for SMILES {smiles}: {e}")
        return None

In [6]:
df = pd.read_csv("qm9_1.csv")

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mol_id  100 non-null    object 
 1   smiles  100 non-null    object 
 2   A       100 non-null    float64
 3   B       100 non-null    float64
 4   C       100 non-null    float64
 5   mu      100 non-null    float64
 6   alpha   100 non-null    float64
 7   homo    100 non-null    float64
 8   lumo    100 non-null    float64
 9   gap     100 non-null    float64
 10  r2      100 non-null    float64
 11  zpve    100 non-null    float64
 12  u0      100 non-null    float64
 13  u298    100 non-null    float64
 14  h298    100 non-null    float64
 15  g298    100 non-null    float64
 16  cv      100 non-null    float64
dtypes: float64(15), object(2)
memory usage: 13.4+ KB


In [7]:
# Выборка подмножества данных для тестирования (можно удалить в финальной версии)
df = df.head(100)

In [9]:
# Вычисление дескрипторов с помощью Mordred
df['mordred_descriptors'] = df['smiles'].apply(calculate_mordred_descriptors)

Error calculating Mordred descriptors for SMILES CCC1=CC=CCCC1: local variable 'descriptors' referenced before assignment
Error calculating Mordred descriptors for SMILES CCn1ccc(n1)OC: local variable 'descriptors' referenced before assignment
Error calculating Mordred descriptors for SMILES CC(NCC#N)C1CN1: local variable 'descriptors' referenced before assignment
Error calculating Mordred descriptors for SMILES CC(=O)C(=O)N1CC1: local variable 'descriptors' referenced before assignment
Error calculating Mordred descriptors for SMILES OC1CCC11CC1C#C: local variable 'descriptors' referenced before assignment
Error calculating Mordred descriptors for SMILES CC1=CCC2OC=NC12: local variable 'descriptors' referenced before assignment
Error calculating Mordred descriptors for SMILES COCC1C2CC1O2: local variable 'descriptors' referenced before assignment
Error calculating Mordred descriptors for SMILES CC1(C)C2N1CC2(C)O: local variable 'descriptors' referenced before assignment
Error calculat

In [10]:
from mordred import descriptors

# Создание калькулятора с использованием всех дескрипторов из Mordred
calc = Calculator(descriptors)

# Генератор списка для вычисления дескрипторов для каждой молекулы
molecules = [Chem.MolFromSmiles(smiles) for smiles in df['smiles']]

# Объединение исходного DataFrame с дескрипторами
df_with_descriptors = calc.pandas(molecules)

100%|██████████| 100/100 [00:10<00:00,  9.92it/s]


In [18]:
num_rows, num_columns = df.shape
print(df.head())

       mol_id            smiles        A        B        C      mu  alpha  \
0  gdb_109095     CCC1=CC=CCCC1  3.03486  1.07556  0.86799  0.6913  97.97   
1  gdb_126739     CCn1ccc(n1)OC  3.12802  1.11330  0.87300  1.9682  79.80   
2   gdb_61921    CC(NCC#N)C1CN1  3.23759  0.78411  0.66376  4.9958  80.82   
3   gdb_17599  CC(=O)C(=O)N1CC1  3.92086  1.63625  1.19988  1.3069  64.00   
4   gdb_94340    OC1CCC11CC1C#C  2.61019  1.28615  1.07855  2.0761  82.73   

     homo    lumo     gap         r2      zpve          u0        u298  \
0 -0.2028 -0.0147  0.1880  1437.9297  0.207597 -351.182105 -351.172934   
1 -0.2077  0.0319  0.2396  1364.4040  0.159883 -419.227054 -419.217941   
2 -0.2486  0.0209  0.2695  1669.6409  0.170769 -399.297998 -399.288094   
3 -0.2403 -0.0671  0.1732   978.7413  0.116682 -399.810511 -399.802101   
4 -0.2360  0.0475  0.2835  1195.5554  0.157575 -385.829753 -385.820728   

         h298        g298      cv mordred_descriptors  
0 -351.171990 -351.216386  35.192   

In [19]:
print(num_rows, num_columns)

100 18


In [20]:
#соеденим с нашим датасетом по smiles
merged_df = pd.concat([df, df_with_descriptors], axis=1)

In [21]:
df = merged_df

In [25]:
num_rows, num_columns = merged_df.shape
print(num_rows, num_columns)

100 1844


In [26]:
print(merged_df.head())

       mol_id            smiles        A        B        C      mu  alpha  \
0  gdb_109095     CCC1=CC=CCCC1  3.03486  1.07556  0.86799  0.6913  97.97   
1  gdb_126739     CCn1ccc(n1)OC  3.12802  1.11330  0.87300  1.9682  79.80   
2   gdb_61921    CC(NCC#N)C1CN1  3.23759  0.78411  0.66376  4.9958  80.82   
3   gdb_17599  CC(=O)C(=O)N1CC1  3.92086  1.63625  1.19988  1.3069  64.00   
4   gdb_94340    OC1CCC11CC1C#C  2.61019  1.28615  1.07855  2.0761  82.73   

     homo    lumo     gap  ...     SRW10     TSRW10          MW       AMW  \
0 -0.2028 -0.0147  0.1880  ...  8.168770  44.011196  122.109550  5.309111   
1 -0.2077  0.0319  0.2396  ...  8.502891  50.670736  126.079313  6.635753   
2 -0.2486  0.0209  0.2695  ...  8.633909  55.709489  125.095297  6.254765   
3 -0.2403 -0.0671  0.1732  ...  8.760453  54.832926  113.047678  7.536512   
4 -0.2360  0.0475  0.2835  ...  9.936003  61.495279  122.073165  6.424903   

   WPath  WPol  Zagreb1 Zagreb2  mZagreb1  mZagreb2  
0     88    11     3

In [27]:
# Вычисление дескрипторов с помощью PubChem
df['pubchem_descriptors'] = df['smiles'].apply(get_pubchem_descriptors)

Error fetching descriptors for SMILES CCC1=CC=CCCC1: 'PUGREST.BadRequest'
Error fetching descriptors for SMILES CCn1ccc(n1)OC: 'PUGREST.BadRequest'
Error fetching descriptors for SMILES CC(NCC#N)C1CN1: identifier/cid cannot be None
Error fetching descriptors for SMILES CC(=O)C(=O)N1CC1: identifier/cid cannot be None
Error fetching descriptors for SMILES OC1CCC11CC1C#C: identifier/cid cannot be None
Error fetching descriptors for SMILES CC1=CCC2OC=NC12: identifier/cid cannot be None
Error fetching descriptors for SMILES COCC1C2CC1O2: identifier/cid cannot be None
Error fetching descriptors for SMILES CC1(C)C2N1CC2(C)O: identifier/cid cannot be None
Error fetching descriptors for SMILES O=C1C2CCC1OC2: 'PUGREST.BadRequest'
Error fetching descriptors for SMILES CCC1=C(N)OC(=N)O1: identifier/cid cannot be None
Error fetching descriptors for SMILES O=C(C#C)C12CCC1O2: identifier/cid cannot be None
Error fetching descriptors for SMILES N=C1OCC2C1OC2=O: identifier/cid cannot be None
Error fetch

In [None]:
# # Функция для вычисления дескрипторов с помощью Mordred
# def calculate_mordred_descriptors(smiles):
#     try:
#         molecule = Chem.MolFromSmiles(smiles)
#         if molecule is None:
#             return None
#         calc = Calculator(descriptors)
#         descriptors = calc(molecule)
#         return descriptors.asdict()
#     except Exception as e:
#         print(f"Error calculating Mordred descriptors for SMILES {smiles}: {e}")
#         return None

In [18]:
# # Функция для получения дескрипторов из PubChem
# def get_pubchem_descriptors(smiles):
#     try:
#         compound = pcp.get_compounds(smiles, 'smiles')
#         cid = compound[0].cid
#         properties = pcp.get_properties('property', cid)
#         descriptors = {prop['Name']: prop['Value'] for prop in properties[0]}
#         return descriptors
#     except Exception as e:
#         print(f"Error fetching descriptors for SMILES {smiles}: {e}")
#         return None

In [19]:
# # Создание списка молекул из SMILES строк
# molecules = [Chem.MolFromSmiles(smiles) for smiles in df['smiles']]

In [None]:
# # Вычисление дескрипторов с помощью Mordred
# df['mordred_descriptors'] = df['smiles'].apply(calculate_mordred_descriptors)

In [28]:
# Получение дескрипторов из PubChem для каждой молекулы
pubchem_descriptors = [get_pubchem_descriptors(smiles) for smiles in df['smiles']]

Error fetching descriptors for SMILES CCC1=CC=CCCC1: 'PUGREST.BadRequest'
Error fetching descriptors for SMILES CCn1ccc(n1)OC: 'PUGREST.BadRequest'
Error fetching descriptors for SMILES CC(NCC#N)C1CN1: identifier/cid cannot be None
Error fetching descriptors for SMILES CC(=O)C(=O)N1CC1: identifier/cid cannot be None
Error fetching descriptors for SMILES OC1CCC11CC1C#C: identifier/cid cannot be None
Error fetching descriptors for SMILES CC1=CCC2OC=NC12: identifier/cid cannot be None
Error fetching descriptors for SMILES COCC1C2CC1O2: identifier/cid cannot be None
Error fetching descriptors for SMILES CC1(C)C2N1CC2(C)O: identifier/cid cannot be None
Error fetching descriptors for SMILES O=C1C2CCC1OC2: 'PUGREST.BadRequest'
Error fetching descriptors for SMILES CCC1=C(N)OC(=N)O1: identifier/cid cannot be None
Error fetching descriptors for SMILES O=C(C#C)C12CCC1O2: identifier/cid cannot be None
Error fetching descriptors for SMILES N=C1OCC2C1OC2=O: identifier/cid cannot be None
Error fetch

In [29]:
# Объединение merged_df DataFrame с дескрипторами из PubChem
df_with_pubchem_descriptors = pd.concat([merged_df, pd.DataFrame(pubchem_descriptors)], axis=1)

In [30]:
# Вывод первых нескольких строк с дескрипторами из PubChem
print(df_with_pubchem_descriptors.head())

       mol_id            smiles        A        B        C      mu  alpha  \
0  gdb_109095     CCC1=CC=CCCC1  3.03486  1.07556  0.86799  0.6913  97.97   
1  gdb_126739     CCn1ccc(n1)OC  3.12802  1.11330  0.87300  1.9682  79.80   
2   gdb_61921    CC(NCC#N)C1CN1  3.23759  0.78411  0.66376  4.9958  80.82   
3   gdb_17599  CC(=O)C(=O)N1CC1  3.92086  1.63625  1.19988  1.3069  64.00   
4   gdb_94340    OC1CCC11CC1C#C  2.61019  1.28615  1.07855  2.0761  82.73   

     homo    lumo     gap  ...          MW       AMW  WPath  WPol  Zagreb1  \
0 -0.2028 -0.0147  0.1880  ...  122.109550  5.309111     88    11     38.0   
1 -0.2077  0.0319  0.2396  ...  126.079313  6.635753     91     8     40.0   
2 -0.2486  0.0209  0.2695  ...  125.095297  6.254765    101     8     40.0   
3 -0.2403 -0.0671  0.1732  ...  113.047678  7.536512     64     8     38.0   
4 -0.2360  0.0475  0.2835  ...  122.073165  6.424903     85     9     52.0   

   Zagreb2  mZagreb1  mZagreb2 pubchem_descriptors     0  
0     40.

# Применение методов отбора признаков

In [33]:
# # Применение метода отбора признаков на основе корреляции Пирсона
# selector_pearson = SelectKBest(score_func=f_regression, k=500)
# selected_features_pearson = selector_pearson.fit_transform(df_with_pubchem_descriptors.drop(columns=['gap']), df_with_pubchem_descriptors['gap'])
# selected_feature_names_pearson = df_with_pubchem_descriptors.drop(columns=['gap']).columns[selector_pearson.get_support()]

TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.

In [37]:
# Метод отбора на основе корреляции Пирсона
selector_pearson = SelectKBest(score_func=f_regression, k=500)

In [36]:
# Преобразование имен столбцов в строковый тип данных
df_with_pubchem_descriptors.columns = df_with_pubchem_descriptors.columns.astype(str)

In [40]:
print(df_with_pubchem_descriptors.dtypes.unique())

[dtype('O') dtype('float64') dtype('int64') dtype('bool')]


In [43]:
# Удаление столбцов с типом данных 'O' (строковых столбцов)
df_with_pubchem_descriptors_numeric = df_with_pubchem_descriptors.select_dtypes(exclude=['object'])

In [44]:
# Удаление столбца 'gap'
df_with_pubchem_descriptors_numeric.drop(columns=['gap'], inplace=True)


In [46]:
# Применение метода отбора признаков на основе корреляции Пирсона
selected_features_pearson = selector_pearson.fit_transform(df_with_pubchem_descriptors_numeric, df_with_pubchem_descriptors['gap'])
selected_feature_names_pearson = df_with_pubchem_descriptors_numeric.columns[selector_pearson.get_support()]

In [47]:
gap_counts = df['gap'].value_counts(normalize=False)
print(gap_counts)

gap
0.3063    2
0.1953    2
0.2680    2
0.1880    1
0.2593    1
         ..
0.2244    1
0.2831    1
0.1930    1
0.1979    1
0.2048    1
Name: count, Length: 97, dtype: int64


In [50]:
# Применение метода отбора признаков на основе взаимной информации
selector_mutual_info = SelectKBest(score_func=mutual_info_regression, k=500)

In [52]:
# Удаление столбцов с типом данных 'O' (строковых столбцов)
df_with_pubchem_descriptors_numeric = df_with_pubchem_descriptors.select_dtypes(exclude=['object'])


In [53]:
# Удаление столбцов 'gap' и столбца с идентификаторами молекул
df_with_pubchem_descriptors_numeric.drop(columns=['gap'], inplace=True)

In [55]:
# Применение метода отбора признаков на основе взаимной информации
selected_features_mutual_info = selector_mutual_info.fit_transform(df_with_pubchem_descriptors_numeric, df_with_pubchem_descriptors['gap'])
selected_feature_names_mutual_info = df_with_pubchem_descriptors_numeric.columns[selector_mutual_info.get_support()]

In [57]:
# Создание DataFrame с выбранными признаками и таргетом gap для корреляции Пирсона
selected_features_df_pearson = pd.DataFrame(selected_features_pearson, columns=selected_feature_names_pearson)
selected_features_df_pearson['gap'] = df_with_pubchem_descriptors['gap']

In [58]:
# Создание DataFrame с выбранными признаками и таргетом gap для взаимной информации
selected_features_df_mutual_info = pd.DataFrame(selected_features_mutual_info, columns=selected_feature_names_mutual_info)
selected_features_df_mutual_info['gap'] = df_with_pubchem_descriptors['gap']

In [59]:
# Сохранение датасетов с выбранными признаками
selected_features_df_pearson.to_csv('selected_features_pearson.csv', index=False)
selected_features_df_mutual_info.to_csv('selected_features_mutual_info.csv', index=False)

In [None]:
# # Добавление таргет параметра "gap" в DataFrame с дескрипторами
# df_descriptors['gap'] = target_gap

In [None]:
# # Удаление ненужных столбцов
# df_descriptors.drop(columns=['smiles'], inplace=True)

In [60]:
# Объединение выбранных признаков от метода отбора признаков на основе корреляции Пирсона и взаимной информации
selected_features_combined = pd.concat([selected_features_df_pearson, selected_features_df_mutual_info], axis=1)

In [62]:
# Сохранение объединенного датасета с выбранными признаками
selected_features_combined.to_csv('fs_combine.csv', index=False)