# **Hometask 2**
1. For your dataset get 1000+ descriptors from 2+ sources: RDKit, Pubchem, Mordred, etc.
2. Select features from downloaded based on at least 2 selection techniques.

loading libraries

In [1]:
!pip install pandas>None
!pip install rdkit>None
!pip install pubchempy>None
!pip install mordred

Collecting mordred
  Downloading mordred-1.2.0.tar.gz (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.8/128.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting networkx==2.* (from mordred)
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: mordred
  Building wheel for mordred (setup.py) ... [?25l[?25hdone
  Created wheel for mordred: filename=mordred-1.2.0-py3-none-any.whl size=176720 sha256=f3ad2ddd33a095c23560afcb09769102e19f71df06bf0de268b0b0b53677512e
  Stored in directory: /root/.cache/pip/wheels/a7/4f/b8/d4c6591f6ac944aaced7865b349477695f662388ad958743c7
Successfully built mordred
Installing collected packages: networkx, mordred
  Attempting uninstall: networkx
    Found existing installation: networkx 3.2.1
    Uninst

In [2]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from mordred import Calculator, descriptors

In [3]:
from sklearn.feature_selection import VarianceThreshold

In [4]:
from mordred import descriptors

In [5]:
df = pd.read_csv('/content/my_data.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mol_id  20000 non-null  object 
 1   smiles  20000 non-null  object 
 2   A       20000 non-null  float64
 3   B       20000 non-null  float64
 4   C       20000 non-null  float64
 5   mu      20000 non-null  float64
 6   alpha   20000 non-null  float64
 7   homo    20000 non-null  float64
 8   lumo    20000 non-null  float64
 9   gap     20000 non-null  float64
 10  r2      20000 non-null  float64
 11  zpve    20000 non-null  float64
 12  u0      20000 non-null  float64
 13  u298    20000 non-null  float64
 14  h298    20000 non-null  float64
 15  g298    20000 non-null  float64
 16  cv      20000 non-null  float64
dtypes: float64(15), object(2)
memory usage: 2.6+ MB


# 1.1 Loading Rdkit and mordred descriptors

In [7]:
# Getting a list of available handles
descriptor_names = list(rdMolDescriptors.Properties.GetAvailableProperties())
get_descriptors = rdMolDescriptors.Properties(descriptor_names)
num_descriptors = len(descriptor_names)

# Initializing an empty matrix for descriptors
descriptors_set = np.empty((0, num_descriptors), float)

# Calculation of descriptors
for _, row in df.iterrows():
    smiles = row['smiles']
    molecule = Chem.MolFromSmiles(smiles)

    if molecule is not None:
        descriptors = np.array(get_descriptors.ComputeProperties(molecule)).reshape((-1, num_descriptors))
        descriptors_set = np.append(descriptors_set, descriptors, axis=0)

# Creating a DataFrame with Handles
df_descriptors = pd.DataFrame(descriptors_set, columns=descriptor_names)

# Merging an existing dataset with new descriptors
df_rdkit = pd.concat([df, df_descriptors], axis=1)

In [8]:
# New descriptors 
df = df_rdkit
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 60 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   mol_id                           20000 non-null  object 
 1   smiles                           20000 non-null  object 
 2   A                                20000 non-null  float64
 3   B                                20000 non-null  float64
 4   C                                20000 non-null  float64
 5   mu                               20000 non-null  float64
 6   alpha                            20000 non-null  float64
 7   homo                             20000 non-null  float64
 8   lumo                             20000 non-null  float64
 9   gap                              20000 non-null  float64
 10  r2                               20000 non-null  float64
 11  zpve                             20000 non-null  float64
 12  u0                

There are still less than 1000 descriptors needed in the task, so we will get more descriptors using Mordred

# 1.2 Descriptors from Mordred

In [None]:
from mordred import descriptors

In [9]:
# Creating a calculator using all the descriptors from Mordred
calc = Calculator(descriptors)

# List generator to calculate descriptors for each molecule
molecules = [Chem.MolFromSmiles(smiles) for smiles in df['smiles']]

# Merging the original DataFrame with handles
df_with_descriptors = calc.pandas(molecules)

100%|██████████| 20000/20000 [31:09<00:00, 10.70it/s]


In [10]:
# Connect with our smiles dataset
merged_df = pd.concat([df, df_with_descriptors], axis=1)

In [11]:
df = merged_df # All data

# Checking the result
print(merged_df.head())

       mol_id            smiles        A        B        C      mu  alpha  \
0   gdb_50579    O=CC1CCC2OC2C1  3.61423  1.09149  0.93260  2.0905  75.54   
1  gdb_108157  COC12CC=C3CC1C23  3.33497  1.50547  1.26311  1.5224  82.20   
2    gdb_1880      OC1CCC(=O)C1  5.59112  1.96604  1.53851  2.2377  56.85   
3   gdb_34264  N#CC12NC3CNC1C23  4.19762  1.34307  1.21397  5.2940  74.13   
4   gdb_22038  CC(=NO)CC(C)(C)C  2.79330  0.93080  0.84919  0.5406  88.74   

     homo    lumo     gap  ...      SRW10     TSRW10          MW       AMW  \
0 -0.2491 -0.0233  0.2258  ...   9.206433  58.472343  126.068080  6.635162   
1 -0.2118 -0.0292  0.1826  ...  10.344738  64.308792  122.073165  6.424903   
2 -0.2381 -0.0172  0.2209  ...   8.206584  47.382536  100.052429  6.670162   
3 -0.2286  0.0082  0.2368  ...  10.394610  64.379700  121.063997  7.566500   
4 -0.2380  0.0127  0.2507  ...   8.498418  36.964640  129.115364  5.379807   

   WPath  WPol  Zagreb1  Zagreb2  mZagreb1  mZagreb2  
0     85     

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Columns: 1886 entries, mol_id to mZagreb2
dtypes: bool(2), float64(852), int64(323), object(709)
memory usage: 287.5+ MB


As you can see in our dataset there are 20K rows and 18K columns

You can delete mol id or Smiles, so this data is just labels of our connections, I want to leave Smiles

In [13]:
df.drop(columns=['mol_id'], inplace=True)

In [14]:
df.to_csv('data_morded', encoding='utf-8')

In [15]:
# The file is too big and colab doesn’t load it as one file
# So I'll split it into parts and download it in parts
num_parts = 5
total_rows = len(df)
chunk_size = total_rows // num_parts

for i in range(num_parts):
    start_idx = i * chunk_size
    end_idx = start_idx + chunk_size
    if i == num_parts - 1:  # For the last chunk, include remaining rows
        end_idx = total_rows
    df_part = df.iloc[start_idx:end_idx]
    df_part.to_csv(f'data_mordred_{i+1}.csv', encoding='utf-8', index=False)

# 2. Descriptors were selected using Pearson collation and feature variance filtering method

If a descriptor has a variance equal to zero, then it does not introduce new information about the object and is redundant, so next we will check whether there are such signs and whether they can be removed

In [16]:
df

Unnamed: 0,smiles,A,B,C,mu,alpha,homo,lumo,gap,r2,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,O=CC1CCC2OC2C1,3.61423,1.09149,0.93260,2.0905,75.54,-0.2491,-0.0233,0.2258,1253.8737,...,9.206433,58.472343,126.068080,6.635162,85,9,48.0,57.0,2.583333,2.027778
1,COC12CC=C3CC1C23,3.33497,1.50547,1.26311,1.5224,82.20,-0.2118,-0.0292,0.1826,1047.6956,...,10.344738,64.308792,122.073165,6.424903,75,8,60.0,82.0,2.395833,1.888889
2,OC1CCC(=O)C1,5.59112,1.96604,1.53851,2.2377,56.85,-0.2381,-0.0172,0.2209,771.3820,...,8.206584,47.382536,100.052429,6.670162,41,4,32.0,34.0,2.972222,1.583333
3,N#CC12NC3CNC1C23,4.19762,1.34307,1.21397,5.2940,74.13,-0.2286,0.0082,0.2368,1017.8124,...,10.394610,64.379700,121.063997,7.566500,77,7,60.0,82.0,2.395833,1.888889
4,CC(=NO)CC(C)(C)C,2.79330,0.93080,0.84919,0.5406,88.74,-0.2380,0.0127,0.2507,1518.5343,...,8.498418,36.964640,129.115364,5.379807,94,7,38.0,37.0,5.673611,2.041667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,C1OC23CCN=C2NC13,3.34284,1.73396,1.32936,3.1256,73.93,-0.2273,0.0109,0.2382,958.9895,...,10.307719,56.408375,124.063663,7.297863,74,9,58.0,76.0,1.784722,1.833333
19996,CC(C#C)C1=CON=C1,3.29901,1.16566,0.91521,2.9980,75.07,-0.2584,-0.0125,0.2459,1236.5085,...,8.520986,50.281704,121.052764,7.565798,89,8,40.0,44.0,3.472222,2.194444
19997,CC(C)(C)C(C#N)C#N,1.79871,1.63214,1.07089,4.5091,79.46,-0.3313,-0.0089,0.3225,1197.0545,...,8.751000,37.448312,122.084398,6.425495,88,10,38.0,40.0,5.673611,2.166667
19998,COC(=O)C1CC(=O)N1,4.65384,0.89755,0.83874,4.4515,66.65,-0.2559,-0.0047,0.2512,1364.6319,...,9.144094,38.770899,129.042593,8.065162,94,8,42.0,47.0,4.083333,2.111111


In [17]:
df.dtypes # As you can see, we have a lot of object type descriptors

smiles       object
A           float64
B           float64
C           float64
mu          float64
             ...   
WPol          int64
Zagreb1     float64
Zagreb2     float64
mZagreb1    float64
mZagreb2    float64
Length: 1885, dtype: object

Since the columns with errors are only of the object type, we will delete them, but there are two identification columns of this type (smiles and model id), I want to leave them for now

In [None]:
df['GATS4p']

In [18]:
# Removing columns that generated the error module 'numpy' has no attribute 'float'.\n`np...
del_list=[]
for col in df.columns:
  if df[col].dtypes == 'object' and col not in ['mol_id', 'smiles']:
    del_list.append(col)
print(del_list)

['ABC', 'ABCGG', 'AATS3dv', 'AATS4dv', 'AATS5dv', 'AATS6dv', 'AATS7dv', 'AATS8dv', 'AATS3d', 'AATS4d', 'AATS5d', 'AATS6d', 'AATS7d', 'AATS8d', 'AATS3s', 'AATS4s', 'AATS5s', 'AATS6s', 'AATS7s', 'AATS8s', 'AATS3Z', 'AATS4Z', 'AATS5Z', 'AATS6Z', 'AATS7Z', 'AATS8Z', 'AATS3m', 'AATS4m', 'AATS5m', 'AATS6m', 'AATS7m', 'AATS8m', 'AATS3v', 'AATS4v', 'AATS5v', 'AATS6v', 'AATS7v', 'AATS8v', 'AATS3se', 'AATS4se', 'AATS5se', 'AATS6se', 'AATS7se', 'AATS8se', 'AATS3pe', 'AATS4pe', 'AATS5pe', 'AATS6pe', 'AATS7pe', 'AATS8pe', 'AATS3are', 'AATS4are', 'AATS5are', 'AATS6are', 'AATS7are', 'AATS8are', 'AATS3p', 'AATS4p', 'AATS5p', 'AATS6p', 'AATS7p', 'AATS8p', 'AATS3i', 'AATS4i', 'AATS5i', 'AATS6i', 'AATS7i', 'AATS8i', 'AATSC3c', 'AATSC4c', 'AATSC5c', 'AATSC6c', 'AATSC7c', 'AATSC8c', 'AATSC3dv', 'AATSC4dv', 'AATSC5dv', 'AATSC6dv', 'AATSC7dv', 'AATSC8dv', 'AATSC3d', 'AATSC4d', 'AATSC5d', 'AATSC6d', 'AATSC7d', 'AATSC8d', 'AATSC3s', 'AATSC4s', 'AATSC5s', 'AATSC6s', 'AATSC7s', 'AATSC8s', 'AATSC3Z', 'AATSC4Z', '

In [19]:
df = df.drop(columns = del_list)
df.info() # We have two bool columns, we need to check them

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Columns: 1178 entries, smiles to mZagreb2
dtypes: bool(2), float64(852), int64(323), object(1)
memory usage: 179.5+ MB


In [20]:
for col in df.columns:
  if df[col].dtypes == 'bool':
    df[col] = df[col].astype(int) # convert to 0 and 1
print(df['Lipinski']) # check

0        1
1        1
2        1
3        1
4        1
        ..
19995    1
19996    1
19997    1
19998    1
19999    1
Name: Lipinski, Length: 20000, dtype: int64


In [21]:
df.to_csv('data_before_Pirson.csv', encoding='utf-8') # to Pearson
#files.download('data_descr1.csv')

1. Pearson correlation

(removing features that have an inverse or direct linear relationship)

In [22]:
df_ex = df.select_dtypes(exclude='object') # Let's take all columns except identification ones

In [23]:
correlation_matrix = df.corr(numeric_only=True)

gap and homo and lumo correlate: since gap = lumo-homo

In [24]:
gap = df['gap'] # To save this column from being truncated since it is the target variable

In [25]:
# homo and Lumo are strongly correlated initially, so it’s worth removing them
columns_to_drop = ['homo', 'lumo']
df.drop(columns=columns_to_drop, inplace=True)

In [26]:
FILTER_THRESHOLD = 0.7 #Пороговое значение для корреляции
highly_correlated_features = set() #Пустое множество для хранения сильно коррелирующих признаков

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > FILTER_THRESHOLD:
            colname = correlation_matrix.columns[i]
            if colname != 'gap': #gap целевая переменная её нельзя удалить
              highly_correlated_features.add(colname)

In [27]:
df_filtered = df.drop(columns=highly_correlated_features) # удалим коррелирующие колонки

In [28]:
df_filtered['gap'] = gap

In [29]:
df_filtered.info() #у нас осталось 330 колонки - и smiles типа object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Columns: 330 entries, smiles to TopoShapeIndex
dtypes: float64(110), int64(219), object(1)
memory usage: 50.4+ MB


In [30]:
df = df_filtered

2. Метод фильтрации признаков по дисперсии

(Данные не имеют ценнности для построения модели, так как разборос значения низкий или равен нулю)

In [31]:
# Вычисление дисперсии для каждого столбца
variances = df.var()

# Определение индексов столбцов с нулевой дисперсией
zero_variance_columns = variances[variances == 0].index

# Удаление столбцов с нулевой дисперсией
df_filt = df.drop(columns=zero_variance_columns)

  variances = df.var()


А теперь у нас 144 признаков

In [32]:
df_filt

Unnamed: 0,smiles,A,B,mu,alpha,gap,r2,u0,lipinskiHBA,lipinskiHBD,...,n7aRing,n9aRing,n5FRing,n6FRing,n7FRing,n8FRing,n9FaRing,GGI4,GGI5,TopoShapeIndex
0,O=CC1CCC2OC2C1,3.61423,1.09149,2.0905,75.54,0.2258,1253.8737,-423.035656,2.0,0.0,...,0,0,0,0,1,0,0,0.276111,0.142500,0.666667
1,COC12CC=C3CC1C23,3.33497,1.50547,1.5224,82.20,0.1826,1047.6956,-385.774234,1.0,0.0,...,0,0,0,0,1,0,0,0.250000,0.000000,1.000000
2,OC1CCC(=O)C1,5.59112,1.96604,2.2377,56.85,0.2209,771.3820,-345.686487,2.0,1.0,...,0,0,0,0,0,0,0,0.000000,0.000000,1.000000
3,N#CC12NC3CNC1C23,4.19762,1.34307,5.2940,74.13,0.2368,1017.8124,-396.880645,3.0,2.0,...,0,0,0,0,1,0,0,0.262222,0.062500,0.666667
4,CC(=NO)CC(C)(C)C,2.79330,0.93080,0.5406,88.74,0.2507,1518.5343,-405.527293,2.0,1.0,...,0,0,0,0,0,0,0,0.240000,0.000000,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,C1OC23CCN=C2NC13,3.34284,1.73396,3.1256,73.93,0.2382,958.9895,-417.956479,3.0,1.0,...,0,0,0,0,0,0,0,0.097222,0.000000,1.000000
19996,CC(C#C)C1=CON=C1,3.29901,1.16566,2.9980,75.07,0.2459,1236.5085,-400.708122,2.0,0.0,...,0,0,0,0,0,0,0,0.250000,0.080000,0.666667
19997,CC(C)(C)C(C#N)C#N,1.79871,1.63214,4.5091,79.46,0.3225,1197.0545,-382.105873,2.0,0.0,...,0,0,0,0,0,0,0,0.000000,0.000000,1.000000
19998,COC(=O)C1CC(=O)N1,4.65384,0.89755,4.4515,66.65,0.2512,1364.6319,-475.072597,4.0,1.0,...,0,0,0,0,0,0,0,0.422222,0.118056,1.000000


In [38]:
df_filt.to_csv('data_filt_all.csv', encoding='utf-8')

In [33]:
#файл слишком большой и колаб его не грузит
#поэтому я разделю его на части и заргужу
num_parts = 5
total_rows = len(df_filt)
chunk_size = total_rows // num_parts

for i in range(num_parts):
    start_idx = i * chunk_size
    end_idx = start_idx + chunk_size
    if i == num_parts - 1:  # for the last chunk, include remaining rows
        end_idx = total_rows
    df_part = df_filt.iloc[start_idx:end_idx]
    df_part.to_csv(f'data_filt2_{i+1}.csv', encoding='utf-8', index=False)