In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
df = pd.read_csv('df_rdkit_desc.csv')
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Molecular Weight,#RO5 Violations,AlogP,Standard Value,pChEMBL Value,Ligand Efficiency SEI,Value,MaxAbsEStateIndex,...,fr_quatN,fr_sulfide,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,0,0,364.46,0.0,3.49,40.0,7.40,10.62,0.04,12.372552,...,0,0,1,0,0,0,0,0,0,0
1,1,1,372.88,0.0,4.43,1700.0,5.77,9.63,1.70,11.641862,...,0,0,1,0,0,0,0,0,0,0
2,2,2,413.90,1.0,5.19,50.0,7.30,12.69,0.05,13.229344,...,0,0,0,0,0,0,0,0,1,0
3,3,5,316.49,1.0,5.66,4500.0,5.35,20.33,4.50,12.675413,...,0,0,0,0,0,0,0,0,0,0
4,4,6,574.08,2.0,5.28,190.0,6.72,7.48,0.19,13.438530,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3962,3962,7903,244.29,0.0,3.31,81100.0,4.09,10.37,81.10,11.350518,...,0,0,0,0,0,0,0,0,0,0
3963,3963,7939,415.83,0.0,3.47,10.0,8.00,8.44,10.00,13.155765,...,0,0,0,0,0,0,0,0,0,0
3964,3964,7950,394.51,1.0,6.00,14700.0,4.83,6.91,14.70,10.501451,...,0,0,0,0,0,0,0,0,0,0
3965,3965,7968,153.14,0.0,0.67,7530.0,5.12,6.13,7.53,10.358426,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df = df.drop(['#RO5 Violations', 'AlogP', 'pChEMBL Value', 'Ligand Efficiency SEI', 'Value'], axis=1)

In [4]:
y = df['Standard Value']
X = df.drop(['Standard Value', 'Unnamed: 0.1', 'Unnamed: 0'], axis=1)

In [5]:
y_log = -np.log10(y)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)

In [10]:
scal = StandardScaler()

In [11]:
X_train_scl = scal.fit_transform(X_train)
X_test_scl = scal.transform(X_test)

In [12]:
pca = PCA(n_components=0.90, random_state=42)

In [13]:
pca.fit(X_train_scl)
X_train_pca = pca.transform(X_train_scl)
X_test_pca = pca.transform(X_test_scl)

In [14]:
print("Число компонент:", pca.n_components_)

Число компонент: 59


In [15]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

rf = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1,
    error_score='raise'
)

grid_search.fit(X_train_pca, y_train)

best_rf = grid_search.best_estimator_

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [16]:
best_rf.fit(X_train_pca, y_train)
y_pred = best_rf.predict(X_test_pca)

In [17]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print("R^2", r2)
print("RMSE =", rmse)

R^2 0.43748004200977775
RMSE = 0.8252511638100323


In [18]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 500]
}

In [19]:
xgb = XGBRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_pca, y_train)

best_rf = grid_search.best_estimator_

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [20]:
best_rf.fit(X_train_pca, y_train)
y_pred = best_rf.predict(X_test_pca)

In [21]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print("R^2", r2)
print("RMSE =", rmse)

R^2 0.41163671337759977
RMSE = 0.8439951680987999


In [22]:
lgb = LGBMRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=lgb,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_pca, y_train)

best_lgb = grid_search.best_estimator_

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001480 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15045
[LightGBM] [Info] Number of data points in the train set: 3173, number of used features: 59
[LightGBM] [Info] Start training from score -2.802129


In [23]:
best_lgb.fit(X_train_pca, y_train)
y_pred = best_rf.predict(X_test_pca)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001174 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15045
[LightGBM] [Info] Number of data points in the train set: 3173, number of used features: 59
[LightGBM] [Info] Start training from score -2.802129


In [24]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print("R^2", r2)
print("RMSE =", rmse)

R^2 0.41163671337759977
RMSE = 0.8439951680987999


In [25]:
model = keras.Sequential([
    layers.Input(shape=(X_train_pca.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

history = model.fit(
    X_train_pca, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.1,
    verbose=2
)

y_pred = model.predict(X_test_pca).flatten()

Epoch 1/50
90/90 - 1s - 13ms/step - loss: 2.2744 - mae: 1.1497 - val_loss: 1.2487 - val_mae: 0.9068
Epoch 2/50
90/90 - 0s - 2ms/step - loss: 1.0759 - mae: 0.8202 - val_loss: 1.0176 - val_mae: 0.8113
Epoch 3/50
90/90 - 0s - 2ms/step - loss: 0.8957 - mae: 0.7423 - val_loss: 0.9866 - val_mae: 0.7879
Epoch 4/50
90/90 - 0s - 2ms/step - loss: 0.7947 - mae: 0.6943 - val_loss: 0.9224 - val_mae: 0.7561
Epoch 5/50
90/90 - 0s - 2ms/step - loss: 0.7055 - mae: 0.6487 - val_loss: 0.8767 - val_mae: 0.7325
Epoch 6/50
90/90 - 0s - 2ms/step - loss: 0.6554 - mae: 0.6220 - val_loss: 0.8649 - val_mae: 0.7265
Epoch 7/50
90/90 - 0s - 2ms/step - loss: 0.6460 - mae: 0.6032 - val_loss: 0.9037 - val_mae: 0.7476
Epoch 8/50
90/90 - 0s - 2ms/step - loss: 0.6210 - mae: 0.5869 - val_loss: 0.8302 - val_mae: 0.7170
Epoch 9/50
90/90 - 0s - 2ms/step - loss: 0.5770 - mae: 0.5608 - val_loss: 0.8575 - val_mae: 0.7190
Epoch 10/50
90/90 - 0s - 2ms/step - loss: 0.5219 - mae: 0.5439 - val_loss: 0.8346 - val_mae: 0.7036
Epoch 11

In [26]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print("R^2", r2)
print("RMSE =", rmse)

R^2 0.31823725145357307
RMSE = 0.9085185759548051


```
Результаты показали следующие - лучшей предсказательной моделью ялвяется случайный лес, хуже всего себя показал MLP
MLP показал себя наихудшим образом в связи с малым размером датасета
После дополнительной оптимизации гиперпараметров XGB и LGB показали результаты лучше чем было изначально (~0.35 r2)
На небольших выборках разница между Random Forest и бустингом часто минимальна, а иногда лес показывает себя лучше за счёт меньшей склонности к переобучению
```

