In [2]:
import numpy as np 
import pandas as pd
import scipy.stats
from tqdm import tqdm_notebook

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

import optuna
from rdkit import Chem
from molfeat.calc import FPCalculator

def canonize_smiles(smiles):
    return Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
    
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('IrLumDB.csv')
test = pd.read_csv('Synthesized_complexes.csv')

In [4]:
df = df[~df['L1'].apply(lambda x: 'si' in x.lower())]
df = df[~df['L3'].apply(lambda x: 'si' in x.lower())]
df = df[~df['L1'].apply(lambda x: 'b' in x.lower())]
df = df[~df['L3'].apply(lambda x: 'b' in x.lower())]
df = df[df['L3'].apply(lambda x: len(x) > 5)]

df['L1_mol'] = df['L1'].apply(Chem.MolFromSmiles)
df['L2_mol'] = df['L2'].apply(Chem.MolFromSmiles)
df['L3_mol'] = df['L3'].apply(Chem.MolFromSmiles)
test['L1_mol'] = test['L1'].apply(Chem.MolFromSmiles)
test['L2_mol'] = test['L2'].apply(Chem.MolFromSmiles)
test['L3_mol'] = test['L3'].apply(Chem.MolFromSmiles)

df_ch2cl2 = df[df['Solvent'] == 'CH2Cl2']
df_ch2cl2.drop_duplicates(subset=['L1', 'L2', 'L3'], inplace=True)
df_ch2cl2.reset_index(drop=True, inplace=True)

In [5]:
def get_finger(fingerprints):
    """
    This function creates fingerprints from SMILES ligands. 
    The list of available fingerprints can be viewed: FPCalculator.available_fingerprints()
    """
    for f in fingerprints:
        calc = FPCalculator(f)
        df_ch2cl2[f'L1_{f}'] = df_ch2cl2['L1_mol'].apply(calc)
        df_ch2cl2[f'L2_{f}'] = df_ch2cl2['L2_mol'].apply(calc)
        df_ch2cl2[f'L3_{f}'] = df_ch2cl2['L3_mol'].apply(calc)
        df_ch2cl2[f'{f}'] = np.sum([df_ch2cl2[f'L1_{f}'], df_ch2cl2[f'L2_{f}'], df_ch2cl2[f'L3_{f}']], axis=0)
        test[f'L1_{f}'] = test['L1_mol'].apply(calc)
        test[f'L2_{f}'] = test['L2_mol'].apply(calc)
        test[f'L3_{f}'] = test['L3_mol'].apply(calc)
        test[f'{f}'] = np.sum([test[f'L1_{f}'], test[f'L2_{f}'], test[f'L3_{f}']], axis=0)

    X, y = df_ch2cl2[fingerprints].to_numpy(), df_ch2cl2['Max_wavelength(nm)'].to_numpy()
    
    X = np.array([np.hstack(i) for i in X])
    df_result = pd.DataFrame()
    for d in X:
        df_result = pd.concat([df_result, pd.DataFrame(d).T])
    X = df_result.to_numpy()
    print(X.shape)

    X_test, y_test = test[fingerprints].to_numpy(), test['Max_wavelength(nm)'].to_numpy()
    X_test = np.array([np.hstack(i) for i in X_test])
    df_result = pd.DataFrame()
    for d in X_test:
        df_result = pd.concat([df_result, pd.DataFrame(d).T])
    X_test = df_result.to_numpy()
    print(X_test.shape)
    
    return X, y, X_test, y_test

In [6]:
X, y, X_test, y_test = get_finger(['ecfp'])

Failed to find the pandas get_adjustment() function to patch
Failed to patch pandas - PandasTools will have limited functionality


(785, 2048)
(33, 2048)


In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [8]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Optuna

In [9]:
def objective_catboost(trial):
    params = {
        "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 2*1e-3, 0.2, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }

    model = CatBoostRegressor(**params, silent=True, random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    rmse = mean_squared_error(y_val, predictions, squared=False)
    return rmse

In [10]:
def objective_lgbm(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "n_estimators": 1000,
        "verbosity": -1,
        "bagging_freq": 1,
        "learning_rate": trial.suggest_float("learning_rate", 2*1e-3, 0.2, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }
    model = LGBMRegressor(**params, silent=True, random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    rmse = mean_squared_error(y_val, predictions, squared=False)
    return rmse

In [11]:
def objective_xgboost(trial):
    params = {
        "objective": "reg:squarederror",
        "n_estimators": 1000,
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }

    model = XGBRegressor(**params)
    model.fit(X_train, y_train, verbose=False)
    predictions = model.predict(X_val)
    rmse = mean_squared_error(y_val, predictions, squared=False)
    return rmse

In [12]:
def objective_svr(trial):
    params = {
        "C": trial.suggest_float('C', 1, 1000, log=True),
        "epsilon": trial.suggest_float('epsilon', 1e-3, 1, log=True),
    }

    model = SVR(**params)
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    rmse = mean_squared_error(y_val, predictions, squared=False)
    return rmse

In [13]:
def objective_knn(trial):
    params = {
        "n_neighbors": trial.suggest_int("n_neighbors", 1, 100, log=True),
    }

    model = KNeighborsRegressor(**params, random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    rmse = mean_squared_error(y_val, predictions, squared=False)
    return rmse

# Functions for training and validating models

In [14]:
def mean_confidence_interval(data, metric, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    m = round(m ,2)
    h = round(h, 2)
    print(f'{metric}: {m} ± {h}')

In [15]:
def cv10(model):
    mae_result = []
    rmse_result = []
    r2_result = []
    for train, val in tqdm_notebook(kf.split(X, y)):
        model.fit(X[train], y[train])
        y_pred_val = model.predict(X[val])
        mae_result.append(mean_absolute_error(y[val], y_pred_val))
        rmse_result.append(mean_squared_error(y[val], y_pred_val, squared=False))
        r2_result.append(r2_score(y[val], y_pred_val))
    mean_confidence_interval(mae_result, 'MAE')
    mean_confidence_interval(rmse_result, 'RMSE')
    mean_confidence_interval(r2_result, 'R2')

In [16]:
def predict_test(model):
    model.fit(X, y)
    y_pred_test = model.predict(X_test)
    mae = round(mean_absolute_error(y_test, y_pred_test), 2)
    rmse = round(mean_squared_error(y_test, y_pred_test, squared=False), 2)
    r2 = round(r2_score(y_test, y_pred_test), 2)
    print(f'MAE: {mae}')
    print(f'RMSE: {rmse}')
    print(f'R2: {r2}')

# Find best params

In [None]:
#сatboost
сatboost_study = optuna.create_study(direction='minimize')
сatboost_study.optimize(objective_catboost, n_trials=30)
catboost_study.best_params  

In [17]:
cat_bp = {'learning_rate': 0.10283991617950267,
          'depth': 8,
          'subsample': 0.5843053868788202,
          'colsample_bylevel': 0.2298254995755487,
          'min_data_in_leaf': 30}

In [18]:
model_cat = CatBoostRegressor(**cat_bp, random_state=42, silent=True)

In [19]:
cv10(model_cat)

0it [00:00, ?it/s]

MAE: 18.78 ± 1.47
RMSE: 27.37 ± 2.41
R2: 0.84 ± 0.05


In [20]:
predict_test(model_cat)

MAE: 16.85
RMSE: 20.26
R2: 0.75


In [None]:
#xgboost
xgboost_study = optuna.create_study(direction='minimize')
xgboost_study.optimize(objective_xgboost, n_trials=30)
xgboost_study.best_params  

In [21]:
xgb_bp = {'learning_rate': 0.06238679289783574,
          'max_depth': 7,
          'subsample': 0.9982371412074009,
          'colsample_bytree': 0.1637075927345035,
          'min_child_weight': 1,
          'objective': 'reg:squarederror',
          'n_estimators': 1000,
          'verbosity': 0}

In [22]:
model_xgb = XGBRegressor(**xgb_bp, random_state=42, silent=True)

In [23]:
cv10(model_xgb)

0it [00:00, ?it/s]

MAE: 18.36 ± 1.38
RMSE: 26.79 ± 1.76
R2: 0.85 ± 0.04


In [24]:
predict_test(model_xgb)

MAE: 19.56
RMSE: 22.54
R2: 0.69


In [None]:
#lightgbm
lightgbm_study = optuna.create_study(direction='minimize')
lightgbm_study.optimize(objective_lgbm, n_trials=30)
lightgbm_study.best_params 

In [25]:
lgbm_bp = {'learning_rate': 0.008945691798973802,
           'num_leaves': 430,
           'subsample': 0.421299382581721,
           'colsample_bytree': 0.7782704746853497,
           'min_data_in_leaf': 1,
           'objective': 'regression',
           'metric': 'rmse',
           'n_estimators': 1000,
           'verbosity': -1,
           'bagging_freq': 1}

In [26]:
model_lgbm = LGBMRegressor(**lgbm_bp, random_state=42)
cv10(model_lgbm)

0it [00:00, ?it/s]

MAE: 18.26 ± 1.95
RMSE: 27.32 ± 3.34
R2: 0.84 ± 0.06


In [27]:
predict_test(model_lgbm)

MAE: 18.53
RMSE: 21.55
R2: 0.72


In [None]:
#svr
svr_study = optuna.create_study(direction='minimize')
svr_study.optimize(objective_svr, n_trials=30)
svr_study.best_params 

In [28]:
svr_bp = {'C': 980.3782570100385, 'epsilon': 0.0010807171902425606}

In [29]:
model_svr = SVR(**svr_bp)

In [30]:
cv10(model_svr)

0it [00:00, ?it/s]

MAE: 21.45 ± 1.45
RMSE: 31.54 ± 2.27
R2: 0.8 ± 0.04


In [31]:
predict_test(model_svr)

MAE: 21.01
RMSE: 25.81
R2: 0.59


In [None]:
#knn
knn_study = optuna.create_study(direction='minimize')
knn_study.optimize(objective_knn, n_trials=30)
knn_study.best_params 

In [32]:
knn_bp = {'n_neighbors': 2}

In [33]:
model_knn = KNeighborsRegressor(**knn_bp)

In [34]:
cv10(model_knn)

0it [00:00, ?it/s]

MAE: 25.31 ± 2.05
RMSE: 39.17 ± 4.83
R2: 0.69 ± 0.06


In [35]:
predict_test(model_knn)

MAE: 29.3
RMSE: 36.64
R2: 0.18
