In [16]:
import numpy as np 
import pandas as pd
import scipy.stats
from tqdm import tqdm_notebook

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

import optuna
from rdkit import Chem
from molfeat.calc import FPCalculator

def canonize_smiles(smiles):
    return Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
    
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('IrLumDB.csv')
test = pd.read_csv('Synthesized_complexes.csv')

In [4]:
df = df[~df['L1'].apply(lambda x: 'si' in x.lower())]
df = df[~df['L3'].apply(lambda x: 'si' in x.lower())]
df = df[~df['L1'].apply(lambda x: 'b' in x.lower())]
df = df[~df['L3'].apply(lambda x: 'b' in x.lower())]
df = df[df['L3'].apply(lambda x: len(x) > 5)]

df['L1_mol'] = df['L1'].apply(Chem.MolFromSmiles)
df['L2_mol'] = df['L2'].apply(Chem.MolFromSmiles)
df['L3_mol'] = df['L3'].apply(Chem.MolFromSmiles)
test['L1_mol'] = test['L1'].apply(Chem.MolFromSmiles)
test['L2_mol'] = test['L2'].apply(Chem.MolFromSmiles)
test['L3_mol'] = test['L3'].apply(Chem.MolFromSmiles)

df_ch2cl2 = df[df['Solvent'] == 'CH2Cl2']
df_ch2cl2.drop_duplicates(subset=['L1', 'L2', 'L3'], inplace=True)
df_ch2cl2.reset_index(drop=True, inplace=True)

In [5]:
def get_finger(fingerprints):
    """
    This function creates fingerprints from SMILES ligands. 
    The list of available fingerprints can be viewed: FPCalculator.available_fingerprints()
    """
    for f in fingerprints:
        calc = FPCalculator(f)
        df_ch2cl2[f'L1_{f}'] = df_ch2cl2['L1_mol'].apply(calc)
        df_ch2cl2[f'L2_{f}'] = df_ch2cl2['L2_mol'].apply(calc)
        df_ch2cl2[f'L3_{f}'] = df_ch2cl2['L3_mol'].apply(calc)
        df_ch2cl2[f'{f}'] = np.sum([df_ch2cl2[f'L1_{f}'], df_ch2cl2[f'L2_{f}'], df_ch2cl2[f'L3_{f}']], axis=0)
        test[f'L1_{f}'] = test['L1_mol'].apply(calc)
        test[f'L2_{f}'] = test['L2_mol'].apply(calc)
        test[f'L3_{f}'] = test['L3_mol'].apply(calc)
        test[f'{f}'] = np.sum([test[f'L1_{f}'], test[f'L2_{f}'], test[f'L3_{f}']], axis=0)

    df_qy = df_ch2cl2[~df_ch2cl2['PLQY'].isna()]
    df_qy = df_qy[df_qy['PLQY_in_train'] != 0]
    X, y = df_qy[fingerprints].to_numpy(), df_qy['PLQY'].to_numpy()
    X = np.array([np.hstack(i) for i in X])
    df_result = pd.DataFrame()
    for d in X:
        df_result = pd.concat([df_result, pd.DataFrame(d).T])
    X = df_result.to_numpy()
    print(X.shape)
    
    return X, y

In [6]:
def get_plqy_class(plqy):
    if plqy <= 0.1:
        return 0
    elif 0.5 >= plqy > 0.1:
        return 1
    else:
        return 2

In [8]:
X, y = get_finger(['ecfp'])
y = np.array([get_plqy_class(i) for i in y])

(724, 2048)


In [9]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42, shuffle=True)

In [10]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Optuna

In [11]:
def objective_catboost(trial):
    params = {
        "iterations": 100,
        "learning_rate": trial.suggest_float("learning_rate", 2*1e-3, 0.2, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
    }

    model = CatBoostClassifier(**params, silent=True, random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    acc = accuracy_score(y_val, predictions)
    return acc

In [12]:
def objective_xgboost(trial):
    params = {
        "n_estimators": 100,
        'verbosity': 0,
        'objective': 'binary:logistic',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear']),
        'eta': trial.suggest_float('eta', 1e-8, 1.0, log=True),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'lambda': trial.suggest_float('lambda', 1e-8, 100.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 100.0, log=True),
    }

    model = XGBClassifier(**params)
    model.fit(X_train, y_train, verbose=False)
    predictions = model.predict(X_val)
    acc = accuracy_score(y_val, predictions)
    return acc

In [13]:
def objective_lgbm(trial):
    params = {
        "n_estimators": 100,
        "verbosity": -1,
        "bagging_freq": 1,
        "learning_rate": trial.suggest_float("learning_rate", 2*1e-3, 0.2, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }
    model = LGBMClassifier(**params, silent=True, random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    acc = accuracy_score(y_val, predictions)
    return acc

In [14]:
def objective_svc(trial):
    params = {
        "C": trial.suggest_float("C", 0.1, 1000, log=True),
    }

    model = SVC(**params)
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    acc = accuracy_score(y_val, predictions)
    return acc

In [15]:
def objective_knn(trial):
    params = {
        "n_neighbors": trial.suggest_int("n_neighbors", 1, 100, log=True),
    }

    model = KNeighborsClassifier(**params)
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    acc = accuracy_score(y_val, predictions)
    return acc

# Functions for training and validating models

In [17]:
def mean_confidence_interval(data, metric, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    m = round(m ,3)
    h = round(h, 3)
    print(f'{metric}: {m} ± {h}')

In [18]:
def cv10(model):
    conf_matrix = np.array([[0, 0, 0],
                            [0, 0, 0],
                            [0, 0, 0]])
    acc_result = []
    for train, val in tqdm_notebook(kf.split(X, y)):
        model.fit(X[train], y[train])
        y_pred_class = model.predict(X[val])
        acc_result.append(accuracy_score(y[val], y_pred_class))
        conf_matrix += confusion_matrix(y[val], y_pred_class)
        
    mean_confidence_interval(acc_result, 'Accuracy')
    return conf_matrix

# Find best params

In [None]:
#сatboost
сatboost_study = optuna.create_study(direction='maximize')
сatboost_study.optimize(objective_catboost, n_trials=30)
catboost_study.best_params  

In [19]:
cat_bp = {'n_estimators': 1000}

In [20]:
model_cat = CatBoostClassifier(**cat_bp, random_state=42, silent=True)

In [21]:
cv10(model_cat)

0it [00:00, ?it/s]

Accuracy: 0.724 ± 0.041


array([[144,  47,  10],
       [ 41, 242,  40],
       [  6,  56, 138]])

In [None]:
#xgboost
xgboost_study = optuna.create_study(direction='maximize')
xgboost_study.optimize(objective_xgboost, n_trials=30)
xgboost_study.best_params  

In [22]:
xgb_bp = {'n_estimators': 1000}

In [23]:
model_xgb = XGBClassifier(**xgb_bp, random_state=42)
cv10(model_xgb)

0it [00:00, ?it/s]

Accuracy: 0.702 ± 0.033


array([[142,  49,  10],
       [ 39, 234,  50],
       [ 10,  58, 132]])

In [None]:
#lightgbm
lightgbm_study = optuna.create_study(direction='maximize')
lightgbm_study.optimize(objective_lgbm, n_trials=30)
lightgbm_study.best_params

In [24]:
lgbm_bp = {'learning_rate': 0.010108586093225807,
 'num_leaves': 1018,
 'colsample_bytree': 0.07145984259032351,
 'min_data_in_leaf': 2,
 'n_estimators': 100,
 'verbosity': -1,
 'bagging_freq': 1}

In [25]:
model_lgbm = LGBMClassifier(**lgbm_bp, random_state=42, silent=True)

In [26]:
cv10(model_lgbm)

0it [00:00, ?it/s]

Accuracy: 0.714 ± 0.023


array([[122,  73,   6],
       [ 17, 279,  27],
       [  6,  78, 116]])

In [None]:
#svc
svc_study = optuna.create_study(direction='maximize')
svc_study.optimize(objective_svc, n_trials=30)
svc_study.best_params 

In [27]:
svc_bp = {'C': 4.831903143096115}

In [28]:
model_svc = SVC(**svc_bp)

In [29]:
cv10(model_svc)

0it [00:00, ?it/s]

Accuracy: 0.703 ± 0.032


array([[132,  64,   5],
       [ 41, 242,  40],
       [  7,  58, 135]])

In [None]:
#knn
knn_study = optuna.create_study(direction='maximize')
knn_study.optimize(objective_knn, n_trials=30)
knn_study.best_params 

In [30]:
knn_bp = {'n_neighbors': 1}

In [31]:
model_knn = KNeighborsClassifier(**knn_bp)

In [32]:
cv10(model_knn)

0it [00:00, ?it/s]

Accuracy: 0.681 ± 0.041


array([[135,  56,  10],
       [ 57, 209,  57],
       [  8,  43, 149]])