<h1 style="color:rgb(0,120,170)">Artificial Intelligence in Life Sciences</h1>
<h2 style="color:rgb(0,120,170)">QSAR and model evaluation</h2>

<b>Authors:</b> Rumetshofer, Renz, Schimunek <br>
<b>Date:</b> 24-03-2022

This file is part of the "Artificial Intelligence in Life Sciences" lecture material.
The following copyright statement applies to all code within this file.

<b>Copyright statement:</b><br>
This material, no matter whether in printed or electronic form, may be used for personal and non-commercial educational
use only. Any reproduction of this manuscript, no matter whether as a whole or in parts, no matter whether in printed
or in electronic form, requires explicit prior acceptance of the authors.

In [30]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
import optuna
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.metrics import log_loss
from sklearn.preprocessing import MultiLabelBinarizer

import os
import pandas as pd
import numpy as np
import copy

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

from rdkit import RDLogger  
RDLogger.DisableLog('rdApp.*') 

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.metrics import pairwise_distances
from sklearn.cluster import AgglomerativeClustering

from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

print(rdkit.__version__)

2022.09.5


In [2]:
# Preprocessed Tox21 dataset with pre-assigned clusters
data = pd.read_csv("data_train.csv",index_col=0).reset_index(drop=True)
data

Unnamed: 0,smiles,task1,task2,task3,task4,task5,task6,task7,task8,task9,task10,task11
0,CC(=O)N(C)c1cccc(-c2ccnc3c(C(=O)c4cccs4)cnn23)c1,0,0,0,0,0,0,0,-1,0,0,0
1,COc1cc(N)c(Cl)cc1C(=O)OCCCN1CCCCC1.Cl,0,0,0,0,0,0,0,-1,0,0,0
2,CCCCNc1c(C(=O)OCC)cnc2c1cnn2CC,0,0,0,0,0,0,0,0,0,1,0
3,C#Cc1cccc(Nc2ncnc3cc(OCCOC)c(OCCOC)cc23)c1.Cl,0,0,0,0,0,0,0,-1,0,0,1
4,CC1OC2(CCCCC2Oc2cccc(Cl)c2)N=C1O,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
11995,CC(C)(C)NC[C@@H](O)COc1nsnc1N1CCOCC1,0,0,0,0,0,0,0,-1,0,0,0
11996,CCC[C@@]1(CCc2ccccc2)CC(O)=C([C@H](CC)c2cccc(N...,0,0,0,0,0,0,0,-1,0,0,0
11997,N=C(O)c1cnc(C2CC2)[nH]1,0,0,0,-1,0,0,0,0,0,0,0
11998,CN=C=O,0,0,0,0,0,0,-1,0,0,0,0


<h2 style="color:rgb(0,120,170)">Data preprocessing</h2>

In order to use the dataset for training a model we replace the missing values with `-1`.

In [3]:
# Select labels, convert to numpy array
y = data[data.columns[1:]].to_numpy()
y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], dtype=int64)

In [9]:
y.shape

(12000, 11)

Next, we calculate Morgan fingerprints from the Smiles string for each sample.

In [4]:
# Initialize variables
fp_length = 1024
fps = np.zeros((len(data), fp_length))

# Calculate Morgan fingerprints and convert to numpy array
for i, smiles in enumerate(tqdm(data['smiles'])):
    mol = Chem.MolFromSmiles(smiles)
    fp_vec = AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=fp_length)
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp_vec, arr)
    fps[i] = arr

100%|██████████| 12000/12000 [00:07<00:00, 1574.34it/s]


In [11]:
fps

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

<h1 style="color:rgb(0,120,170)">Train model on random split</h1>

In [11]:
train_fps, val_fps, y_train, y_val = train_test_split(fps, y,
                                                      test_size=0.2, random_state=1234)


In [12]:
def objective(trial):
    # Optimize the hyperparameters using Optuna
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'num_leaves': trial.suggest_int('num_leaves', 10, 100),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.2, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    lgb_train = lgb.Dataset(train_fps, label=y_train[:, 0])
    lgb_val = lgb.Dataset(val_fps, label=y_val[:, 0])

    num_round = 100
    lgb_model = lgb.train(params, lgb_train, num_round, valid_sets=[lgb_val],
                          early_stopping_rounds=10, verbose_eval=False)

    # Return the binary_logloss from validation set as the objective to minimize
    return lgb_model.best_score["valid_0"]["binary_logloss"]


if __name__ == "__main__":
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=50)

    print("Best trial:")
    trial = study.best_trial
    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    # Train the model with the best hyperparameters
    best_params = trial.params
    best_params.update({"objective": "binary", "metric": "binary_logloss"})
    lgb_train = lgb.Dataset(train_fps, label=y_train[:, 0])
    lgb_val = lgb.Dataset(val_fps, label=y_val[:, 0])
    num_round = 100

    lgb_model = lgb.train(best_params, lgb_train, num_round, valid_sets=[lgb_train, lgb_val],
                          early_stopping_rounds=10)


[32m[I 2023-04-10 16:03:38,839][0m A new study created in memory with name: no-name-52325624-0587-4a0d-b5ff-541906d70321[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:39,352][0m Trial 0 finished with value: 0.15828870814679985 and parameters: {'num_leaves': 14, 'learning_rate': 0.00041691523050076107, 'feature_fraction': 0.7697668301022773, 'bagging_fraction': 0.7328152528063749, 'bagging_freq': 5, 'min_child_samples': 89}. Best is trial 0 with value: 0.15828870814679985.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:39,914][0m Trial 1 finished with value: 0.15353160616368425 and parameters: {'num_leaves': 24, 'learning_rate': 0.00042792163785502834, 'feature_fraction': 0.8550502363116506, 'bagging_fraction': 0.47650529772600936, 'bagging_freq': 6, 'min_child_samples': 12}. Best is trial 1 with value: 0.15353160616368425.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:40,781][0m Trial 2 finished with value: 0.1554987951463129 and parameters: {'num_leaves': 62, 'learning_rate': 0.0005456315611098472, 'feature_fraction': 0.4172094633527854, 'bagging_fraction': 0.7366541429839353, 'bagging_freq': 7, 'min_child_samples': 83}. Best is trial 1 with value: 0.15353160616368425.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:41,557][0m Trial 3 finished with value: 0.13972348527770923 and parameters: {'num_leaves': 67, 'learning_rate': 0.0010224113938887183, 'feature_fraction': 0.7552169527412081, 'bagging_fraction': 0.7490607160405172, 'bagging_freq': 6, 'min_child_samples': 42}. Best is trial 3 with value: 0.13972348527770923.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:42,217][0m Trial 4 finished with value: 0.11458489466799475 and parameters: {'num_leaves': 34, 'learning_rate': 0.0030167655409689942, 'feature_fraction': 0.48723498361498635, 'bagging_fraction': 0.7228428441574727, 'bagging_freq': 1, 'min_child_samples': 62}. Best is trial 4 with value: 0.11458489466799475.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:42,706][0m Trial 5 finished with value: 0.11265610883266913 and parameters: {'num_leaves': 37, 'learning_rate': 0.002579738760624978, 'feature_fraction': 0.682911301254642, 'bagging_fraction': 0.4134989399558773, 'bagging_freq': 2, 'min_child_samples': 14}. Best is trial 5 with value: 0.11265610883266913.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:43,526][0m Trial 6 finished with value: 0.05667429623602807 and parameters: {'num_leaves': 76, 'learning_rate': 0.01575263709318802, 'feature_fraction': 0.5505094100818214, 'bagging_fraction': 0.9619942129954822, 'bagging_freq': 1, 'min_child_samples': 60}. Best is trial 6 with value: 0.05667429623602807.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:44,142][0m Trial 7 finished with value: 0.042267052643796474 and parameters: {'num_leaves': 76, 'learning_rate': 0.030919612603084005, 'feature_fraction': 0.7838984203280872, 'bagging_fraction': 0.7379818943901397, 'bagging_freq': 3, 'min_child_samples': 20}. Best is trial 7 with value: 0.042267052643796474.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:44,732][0m Trial 8 finished with value: 0.05712517229689763 and parameters: {'num_leaves': 75, 'learning_rate': 0.016492243483559356, 'feature_fraction': 0.9964328239929403, 'bagging_fraction': 0.8496160182632138, 'bagging_freq': 6, 'min_child_samples': 62}. Best is trial 7 with value: 0.042267052643796474.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:45,290][0m Trial 9 finished with value: 0.15272325227201555 and parameters: {'num_leaves': 81, 'learning_rate': 0.0005967761058727991, 'feature_fraction': 0.6770631884567345, 'bagging_fraction': 0.5461107603244645, 'bagging_freq': 3, 'min_child_samples': 48}. Best is trial 7 with value: 0.042267052643796474.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:45,684][0m Trial 10 finished with value: 0.047647700656292674 and parameters: {'num_leaves': 99, 'learning_rate': 0.19912047475384628, 'feature_fraction': 0.883668353980444, 'bagging_fraction': 0.5869105440757193, 'bagging_freq': 4, 'min_child_samples': 32}. Best is trial 7 with value: 0.042267052643796474.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:46,038][0m Trial 11 finished with value: 0.04711186555966518 and parameters: {'num_leaves': 100, 'learning_rate': 0.17222230320375245, 'feature_fraction': 0.8942664545113046, 'bagging_fraction': 0.5831365757516519, 'bagging_freq': 4, 'min_child_samples': 30}. Best is trial 7 with value: 0.042267052643796474.[0m
[32m[I 2023-04-10 16:03:46,359][0m Trial 12 finished with value: 0.04573796851241448 and parameters: {'num_leaves': 99, 'learning_rate': 0.15983672327752205, 'feature_fraction': 0.9047714190143895, 'bagging_fraction': 0.6220066853531836, 'bagging_freq': 3, 'min_child_samples': 27}. Best is trial 7 with value: 0.042267052643796474.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570
[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:46,922][0m Trial 13 finished with value: 0.042414353958880546 and parameters: {'num_leaves': 89, 'learning_rate': 0.0378012873864016, 'feature_fraction': 0.950073814145337, 'bagging_fraction': 0.6393300256173694, 'bagging_freq': 3, 'min_child_samples': 25}. Best is trial 7 with value: 0.042267052643796474.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:47,327][0m Trial 14 finished with value: 0.16526103956397448 and parameters: {'num_leaves': 49, 'learning_rate': 0.00011739003015105583, 'feature_fraction': 0.9585409797505818, 'bagging_fraction': 0.6581890180731815, 'bagging_freq': 3, 'min_child_samples': 5}. Best is trial 7 with value: 0.042267052643796474.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:47,909][0m Trial 15 finished with value: 0.042627445851485875 and parameters: {'num_leaves': 87, 'learning_rate': 0.03745800014199516, 'feature_fraction': 0.808592091982965, 'bagging_fraction': 0.8256932745158593, 'bagging_freq': 2, 'min_child_samples': 23}. Best is trial 7 with value: 0.042267052643796474.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:48,374][0m Trial 16 finished with value: 0.041213579114061 and parameters: {'num_leaves': 54, 'learning_rate': 0.04399133587526455, 'feature_fraction': 0.9874834831481893, 'bagging_fraction': 0.6665414225921092, 'bagging_freq': 2, 'min_child_samples': 39}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:48,769][0m Trial 17 finished with value: 0.08106387699093927 and parameters: {'num_leaves': 44, 'learning_rate': 0.007132802415947579, 'feature_fraction': 0.824760559191716, 'bagging_fraction': 0.6700960157258209, 'bagging_freq': 2, 'min_child_samples': 42}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:49,182][0m Trial 18 finished with value: 0.04396862478474068 and parameters: {'num_leaves': 57, 'learning_rate': 0.0613760723073035, 'feature_fraction': 0.980947785346699, 'bagging_fraction': 0.5294250282947607, 'bagging_freq': 4, 'min_child_samples': 78}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:49,511][0m Trial 19 finished with value: 0.04248684930256256 and parameters: {'num_leaves': 67, 'learning_rate': 0.0701717669335135, 'feature_fraction': 0.6123783635070883, 'bagging_fraction': 0.8129679036428293, 'bagging_freq': 1, 'min_child_samples': 37}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:49,963][0m Trial 20 finished with value: 0.07316532147529235 and parameters: {'num_leaves': 53, 'learning_rate': 0.011675957698670789, 'feature_fraction': 0.7308055457514926, 'bagging_fraction': 0.6815949192243704, 'bagging_freq': 2, 'min_child_samples': 99}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:50,511][0m Trial 21 finished with value: 0.043640978492507886 and parameters: {'num_leaves': 89, 'learning_rate': 0.03139927992286344, 'feature_fraction': 0.9255269269705545, 'bagging_fraction': 0.6392867202235271, 'bagging_freq': 3, 'min_child_samples': 19}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:51,098][0m Trial 22 finished with value: 0.044746610939897756 and parameters: {'num_leaves': 88, 'learning_rate': 0.037726044921004835, 'feature_fraction': 0.9483393810160006, 'bagging_fraction': 0.6288129755508269, 'bagging_freq': 4, 'min_child_samples': 54}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:51,442][0m Trial 23 finished with value: 0.04850768544003736 and parameters: {'num_leaves': 70, 'learning_rate': 0.07801703783771641, 'feature_fraction': 0.9990528074408153, 'bagging_fraction': 0.705886208710587, 'bagging_freq': 3, 'min_child_samples': 9}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:51,908][0m Trial 24 finished with value: 0.04562968956202185 and parameters: {'num_leaves': 60, 'learning_rate': 0.02263393770868149, 'feature_fraction': 0.8551105530530294, 'bagging_fraction': 0.7739865095133951, 'bagging_freq': 2, 'min_child_samples': 17}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:52,444][0m Trial 25 finished with value: 0.07191136535195404 and parameters: {'num_leaves': 82, 'learning_rate': 0.008956352397669322, 'feature_fraction': 0.931653987259896, 'bagging_fraction': 0.6810126680576883, 'bagging_freq': 3, 'min_child_samples': 34}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:53,000][0m Trial 26 finished with value: 0.04481485661216057 and parameters: {'num_leaves': 93, 'learning_rate': 0.024205530882975838, 'feature_fraction': 0.9433331045510012, 'bagging_fraction': 0.7725695835559533, 'bagging_freq': 5, 'min_child_samples': 20}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:53,418][0m Trial 27 finished with value: 0.08188243453732998 and parameters: {'num_leaves': 45, 'learning_rate': 0.007131250218173555, 'feature_fraction': 0.8071363331784382, 'bagging_fraction': 0.6024754399719239, 'bagging_freq': 2, 'min_child_samples': 40}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:53,835][0m Trial 28 finished with value: 0.04295219059594939 and parameters: {'num_leaves': 77, 'learning_rate': 0.04991265298353865, 'feature_fraction': 0.8743664662114481, 'bagging_fraction': 0.6966416711378127, 'bagging_freq': 5, 'min_child_samples': 25}. Best is trial 16 with value: 0.041213579114061.[0m
[32m[I 2023-04-10 16:03:54,075][0m Trial 29 finished with value: 0.042806362956725755 and parameters: {'num_leaves': 16, 'learning_rate': 0.09291826450333436, 'feature_fraction': 0.7794589077521958, 'bagging_fraction': 0.6420463862588077, 'bagging_freq': 4, 'min_child_samples': 47}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570
[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:54,559][0m Trial 30 finished with value: 0.051491344011527745 and parameters: {'num_leaves': 71, 'learning_rate': 0.022982013169381456, 'feature_fraction': 0.902371897825401, 'bagging_fraction': 0.7184575892694547, 'bagging_freq': 1, 'min_child_samples': 72}. Best is trial 16 with value: 0.041213579114061.[0m
[32m[I 2023-04-10 16:03:54,874][0m Trial 31 finished with value: 0.04288777499386921 and parameters: {'num_leaves': 65, 'learning_rate': 0.08916677111024225, 'feature_fraction': 0.6303873351588948, 'bagging_fraction': 0.7952270491217175, 'bagging_freq': 1, 'min_child_samples': 36}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570
[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:55,373][0m Trial 32 finished with value: 0.042977045661251524 and parameters: {'num_leaves': 82, 'learning_rate': 0.04694461453435874, 'feature_fraction': 0.8452511299013505, 'bagging_fraction': 0.8562942849463608, 'bagging_freq': 1, 'min_child_samples': 38}. Best is trial 16 with value: 0.041213579114061.[0m
[32m[I 2023-04-10 16:03:55,683][0m Trial 33 finished with value: 0.043132059868418386 and parameters: {'num_leaves': 54, 'learning_rate': 0.0964235022272825, 'feature_fraction': 0.9647251353388534, 'bagging_fraction': 0.7456284410613755, 'bagging_freq': 2, 'min_child_samples': 28}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570
[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:56,163][0m Trial 34 finished with value: 0.04366499091851932 and parameters: {'num_leaves': 63, 'learning_rate': 0.03517848984096878, 'feature_fraction': 0.6227690695932991, 'bagging_fraction': 0.6981777109832703, 'bagging_freq': 1, 'min_child_samples': 48}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:56,534][0m Trial 35 finished with value: 0.04367218882280537 and parameters: {'num_leaves': 71, 'learning_rate': 0.06499181058691289, 'feature_fraction': 0.7714418486903029, 'bagging_fraction': 0.7377211447471573, 'bagging_freq': 3, 'min_child_samples': 11}. Best is trial 16 with value: 0.041213579114061.[0m
[32m[I 2023-04-10 16:03:56,813][0m Trial 36 finished with value: 0.04481318496021476 and parameters: {'num_leaves': 34, 'learning_rate': 0.11647665584269473, 'feature_fraction': 0.7152785779943672, 'bagging_fraction': 0.8019615173065532, 'bagging_freq': 2, 'min_child_samples': 52}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570
[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:57,285][0m Trial 37 finished with value: 0.04411241208786553 and parameters: {'num_leaves': 93, 'learning_rate': 0.05277718646245115, 'feature_fraction': 0.8593812691329593, 'bagging_fraction': 0.8781374780020267, 'bagging_freq': 7, 'min_child_samples': 23}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:57,773][0m Trial 38 finished with value: 0.05600258036850766 and parameters: {'num_leaves': 78, 'learning_rate': 0.013284631248551731, 'feature_fraction': 0.9227715885664639, 'bagging_fraction': 0.7267513815719528, 'bagging_freq': 1, 'min_child_samples': 16}. Best is trial 16 with value: 0.041213579114061.[0m
[32m[I 2023-04-10 16:03:58,081][0m Trial 39 finished with value: 0.043953038986850546 and parameters: {'num_leaves': 67, 'learning_rate': 0.12916819476137464, 'feature_fraction': 0.9754638817526016, 'bagging_fraction': 0.6630422320695671, 'bagging_freq': 2, 'min_child_samples': 35}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570
[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:58,458][0m Trial 40 finished with value: 0.04190628665777745 and parameters: {'num_leaves': 58, 'learning_rate': 0.07102720453864919, 'feature_fraction': 0.7549448974365621, 'bagging_fraction': 0.7512034341021296, 'bagging_freq': 3, 'min_child_samples': 45}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:58,898][0m Trial 41 finished with value: 0.041852950840467175 and parameters: {'num_leaves': 59, 'learning_rate': 0.055380632145357864, 'feature_fraction': 0.7625897681469107, 'bagging_fraction': 0.7490533948036616, 'bagging_freq': 3, 'min_child_samples': 45}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:59,305][0m Trial 42 finished with value: 0.04445800230116795 and parameters: {'num_leaves': 40, 'learning_rate': 0.03217636727658807, 'feature_fraction': 0.7547714218930719, 'bagging_fraction': 0.7617750757577768, 'bagging_freq': 3, 'min_child_samples': 58}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:03:59,762][0m Trial 43 finished with value: 0.05136871776525247 and parameters: {'num_leaves': 58, 'learning_rate': 0.020072868616861892, 'feature_fraction': 0.7991416958699786, 'bagging_fraction': 0.7170731897663603, 'bagging_freq': 4, 'min_child_samples': 45}. Best is trial 16 with value: 0.041213579114061.[0m
[32m[I 2023-04-10 16:04:00,070][0m Trial 44 finished with value: 0.04758875624563104 and parameters: {'num_leaves': 51, 'learning_rate': 0.14281343647075273, 'feature_fraction': 0.7540971907537349, 'bagging_fraction': 0.7600512477908132, 'bagging_freq': 3, 'min_child_samples': 54}. Best is trial 16 with value: 0.041213579114061.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570
[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:04:00,457][0m Trial 45 finished with value: 0.04106002329049448 and parameters: {'num_leaves': 47, 'learning_rate': 0.04920031160221542, 'feature_fraction': 0.8277746159959851, 'bagging_fraction': 0.6846852351069285, 'bagging_freq': 3, 'min_child_samples': 31}. Best is trial 45 with value: 0.04106002329049448.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:04:00,801][0m Trial 46 finished with value: 0.042280385258540505 and parameters: {'num_leaves': 47, 'learning_rate': 0.05744446437430618, 'feature_fraction': 0.8305107601883294, 'bagging_fraction': 0.6995089933573692, 'bagging_freq': 4, 'min_child_samples': 30}. Best is trial 45 with value: 0.04106002329049448.[0m
[32m[I 2023-04-10 16:04:01,031][0m Trial 47 finished with value: 0.04463498231299974 and parameters: {'num_leaves': 26, 'learning_rate': 0.1991629982394159, 'feature_fraction': 0.7334774420943122, 'bagging_fraction': 0.7431859934616132, 'bagging_freq': 3, 'min_child_samples': 68}. Best is trial 45 with value: 0.04106002329049448.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:04:01,313][0m Trial 48 finished with value: 0.044197956887882674 and parameters: {'num_leaves': 39, 'learning_rate': 0.11293228367441789, 'feature_fraction': 0.7818961213652978, 'bagging_fraction': 0.7833934259498239, 'bagging_freq': 4, 'min_child_samples': 43}. Best is trial 45 with value: 0.04106002329049448.[0m


[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570
[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570


[32m[I 2023-04-10 16:04:01,706][0m Trial 49 finished with value: 0.053576460390093804 and parameters: {'num_leaves': 43, 'learning_rate': 0.016990225967076274, 'feature_fraction': 0.7011707668540861, 'bagging_fraction': 0.6788417455320028, 'bagging_freq': 3, 'min_child_samples': 31}. Best is trial 45 with value: 0.04106002329049448.[0m


Best trial:
  Value: 0.04106002329049448
  Params: 
    num_leaves: 47
    learning_rate: 0.04920031160221542
    feature_fraction: 0.8277746159959851
    bagging_fraction: 0.6846852351069285
    bagging_freq: 3
    min_child_samples: 31
[LightGBM] [Info] Number of positive: 363, number of negative: 9237
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2048
[LightGBM] [Info] Number of data points in the train set: 9600, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037812 -> initscore=-3.236570
[LightGBM] [Info] Start training from score -3.236570
[1]	training's binary_logloss: 0.136293	valid_1's binary_logloss: 0.145274
Training until validation scores don't improve for 10 rounds
[2]	training's binary_logloss: 0.124141	valid_1's binary_logloss: 0.132846
[3]	training's binary_logloss: 0.11577	valid_1's binary_logloss: 0.124135
[4]	training's binary_log

### Testing calibration

In [31]:
def objective(trial):
    # Optimize the hyperparameters using Optuna
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 10, 100),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.2, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }
    num_round = 100

    mlb = MultiLabelBinarizer()
    y_train_binarized = mlb.fit_transform(y_train)
    y_val_binarized = mlb.transform(y_val)

    logloss_list = []

    for i in range(y_train_binarized.shape[1]):
        lgb_model = LGBMClassifier(n_estimators=num_round, **params)
        calibrated_lgb_model = CalibratedClassifierCV(
            lgb_model, method="sigmoid", cv=3)
        calibrated_lgb_model.fit(train_fps, y_train_binarized[:, i])

        # Return the binary_logloss from validation set as the objective to minimize
        val_preds = calibrated_lgb_model.predict_proba(val_fps)
        logloss = log_loss(y_val_binarized[:, i], val_preds)
        logloss_list.append(logloss)

    return np.mean(logloss_list)


### Testing

In [32]:
test_data = pd.read_csv("smiles_test.csv", index_col=0).reset_index(drop=True)
test_data


Unnamed: 0,smiles
0,OC(COc1ccc(Cl)cc1)=N[C@H]1CC[C@H](N=C(O)COc2cc...
1,CCCO/N=C(/C)c1cc(C(O)=NC(Cc2cc(F)cc(F)c2)[C@@H...
2,COc1cc(Cl)ccc1Cl
3,COc1cc(C(O)=NCc2ccc(OCCN(C)C)cc2)cc(OC)c1OC
4,CCC(=O)O[C@@]1(C(=O)CCl)[C@@H](C)C[C@H]2[C@@H]...
...,...
5891,N#Cc1cc(NC(=O)C(=O)O)c(Cl)c(NC(=O)C(=O)O)c1.NC...
5892,O=c1cccc2n1C[C@@H]1CNC[C@H]2C1
5893,CSCC[C@H](N=C(O)[C@H](Cc1ccccc1)N=C(O)CN=C(O)C...
5894,CCn1cc2c3c(cc(C(O)=NC(Cc4ccccc4)[C@H](O)C[NH2+...


In [33]:
fp_length = 1024

test_fps = np.zeros((len(test_data), fp_length))
for i, smiles in enumerate(test_data['smiles']):
    mol = Chem.MolFromSmiles(smiles)
    fp_vec = AllChem.GetMorganFingerprintAsBitVect(
        mol, radius=3, nBits=fp_length)
    arr = np.zeros((1,))
    Chem.DataStructs.ConvertToNumpyArray(fp_vec, arr)
    test_fps[i] = arr


In [34]:
new_probs = lgb_model.predict(test_fps)


In [35]:
n_tasks = y_train.shape[1]
test_probs = np.empty((test_fps.shape[0], n_tasks))

for j in range(n_tasks):
    lgb_test = lgb.Dataset(test_fps)
    test_probs[:, j] = lgb_model.predict(test_fps)


In [36]:
probs = pd.DataFrame(test_probs, columns=[
                     "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11"])
probs.index = test_data.index

probs.to_csv("test_predictions_4.csv")


We can also look at the predicted number of samples for each class.

In [108]:
unique, counts = np.unique(y_hats_class, return_counts=True)
dict(zip(unique, counts))

{0.0: 26028, 1.0: 372}

<h1 style="color:rgb(0,120,170)">Metrics</h1>

To determine the quality of the model we look at several metrics. When calculating metrics we need to remove predictions for missing values as there's no way to measure the quality of these predictions.

<h2 style="color:rgb(0,120,170)">Confusion Matrix, Precision, Recall, F1-score</h2>

Lets look at these metrics (or methods) for the first task.

In [109]:
task = 0
# Mask out unknown samples
idx = (y_test[:, task] != (-1))

### Confusion Matrix

In [110]:
cm = confusion_matrix(y_test[idx,task], y_hats_class[idx,task])
cm

array([[2188,    1],
       [  13,   86]], dtype=int64)

In [111]:
# True Negatives, False Positives, False Negatives, True Positives
cm.ravel()

array([2188,    1,   13,   86], dtype=int64)

### Precision, Recall and F1-Score

- The **precision** is the ratio $\frac{TP}{TP + FP}$ where TP is the number of true positives and FP the number of false positives. The precision is intuitively the ability of the classifier to not label negative samples as positive.

- The **recall** is the ratio $\frac{TP}{TP + FN}$ where TP is the number of true positives and FN the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.

- The **F1-score** can be interpreted as a weighted harmonic mean of the precision and recall.

In [112]:
print(classification_report(y_test[idx,task],y_hats_class[idx,task], target_names=["class 0", "class 1"]))

              precision    recall  f1-score   support

     class 0       0.99      1.00      1.00      2189
     class 1       0.99      0.87      0.92        99

    accuracy                           0.99      2288
   macro avg       0.99      0.93      0.96      2288
weighted avg       0.99      0.99      0.99      2288



<h2 style="color:rgb(0,120,170)">Area under the ROC curve (AUC)</h2>

Next, we calculate the AUC for each task and the mean over all tasks.

In [90]:
def calc_masked_AUC_per_task(prediction, target):
    auc_per_task = []
    for j in range(target.shape[1]):
        y_score = prediction[:, j]
        y_true = target[:, j]
        # Mask out unknown samples
        idx = (y_true != (-1))
        # Calculate AUC per task
        auc_per_task.append(roc_auc_score(y_true[idx], y_score[idx]))
    return auc_per_task

In [113]:
# Calculate AUC per task
auc_per_task = calc_masked_AUC_per_task(y_hats_proba, y_test)
auc_per_task

[0.9998223440434495,
 0.5305144112321529,
 0.6112104995296186,
 0.955122591943958,
 0.5633698958429347,
 0.5469617140850018,
 0.6722359040829553,
 0.6896354484441732,
 0.7863222707526398,
 0.7882040752210403,
 0.650561797752809]

In [114]:
np.mean(auc_per_task)

0.7085419048118848

### Testing predictions locally

In [115]:
target = data.iloc[:1000,1:]
target = (target +1)/2
target[target==0.5] = -1
target 

Unnamed: 0,task1,task2,task3,task4,task5,task6,task7,task8,task9,task10,task11
0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0
1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0
2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,1.0
4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...
995,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0
996,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,0.0,-1.0,-1.0
997,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
998,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,-1.0,0.0,-1.0,-1.0


In [126]:
target = data.iloc[:, 1:]
target = (target + 1)/2
target[target == 0.5] = -1
target


Unnamed: 0,task1,task2,task3,task4,task5,task6,task7,task8,task9,task10,task11
0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0
1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0
2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,1.0
4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...
11995,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0
11996,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0
11997,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
11998,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0


In [130]:
target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   task1   12000 non-null  float64
 1   task2   12000 non-null  float64
 2   task3   12000 non-null  float64
 3   task4   12000 non-null  float64
 4   task5   12000 non-null  float64
 5   task6   12000 non-null  float64
 6   task7   12000 non-null  float64
 7   task8   12000 non-null  float64
 8   task9   12000 non-null  float64
 9   task10  12000 non-null  float64
 10  task11  12000 non-null  float64
dtypes: float64(11)
memory usage: 1.0 MB


In [127]:
submission = probs.iloc[:, :]
submission


Unnamed: 0,task1,task2,task3,task4,task5,task6,task7,task8,task9,task10,task11
0,0.00,0.010000,0.012500,0.00,0.00,0.011429,0.010000,0.015,0.130000,0.060000,0.010
1,0.02,0.036667,0.062619,0.00,0.00,0.275000,0.025000,0.010,0.020000,0.081333,0.005
2,0.59,0.021667,0.015000,0.02,0.01,0.000000,0.000000,0.000,0.015000,0.090000,0.014
3,0.00,0.000000,0.010000,0.00,0.00,0.000000,0.000000,0.000,0.071667,0.010000,0.000
4,0.00,0.000000,0.006667,0.00,0.00,0.000000,0.010000,0.000,0.010000,0.685000,0.000
...,...,...,...,...,...,...,...,...,...,...,...
2395,0.00,0.000000,0.020000,0.00,0.00,0.000000,0.200000,0.000,0.220000,0.040000,0.000
2396,0.01,0.041667,0.050000,0.02,0.00,0.000000,0.033333,0.000,0.090000,0.130000,0.020
2397,0.00,0.010000,0.020000,0.00,0.00,0.000000,0.030000,0.000,0.010000,0.010000,0.000
2398,0.88,0.017500,0.042667,0.00,0.01,0.000000,0.020000,0.010,0.010000,0.040000,0.010


In [131]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400 entries, 0 to 2399
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   task1   2400 non-null   float64
 1   task2   2400 non-null   float64
 2   task3   2400 non-null   float64
 3   task4   2400 non-null   float64
 4   task5   2400 non-null   float64
 5   task6   2400 non-null   float64
 6   task7   2400 non-null   float64
 7   task8   2400 non-null   float64
 8   task9   2400 non-null   float64
 9   task10  2400 non-null   float64
 10  task11  2400 non-null   float64
dtypes: float64(11)
memory usage: 206.4 KB


In [133]:
auc_per_task = []
for j in range(target.shape[1]):
    y_score = submission.iloc[:, j]
    print(y_score.shape)
    y_true = target.iloc[:, j]
    print(y_true.shape)
        # mask out unknown samples
    idx = (y_true != (-1))
        # calculate AUC per task
    auc_per_task.append(roc_auc_score(y_true[idx], y_score[idx]))
avg_auc = np.mean(auc_per_task)
print(avg_auc)


(2400,)
(12000,)


ValueError: Found input variables with inconsistent numbers of samples: [1023, 212]

<h1 style="color:rgb(0,120,170)">Cluster Cross-Validation</h1>

The previous model was trained with samples randomly assigned to the training and test sets. However, if we want to know how well our model generalizes to future data it might be a better idea to assign the training and test samples based on structural similarity. If we cluster the samples and assign all samples of some clusters to the training set and all samples of the other clusters to the test set we avoid that very similar samples are in the training and test sets.

In [22]:
# We have 5 different cluster folds
data['cluster_folds'].unique()

array([4, 1, 2, 3, 0])

In [56]:
# We can select a list of test-folds here, to save time we only select one
test_folds = [0]

# For each test_fold we train a model on the remaining folds and calculate the AUC on the selected test fold
for test_fold in test_folds:
    # Split data
    X_train, X_test, y_train, y_test = split_data(test_fold, fps, y, data['cluster_folds'])
    y_hats_proba, y_hats_class = train_rf(X_train, y_train, X_test)

    # Calculate mean AUC
    auc_per_task = calc_masked_AUC_per_task(y_hats_proba, y_test)
    print(np.mean(auc_per_task))

100%|██████████| 12/12 [00:24<00:00,  2.08s/it]

0.7419875728081738



