In [101]:
import pickle
import sys
import pandas as pd
from os.path import join
import os
import numpy as np

In [102]:
import lightgbm as lgb
from lightgbm import LGBMClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, cohen_kappa_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [103]:
from sklearn.metrics import confusion_matrix

In [104]:
import matplotlib.pyplot as plt

In [122]:
data_folder = "../data/preprocessed/"
X = pd.read_pickle(join(data_folder, "X.pk.zip"))
y = pd.read_pickle(join(data_folder, "Y.pk.zip"))

X_mat = X.values
y_vec = y.values.flatten()
indices = X.index
X_train, X_test, y_train, y_test, i_train, i_test = train_test_split(X_mat, y_vec, indices, test_size=0.11, random_state=123)


In [123]:
X_train.shape

(62932, 203)

In [124]:
sys.stdout.write(f"The number of features: {X_mat.shape[1]}\n")
sys.stdout.write(f"TRAIN \
0: {np.sum(y_train == 0)}, 1: {np.sum(y_train == 1)}, 2: {np.sum(y_train == 2)}\n")

sys.stdout.write(f"TEST  \
0: {np.sum(y_test == 0)}, 1: {np.sum(y_test == 1)}, 2: {np.sum(y_test == 2)}\n")

The number of features: 203
TRAIN 0: 1823, 1: 2509, 2: 58600
TEST  0: 218, 1: 326, 2: 7235


30

# Model

## SKlearn - like

In [125]:
classifiers = {
    "Tree w": DecisionTreeClassifier(class_weight='balanced'),
    "Tree cw": DecisionTreeClassifier(criterion = 'log_loss',
                                     class_weight='balanced'),
    # "LGBMC": LGBMClassifier(class_weight={0: 5, 1: 5, 2: 1}),
    "LGBMC": LGBMClassifier(),
    "LGBMC bo": LGBMClassifier(
        class_weight='balanced',
    objective = 'multiclass')
}

In [10]:
# preprocessor.

In [126]:
preprocessor = StandardScaler()
for cla in classifiers:
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor)
        , ('classifier', classifiers[cla])
    ])
    model = pipeline.fit(X_train, y_train)
    predictions = model.predict(X_test)
    score = f1_score(predictions, y_test, average=None)
    score2 = calc_custom_kappa(y_test, predictions)
    print(f'Model:{cla}; score:{score2}; all_score:{score}')

Model:Tree w; score:0.006499323611470209; all_score:[0.01716738 0.06       0.92634797]
Model:Tree cw; score:0.017006835590186675; all_score:[0.0373444  0.07952872 0.92741543]
Model:LGBMC; score:0.016836367508788697; all_score:[0.01769912 0.         0.96367877]
Model:LGBMC bo; score:0.052556191508088435; all_score:[0.0669145  0.07746479 0.85800015]


## lgb-like

In [127]:
def kappa_metric(dy_true, dy_pred):
    """An eval metric that always returns the same value"""
    metric_name = 'constant_metric'
    preds = dy_pred.reshape(3, -1).T
    preds = preds.argmax(axis = 1)
    value = cohen_kappa_score(dy_true, dy_pred)
    is_higher_better = True
    return metric_name, value, is_higher_better

evals_result = {}


def calc_custom_kappa(actuals, preds, n=3):
    # from here
    # https://www.kaggle.com/code/aroraaman/quadratic-kappa-metric-explained-in-5-simple-steps/notebook
    # but later it's better to move to something like that
    # https://www.kaggle.com/code/tsmith26/lightgbm-custom-objective-for-weighted-kappa
    conf_m = confusion_matrix(actuals, preds)
    w = np.zeros((n, n))
    for i_ in range(len(w)):
        for j_ in range(len(w)):
            w[i_][j_] = float(((i_ - j_) ** 2) / (n - 1) ** 2)

    act_hist = np.zeros([n])
    for item in actuals:
        act_hist[item] += 1

    pred_hist = np.zeros([n])
    for item in preds:
        pred_hist[item] += 1
    e_mat = np.outer(act_hist, pred_hist)

    e_mat = e_mat / e_mat.sum()
    conf_m = conf_m / conf_m.sum()

    num = 0
    den = 0
    for i_ in range(len(w)):
        for j_ in range(len(w)):
            num += w[i_][j_] * conf_m[i_][j_]
            den += w[i_][j_] * e_mat[i_][j_]

    weighted_kappa = (1 - (num / den))
    return weighted_kappa




def evalerror(preds, dtrain):

    labels = dtrain.get_label()
    preds = preds.reshape(3, -1).T
    preds = preds.argmax(axis = 1)
    f_score = f1_score(labels , preds,  average = 'weighted')
    return 'f1_score', f_score, True

In [128]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)
train_data = lgb.Dataset(X_train_sc, label=y_train)
validation_data = lgb.Dataset(X_test_sc, label=y_test)

In [129]:
params = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'multiclassova',
    'metric': 'multi_logloss',
    'num_class': 3,
    'is_unbalance': True
}



In [130]:
scores = []
bst = lgb.train(params, train_data, 750)  # , feval=kappa_metric)
y_predicted = bst.predict(X_test_sc)
ypr = [np.argmax(line) for line in y_predicted]
k_score = cohen_kappa_score(y_test, ypr)
kk_score = calc_custom_kappa(y_test, ypr)
f1_scores = f1_score(y_test, ypr, average=None)
s = np.append(f1_scores, k_score)
s = np.append(s, kk_score)
scores.append(s)

[LightGBM] [Info] Number of positive: 1823, number of negative: 61109
[LightGBM] [Info] Number of positive: 2509, number of negative: 60423
[LightGBM] [Info] Number of positive: 58600, number of negative: 4332
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19126
[LightGBM] [Info] Number of data points in the train set: 62932, number of used features: 196
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.028968 -> initscore=-3.512176
[LightGBM] [Info] Start training from score -3.512176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.039868 -> initscore=-3.181486
[LightGBM] [Info] Start training from score -3.181486
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.931164 -> initscore=2.604705
[LightGBM] [Info] Start training from score 2.604705


In [131]:
calc_custom_kappa(y_test, ypr)
    

0.0707856784303984

In [132]:
scores = np.array(scores)
print(scores)

[[0.07936508 0.07048458 0.89146669 0.04171179 0.07078568]]


### Check what is wrongly predicted

In [133]:
df_smiles = pd.read_csv("../data/raw/train.csv", index_col=0)

In [134]:
df_smiles.loc['EOS2088',:].values[0]

'Cc1ccc(C(=O)NC2CCCC2)cc1S(=O)(=O)N1CCOCC1'

In [135]:
j = 0
i = 0

true_target = 0


while j < 30:
    diff = train_data.get_label()[i] - ypr[i]
    if np.abs(diff) > 0.5 and true_target == int(train_data.get_label()[i]):
        molid = i_train[i]
        smiles = df_smiles.loc[molid, :].values[0]
        print(f'{smiles}\t{ypr[i]}')
        j+=1
    i+=1

COc1ccc(Nc2nc(N3CCN(S(C)(=O)=O)CC3)nc3ccccc23)cc1	2
COc1ccc(N2CCCn3c2nc2c3c(=O)n(CCC(C)C)c(=O)n2C)cc1	2
CCC(C)(C)N1CCN(C(=O)N[C@@H](C)c2ccc(-n3ccnn3)cc2)CC1	2
CCN(Cc1ccc(OC)c(F)c1)S(=O)(=O)c1ccc2c(c1)CCC(=O)N2	2
CCOc1ccc(NC(=O)CN2C(=O)NC3(CCC(C)CC3)C2=O)cc1	2
Cc1[nH]c(=O)[nH]c1C(=O)N1Cc2sccc2CC1(C)C	2
Cc1ccc(C)c(C(=O)C2=C(O)C(=O)N(CCN3CCOCC3)[C@H]2c2ccco2)c1	1
Cl.N[C@@H]1CCN(C(=O)CCCOc2ccc(F)cc2)C1	2
Cc1cc(NC(=O)CN2CCN(S(=O)(=O)Cc3ccccc3)CC2)n(C)n1	2
C[C@H]1CCCCN1Cc1ccccc1CNC(=O)Cn1ccc(=O)[nH]c1=O	2
Cc1nn(C)c(C)c1S(=O)(=O)N(C)[C@@H](C)c1ccc(F)cc1	2
COc1cccc2c1CN(C(=O)NC[C@@H]1CC(=O)N(C)[C@H]1c1cnn(C)c1C)C2	2
CC(=O)N1C[C@H](O)C[C@@H]1C(=O)Nc1ccc(OCC2CC2)cc1	2
COCCn1c(N)c(C(=O)CN2CCN(c3ccc(O)cc3)CC2)c(=O)[nH]c1=O	2
COc1ccccc1NCCCn1c(C)cn2c3c(=O)[nH]c(=O)n(C)c3nc12	2
CC1(C)C(=O)Nc2ccc(C(=O)N[C@H]3CCCn4ncnc43)cc21	2
COc1cccc([C@@H]2CCCN2C(=O)C2CCCC2)c1	2
CS(=O)(=O)c1nc(C(=O)Nc2ccc3ccccc3n2)c2ccccn12	1
CCOc1ccc(-c2nc3ccccc3c(=O)o2)cc1	2
Cc1ccc(CN(C(=O)C[S@](=O)c2ccc(C)cn2)C2CC2)cc1	2
O=C(NC