In [None]:
!pip install catboost BorutaShap

In [None]:
import pandas as pd
import numpy as np
import sklearn
import warnings
warnings.filterwarnings('ignore') # To suppress runtime and deprication warnings. 

# Load Data

Data from https://github.com/cosylabiiit/bittersweet

Dragon3D data

In [None]:
# Load raw data.
bitter_train = pd.read_csv('manuscript - experiments/bitter-data/model-data-dragon_3d.tsv', sep='\t')
bitter_test = pd.read_csv('manuscript - experiments/bitter-data/gold-standard-dragon_3d.tsv', sep='\t')
sweet_test = pd.read_csv('manuscript - experiments/sweet-data/gold-standard-dragon_3d.tsv', sep='\t')
sweet_train = pd.read_csv('manuscript - experiments/sweet-data/model-data-dragon_3d.tsv', sep='\t')

In [None]:
print(bitter_train.shape)
print(bitter_test.shape)
print(sweet_train.shape)
print(sweet_test.shape)

(2135, 5096)
(171, 5097)
(2090, 5096)
(154, 5097)


In [None]:
# Rename taste column as Target.
sweet_train.rename(columns={'taste':'Target'}, inplace=True)
sweet_test.rename(columns={'taste':'Target'}, inplace=True)
bitter_train.rename(columns={'taste':'Target'}, inplace=True)
bitter_test.rename(columns={'taste':'Target'}, inplace=True)


sweet_train.rename(columns={'smiles':'SMILES'}, inplace=True)
sweet_test.rename(columns={'smiles':'SMILES'}, inplace=True)
bitter_train.rename(columns={'smiles':'SMILES'}, inplace=True)
bitter_test.rename(columns={'smiles':'SMILES'}, inplace=True)

In [None]:
print(sweet_train.count())
sweet_train[sweet_train['SMILES'].isna()] # check if any of the SMILES is a NaN.

name            1993
orig_taste      2090
reference       2090
SMILES          2090
can-smiles      2090
                ... 
CATS3D_15_LL    2090
CATS3D_16_LL    2090
CATS3D_17_LL    2090
CATS3D_18_LL    2090
CATS3D_19_LL    2090
Length: 5096, dtype: int64


Unnamed: 0,name,orig_taste,reference,SMILES,can-smiles,Target,orig_idx,ITH,ISH,HIC,...,CATS3D_10_LL,CATS3D_11_LL,CATS3D_12_LL,CATS3D_13_LL,CATS3D_14_LL,CATS3D_15_LL,CATS3D_16_LL,CATS3D_17_LL,CATS3D_18_LL,CATS3D_19_LL


In [None]:
print(sweet_test.count())
sweet_test[sweet_test['SMILES'].isna()] # check if any of the SMILES is a NaN.

name            154
orig_taste      154
reference       154
SMILES          154
can-smiles      154
               ... 
CATS3D_15_LL    154
CATS3D_16_LL    154
CATS3D_17_LL    154
CATS3D_18_LL    154
CATS3D_19_LL    154
Length: 5097, dtype: int64


Unnamed: 0,name,orig_taste,reference,SMILES,can-smiles,In Bitter Domain,Target,orig_idx,ITH,ISH,...,CATS3D_10_LL,CATS3D_11_LL,CATS3D_12_LL,CATS3D_13_LL,CATS3D_14_LL,CATS3D_15_LL,CATS3D_16_LL,CATS3D_17_LL,CATS3D_18_LL,CATS3D_19_LL


In [None]:
print(bitter_train.count())
bitter_train[bitter_train['SMILES'].isna()] # check if any of the SMILES is a NaN.

name            2057
orig_taste      2135
reference       2135
SMILES          2135
can-smiles      2135
                ... 
CATS3D_15_LL    2135
CATS3D_16_LL    2135
CATS3D_17_LL    2135
CATS3D_18_LL    2135
CATS3D_19_LL    2135
Length: 5096, dtype: int64


Unnamed: 0,name,orig_taste,reference,SMILES,can-smiles,Target,orig_idx,ITH,ISH,HIC,...,CATS3D_10_LL,CATS3D_11_LL,CATS3D_12_LL,CATS3D_13_LL,CATS3D_14_LL,CATS3D_15_LL,CATS3D_16_LL,CATS3D_17_LL,CATS3D_18_LL,CATS3D_19_LL


In [None]:
print(bitter_test.count())
bitter_test[bitter_test['SMILES'].isna()] # check if any of the SMILES is a NaN.

name            115
orig_taste      171
reference       171
SMILES          171
can-smiles      171
               ... 
CATS3D_15_LL    171
CATS3D_16_LL    171
CATS3D_17_LL    171
CATS3D_18_LL    171
CATS3D_19_LL    171
Length: 5097, dtype: int64


Unnamed: 0,name,orig_taste,reference,SMILES,can-smiles,In Bitter Domain,Target,orig_idx,ITH,ISH,...,CATS3D_10_LL,CATS3D_11_LL,CATS3D_12_LL,CATS3D_13_LL,CATS3D_14_LL,CATS3D_15_LL,CATS3D_16_LL,CATS3D_17_LL,CATS3D_18_LL,CATS3D_19_LL


In [None]:
sweet_train['Target'] = sweet_train['Target'].astype(int) # Convert to numeric
sweet_test['Target'] = sweet_test['Target'].astype(int) # Convert to numeric

In [None]:
bitter_train['Target'] = bitter_train['Target'].astype(int) # Convert to numeric
bitter_test['Target'] = bitter_test['Target'].astype(int) # Convert to numeric

In [None]:
df_sweet_features_train = sweet_train[sweet_train.columns.difference(['name', 'orig_taste','reference','SMILES','can-smiles','orig_idx'])]
df_sweet_features_test = sweet_test[sweet_test.columns.difference(['name', 'orig_taste','reference','SMILES','can-smiles','orig_idx','In Bitter Domain'])]


In [None]:
bitter_test['reference'].unique()

array(['Phyto-Dictionary', 'Bitter-New', 'UNIMI'], dtype=object)

In [None]:

df_bitter_features_train = bitter_train[bitter_train.columns.difference(['name', 'orig_taste','reference','SMILES','can-smiles','orig_idx'])]


df_bitter_test_phyto = bitter_test[bitter_test['reference'] == 'Phyto-Dictionary']
df_bitter_test_bitternew = bitter_test[bitter_test['reference'] == 'Bitter-New']
df_bitter_test_unimi = bitter_test[bitter_test['reference'] == 'UNIMI']

df_bitter_test_phyto = df_bitter_test_phyto[df_bitter_test_phyto.columns.difference(['name', 'orig_taste','reference','SMILES','can-smiles','orig_idx','In Bitter Domain'])]
df_bitter_test_bitternew = df_bitter_test_bitternew[df_bitter_test_phyto.columns.difference(['name', 'orig_taste','reference','SMILES','can-smiles','orig_idx','In Bitter Domain'])]
df_bitter_test_unimi = df_bitter_test_unimi[df_bitter_test_unimi.columns.difference(['name', 'orig_taste','reference','SMILES','can-smiles','orig_idx','In Bitter Domain'])]



In [None]:
print(df_bitter_features_train.shape)
print(df_bitter_test_phyto.shape)
print(df_sweet_features_train.shape)
print(df_sweet_features_test.shape)

(2135, 5090)
(88, 5090)
(2090, 5090)
(154, 5090)


In [None]:
df_sweet_features_train = df_sweet_features_train.drop('Target', axis=1)
df_sweet_features_test = df_sweet_features_test.drop('Target', axis=1)

df_bitter_features_train = df_bitter_features_train.drop('Target', axis=1)
df_bitter_features_test_pytho = df_bitter_test_phyto.drop('Target', axis=1)
df_bitter_features_test_bitternew = df_bitter_test_bitternew.drop('Target', axis=1)
df_bitter_features_test_unimi = df_bitter_test_unimi.drop('Target', axis=1)



In [None]:
df_sweet_features_train.apply(lambda x: pd.to_numeric(x, errors='coerce') )
mask_sweet_train = df_sweet_features_train.applymap(lambda x: isinstance(x, (int, float)))
df_sweet_features_train = df_sweet_features_train.where(mask_sweet_train)

df_sweet_features_test.apply(lambda x: pd.to_numeric(x, errors='coerce') )
mask_sweet_test = df_sweet_features_test.applymap(lambda x: isinstance(x, (int, float)))
df_sweet_features_test = df_sweet_features_test.where(mask_sweet_test)


df_bitter_features_train.apply(lambda x: pd.to_numeric(x, errors='coerce') )
mask_bitter_train = df_bitter_features_train.applymap(lambda x: isinstance(x, (int, float)))
df_bitter_features_train = df_bitter_features_train.where(mask_bitter_train)


df_bitter_features_test_pytho.apply(lambda x: pd.to_numeric(x, errors='coerce') )
mask_bitter_test = df_bitter_features_test_pytho.applymap(lambda x: isinstance(x, (int, float)))
df_bitter_features_test_pytho = df_bitter_features_test_pytho.where(mask_bitter_test)


df_bitter_features_test_unimi.apply(lambda x: pd.to_numeric(x, errors='coerce') )
mask_bitter_test = df_bitter_features_test_unimi.applymap(lambda x: isinstance(x, (int, float)))
df_bitter_features_test_unimi = df_bitter_features_test_unimi.where(mask_bitter_test)

df_bitter_features_test_bitternew.apply(lambda x: pd.to_numeric(x, errors='coerce') )
mask_bitter_test = df_bitter_features_test_bitternew.applymap(lambda x: isinstance(x, (int, float)))
df_bitter_features_test_bitternew = df_bitter_features_test_bitternew.where(mask_bitter_test)


In [None]:
X_train_sweet = df_sweet_features_train
y_train_sweet = sweet_train[['Target']]

X_test_sweet = df_sweet_features_test
y_test_sweet = sweet_test[['Target']]

X_train_bitter = df_bitter_features_train
y_train_bitter = bitter_train[['Target']]

X_test_bitter_pytho = df_bitter_features_test_pytho
y_test_bitter_pytho = df_bitter_test_phyto[['Target']] 

X_test_bitter_bitternew = df_bitter_features_test_bitternew
y_test_bitter_bitternew = df_bitter_test_bitternew[['Target']] 

X_test_bitter_unimi = df_bitter_features_test_unimi
y_test_bitter_unimi = df_bitter_test_unimi[['Target']] 


In [None]:
print(df_bitter_features_train.shape)
print(X_test_bitter_bitternew.shape)
print(df_sweet_features_train.shape)
print(df_sweet_features_test.shape)

(2135, 5089)
(27, 5089)
(2090, 5089)
(154, 5089)


In [None]:
# Print some metrics about the data.
print ("Total number of features: " + str(X_train_sweet.shape[1]))
print ("Total number of samples: "+ str(X_train_sweet.shape[0]))
print ("To check if there are any null values in the features and how many: ", \
       X_train_sweet.isnull().values.any(), X_train_sweet.isnull().sum().sum())
print ("To check if there are any null values in the target and how many:", \
       y_train_sweet.isnull().values.any(), y_train_sweet.isnull().sum().sum()) 
print ("The total number of Sweet targets: " + str(y_train_sweet[y_train_sweet.Target == 1].count()[0]))
print ("The total number of Bitter targets: " + str(y_train_sweet[y_train_sweet.Target == 0].count()[0]))


Total number of features: 5089
Total number of samples: 2090
To check if there are any null values in the features and how many:  False 0
To check if there are any null values in the target and how many: False 0
The total number of Sweet targets: 1126
The total number of Bitter targets: 964


In [None]:
# Print some metrics about the data.
print ("Total number of features: " + str(X_train_bitter.shape[1]))
print ("Total number of samples: "+ str(X_train_bitter.shape[0]))
print ("To check if there are any null values in the features and how many: ", \
       X_train_bitter.isnull().values.any(), X_train_bitter.isnull().sum().sum())
print ("To check if there are any null values in the target and how many:", \
       y_train_bitter.isnull().values.any(), y_train_bitter.isnull().sum().sum()) 
print ("The total number of Sweet targets: " + str(y_train_bitter[y_train_bitter.Target == 1].count()[0]))
print ("The total number of Bitter targets: " + str(y_train_bitter[y_train_bitter.Target == 0].count()[0]))


Total number of features: 5089
Total number of samples: 2135
To check if there are any null values in the features and how many:  False 0
To check if there are any null values in the target and how many: False 0
The total number of Sweet targets: 728
The total number of Bitter targets: 1407


# Preprocess 


In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler_sweet = MinMaxScaler()
X_train_sweet_scaled = scaler_sweet.fit_transform(X_train_sweet)
X_test_sweet_scaled = scaler_sweet.transform(X_test_sweet)


scaler_bitter = MinMaxScaler()
X_train_bitter_scaled = scaler_bitter.fit_transform(X_train_bitter)


X_test_bitter_scaled_pytho = scaler_bitter.transform(X_test_bitter_pytho)

X_test_bitter_scaled_bitternew = scaler_bitter.transform(X_test_bitter_bitternew)

X_test_bitter_scaled_unimi = scaler_bitter.transform(X_test_bitter_unimi)


In [None]:
s_cols = X_train_sweet.columns
X_train_sweet_scaled = pd.DataFrame(X_train_sweet_scaled , columns = s_cols)
st_cols = X_test_sweet.columns
X_test_sweet_scaled = pd.DataFrame(X_test_sweet_scaled , columns = st_cols)

b_cols = X_train_bitter.columns
X_train_bitter_scaled = pd.DataFrame(X_train_bitter_scaled , columns = b_cols)
bt_cols = X_train_bitter.columns
X_test_bitter_scaled_pytho = pd.DataFrame(X_test_bitter_scaled_pytho , columns = bt_cols)
X_test_bitter_scaled_unimi = pd.DataFrame(X_test_bitter_scaled_unimi , columns = bt_cols)
X_test_bitter_scaled_bitternew = pd.DataFrame(X_test_bitter_scaled_bitternew , columns = bt_cols)


In [None]:
# Print some metrics about the data.
print ("Total number of features: " + str(X_train_sweet_scaled.shape[1]))
print ("Total number of samples: "+ str(X_train_sweet_scaled.shape[0]))
print ("To check if there are any null values in the features and how many: ", \
       X_train_sweet_scaled.isnull().values.any(), X_train_sweet_scaled.isnull().sum().sum())
print ("To check if there are any null values in the target and how many:", \
       y_train_sweet.isnull().values.any(), y_train_sweet.isnull().sum().sum()) 
print ("The total number of Sweet targets: " + str(y_train_sweet[y_train_sweet.Target == 1].count()[0]))
print ("The total number of Bitter targets: " + str(y_train_sweet[y_train_sweet.Target == 0].count()[0]))


Total number of features: 5089
Total number of samples: 2090
To check if there are any null values in the features and how many:  False 0
To check if there are any null values in the target and how many: False 0
The total number of Sweet targets: 1126
The total number of Bitter targets: 964


In [None]:
# Print some metrics about the data.
print ("Total number of features: " + str(X_train_bitter_scaled.shape[1]))
print ("Total number of samples: "+ str(X_train_bitter_scaled.shape[0]))
print ("To check if there are any null values in the features and how many: ", \
       X_train_bitter_scaled.isnull().values.any(), X_train_bitter_scaled.isnull().sum().sum())
print ("To check if there are any null values in the target and how many:", \
       y_train_bitter.isnull().values.any(), y_train_bitter.isnull().sum().sum()) 
print ("The total number of Sweet targets: " + str(y_train_bitter[y_train_bitter.Target == 1].count()[0]))
print ("The total number of Bitter targets: " + str(y_train_bitter[y_train_bitter.Target == 0].count()[0]))


Total number of features: 5089
Total number of samples: 2135
To check if there are any null values in the features and how many:  False 0
To check if there are any null values in the target and how many: False 0
The total number of Sweet targets: 728
The total number of Bitter targets: 1407


In [None]:
# remove punctuations in features names
import string
#sweet train
s_cols = X_train_sweet_scaled.columns
s_list = [''.join(char for char in item
                    if char not in ['[',']','<'])
            for item in s_cols]

#sweet test
st_cols = X_test_sweet_scaled.columns
st_list = [''.join(char for char in item
                    if char not in ['[',']','<'])
            for item in st_cols]


#bitter train
b_cols = X_train_bitter_scaled.columns
b_list = [''.join(char for char in item
                    if char not in ['[',']','<'])
            for item in b_cols]

#bitter test
bt_cols = X_test_bitter_scaled_pytho.columns
bt_list = [''.join(char for char in item
                    if char not in ['[',']','<'])
            for item in bt_cols]



In [None]:
X_train_sweet_scaled.columns = s_list
X_test_sweet_scaled.columns = st_list

X_train_bitter_scaled.columns = b_list

X_test_bitter_scaled_pytho.columns = bt_list
X_test_bitter_scaled_bitternew.columns = bt_list
X_test_bitter_scaled_unimi.columns = bt_list

# Model Training

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, average_precision_score , roc_auc_score , classification_report, f1_score, recall_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
def get_metrics(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:,1]

    print("Accuracy:",accuracy_score(y_test,y_pred))
    print("Precision:",precision_score(y_test,y_pred))
    print("Average Precision:",average_precision_score(y_test,y_pred_proba))
    try:
        print("AUROC:",roc_auc_score(y_test,y_pred_proba))
    except:
        print("AUROC: N/A (Only 1 class in y_true)")
    print("Sensitivity", recall_score(y_test, y_pred))
    print("Specificity", recall_score(np.logical_not(y_test),np.logical_not(y_pred)))
    print("F1-Score", f1_score(y_test, y_pred))
    print("Classification Report")
    print(classification_report(y_test, y_pred))
    print('-----------------\r\n')
    
def evaluate(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    get_metrics(model, X_test, y_test)

def evaluate_all(models, X_train, y_train, X_test, y_test):
    for model in models:
        print(f"Model: {model['name']}")
        evaluate(model['model'], X_train, y_train, X_test, y_test)

In [None]:
clf_xgboost_sweet = XGBClassifier(
    colsample_bytree=0.3,
    learning_rate=0.01,
    max_depth=10,
    n_estimators=1000
)
clf_catboost_sweet = CatBoostClassifier(
    depth=6,
    iterations=250, 
    learning_rate=0.2, 
    logging_level='Silent'
)
clf_rf_sweet = RandomForestClassifier(
    bootstrap=False,
    max_depth=None,
    max_features='auto',
    min_samples_leaf=1,
    min_samples_split=10,
    n_estimators=400
)
sweet_models_full = [
    {'model': clf_xgboost_sweet, 'name': 'XGBoost (No Feature Selection)'},
    {'model': clf_catboost_sweet, 'name': 'CatBoost (No Feature Selection)'},
    {'model': clf_rf_sweet, 'name': 'Random Forest (No Feature Selection)'},
]

# Sweet Classifier ( No Feature Selection)

In [None]:
evaluate_all(sweet_models_full,  X_train_sweet_scaled, y_train_sweet, X_test_sweet_scaled, y_test_sweet)

Model: XGBoost (No Feature Selection)
Accuracy: 0.8051948051948052
Precision: 0.9120879120879121
Average Precision: 0.9462987062479548
AUROC: 0.8752186588921282
Sensitivity 0.7904761904761904
Specificity 0.8367346938775511
F1-Score 0.8469387755102041
Classification Report
              precision    recall  f1-score   support

           0       0.65      0.84      0.73        49
           1       0.91      0.79      0.85       105

    accuracy                           0.81       154
   macro avg       0.78      0.81      0.79       154
weighted avg       0.83      0.81      0.81       154

-----------------

Model: CatBoost (No Feature Selection)
Accuracy: 0.7792207792207793
Precision: 0.8901098901098901
Average Precision: 0.9443855366176224
AUROC: 0.8744412050534498
Sensitivity 0.7714285714285715
Specificity 0.7959183673469388
F1-Score 0.8265306122448981
Classification Report
              precision    recall  f1-score   support

           0       0.62      0.80      0.70        4

# BItter Classifier (No Feature Selection)

In [None]:
clf_xgboost_bitter = XGBClassifier(
    colsample_bytree=0.3,
    learning_rate=0.01,
    max_depth=10,
    n_estimators=500
)
clf_catboost_bitter = CatBoostClassifier(
    depth=10,
    iterations=350, 
    learning_rate=0.1, 
    logging_level='Silent'
)
clf_rf_bitter = RandomForestClassifier(
    bootstrap=False,
    max_depth=100,
    max_features='auto',
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=400
)

bitter_models_full = [
    {'model': clf_xgboost_bitter, 'name': 'XGBoost (No Feature Selection)'},
    {'model': clf_catboost_bitter, 'name': 'CatBoost (No Feature Selection)'},
    {'model': clf_rf_bitter, 'name': 'Random Forest (No Feature Selection)'},
]

Phyto-Dictionary

In [None]:
evaluate_all(bitter_models_full, X_train_bitter_scaled, y_train_bitter, X_test_bitter_scaled_pytho, y_test_bitter_pytho)

Model: XGBoost (No Feature Selection)
Accuracy: 0.875
Precision: 0.9074074074074074
Average Precision: 0.9600489958949988
AUROC: 0.9294765840220386
Sensitivity 0.8909090909090909
Specificity 0.8484848484848485
F1-Score 0.8990825688073394
Classification Report
              precision    recall  f1-score   support

           0       0.82      0.85      0.84        33
           1       0.91      0.89      0.90        55

    accuracy                           0.88        88
   macro avg       0.87      0.87      0.87        88
weighted avg       0.88      0.88      0.88        88

-----------------

Model: CatBoost (No Feature Selection)
Accuracy: 0.8863636363636364
Precision: 0.9245283018867925
Average Precision: 0.9644389389061261
AUROC: 0.9349862258953168
Sensitivity 0.8909090909090909
Specificity 0.8787878787878788
F1-Score 0.9074074074074073
Classification Report
              precision    recall  f1-score   support

           0       0.83      0.88      0.85        33
           


Bitter-New

In [None]:
evaluate_all(bitter_models_full, X_train_bitter_scaled, y_train_bitter, X_test_bitter_scaled_bitternew, y_test_bitter_bitternew)

Model: XGBoost (No Feature Selection)
Accuracy: 0.5925925925925926
Precision: 1.0
Average Precision: 1.0
AUROC: N/A (Only 1 class in y_true)
Sensitivity 0.5925925925925926
Specificity 0.0
F1-Score 0.7441860465116279
Classification Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.59      0.74        27

    accuracy                           0.59        27
   macro avg       0.50      0.30      0.37        27
weighted avg       1.00      0.59      0.74        27

-----------------

Model: CatBoost (No Feature Selection)
Accuracy: 0.5925925925925926
Precision: 1.0
Average Precision: 1.0
AUROC: N/A (Only 1 class in y_true)
Sensitivity 0.5925925925925926
Specificity 0.0
F1-Score 0.7441860465116279
Classification Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.59      0.74        27

    accuracy 

UNIMI

In [None]:
evaluate_all(bitter_models_full, X_train_bitter_scaled, y_train_bitter, X_test_bitter_scaled_unimi, y_test_bitter_unimi)

Model: XGBoost (No Feature Selection)
Accuracy: 0.8392857142857143
Precision: 0.8181818181818182
Average Precision: 0.8629804203273626
AUROC: 0.8893280632411067
Sensitivity 0.782608695652174
Specificity 0.8787878787878788
F1-Score 0.8
Classification Report
              precision    recall  f1-score   support

           0       0.85      0.88      0.87        33
           1       0.82      0.78      0.80        23

    accuracy                           0.84        56
   macro avg       0.84      0.83      0.83        56
weighted avg       0.84      0.84      0.84        56

-----------------

Model: CatBoost (No Feature Selection)
Accuracy: 0.6964285714285714
Precision: 0.625
Average Precision: 0.8006447136837629
AUROC: 0.8405797101449275
Sensitivity 0.6521739130434783
Specificity 0.7272727272727273
F1-Score 0.6382978723404256
Classification Report
              precision    recall  f1-score   support

           0       0.75      0.73      0.74        33
           1       0.62    

# Sweet Classifier (Boruta Shap)

In [None]:
from BorutaShap import BorutaShap

In [None]:
clf_xgboost_sweet_ba = XGBClassifier(
    colsample_bytree=0.3,
    learning_rate=0.01,
    max_depth=10,
    n_estimators=1000
)
clf_catboost_sweet_ba = CatBoostClassifier(
    depth=6,
    iterations=250, 
    learning_rate=0.2, 
    logging_level='Silent'
)
clf_rf_sweet_ba = RandomForestClassifier(
    bootstrap=False,
    max_depth=None,
    max_features='auto',
    min_samples_leaf=1,
    min_samples_split=10,
    n_estimators=400
)
sweet_models_full = [
    {'model': clf_xgboost_sweet_ba, 'name': 'XGBoost (BorutaShap)'},
    {'model': clf_catboost_sweet_ba, 'name': 'CatBoost (BorutaShap)'},
    {'model': clf_rf_sweet_ba, 'name': 'Random Forest (BorutaShap)'},
]

In [None]:
clf_xgboost_bitter_ba = XGBClassifier(
    colsample_bytree=0.3,
    learning_rate=0.01,
    max_depth=10,
    n_estimators=500
)
clf_catboost_bitter_ba = CatBoostClassifier(
    depth=10,
    iterations=350, 
    learning_rate=0.1, 
    logging_level='Silent'
)
clf_rf_bitter_ba = RandomForestClassifier(
    bootstrap=False,
    max_depth=100,
    max_features='auto',
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=400
)

bitter_models_full = [
    {'model': clf_xgboost_bitter_ba, 'name': 'XGBoost (BorutaShap)'},
    {'model': clf_catboost_bitter_ba, 'name': 'CatBoost (BorutaShap)'},
    {'model': clf_rf_bitter_ba, 'name': 'Random Forest (BorutaShap)'},
]

In [None]:
sweet_selector = BorutaShap(importance_measure='shap', classification=True)

In [None]:
sweet_selector.fit(
    X=X_train_sweet_scaled, y=y_train_sweet['Target'],
    n_trials=500,
    sample=False,
    verbose=True,
    random_state=42
)

  0%|          | 0/500 [00:00<?, ?it/s]

275 attributes confirmed important: ['Mor17p', 'CATS2D_07_DL', 'F03C-N', 'SM13_EA(bo)', 'Chi_D/Dt', 'SpMin3_Bh(v)', 'LOC', 'SM5_B(p)', 'SpMax_B(v)', 'SpMax_AEA(bo)', 'Mor14v', 'SpMax_B(m)', 'SM11_EA(bo)', 'TDB01i', 'MATS4e', 'Hy', 'MWC05', 'GATS4e', 'CATS2D_05_DN', 'ATSC4i', 'SpPosA_G/D', 'H4s', 'Eig01_AEA(ed)', 'Eig04_AEA(ed)', 'SM5_H2', 'TDB05s', 'MWC06', 'RDF010s', 'J_Dt', 'SM5_B(v)', 'SM08_EA(bo)', 'SM4_B(v)', 'ATS3v', 'Eig01_AEA(ri)', 'R5s+', 'SpDiam_H2', 'CATS2D_07_AL', 'Eig01_AEA(dm)', 'SM11_EA(ed)', 'SM6_H2', 'Eig01_EA(ed)', 'P_VSA_LogP_5', 'HATS6v', 'MATS6p', 'SpMAD_B(i)', 'SM14_EA(ri)', 'piPC07', 'SM6_B(s)', 'RBF', 'SpMaxA_D/Dt', 'Mor14p', 'CATS3D_03_DL', 'SpDiam_AEA(ed)', 'SpMax_B(p)', 'ATSC2e', 'SpAD_Dz(m)', 'MATS2e', 'AVS_B(i)', 'RDF020m', 'SpMax_B(s)', 'ATSC4s', 'SM08_AEA(bo)', 'SM15_EA(ed)', 'RDF020u', 'CATS2D_09_AL', 'SM6_L', 'ATSC5i', 'J_G', 'SM06_EA(bo)', 'SM6_B(p)', 'SpDiam_EA(ri)', 'ATS3s', 'F03C-O', 'AVS_B(v)', 'VR2_B(p)', 'B01C-N', 'ATS6s', 'Eig03_EA(bo)', 'VE1_B(

In [None]:
X_sweet_subset = sweet_selector.Subset()
X_sweet_subset_cols = X_sweet_subset.columns



X_train_sweet_subset = X_train_sweet_scaled[X_sweet_subset_cols]
X_test_sweet_subset = X_test_sweet_scaled[X_sweet_subset_cols]

In [None]:
X_sweet_borutashap = pd.DataFrame(X_sweet_subset_cols)
X_sweet_borutashap.to_csv("X_sweet_borutashap.csv")

In [None]:
evaluate_all(sweet_models_full, X_train_sweet_subset, y_train_sweet, X_test_sweet_subset, y_test_sweet)

Model: XGBoost (BorutaShap)
Accuracy: 0.7987012987012987
Precision: 0.9111111111111111
Average Precision: 0.9540230290360356
AUROC: 0.8942662779397473
Sensitivity 0.780952380952381
Specificity 0.8367346938775511
F1-Score 0.841025641025641
Classification Report
              precision    recall  f1-score   support

           0       0.64      0.84      0.73        49
           1       0.91      0.78      0.84       105

    accuracy                           0.80       154
   macro avg       0.78      0.81      0.78       154
weighted avg       0.83      0.80      0.80       154

-----------------

Model: CatBoost (BorutaShap)
Accuracy: 0.7727272727272727
Precision: 0.8804347826086957
Average Precision: 0.949677001167498
AUROC: 0.8841593780369291
Sensitivity 0.7714285714285715
Specificity 0.7755102040816326
F1-Score 0.8223350253807107
Classification Report
              precision    recall  f1-score   support

           0       0.61      0.78      0.68        49
           1       0.

# Bitter Classifier (Boruta Shap)



In [None]:
bitter_selector = BorutaShap(importance_measure='shap', classification=True)

In [None]:
bitter_selector.fit(
    X=X_train_bitter_scaled,
    y=y_train_bitter['Target'],
    n_trials=500,
    sample=False,
    verbose=True,
    random_state=42
)

  0%|          | 0/500 [00:00<?, ?it/s]

270 attributes confirmed important: ['Eig08_EA(ri)', 'GATS5m', 'SPAM', 'Mv', 'R5p', 'Mor16p', 'ATSC4s', 'P_VSA_ppp_cyc', 'PCD', 'ATSC5v', 'O%', 'SpMax3_Bh(e)', 'EE_B(i)', 'R6m', 'RDF020s', 'AVS_B(i)', 'P_VSA_e_3', 'H0m', 'SsssCH', 'MATS2i', 'SpMAD_EA(bo)', 'Mor15m', 'nO', 'GATS6e', 'RDF040i', 'SpMax1_Bh(m)', 'SpMax3_Bh(v)', 'SM11_AEA(ri)', 'CATS3D_02_AA', 'piPC10', 'SM08_AEA(bo)', 'B03C-N', 'Mor12e', 'ATS4p', 'SpMin1_Bh(p)', 'CATS2D_08_AL', 'SM13_EA(ed)', 'SpMax_EA', 'F01C-O', 'VE3_RG', 'GATS2p', 'P_VSA_ppp_ter', 'TDB04u', 'ATSC5i', 'R5s+', 'CATS2D_09_AL', 'SpMax_EA(ed)', 'WiA_D/Dt', 'ATSC1m', 'SM15_EA(ed)', 'R6v', 'RDF020p', 'R3s+', 'HATS6m', 'Mor03i', 'SpDiam_B(p)', 'GATS6m', 'MAXDN', 'CATS2D_04_AA', 'GATS1p', 'SpMax1_Bh(e)', 'B01C-N', 'SpMax_AEA(ri)', 'SpMax_AEA(ed)', 'ATS5p', 'MATS4e', 'H_Dz(p)', 'Eig06_AEA(bo)', 'SM08_EA(ed)', 'HATS6v', 'TDB03p', 'SpDiam_B(v)', 'ATSC5p', 'SpDiam_B(e)', 'SpMax_L', 'ATSC4p', 'SM5_RG', 'P_VSA_MR_6', 'P_VSA_ppp_A', 'ATSC3v', 'P_VSA_m_3', 'ATSC3p', 'Mi

In [None]:

X_bitter_subset = bitter_selector.Subset()

In [None]:

X_bitter_subset_cols = X_bitter_subset.columns

X_bitter_borutashap = pd.DataFrame(X_bitter_subset_cols)
X_bitter_borutashap.to_csv("X_bitter_borutashap500.csv")

X_train_bitter_subset = X_train_bitter_scaled[X_bitter_subset_cols]
X_test_bitter_phyto_subset = X_test_bitter_scaled_pytho[X_bitter_subset_cols]
X_test_bitter_bitternew_subset = X_test_bitter_scaled_bitternew[X_bitter_subset_cols]
X_test_bitter_unimi_subset = X_test_bitter_scaled_unimi[X_bitter_subset_cols]


Phyto-Dictionary

In [None]:
evaluate_all(bitter_models_full, X_train_bitter_subset, y_train_bitter, X_test_bitter_phyto_subset, y_test_bitter_pytho)

Model: XGBoost (BorutaShap)
Accuracy: 0.8977272727272727
Precision: 0.9423076923076923
Average Precision: 0.9611983572363016
AUROC: 0.9272727272727272
Sensitivity 0.8909090909090909
Specificity 0.9090909090909091
F1-Score 0.9158878504672897
Classification Report
              precision    recall  f1-score   support

           0       0.83      0.91      0.87        33
           1       0.94      0.89      0.92        55

    accuracy                           0.90        88
   macro avg       0.89      0.90      0.89        88
weighted avg       0.90      0.90      0.90        88

-----------------

Model: CatBoost (BorutaShap)
Accuracy: 0.8977272727272727
Precision: 0.9259259259259259
Average Precision: 0.969051752074148
AUROC: 0.9393939393939394
Sensitivity 0.9090909090909091
Specificity 0.8787878787878788
F1-Score 0.9174311926605504
Classification Report
              precision    recall  f1-score   support

           0       0.85      0.88      0.87        33
           1       

Bitter-New

In [None]:
evaluate_all(bitter_models_full, X_train_bitter_subset, y_train_bitter, X_test_bitter_bitternew_subset, y_test_bitter_bitternew)

Model: XGBoost (BorutaShap)
Accuracy: 0.5185185185185185
Precision: 1.0
Average Precision: 1.0
AUROC: N/A (Only 1 class in y_true)
Sensitivity 0.5185185185185185
Specificity 0.0
F1-Score 0.6829268292682926
Classification Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.52      0.68        27

    accuracy                           0.52        27
   macro avg       0.50      0.26      0.34        27
weighted avg       1.00      0.52      0.68        27

-----------------

Model: CatBoost (BorutaShap)
Accuracy: 0.5185185185185185
Precision: 1.0
Average Precision: 1.0
AUROC: N/A (Only 1 class in y_true)
Sensitivity 0.5185185185185185
Specificity 0.0
F1-Score 0.6829268292682926
Classification Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.52      0.68        27

    accuracy                     

UNIMI

In [None]:
evaluate_all(bitter_models_full, X_train_bitter_subset, y_train_bitter, X_test_bitter_unimi_subset, y_test_bitter_unimi)

Model: XGBoost (BorutaShap)
Accuracy: 0.75
Precision: 0.6666666666666666
Average Precision: 0.8208714464839438
AUROC: 0.8656126482213438
Sensitivity 0.782608695652174
Specificity 0.7272727272727273
F1-Score 0.72
Classification Report
              precision    recall  f1-score   support

           0       0.83      0.73      0.77        33
           1       0.67      0.78      0.72        23

    accuracy                           0.75        56
   macro avg       0.75      0.75      0.75        56
weighted avg       0.76      0.75      0.75        56

-----------------

Model: CatBoost (BorutaShap)
Accuracy: 0.7321428571428571
Precision: 0.6428571428571429
Average Precision: 0.7965308820871808
AUROC: 0.8432147562582344
Sensitivity 0.782608695652174
Specificity 0.696969696969697
F1-Score 0.7058823529411765
Classification Report
              precision    recall  f1-score   support

           0       0.82      0.70      0.75        33
           1       0.64      0.78      0.71      