In [None]:
!pip install catboost BorutaShap

In [None]:
!pip install scikit-learn==1.2.0

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd "/content/drive/MyDrive/Colab Notebooks/Machine Learning Projects/sweetboost"

/content/drive/MyDrive/Colab Notebooks/Machine Learning Projects/sweetboost


Data from https://github.com/cosylabiiit/bittersweet

In [4]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore') # To suppress runtime and deprication warnings. 

In [5]:
import sklearn

In [6]:
sklearn.__version__

'1.0.2'

# Load Data

Dragon3D data

In [7]:
# Load raw data.
bitter_train = pd.read_csv('manuscript - experiments/bitter-data/model-data-dragon_3d.tsv', sep='\t')
bitter_test = pd.read_csv('manuscript - experiments/bitter-data/gold-standard-dragon_3d.tsv', sep='\t')
sweet_test = pd.read_csv('manuscript - experiments/sweet-data/gold-standard-dragon_3d.tsv', sep='\t')
sweet_train = pd.read_csv('manuscript - experiments/sweet-data/model-data-dragon_3d.tsv', sep='\t')

In [8]:
print(bitter_train.shape)
print(bitter_test.shape)
print(sweet_train.shape)
print(sweet_test.shape)

(2135, 5096)
(171, 5097)
(2090, 5096)
(154, 5097)


In [9]:
# Rename taste column as Target.
sweet_train.rename(columns={'taste':'Target'}, inplace=True)
sweet_test.rename(columns={'taste':'Target'}, inplace=True)
bitter_train.rename(columns={'taste':'Target'}, inplace=True)
bitter_test.rename(columns={'taste':'Target'}, inplace=True)


sweet_train.rename(columns={'smiles':'SMILES'}, inplace=True)
sweet_test.rename(columns={'smiles':'SMILES'}, inplace=True)
bitter_train.rename(columns={'smiles':'SMILES'}, inplace=True)
bitter_test.rename(columns={'smiles':'SMILES'}, inplace=True)

In [10]:
print(sweet_train.count())
sweet_train[sweet_train['SMILES'].isna()] # check if any of the SMILES is a NaN.

name            1993
orig_taste      2090
reference       2090
SMILES          2090
can-smiles      2090
                ... 
CATS3D_15_LL    2090
CATS3D_16_LL    2090
CATS3D_17_LL    2090
CATS3D_18_LL    2090
CATS3D_19_LL    2090
Length: 5096, dtype: int64


Unnamed: 0,name,orig_taste,reference,SMILES,can-smiles,Target,orig_idx,ITH,ISH,HIC,...,CATS3D_10_LL,CATS3D_11_LL,CATS3D_12_LL,CATS3D_13_LL,CATS3D_14_LL,CATS3D_15_LL,CATS3D_16_LL,CATS3D_17_LL,CATS3D_18_LL,CATS3D_19_LL


In [11]:
print(sweet_test.count())
sweet_test[sweet_test['SMILES'].isna()] # check if any of the SMILES is a NaN.

name            154
orig_taste      154
reference       154
SMILES          154
can-smiles      154
               ... 
CATS3D_15_LL    154
CATS3D_16_LL    154
CATS3D_17_LL    154
CATS3D_18_LL    154
CATS3D_19_LL    154
Length: 5097, dtype: int64


Unnamed: 0,name,orig_taste,reference,SMILES,can-smiles,In Bitter Domain,Target,orig_idx,ITH,ISH,...,CATS3D_10_LL,CATS3D_11_LL,CATS3D_12_LL,CATS3D_13_LL,CATS3D_14_LL,CATS3D_15_LL,CATS3D_16_LL,CATS3D_17_LL,CATS3D_18_LL,CATS3D_19_LL


In [12]:
print(bitter_train.count())
bitter_train[bitter_train['SMILES'].isna()] # check if any of the SMILES is a NaN.

name            2057
orig_taste      2135
reference       2135
SMILES          2135
can-smiles      2135
                ... 
CATS3D_15_LL    2135
CATS3D_16_LL    2135
CATS3D_17_LL    2135
CATS3D_18_LL    2135
CATS3D_19_LL    2135
Length: 5096, dtype: int64


Unnamed: 0,name,orig_taste,reference,SMILES,can-smiles,Target,orig_idx,ITH,ISH,HIC,...,CATS3D_10_LL,CATS3D_11_LL,CATS3D_12_LL,CATS3D_13_LL,CATS3D_14_LL,CATS3D_15_LL,CATS3D_16_LL,CATS3D_17_LL,CATS3D_18_LL,CATS3D_19_LL


In [13]:
print(bitter_test.count())
bitter_test[bitter_test['SMILES'].isna()] # check if any of the SMILES is a NaN.

name            115
orig_taste      171
reference       171
SMILES          171
can-smiles      171
               ... 
CATS3D_15_LL    171
CATS3D_16_LL    171
CATS3D_17_LL    171
CATS3D_18_LL    171
CATS3D_19_LL    171
Length: 5097, dtype: int64


Unnamed: 0,name,orig_taste,reference,SMILES,can-smiles,In Bitter Domain,Target,orig_idx,ITH,ISH,...,CATS3D_10_LL,CATS3D_11_LL,CATS3D_12_LL,CATS3D_13_LL,CATS3D_14_LL,CATS3D_15_LL,CATS3D_16_LL,CATS3D_17_LL,CATS3D_18_LL,CATS3D_19_LL


In [14]:
sweet_train['Target'] = sweet_train['Target'].astype(int) # Convert to numeric
sweet_test['Target'] = sweet_test['Target'].astype(int) # Convert to numeric

In [15]:
bitter_train['Target'] = bitter_train['Target'].astype(int) # Convert to numeric
bitter_test['Target'] = bitter_test['Target'].astype(int) # Convert to numeric

In [16]:
df_sweet_features_train = sweet_train[sweet_train.columns.difference(['name', 'orig_taste','reference','SMILES','can-smiles','orig_idx'])]
df_sweet_features_test = sweet_test[sweet_test.columns.difference(['name', 'orig_taste','reference','SMILES','can-smiles','orig_idx','In Bitter Domain'])]

df_bitter_features_train = bitter_train[bitter_train.columns.difference(['name', 'orig_taste','reference','SMILES','can-smiles','orig_idx'])]
df_bitter_features_test = bitter_test[bitter_test.columns.difference(['name', 'orig_taste','reference','SMILES','can-smiles','orig_idx','In Bitter Domain'])]


In [17]:
print(df_bitter_features_train.shape)
print(df_bitter_features_test.shape)
print(df_sweet_features_train.shape)
print(df_sweet_features_test.shape)

(2135, 5090)
(171, 5090)
(2090, 5090)
(154, 5090)


In [18]:
df_sweet_features_train = df_sweet_features_train.drop('Target', axis=1)
df_sweet_features_test = df_sweet_features_test.drop('Target', axis=1)

df_bitter_features_train = df_bitter_features_train.drop('Target', axis=1)
df_bitter_features_test = df_bitter_features_test.drop('Target', axis=1)

In [19]:
df_sweet_features_train.apply(lambda x: pd.to_numeric(x, errors='coerce') )
mask_sweet_train = df_sweet_features_train.applymap(lambda x: isinstance(x, (int, float)))
df_sweet_features_train = df_sweet_features_train.where(mask_sweet_train)

df_sweet_features_test.apply(lambda x: pd.to_numeric(x, errors='coerce') )
mask_sweet_test = df_sweet_features_test.applymap(lambda x: isinstance(x, (int, float)))
df_sweet_features_test = df_sweet_features_test.where(mask_sweet_test)


df_bitter_features_train.apply(lambda x: pd.to_numeric(x, errors='coerce') )
mask_bitter_train = df_bitter_features_train.applymap(lambda x: isinstance(x, (int, float)))
df_bitter_features_train = df_bitter_features_train.where(mask_bitter_train)


df_bitter_features_test.apply(lambda x: pd.to_numeric(x, errors='coerce') )
mask_bitter_test = df_bitter_features_test.applymap(lambda x: isinstance(x, (int, float)))
df_bitter_features_test = df_bitter_features_test.where(mask_bitter_test)

In [20]:
print(df_bitter_features_train.shape)
print(df_bitter_features_test.shape)
print(df_sweet_features_train.shape)
print(df_sweet_features_test.shape)

(2135, 5089)
(171, 5089)
(2090, 5089)
(154, 5089)


In [21]:
X_train_sweet = df_sweet_features_train
y_train_sweet = sweet_train[['Target']]

X_test_sweet = df_sweet_features_test
y_test_sweet = sweet_test[['Target']]

X_train_bitter = df_bitter_features_train
y_train_bitter = bitter_train[['Target']]

X_test_bitter = df_bitter_features_test
y_test_bitter = bitter_test[['Target']] 

In [22]:
print(df_bitter_features_train.shape)
print(df_bitter_features_test.shape)
print(df_sweet_features_train.shape)
print(df_sweet_features_test.shape)

(2135, 5089)
(171, 5089)
(2090, 5089)
(154, 5089)


In [23]:
# Print some metrics about the data.
print ("Total number of features: " + str(X_train_sweet.shape[1]))
print ("Total number of samples: "+ str(X_train_sweet.shape[0]))
print ("To check if there are any null values in the features and how many: ", \
       X_train_sweet.isnull().values.any(), X_train_sweet.isnull().sum().sum())
print ("To check if there are any null values in the target and how many:", \
       y_train_sweet.isnull().values.any(), y_train_sweet.isnull().sum().sum()) 
print ("The total number of Sweet targets: " + str(y_train_sweet[y_train_sweet.Target == 1].count()[0]))
print ("The total number of Bitter targets: " + str(y_train_sweet[y_train_sweet.Target == 0].count()[0]))


Total number of features: 5089
Total number of samples: 2090
To check if there are any null values in the features and how many:  False 0
To check if there are any null values in the target and how many: False 0
The total number of Sweet targets: 1126
The total number of Bitter targets: 964


In [24]:
# Print some metrics about the data.
print ("Total number of features: " + str(X_train_bitter.shape[1]))
print ("Total number of samples: "+ str(X_train_bitter.shape[0]))
print ("To check if there are any null values in the features and how many: ", \
       X_train_bitter.isnull().values.any(), X_train_bitter.isnull().sum().sum())
print ("To check if there are any null values in the target and how many:", \
       y_train_bitter.isnull().values.any(), y_train_bitter.isnull().sum().sum()) 
print ("The total number of Sweet targets: " + str(y_train_bitter[y_train_bitter.Target == 1].count()[0]))
print ("The total number of Bitter targets: " + str(y_train_bitter[y_train_bitter.Target == 0].count()[0]))


Total number of features: 5089
Total number of samples: 2135
To check if there are any null values in the features and how many:  False 0
To check if there are any null values in the target and how many: False 0
The total number of Sweet targets: 728
The total number of Bitter targets: 1407


# Preprocess 


In [25]:
from sklearn.preprocessing import MinMaxScaler
scaler_sweet = MinMaxScaler()
X_train_sweet_scaled = scaler_sweet.fit_transform(X_train_sweet)
X_test_sweet_scaled = scaler_sweet.transform(X_test_sweet)


scaler_bitter = MinMaxScaler()
X_train_bitter_scaled = scaler_bitter.fit_transform(X_train_bitter)
X_test_bitter_scaled = scaler_bitter.transform(X_test_bitter)


In [26]:
print("sweet train shape")
print(X_train_sweet_scaled.shape)
print("sweet test shape")
print(X_test_sweet_scaled.shape)
print("bitter train shape")
print(X_train_bitter_scaled.shape)
print("bitter test shape")
print(X_test_bitter_scaled.shape)

sweet train shape
(2090, 5089)
sweet test shape
(154, 5089)
bitter train shape
(2135, 5089)
bitter test shape
(171, 5089)


In [27]:
s_cols = X_train_sweet.columns
X_train_sweet_scaled = pd.DataFrame(X_train_sweet_scaled , columns = s_cols)
st_cols = X_test_sweet.columns
X_test_sweet_scaled = pd.DataFrame(X_test_sweet_scaled , columns = st_cols)

b_cols = X_train_bitter.columns
X_train_bitter_scaled = pd.DataFrame(X_train_bitter_scaled , columns = b_cols)
bt_cols = X_train_bitter.columns
X_test_bitter_scaled = pd.DataFrame(X_test_bitter_scaled , columns = bt_cols)


In [28]:
# Print some metrics about the data.
print ("Total number of features: " + str(X_train_sweet_scaled.shape[1]))
print ("Total number of samples: "+ str(X_train_sweet_scaled.shape[0]))
print ("To check if there are any null values in the features and how many: ", \
       X_train_sweet_scaled.isnull().values.any(), X_train_sweet_scaled.isnull().sum().sum())
print ("To check if there are any null values in the target and how many:", \
       y_train_sweet.isnull().values.any(), y_train_sweet.isnull().sum().sum()) 
print ("The total number of Sweet targets: " + str(y_train_sweet[y_train_sweet.Target == 1].count()[0]))
print ("The total number of Bitter targets: " + str(y_train_sweet[y_train_sweet.Target == 0].count()[0]))


Total number of features: 5089
Total number of samples: 2090
To check if there are any null values in the features and how many:  False 0
To check if there are any null values in the target and how many: False 0
The total number of Sweet targets: 1126
The total number of Bitter targets: 964


In [29]:
# Print some metrics about the data.
print ("Total number of features: " + str(X_train_bitter_scaled.shape[1]))
print ("Total number of samples: "+ str(X_train_bitter_scaled.shape[0]))
print ("To check if there are any null values in the features and how many: ", \
       X_train_bitter_scaled.isnull().values.any(), X_train_bitter_scaled.isnull().sum().sum())
print ("To check if there are any null values in the target and how many:", \
       y_train_bitter.isnull().values.any(), y_train_bitter.isnull().sum().sum()) 
print ("The total number of Sweet targets: " + str(y_train_bitter[y_train_bitter.Target == 1].count()[0]))
print ("The total number of Bitter targets: " + str(y_train_bitter[y_train_bitter.Target == 0].count()[0]))


Total number of features: 5089
Total number of samples: 2135
To check if there are any null values in the features and how many:  False 0
To check if there are any null values in the target and how many: False 0
The total number of Sweet targets: 728
The total number of Bitter targets: 1407


In [30]:
# remove punctuations for boruta shap features
import string
#sweet train
s_cols = X_train_sweet_scaled.columns
s_list = [''.join(char for char in item
                    if char not in ['[',']','<'])
            for item in s_cols]
print(s_list) 

#sweet test
st_cols = X_test_sweet_scaled.columns
st_list = [''.join(char for char in item
                    if char not in ['[',']','<'])
            for item in st_cols]


#bitter train
b_cols = X_train_bitter_scaled.columns
b_list = [''.join(char for char in item
                    if char not in ['[',']','<'])
            for item in b_cols]

#bitter test
bt_cols = X_test_bitter_scaled.columns
bt_list = [''.join(char for char in item
                    if char not in ['[',']','<'])
            for item in bt_cols]



['AAC', 'AECC', 'AMW', 'AROM', 'ARR', 'ASP', 'ATS1e', 'ATS1i', 'ATS1m', 'ATS1p', 'ATS1s', 'ATS1v', 'ATS2e', 'ATS2i', 'ATS2m', 'ATS2p', 'ATS2s', 'ATS2v', 'ATS3e', 'ATS3i', 'ATS3m', 'ATS3p', 'ATS3s', 'ATS3v', 'ATS4e', 'ATS4i', 'ATS4m', 'ATS4p', 'ATS4s', 'ATS4v', 'ATS5e', 'ATS5i', 'ATS5m', 'ATS5p', 'ATS5s', 'ATS5v', 'ATS6e', 'ATS6i', 'ATS6m', 'ATS6p', 'ATS6s', 'ATS6v', 'ATS7e', 'ATS7i', 'ATS7m', 'ATS7p', 'ATS7s', 'ATS7v', 'ATS8e', 'ATS8i', 'ATS8m', 'ATS8p', 'ATS8s', 'ATS8v', 'ATSC1e', 'ATSC1i', 'ATSC1m', 'ATSC1p', 'ATSC1s', 'ATSC1v', 'ATSC2e', 'ATSC2i', 'ATSC2m', 'ATSC2p', 'ATSC2s', 'ATSC2v', 'ATSC3e', 'ATSC3i', 'ATSC3m', 'ATSC3p', 'ATSC3s', 'ATSC3v', 'ATSC4e', 'ATSC4i', 'ATSC4m', 'ATSC4p', 'ATSC4s', 'ATSC4v', 'ATSC5e', 'ATSC5i', 'ATSC5m', 'ATSC5p', 'ATSC5s', 'ATSC5v', 'ATSC6e', 'ATSC6i', 'ATSC6m', 'ATSC6p', 'ATSC6s', 'ATSC6v', 'ATSC7e', 'ATSC7i', 'ATSC7m', 'ATSC7p', 'ATSC7s', 'ATSC7v', 'ATSC8e', 'ATSC8i', 'ATSC8m', 'ATSC8p', 'ATSC8s', 'ATSC8v', 'AVS_B(e)', 'AVS_B(i)', 'AVS_B(m)', 'AVS_B(

In [31]:
#double check if there is duplicates
# find the length of the list
print(len(s_list))
# create a set from the list
myset = set(s_list)
# find the length of the Python set variable myset
print(len(myset))


#double check if there is duplicates
# find the length of the list
print(len(st_list))
# create a set from the list
myset = set(st_list)
# find the length of the Python set variable myset
print(len(myset))


5089
5089
5089
5089


In [32]:
#double check if there is duplicates
# find the length of the list
print(len(b_list))
# create a set from the list
myset = set(b_list)
# find the length of the Python set variable myset
print(len(myset))


#double check if there is duplicates
# find the length of the list
print(len(bt_list))
# create a set from the list
myset = set(bt_list)
# find the length of the Python set variable myset
print(len(myset))


5089
5089
5089
5089


In [33]:
X_train_sweet_scaled.columns = s_list
X_test_sweet_scaled.columns = st_list

X_train_bitter_scaled.columns = b_list
X_test_bitter_scaled.columns = bt_list

In [34]:
# Print some metrics about the data.
print ("Total number of features: " + str(X_train_sweet_scaled.shape[1]))
print ("Total number of samples: "+ str(X_train_sweet_scaled.shape[0]))
print ("To check if there are any null values in the features and how many: ", \
       X_train_sweet_scaled.isnull().values.any(), X_train_sweet_scaled.isnull().sum().sum())
print ("To check if there are any null values in the target and how many:", \
       y_train_sweet.isnull().values.any(), y_train_sweet.isnull().sum().sum()) 
print ("The total number of Sweet targets: " + str(y_train_sweet[y_train_sweet.Target == 1].count()[0]))
print ("The total number of Bitter targets: " + str(y_train_sweet[y_train_sweet.Target == 0].count()[0]))


Total number of features: 5089
Total number of samples: 2090
To check if there are any null values in the features and how many:  False 0
To check if there are any null values in the target and how many: False 0
The total number of Sweet targets: 1126
The total number of Bitter targets: 964


In [35]:
# Print some metrics about the data.
print ("Total number of features: " + str(X_train_bitter_scaled.shape[1]))
print ("Total number of samples: "+ str(X_train_bitter_scaled.shape[0]))
print ("To check if there are any null values in the features and how many: ", \
       X_train_bitter_scaled.isnull().values.any(), X_train_bitter_scaled.isnull().sum().sum())
print ("To check if there are any null values in the target and how many:", \
       y_train_bitter.isnull().values.any(), y_train_bitter.isnull().sum().sum()) 
print ("The total number of Sweet targets: " + str(y_train_bitter[y_train_bitter.Target == 1].count()[0]))
print ("The total number of Bitter targets: " + str(y_train_bitter[y_train_bitter.Target == 0].count()[0]))


Total number of features: 5089
Total number of samples: 2135
To check if there are any null values in the features and how many:  False 0
To check if there are any null values in the target and how many: False 0
The total number of Sweet targets: 728
The total number of Bitter targets: 1407


# Model Training

In [36]:
from sklearn.metrics import accuracy_score, precision_score, average_precision_score , roc_auc_score , classification_report

In [37]:
def evaluate(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("Accuracy:",accuracy_score(y_test,y_pred))
    print("Precision:",precision_score(y_test,y_pred))
    print("Average Precision:",average_precision_score(y_test,y_pred))
    print("AUROC:",roc_auc_score(y_test,y_pred))
    print("Classification Report")
    print(classification_report(y_test, y_pred))

In [38]:
from xgboost import XGBClassifier

clf_xgboost = XGBClassifier()

In [39]:
from catboost import CatBoostClassifier

clf_catboost = CatBoostClassifier(
    iterations=100, 
    learning_rate=0.2, 
    logging_level='Silent'
)

In [40]:
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier(
    n_estimators=250
)

In [None]:
 #recall of the positive class is also known as “sensitivity”; recall of the negative class is “specificity”.

In [None]:
print("Evaluating XGBoost")
evaluate(clf_xgboost, X_train_sweet_scaled, y_train_sweet, X_test_sweet_scaled, y_test_sweet)

print("Evaluating CatBoost")
evaluate(clf_catboost,X_train_sweet_scaled, y_train_sweet, X_test_sweet_scaled, y_test_sweet)

print("Evaluating RandomForest")
evaluate(clf_rf, X_train_sweet_scaled, y_train_sweet, X_test_sweet_scaled, y_test_sweet)

Evaluating XGBoost
Accuracy: 0.7467532467532467
Precision: 0.8928571428571429
Average Precision: 0.8325602968460112
AUROC: 0.7653061224489797
Classification Report
              precision    recall  f1-score   support

           0       0.57      0.82      0.67        49
           1       0.89      0.71      0.79       105

    accuracy                           0.75       154
   macro avg       0.73      0.77      0.73       154
weighted avg       0.79      0.75      0.76       154

Evaluating CatBoost
Accuracy: 0.7792207792207793
Precision: 0.898876404494382
Average Precision: 0.8471958752857629
AUROC: 0.7891156462585034
Classification Report
              precision    recall  f1-score   support

           0       0.62      0.82      0.70        49
           1       0.90      0.76      0.82       105

    accuracy                           0.78       154
   macro avg       0.76      0.79      0.76       154
weighted avg       0.81      0.78      0.79       154

Evaluating RandomF

In [None]:
print("Evaluating XGBoost")
evaluate(clf_xgboost, X_train_bitter_scaled, y_train_bitter, X_test_bitter_scaled, y_test_bitter)

print("Evaluating CatBoost")
evaluate(clf_catboost,X_train_bitter_scaled, y_train_bitter, X_test_bitter_scaled, y_test_bitter)

print("Evaluating RandomForest")
evaluate(clf_rf, X_train_bitter_scaled, y_train_bitter, X_test_bitter_scaled, y_test_bitter)

Evaluating XGBoost
Accuracy: 0.8070175438596491
Precision: 0.9
Average Precision: 0.8346365914786967
AUROC: 0.8175324675324676
Classification Report
              precision    recall  f1-score   support

           0       0.70      0.86      0.78        66
           1       0.90      0.77      0.83       105

    accuracy                           0.81       171
   macro avg       0.80      0.82      0.80       171
weighted avg       0.82      0.81      0.81       171

Evaluating CatBoost
Accuracy: 0.7894736842105263
Precision: 0.8709677419354839
Average Precision: 0.8122402781146415
AUROC: 0.7948051948051947
Classification Report
              precision    recall  f1-score   support

           0       0.69      0.82      0.75        66
           1       0.87      0.77      0.82       105

    accuracy                           0.79       171
   macro avg       0.78      0.79      0.78       171
weighted avg       0.80      0.79      0.79       171

Evaluating RandomForest
Accuracy

# Boruta Shap

In [41]:
from BorutaShap import BorutaShap

Sweet

In [None]:
# If no model is selected default is the Random Forest
# If classification is True it is a classification problem
model = XGBClassifier()
Feature_Selector = BorutaShap(model=model,importance_measure='shap', classification=True,percentile=90)
#Feature_Selector = BorutaShap(importance_measure='shap', classification=True)

#Feature_Selector.fit(X=X_train, y=y_train['Target'], n_trials=50, random_state=0)


#Feature_Selector.fit(X=X_train, y=y_train['Target'], n_trials=500, sample=False,
#            	     train_or_test = 'test', normalize=True, verbose=True,random_state=32)

Feature_Selector.fit(X=X_train_sweet_scaled, y=y_train_sweet['Target'], n_trials=100, sample=False, verbose=True,random_state=32)

  0%|          | 0/100 [00:00<?, ?it/s]

95 attributes confirmed important: ['SpDiam_L', 'CATS2D_05_DN', 'Di', 'RDF020p', 'NNRS', 'MLOGP2', 'CATS2D_09_AL', 'N%', 'BIC5', 'C-008', 'SpDiam_EA(dm)', 'MATS6m', 'HATS3e', 'H_Dz(p)', 'RDF040s', 'CATS3D_02_DL', 'MAXDN', 'P_VSA_LogP_2', 'SpDiam_B(p)', 'TDB01i', 'TPSA(NO)', 'ATSC3v', 'CATS2D_07_AL', 'CATS3D_06_AL', 'SpDiam_B(s)', 'R2i', 'MAXDP', 'ATS6s', 'GGI8', 'CATS2D_07_LL', 'CATS2D_03_DL', 'MLOGP', 'SpMin1_Bh(v)', 'SpMax3_Bh(s)', 'Eig15_EA(ri)', 'MATS6v', 'piID', 'GATS2e', 'RDF020s', 'CATS2D_04_PN', 'SpDiam_EA(ed)', 'rGes', 'DISPs', 'RDF040m', 'CATS2D_05_DA', 'CATS3D_03_DL', 'S3K', 'SpMax1_Bh(s)', 'P_VSA_i_3', 'VE1sign_RG', 'ATS4s', 'HATS7i', 'Eig01_AEA(ed)', 'SpMin1_Bh(s)', 'VE1_RG', 'SpMax_B(i)', 'Eig09_EA(ed)', 'R3v+', 'SpMax1_Bh(m)', 'R8s', 'JGI8', 'CATS2D_02_DL', 'RDF070m', 'GATS4m', 'TDB04v', 'NdssC', 'Mor11m', 'PCR', 'J_D/Dt', 'HATS6v', 'VE2_B(s)', 'MATS2s', 'HATS7p', 'SPH', 'Mor12m', 'DBI', 'GATS4s', 'ISH', 'MATS5m', 'Mor19m', 'TDB05e', 'totalcharge', 'R4s+', 'ASP', 'HATS5p

In [None]:
X_sweet_subset = Feature_Selector.Subset()

In [None]:
X_sweet_subset_cols = X_sweet_subset.columns
#print(X_sweet_subset_cols)

X_sweet_test_subset = X_test_sweet_scaled[X_sweet_subset_cols]

In [None]:
print("Evaluating XGBoost")
evaluate(clf_xgboost, X_sweet_subset, y_train_sweet, X_sweet_test_subset, y_test_sweet)

print("Evaluating CatBoost")
evaluate(clf_catboost,X_sweet_subset, y_train_sweet, X_sweet_test_subset, y_test_sweet)

print("Evaluating RandomForest")
evaluate(clf_rf,X_sweet_subset, y_train_sweet, X_sweet_test_subset, y_test_sweet)

Evaluating XGBoost
Accuracy: 0.7337662337662337
Precision: 0.8902439024390244
Average Precision: 0.8267236828212439
AUROC: 0.7557823129251701
Classification Report
              precision    recall  f1-score   support

           0       0.56      0.82      0.66        49
           1       0.89      0.70      0.78       105

    accuracy                           0.73       154
   macro avg       0.72      0.76      0.72       154
weighted avg       0.78      0.73      0.74       154

Evaluating CatBoost
Accuracy: 0.7467532467532467
Precision: 0.875
Average Precision: 0.8234848484848485
AUROC: 0.754421768707483
Classification Report
              precision    recall  f1-score   support

           0       0.58      0.78      0.66        49
           1       0.88      0.73      0.80       105

    accuracy                           0.75       154
   macro avg       0.73      0.75      0.73       154
weighted avg       0.78      0.75      0.75       154

Evaluating RandomForest
Accurac

In [None]:
model = CatBoostClassifier(iterations=50, 
    learning_rate=0.1, 
    logging_level='Silent')

# no model selected default is Random Forest, if classification is False it is a Regression problem
Feature_Selector = BorutaShap(model=model,
                              importance_measure='shap',
                              classification=True,percentile=90)

Feature_Selector.fit(X=X_train_sweet_scaled, y=y_train_sweet['Target'], n_trials=200, random_state=0)

  0%|          | 0/200 [00:00<?, ?it/s]

58 attributes confirmed important: ['SpPosA_G/D', 'SIC3', 'SpMax1_Bh(i)', 'Eig01_EA(ri)', 'B02C-N', 'P_VSA_m_2', 'SpDiam_B(s)', 'CATS2D_05_DN', 'RBF', 'B01C-N', 'MLOGP2', 'GATS4s', 'MATS5m', 'totalcharge', 'piPC03', 'VE1_RG', 'N%', 'MAXDP', 'R7e', 'GATS4e', 'R4s+', 'DISPs', 'piPC04', 'SpMax7_Bh(s)', 'P_VSA_LogP_5', 'CATS2D_03_DL', 'P_VSA_i_4', 'CATS2D_07_DL', 'SpMax1_Bh(m)', 'SpDiam_EA(ri)', 'MATS6v', 'piID', 'Mor16e', 'P_VSA_s_6', 'SpMax_B(m)', 'CATS3D_02_DL', 'GATS6m', 'GATS5m', 'RDF040s', 'CATS2D_02_DL', 'SIC4', 'piPC07', 'P_VSA_MR_3', 'SpMAD_G/D', 'F01C-N', 'nN', 'J_Dt', 'P_VSA_LogP_2', 'SpMax_B(e)', 'F02C-N', 'HATS6v', 'TDB01i', 'F03C-N', 'CATS2D_04_AP', 'SpMAD_D/Dt', 'rGes', 'HATS6p', 'CATS2D_04_DL']
4905 attributes confirmed unimportant: ['B07O-X', 'nIsoxazoles', 'HATS6e', 'RDF125m', 'C-025', 'SP05', 'nRCOX', 'Mor07u', 'B06Si-X', 'SpMax5_Bh(p)', 'B04S-Br', 'SpDiam_AEA(dm)', 'SM3_Dz(Z)', 'B04O-B', 'F10O-Br', 'B01Si-X', 'H6s', 'SM09_EA(bo)', 'Eig12_AEA(dm)', 'CATS2D_04_PL', 'B06C-

In [None]:
X_sweet_subset = Feature_Selector.Subset()

In [None]:
X_sweet_subset_cols = X_sweet_subset.columns
#print(X_sweet_subset_cols)

X_sweet_test_subset = X_test_sweet_scaled[X_sweet_subset_cols]

In [None]:
print("Evaluating XGBoost")
evaluate(clf_xgboost, X_sweet_subset, y_train_sweet, X_sweet_test_subset, y_test_sweet)

print("Evaluating CatBoost")
evaluate(clf_catboost,X_sweet_subset, y_train_sweet, X_sweet_test_subset, y_test_sweet)

print("Evaluating RandomForest")
evaluate(clf_rf,X_sweet_subset, y_train_sweet, X_sweet_test_subset, y_test_sweet)

Evaluating XGBoost
Accuracy: 0.7467532467532467
Precision: 0.8928571428571429
Average Precision: 0.8325602968460112
AUROC: 0.7653061224489797
Classification Report
              precision    recall  f1-score   support

           0       0.57      0.82      0.67        49
           1       0.89      0.71      0.79       105

    accuracy                           0.75       154
   macro avg       0.73      0.77      0.73       154
weighted avg       0.79      0.75      0.76       154

Evaluating CatBoost
Accuracy: 0.7532467532467533
Precision: 0.8764044943820225
Average Precision: 0.8263680140084635
AUROC: 0.7591836734693878
Classification Report
              precision    recall  f1-score   support

           0       0.58      0.78      0.67        49
           1       0.88      0.74      0.80       105

    accuracy                           0.75       154
   macro avg       0.73      0.76      0.74       154
weighted avg       0.78      0.75      0.76       154

Evaluating Random

Random Forest

In [None]:

# no model selected default is Random Forest, if classification is False it is a Regression problem
Feature_Selector = BorutaShap(
                              importance_measure='shap',
                              classification=True,percentile=80)

Feature_Selector.fit(X=X_train_sweet_scaled, y=y_train_sweet['Target'], n_trials=200, random_state=0)

  0%|          | 0/200 [00:00<?, ?it/s]

722 attributes confirmed important: ['R6v', 'L2m', 'Mor18p', 'B02C-N', 'AVS_B(p)', 'CATS2D_05_DN', 'Eig07_EA(bo)', 'RDF020p', 'Mor14e', 'Mor24e', 'Mor03i', 'MPC06', 'BIC3', 'HNar', 'SpDiam_AEA(dm)', 'SpPosA_Dz(m)', 'TDB07m', 'SM6_L', 'SM09_EA(bo)', 'CATS2D_04_PL', 'SM6_B(p)', 'VR2_B(v)', 'SpMAD_EA', 'GATS6m', 'piPC09', 'Mor20s', 'Mor29p', 'Mor24u', 'O-060', 'SM10_EA(dm)', 'SpMax_B(p)', 'VR2_B(p)', 'MATS7p', 'ATS4m', 'JGI3', 'H_Dz(i)', 'SM09_EA(dm)', 'Mor15p', 'SM06_AEA(ri)', 'Mor03p', 'CATS3D_02_AA', 'TDB05s', 'Mor29v', 'VR2_B(m)', 'SpDiam_B(s)', 'R4i', 'Mor28m', 'SpPosA_RG', 'DISPe', 'SIC0', 'Chi_Dt', 'HATSe', 'SM03_AEA(ri)', 'SpPosA_B(e)', 'MAXDP', 'HATS6s', 'Eig10_EA', 'ATSC4e', 'AVS_B(v)', 'HATSp', 'GATS2s', 'CATS2D_03_DL', 'R3s+', 'O%', 'Mor30v', 'Eig01_AEA(dm)', 'SpMin1_Bh(e)', 'P_VSA_MR_2', 'ATSC1e', 'VR2_B(s)', 'TDB02v', 'P_VSA_s_3', 'Mor10e', 'CATS2D_04_PN', 'TDB03u', 'SpMAD_B(m)', 'GATS3v', 'GATS6s', 'SM12_EA', 'DISPs', 'SM3_B(p)', 'Mor24m', 'Eig06_EA(bo)', 'TDB07v', 'F03O-O'

In [None]:
X_sweet_subset = Feature_Selector.Subset()

In [None]:
X_sweet_subset_cols = X_sweet_subset.columns
#print(X_sweet_subset_cols)

X_sweet_test_subset = X_test_sweet_scaled[X_sweet_subset_cols]

In [None]:
print("Evaluating XGBoost")
evaluate(clf_xgboost, X_sweet_subset, y_train_sweet, X_sweet_test_subset, y_test_sweet)

print("Evaluating CatBoost")
evaluate(clf_catboost,X_sweet_subset, y_train_sweet, X_sweet_test_subset, y_test_sweet)

print("Evaluating RandomForest")
evaluate(clf_rf,X_sweet_subset, y_train_sweet, X_sweet_test_subset, y_test_sweet)

Evaluating XGBoost
Accuracy: 0.7597402597402597
Precision: 0.8953488372093024
Average Precision: 0.8384073291050036
AUROC: 0.7748299319727892
Classification Report
              precision    recall  f1-score   support

           0       0.59      0.82      0.68        49
           1       0.90      0.73      0.81       105

    accuracy                           0.76       154
   macro avg       0.74      0.77      0.75       154
weighted avg       0.80      0.76      0.77       154

Evaluating CatBoost
Accuracy: 0.7857142857142857
Precision: 0.8913043478260869
Average Precision: 0.845416901938641
AUROC: 0.7884353741496598
Classification Report
              precision    recall  f1-score   support

           0       0.63      0.80      0.70        49
           1       0.89      0.78      0.83       105

    accuracy                           0.79       154
   macro avg       0.76      0.79      0.77       154
weighted avg       0.81      0.79      0.79       154

Evaluating RandomF

Catboost

In [42]:
model = CatBoostClassifier(iterations=50, 
    learning_rate=0.2, 
    logging_level='Silent')

# no model selected default is Random Forest, if classification is False it is a Regression problem
Feature_Selector = BorutaShap(model = model,
                              importance_measure='shap',
                              classification=True,percentile=80)

Feature_Selector.fit(X=X_train_sweet_scaled, y=y_train_sweet['Target'], n_trials=200, random_state=0)

  0%|          | 0/200 [00:00<?, ?it/s]

45 attributes confirmed important: ['P_VSA_LogP_2', 'RDF040s', 'Mor24s', 'rGes', 'RBF', 'GATS6m', 'MATS5m', 'DISPp', 'P_VSA_LogP_5', 'HATS6p', 'R5s+', 'S3K', 'MATS6m', 'GATS4s', 'totalcharge', 'CATS2D_05_DN', 'DISPs', 'GATS4e', 'DISPe', 'J_Dt', 'CATS3D_02_DL', 'F01C-N', 'CATS2D_03_DL', 'TDB01i', 'SpMax1_Bh(i)', 'Eig01_EA(ri)', 'N%', 'SpDiam_B(s)', 'GATS5m', 'Mor16e', 'GATS2m', 'CATS2D_07_DL', 'Mor04s', 'nN', 'JGI8', 'MATS6v', 'SpMax1_Bh(m)', 'SpMax7_Bh(s)', 'R4s+', 'Mor19m', 'CATS2D_08_AL', 'RDF020m', 'MLOGP2', 'VE1_RG', 'SpDiam_EA(ri)']
4916 attributes confirmed unimportant: ['Mor25e', 'RDF120i', 'B05F-B', 'B02F-X', 'CATS3D_08_NL', 'Mor16u', 'CATS3D_12_DP', 'MATS8e', 'Eig11_EA', 'F02O-F', 'F10I-I', 'F06P-Cl', 'SpAbs_B(p)', 'MPC08', 'H6p', 'R7i+', 'SpAD_G/D', 'R4m', 'R4v+', 'F01Br-Br', 'F03Cl-Br', 'Mor17e', 'GATS6v', 'R8e+', 'SpPos_Dz(Z)', 'CATS2D_01_PP', 'H0i', 'Mor08e', 'B06I-I', 'Mor27p', 'G(N..S)', 'Mor14e', 'B02S-Si', 'CATS3D_14_DL', 'StsC', 'Mor24m', 'F01C-X', 'SM5_Dz(i)', 'Eig02

In [None]:
X_sweet_subset = Feature_Selector.Subset()

In [55]:
X_sweet_subset_cols = X_sweet_subset.columns
#print(X_sweet_subset_cols)

X_sweet_test_subset = X_test_sweet_scaled[X_sweet_subset_cols]

X_sweet_subset = X_train_sweet_scaled[X_sweet_subset_cols]

In [45]:
print("Evaluating XGBoost")
evaluate(clf_xgboost, X_sweet_subset, y_train_sweet, X_sweet_test_subset, y_test_sweet)

print("Evaluating CatBoost")
evaluate(clf_catboost,X_sweet_subset, y_train_sweet, X_sweet_test_subset, y_test_sweet)

print("Evaluating RandomForest")
evaluate(clf_rf,X_sweet_subset, y_train_sweet, X_sweet_test_subset, y_test_sweet)

Evaluating XGBoost
Accuracy: 0.7532467532467533
Precision: 0.8941176470588236
Average Precision: 0.8354825566590273
AUROC: 0.7700680272108844
Classification Report
              precision    recall  f1-score   support

           0       0.58      0.82      0.68        49
           1       0.89      0.72      0.80       105

    accuracy                           0.75       154
   macro avg       0.74      0.77      0.74       154
weighted avg       0.79      0.75      0.76       154

Evaluating CatBoost
Accuracy: 0.7597402597402597
Precision: 0.8777777777777778
Average Precision: 0.8292544492544491
AUROC: 0.7639455782312925
Classification Report
              precision    recall  f1-score   support

           0       0.59      0.78      0.67        49
           1       0.88      0.75      0.81       105

    accuracy                           0.76       154
   macro avg       0.74      0.76      0.74       154
weighted avg       0.79      0.76      0.77       154

Evaluating Random

In [None]:
model = CatBoostClassifier(iterations=100, 
    learning_rate=0.1, 
    logging_level='Silent')

# no model selected default is Random Forest, if classification is False it is a Regression problem
Feature_Selector = BorutaShap(model = model,
                              importance_measure='shap',
                              classification=True)

Feature_Selector.fit(X=X_train_sweet_scaled, y=y_train_sweet['Target'], n_trials=50, random_state=0)

  0%|          | 0/50 [00:00<?, ?it/s]

adaboost

In [58]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

tree = DecisionTreeClassifier(
          max_depth=1,
          random_state=42)

ada = AdaBoostClassifier(
           base_estimator=tree,
           n_estimators=500,
           learning_rate=0.5,
           random_state=42)

In [61]:
X_sweet_subset_cols = ['SpPosA_G/D', 'SIC3', 'SpMax1_Bh(i)', 'Eig01_EA(ri)', 'B02C-N', 'P_VSA_m_2', 'SpDiam_B(s)', 'CATS2D_05_DN', 'RBF', 'B01C-N', 'MLOGP2', 'GATS4s', 'MATS5m', 'totalcharge', 'piPC03', 'VE1_RG', 'N%', 'MAXDP', 'R7e', 'GATS4e', 'R4s+', 'DISPs', 'piPC04', 'SpMax7_Bh(s)', 'P_VSA_LogP_5', 'CATS2D_03_DL', 'P_VSA_i_4', 'CATS2D_07_DL', 'SpMax1_Bh(m)', 'SpDiam_EA(ri)', 'MATS6v', 'piID', 'Mor16e', 'P_VSA_s_6', 'SpMax_B(m)', 'CATS3D_02_DL', 'GATS6m', 'GATS5m', 'RDF040s', 'CATS2D_02_DL', 'SIC4', 'piPC07', 'P_VSA_MR_3', 'SpMAD_G/D', 'F01C-N', 'nN', 'J_Dt', 'P_VSA_LogP_2', 'SpMax_B(e)', 'F02C-N', 'HATS6v', 'TDB01i', 'F03C-N', 'CATS2D_04_AP', 'SpMAD_D/Dt', 'rGes', 'HATS6p', 'CATS2D_04_DL']

In [62]:
#X_sweet_subset_cols = X_sweet_subset.columns
#print(X_sweet_subset_cols)

X_sweet_test_subset = X_test_sweet_scaled[X_sweet_subset_cols]

X_sweet_subset = X_train_sweet_scaled[X_sweet_subset_cols]

In [63]:
print("Evaluating adaBoost")
evaluate(ada, X_sweet_subset, y_train_sweet, X_sweet_test_subset, y_test_sweet)


Evaluating adaBoost
Accuracy: 0.7077922077922078
Precision: 0.8260869565217391
Average Precision: 0.786241294936947
AUROC: 0.698639455782313
Classification Report
              precision    recall  f1-score   support

           0       0.53      0.67      0.59        49
           1       0.83      0.72      0.77       105

    accuracy                           0.71       154
   macro avg       0.68      0.70      0.68       154
weighted avg       0.73      0.71      0.72       154



In [53]:

# no model selected default is Random Forest, if classification is False it is a Regression problem
Feature_Selector = BorutaShap(model = ada,
                              importance_measure='shap',
                              classification=True,percentile=80)

Feature_Selector.fit(X=X_train_sweet_scaled, y=y_train_sweet['Target'], n_trials=200, random_state=0)

  0%|          | 0/200 [00:00<?, ?it/s]

InvalidModelError: ignored

In [None]:
X_sweet_subset = Feature_Selector.Subset()

In [None]:
X_sweet_subset_cols = X_sweet_subset.columns
#print(X_sweet_subset_cols)

X_sweet_test_subset = X_test_sweet_scaled[X_sweet_subset_cols]

X_sweet_subset = X_train_sweet_scaled[X_sweet_subset_cols]

In [None]:
print("Evaluating XGBoost")
evaluate(clf_xgboost, X_sweet_subset, y_train_sweet, X_sweet_test_subset, y_test_sweet)

print("Evaluating CatBoost")
evaluate(clf_catboost,X_sweet_subset, y_train_sweet, X_sweet_test_subset, y_test_sweet)

print("Evaluating RandomForest")
evaluate(clf_rf,X_sweet_subset, y_train_sweet, X_sweet_test_subset, y_test_sweet)

Bitter

In [None]:
# If no model is selected default is the Random Forest
# If classification is True it is a classification problem
model = XGBClassifier()

from sklearn.ensemble import AdaBoostClassifier

from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(
          max_depth=1,
          random_state=42)

ada = AdaBoostClassifier(
           n_estimators=500,
           learning_rate=0.5,
           random_state=42)

Feature_Selector = BorutaShap(model=ada,importance_measure='shap', classification=True)
#Feature_Selector = BorutaShap(importance_measure='shap', classification=True)

#Feature_Selector.fit(X=X_train, y=y_train['Target'], n_trials=50, random_state=0)


#Feature_Selector.fit(X=X_train, y=y_train['Target'], n_trials=500, sample=False,
#            	     train_or_test = 'test', normalize=True, verbose=True,random_state=32)

Feature_Selector.fit(X=X_train_bitter_scaled, y=y_train_bitter['Target'], n_trials=50, sample=False, verbose=True,random_state=32)

In [None]:
X_bitter_subset = Feature_Selector.Subset()

In [None]:
X_bitter_subset_cols = X_bitter_subset.columns
print(X_bitter_subset_cols)

Index(['MLOGP2', 'N%', 'CATS2D_05_DN', 'SpMax1_Bh(m)', 'totalcharge',
       'SpDiam_RG', 'TDB03p', 'CATS2D_02_DL'],
      dtype='object')


In [None]:
X_bitter_test_subset = X_test_bitter_scaled[X_bitter_subset_cols]

In [None]:
print("Evaluating XGBoost")
evaluate(clf_xgboost, X_sweet_subset, y_train_sweet, X_sweet_test_subset, y_test_sweet)

print("Evaluating CatBoost")
evaluate(clf_catboost,X_sweet_subset, y_train_sweet, X_sweet_test_subset, y_test_sweet)

print("Evaluating RandomForest")
evaluate(clf_rf,X_sweet_subset, y_train_sweet, X_sweet_test_subset, y_test_sweet)

Evaluating XGBoost
Accuracy: 0.7142857142857143
Precision: 0.8674698795180723
Average Precision: 0.8091222030981068
AUC: 0.7306122448979592
Evaluating CatBoost
Accuracy: 0.7272727272727273
Precision: 0.8705882352941177
Average Precision: 0.8148561242678889
AUC: 0.7401360544217688
Evaluating RandomForest
Accuracy: 0.7402597402597403
Precision: 0.891566265060241
Average Precision: 0.829640640484014
AUC: 0.7605442176870749
