In [33]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [34]:
df_sweet_train = pd.read_csv('./mordred/sweet_descriptor_train.csv')
df_sweet_test = pd.read_csv('./mordred/sweet_descriptor_test.csv')
df_bitter_train = pd.read_csv('./mordred/bitter_descriptor_train.csv')
df_bitter_test = pd.read_csv('./mordred/bitter_descriptor_test.csv')

In [35]:
non_feature_columns = ['Name', 'Reference', 'SMILES', 'Canonical SMILES', 'Target']

def fix_feature_dtype(df):
    df_features = df[df.columns.difference(non_feature_columns)]
    
    df_features.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    mask = df_features.applymap(lambda x: isinstance(x, (int, float)))
    df_features = df_features.where(mask)
    
    df_set = pd.concat([df[non_feature_columns], df_features], axis=1)
    return df_set

In [36]:
df_sweet_set_train = fix_feature_dtype(df_sweet_train)
df_sweet_set_test = fix_feature_dtype(df_sweet_test)
df_bitter_set_train = fix_feature_dtype(df_bitter_train)
df_bitter_set_test = fix_feature_dtype(df_bitter_test)

In [37]:
X_train_sweet = df_sweet_set_train[df_sweet_set_train.columns.difference(non_feature_columns)]
y_train_sweet = df_sweet_set_train[['Target']]
X_test_sweet = df_sweet_set_test[df_sweet_set_test.columns.difference(non_feature_columns)]
y_test_sweet = df_sweet_set_test[['Target']]

In [38]:
X_train_bitter = df_bitter_set_train[df_bitter_set_train.columns.difference(non_feature_columns)]
y_train_bitter = df_bitter_set_train[['Target']]

In [39]:
df_bitter_test['Reference'].unique()

array(['Wiener et al. (2017) - Phyto-Dictionary',
       'Wiener et al. (2017) - Bitter-New',
       'Wiener et al. (2017) - UNIMI'], dtype=object)

In [40]:
df_bitter_test_phyto = df_bitter_set_test[df_bitter_set_test['Reference'] == 'Wiener et al. (2017) - Phyto-Dictionary']
df_bitter_test_bitternew = df_bitter_set_test[df_bitter_set_test['Reference'] == 'Wiener et al. (2017) - Bitter-New']
df_bitter_test_unimi = df_bitter_set_test[df_bitter_set_test['Reference'] == 'Wiener et al. (2017) - UNIMI']

In [41]:
X_test_bitter_phyto = df_bitter_test_phyto[df_bitter_test_phyto.columns.difference(non_feature_columns)]
y_test_bitter_phyto = df_bitter_test_phyto[['Target']]
X_test_bitter_bitternew = df_bitter_test_bitternew[df_bitter_test_bitternew.columns.difference(non_feature_columns)]
y_test_bitter_bitternew = df_bitter_test_bitternew[['Target']]
X_test_bitter_unimi = df_bitter_test_unimi[df_bitter_test_unimi.columns.difference(non_feature_columns)]
y_test_bitter_unimi = df_bitter_test_unimi[['Target']]

In [42]:
X_train_sweet

Unnamed: 0,AATS0Z,AATS0are,AATS0d,AATS0dv,AATS0i,AATS0m,AATS0p,AATS0pe,AATS0s,AATS0se,...,piPC1,piPC10,piPC2,piPC3,piPC4,piPC5,piPC6,piPC7,piPC8,piPC9
0,25.733333,7.027333,3.155556,9.066667,169.548550,101.537130,1.118301,6.99288,,8.559139,...,3.218876,4.060443,3.610918,3.951244,4.143135,4.304065,4.317488,4.262680,4.262680,4.234107
1,43.571429,6.880636,3.309524,7.971781,167.215625,180.198242,1.459935,7.014657,,8.599467,...,3.218876,4.060443,3.610918,3.951244,4.143135,4.304065,4.317488,4.262680,4.262680,4.234107
2,24.102564,6.531277,2.923077,8.461538,165.471161,95.133819,1.350863,6.559133,,8.042601,...,3.332205,4.627421,3.749504,4.020877,4.312476,4.722398,4.452165,4.599529,4.819273,4.884694
3,25.500000,7.045,3.000000,8.833333,170.520236,100.566063,1.080332,7.004025,,8.58229,...,2.564949,0.000000,2.944439,3.178054,3.135494,3.178054,2.890372,2.079442,0.693147,0.000000
4,25.733333,7.027333,3.155556,9.066667,169.548550,101.537130,1.118301,6.99288,,8.559139,...,3.218876,4.276666,3.610918,3.912023,4.043051,4.143135,4.077537,3.988984,4.077537,4.219508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2178,20.777778,6.054717,2.722222,6.333333,163.806151,81.730971,1.409834,6.119594,,7.600174,...,2.564949,0.000000,3.068053,3.504055,3.944006,4.328263,3.840795,3.056357,0.000000,0.000000
2179,22.823529,6.292353,2.823529,7.176471,161.047311,89.994904,1.433271,6.347335,,7.837339,...,2.564949,0.000000,3.068053,3.504055,3.899444,4.385925,3.876396,2.784239,0.000000,0.000000
2180,24.086957,6.09717,3.347826,8.260870,155.760108,95.325134,1.709631,6.202835,,7.587205,...,3.135494,5.731874,3.663562,4.226834,4.711780,5.192262,5.029621,5.364222,5.432903,5.614496
2181,20.173913,6.036087,2.695652,6.434783,162.220157,79.239385,1.379211,6.098683,,7.616941,...,2.740840,0.000000,3.228826,3.522677,3.962003,3.852804,3.610918,3.725693,3.725693,0.000000


In [54]:
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
# sweet_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
# bitter_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
sweet_imputer = KNNImputer(n_neighbors=8, weights="uniform")
bitter_imputer = KNNImputer(n_neighbors=8, weights="uniform")

In [58]:
X_train_sweet.shape

(2183, 1826)

In [60]:
len(sweet_imputer.get_feature_names_out())

972

In [56]:
X_train_sweet_tf = sweet_imputer.fit_transform(X_train_sweet)
X_test_sweet_tf = sweet_imputer.transform(X_test_sweet)

X_train_bitter_tf = bitter_imputer.fit_transform(X_train_bitter)
X_test_bitter_phyto_tf = bitter_imputer.transform(X_test_bitter_phyto)
X_test_bitter_bitternew_tf = bitter_imputer.transform(X_test_bitter_bitternew)
X_test_bitter_unimi_tf = bitter_imputer.transform(X_test_bitter_unimi)

In [65]:
X_train_sweet_tf = pd.DataFrame(X_train_sweet_tf,
                                columns=sweet_imputer.get_feature_names_out())
X_test_sweet_tf = pd.DataFrame(X_test_sweet_tf,
                                columns=sweet_imputer.get_feature_names_out())
X_train_bitter_tf = pd.DataFrame(X_train_bitter_tf,
                                columns=bitter_imputer.get_feature_names_out())
X_test_bitter_phyto_tf = pd.DataFrame(X_test_bitter_phyto_tf,
                                columns=bitter_imputer.get_feature_names_out())
X_test_bitter_bitternew_tf = pd.DataFrame(X_test_bitter_bitternew_tf,
                                columns=bitter_imputer.get_feature_names_out())
X_test_bitter_unimi_tf = pd.DataFrame(X_test_bitter_unimi_tf,
                                columns=bitter_imputer.get_feature_names_out())

## Model Training

In [150]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, average_precision_score , roc_auc_score , classification_report, f1_score, recall_score

In [152]:
def get_metrics(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:,1]

    print("Accuracy:",accuracy_score(y_test,y_pred))
    print("Precision:",precision_score(y_test,y_pred))
    print("Average Precision:",average_precision_score(y_test,y_pred_proba))
    try:
        print("AUROC:",roc_auc_score(y_test,y_pred_proba))
    except:
        print("AUROC: N/A (Only 1 class in y_true)")
    print("Sensitivity", recall_score(y_test, y_pred))
    print("Specificity", recall_score(np.logical_not(y_test),np.logical_not(y_pred)))
    print("F1-Score", f1_score(y_test, y_pred))
    print("Classification Report")
    print(classification_report(y_test, y_pred))
    print('-----------------\r\n')
    
def evaluate(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    get_metrics(model, X_test, y_test)

def evaluate_all(models, X_train, y_train, X_test, y_test):
    for model in models:
        print(f"Model: {model['name']}")
        evaluate(model['model'], X_train, y_train, X_test, y_test)

In [69]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

## Parameter Tuning
In this section, we tune the parameters for each of the models used

### Sweet Classifier

In [70]:
clf_xgboost = XGBClassifier()
clf_catboost = CatBoostClassifier(
    logging_level='Silent'
)
clf_rf = RandomForestClassifier(
)

In [99]:
xgb_params = { 'max_depth': [3,6,10],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 500, 1000],
    'colsample_bytree': [0.3, 0.7]
    }
catboost_params = {'depth': [3,6,10],
    'learning_rate' : [0.01, 0.1, 0.2],
    'iterations'    : [100, 250, 350]
    }
rf_params = {'bootstrap': [True, False],
    'max_depth': [10, 100, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [200, 400]
    }

In [18]:
clf_xgboost_cv = GridSearchCV(estimator=clf_xgboost, 
                    param_grid=xgb_params,
                    scoring='f1', 
                    n_jobs=-1,
                    verbose=1)

clf_catboost_cv = GridSearchCV(estimator=clf_catboost, 
                    param_grid=catboost_params,
                    scoring='f1', 
                    n_jobs=-1,
                    verbose=1)

clf_rf_cv = GridSearchCV(estimator=clf_rf, 
                    param_grid=rf_params,
                    scoring='f1', 
                    n_jobs=-1,
                    verbose=1)

In [None]:
print("Evaluating XGBoost")
evaluate(clf_xgboost_cv, X_train_sweet_tf, y_train_sweet, X_test_sweet_tf, y_test_sweet)

print("Evaluating CatBoost")
evaluate(clf_catboost_cv, X_train_sweet_tf, y_train_sweet, X_test_sweet_tf, y_test_sweet)

print("Evaluating RandomForest")
evaluate(clf_rf_cv, X_train_sweet_tf, y_train_sweet, X_test_sweet_tf, y_test_sweet)

In [27]:
print('XGBoost Params:', clf_xgboost_cv.best_params_)
print('CatBoost Params:', clf_catboost_cv.best_params_)
print('Random Forest Params:', clf_rf_cv.best_params_)

XGBoost Params: {'colsample_bytree': 0.3, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 1000}
CatBoost Params: {'depth': 6, 'iterations': 250, 'learning_rate': 0.2}
Random Forest Params: {'bootstrap': False, 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 400}


## Bitter Classifier

In [118]:
clf_xgboost_bitter = XGBClassifier()
clf_catboost_bitter = CatBoostClassifier(logging_level='Silent')
clf_rf_bitter = RandomForestClassifier()

In [100]:
clf_xgboost_bitter_cv = GridSearchCV(estimator=clf_xgboost_bitter, 
                    param_grid=xgb_params,
                    scoring='f1', 
                    n_jobs=-1,
                    verbose=1)

clf_catboost_bitter_cv = GridSearchCV(estimator=clf_catboost_bitter, 
                    param_grid=catboost_params,
                    scoring='f1', 
                    n_jobs=-1,
                    verbose=1)

clf_rf_bitter_cv = GridSearchCV(estimator=clf_rf_bitter, 
                    param_grid=rf_params,
                    scoring='f1', 
                    n_jobs=-1,
                    verbose=1)

In [103]:
print("Evaluating XGBoost")
evaluate(clf_xgboost_bitter_cv, X_train_bitter_tf, y_train_bitter, X_test_bitter_phyto_tf, y_test_bitter_phyto)

print("Evaluating CatBoost")
evaluate(clf_catboost_bitter_cv, X_train_bitter_tf, y_train_bitter, X_test_bitter_phyto_tf, y_test_bitter_phyto)

print("Evaluating RandomForest")
evaluate(clf_rf_bitter_cv, X_train_bitter_tf, y_train_bitter, X_test_bitter_phyto_tf, y_test_bitter_phyto)

Evaluating XGBoost
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Accuracy: 0.9186046511627907
Precision: 0.96
Average Precision: 0.9683075828134113
AUROC: 0.953973699256718
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.94      0.90        33
           1       0.96      0.91      0.93        53

    accuracy                           0.92        86
   macro avg       0.91      0.92      0.92        86
weighted avg       0.92      0.92      0.92        86

-----------------

Evaluating CatBoost
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Accuracy: 0.9069767441860465
Precision: 0.9591836734693877
Average Precision: 0.9719084276149621
AUROC: 0.9568324757004002
Classification Report
              precision    recall  f1-score   support

           0       0.84      0.94      0.89        33
           1       0.96      0.89      0.92        53

    accuracy                           0.91        8

  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)
  warn(
  estimator.fit(X_train, y_train, **fit_params)


Accuracy: 0.9069767441860465
Precision: 0.9411764705882353
Average Precision: 0.9164506827318755
AUROC: 0.9376786735277302
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.91      0.88        33
           1       0.94      0.91      0.92        53

    accuracy                           0.91        86
   macro avg       0.90      0.91      0.90        86
weighted avg       0.91      0.91      0.91        86

-----------------



In [105]:
print('XGBoost Params:', clf_xgboost_bitter_cv.best_params_)
print('CatBoost Params:', clf_catboost_bitter_cv.best_params_)
print('Random Forest Params:', clf_rf_bitter_cv.best_params_)

XGBoost Params: {'colsample_bytree': 0.3, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 500}
CatBoost Params: {'depth': 10, 'iterations': 350, 'learning_rate': 0.1}
Random Forest Params: {'bootstrap': False, 'max_depth': 100, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}


### Summary: Best Parameters for Sweet Classifier
```
XGBoost Params: {'colsample_bytree': 0.3, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 1000}
CatBoost Params: {'depth': 6, 'iterations': 250, 'learning_rate': 0.2}
Random Forest Params: {'bootstrap': False, 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 400}
```

### Summary: Best Parameters for Bitter Classifier
```
XGBoost Params: {'colsample_bytree': 0.3, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 500}
CatBoost Params: {'depth': 10, 'iterations': 350, 'learning_rate': 0.1}
Random Forest Params: {'bootstrap': False, 'max_depth': 100, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
```

## Evaluation (No Feature Selection)
In this section, we evaluate the models without applying feature selection yet

In [133]:
clf_xgboost_sweet = XGBClassifier(
    colsample_bytree=0.3,
    learning_rate=0.01,
    max_depth=10,
    n_estimators=1000
)
clf_catboost_sweet = CatBoostClassifier(
    depth=6,
    iterations=250, 
    learning_rate=0.2, 
    logging_level='Silent'
)
clf_rf_sweet = RandomForestClassifier(
    bootstrap=False,
    max_depth=None,
    max_features='auto',
    min_samples_leaf=1,
    min_samples_split=10,
    n_estimators=400
)
sweet_models_full = [
    {'model': clf_xgboost_sweet, 'name': 'XGBoost (No Feature Selection)'},
    {'model': clf_catboost_sweet, 'name': 'CatBoost (No Feature Selection)'},
    {'model': clf_rf_sweet, 'name': 'Random Forest (No Feature Selection)'},
]

In [163]:
clf_xgboost_bitter = XGBClassifier(
    colsample_bytree=0.3,
    learning_rate=0.01,
    max_depth=10,
    n_estimators=500
)
clf_catboost_bitter = CatBoostClassifier(
    depth=10,
    iterations=350, 
    learning_rate=0.1, 
    logging_level='Silent'
)
clf_rf_bitter = RandomForestClassifier(
    bootstrap=False,
    max_depth=100,
    max_features='auto',
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=400
)

bitter_models_full = [
    {'model': clf_xgboost_bitter, 'name': 'XGBoost (No Feature Selection)'},
    {'model': clf_catboost_bitter, 'name': 'CatBoost (No Feature Selection)'},
    {'model': clf_rf_bitter, 'name': 'Random Forest (No Feature Selection)'},
]

### Sweet Classifier

In [164]:
evaluate_all(sweet_models_full, X_train_sweet_tf, y_train_sweet, X_test_sweet_tf, y_test_sweet)

Model: XGBoost (BorutaShap)
Accuracy: 0.7564102564102564
Precision: 0.8777777777777778
Average Precision: 0.9324446028595318
AUROC: 0.8511320754716982
Sensitivity 0.7452830188679245
Specificity 0.78
F1-Score 0.8061224489795918
Classification Report
              precision    recall  f1-score   support

           0       0.59      0.78      0.67        50
           1       0.88      0.75      0.81       106

    accuracy                           0.76       156
   macro avg       0.73      0.76      0.74       156
weighted avg       0.79      0.76      0.76       156

-----------------

Model: CatBoost (BorutaShap)
Accuracy: 0.717948717948718
Precision: 0.8604651162790697
Average Precision: 0.9235184805138908
AUROC: 0.8311320754716982
Sensitivity 0.6981132075471698
Specificity 0.76
F1-Score 0.7708333333333334
Classification Report
              precision    recall  f1-score   support

           0       0.54      0.76      0.63        50
           1       0.86      0.70      0.77    

### Bitter Classifier

#### Phyto-Dictionary

In [165]:
evaluate_all(bitter_models_full, X_train_bitter_tf, y_train_bitter, X_test_bitter_phyto_tf, y_test_bitter_phyto)

Model: XGBoost (No Feature Selection)
Accuracy: 0.9186046511627907
Precision: 0.96
Average Precision: 0.9683075828134113
AUROC: 0.953973699256718
Sensitivity 0.9056603773584906
Specificity 0.9393939393939394
F1-Score 0.9320388349514563
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.94      0.90        33
           1       0.96      0.91      0.93        53

    accuracy                           0.92        86
   macro avg       0.91      0.92      0.92        86
weighted avg       0.92      0.92      0.92        86

-----------------

Model: CatBoost (No Feature Selection)
Accuracy: 0.9069767441860465
Precision: 0.9591836734693877
Average Precision: 0.9719084276149621
AUROC: 0.9568324757004002
Sensitivity 0.8867924528301887
Specificity 0.9393939393939394
F1-Score 0.9215686274509803
Classification Report
              precision    recall  f1-score   support

           0       0.84      0.94      0.89        33
           1 

#### UNIMI

In [166]:
evaluate_all(bitter_models_full, X_train_bitter_tf, y_train_bitter, X_test_bitter_unimi_tf, y_test_bitter_unimi)

Model: XGBoost (No Feature Selection)
Accuracy: 0.7254901960784313
Precision: 0.8333333333333334
Average Precision: 0.6559732162763309
AUROC: 0.8055555555555556
Sensitivity 0.2777777777777778
Specificity 0.9696969696969697
F1-Score 0.4166666666666667
Classification Report
              precision    recall  f1-score   support

           0       0.71      0.97      0.82        33
           1       0.83      0.28      0.42        18

    accuracy                           0.73        51
   macro avg       0.77      0.62      0.62        51
weighted avg       0.75      0.73      0.68        51

-----------------

Model: CatBoost (No Feature Selection)
Accuracy: 0.7058823529411765
Precision: 0.8
Average Precision: 0.6868716915127415
AUROC: 0.8173400673400674
Sensitivity 0.2222222222222222
Specificity 0.9696969696969697
F1-Score 0.3478260869565218
Classification Report
              precision    recall  f1-score   support

           0       0.70      0.97      0.81        33
           1 

#### Bitter-New

In [156]:
evaluate_all(bitter_models_full, X_train_bitter_tf, y_train_bitter, X_test_bitter_bitternew_tf, y_test_bitter_bitternew)

Model: XGBoost (BorutaShap)
Accuracy: 0.4074074074074074
Precision: 1.0
Average Precision: 0.9999999999999998
AUROC: N/A (Only 1 class in y_true)
Sensitivity 0.4074074074074074
Specificity 0.0
F1-Score 0.5789473684210525
Classification Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.41      0.58        27

    accuracy                           0.41        27
   macro avg       0.50      0.20      0.29        27
weighted avg       1.00      0.41      0.58        27

-----------------

Model: CatBoost (BorutaShap)
Accuracy: 0.3333333333333333
Precision: 1.0
Average Precision: 0.9999999999999998
AUROC: N/A (Only 1 class in y_true)
Sensitivity 0.3333333333333333
Specificity 0.0
F1-Score 0.5
Classification Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.33      0.50        27

    accuracy      

## Feature Selection using BorutaShap
In this section, we select only the important features using BorutaShap algorithm

In [None]:
!pip install scikit-learn==1.1.3

In [None]:
!pip install catboost BorutaShap

In [2]:
from BorutaShap import BorutaShap

In [1]:
import sklearn
sklearn.__version__

'1.1.3'

In [182]:
sweet_selector = BorutaShap(importance_measure='shap', classification=True,percentile=100)
bitter_selector = BorutaShap(importance_measure='shap', classification=True,percentile=100)

In [198]:
X_train_sweet_tf.shape

(2183, 972)

In [199]:
X_train_bitter_tf.shape

(2229, 975)

In [183]:
sweet_selector.fit(
    X=X_train_sweet_tf,
    y=y_train_sweet['Target'],
    n_trials=500,
    sample=False,
    verbose=True,
    random_state=42
)

  0%|          | 0/500 [00:00<?, ?it/s]

234 attributes confirmed important: ['MPC5', 'AMW', 'ATSC1d', 'ATSC5are', 'GATS3se', 'VSA_EState1', 'BertzCT', 'MWC09', 'CIC4', 'TMPC10', 'GATS4se', 'AATSC4pe', 'MWC10', 'MWC04', 'ZMIC5', 'PEOE_VSA8', 'AATSC1v', 'Mp', 'FCSP3', 'ATS3m', 'ATSC0dv', 'MPC6', 'AXp-1d', 'SRW08', 'Xpc-5dv', 'ATSC6m', 'AATS1p', 'MATS3m', 'GATS2are', 'JGI3', 'CIC5', 'ATS5p', 'MATS2p', 'MATS3are', 'CIC1', 'AATSC2d', 'GATS1are', 'AATS3m', 'ATS4m', 'VSA_EState3', 'AATSC3dv', 'AATSC2se', 'MATS1se', 'SsssCH', 'GATS1m', 'GATS2i', 'GATS1p', 'Xp-6d', 'BIC3', 'Xp-5d', 'AATS3v', 'AATSC4se', 'GATS4pe', 'ATS5m', 'AATS2d', 'Xp-7d', 'AATSC1dv', 'AATS4Z', 'piPC1', 'ATSC5c', 'MATS4se', 'VSA_EState6', 'SlogP_VSA2', 'ATSC5d', 'AATS1m', 'ATSC6p', 'ATSC4dv', 'GATS2v', 'RotRatio', 'AXp-2d', 'ATSC4c', 'VSA_EState4', 'SMR_VSA1', 'GATS2pe', 'ATSC1i', 'AXp-2dv', 'IC5', 'SlogP_VSA3', 'AATS4v', 'AATS0Z', 'AATS4m', 'ATSC4v', 'AATSC4c', 'AATSC3v', 'nC', 'AATSC0i', 'AATSC1d', 'MIC0', 'SsOH', 'MZ', 'Xpc-4d', 'GATS2Z', 'ATSC3are', 'AATS0m', '

In [184]:
bitter_selector.fit(
    X=X_train_bitter_tf,
    y=y_train_bitter['Target'],
    n_trials=500,
    sample=False,
    verbose=True,
    random_state=42
)

  0%|          | 0/500 [00:00<?, ?it/s]

247 attributes confirmed important: ['AATS4pe', 'ATSC1d', 'MATS1i', 'VSA_EState1', 'ATS1p', 'PetitjeanIndex', 'CIC3', 'ATSC6dv', 'AATSC0c', 'BertzCT', 'CIC4', 'MWC09', 'MWC10', 'AATSC4pe', 'SsCH3', 'AATSC3se', 'ZMIC5', 'PEOE_VSA8', 'VSA_EState8', 'AATSC1v', 'Mp', 'FCSP3', 'ATS4are', 'AXp-1d', 'AXp-3d', 'Xpc-5dv', 'AATS1p', 'ATSC6m', 'Xc-5dv', 'MATS3m', 'CIC5', 'ATS5p', 'MATS2p', 'MATS3are', 'AATSC2d', 'Sp', 'ATS2i', 'VSA_EState7', 'VSA_EState3', 'GATS1m', 'Xpc-5d', 'ATSC0v', 'GATS1p', 'SaaN', 'AATS3v', 'ATSC2dv', 'AATSC0dv', 'AATSC4se', 'apol', 'AATS2d', 'ATSC0p', 'AATSC1dv', 'AATS4Z', 'AATSC1i', 'piPC1', 'VSA_EState6', 'VSA_EState9', 'MATS4se', 'SlogP_VSA2', 'ATSC5d', 'VSA_EState2', 'ATSC4dv', 'SMR_VSA6', 'AATS4se', 'ATSC6p', 'ATS5d', 'GATS2v', 'ATSC5v', 'RotRatio', 'ATSC4c', 'AXp-2d', 'VSA_EState4', 'SMR_VSA1', 'NaaN', 'ATSC1i', 'ATSC4se', 'SsssN', 'SlogP_VSA3', 'AATSC1m', 'ATSC5Z', 'ATSC0d', 'AATS4m', 'GATS4i', 'AATSC4c', 'ATS4se', 'ATS4pe', 'AATSC0i', 'MIC0', 'Xpc-4d', 'GGI3', 'MZ'

In [185]:
X_sweet_subset = sweet_selector.Subset()
X_bitter_subset = bitter_selector.Subset()

In [186]:
X_sweet_subset.shape

(2183, 234)

In [187]:
X_bitter_subset.shape

(2229, 247)

In [188]:
X_sweet_subset_cols = X_sweet_subset.columns
X_bitter_subset_cols = X_bitter_subset.columns

X_train_sweet_subset = X_train_sweet_tf[X_sweet_subset_cols]
X_test_sweet_subset = X_test_sweet_tf[X_sweet_subset_cols]

X_train_bitter_subset = X_train_bitter_tf[X_bitter_subset_cols]
X_test_bitter_phyto_subset = X_test_bitter_phyto_tf[X_bitter_subset_cols]
X_test_bitter_bitternew_subset = X_test_bitter_bitternew_tf[X_bitter_subset_cols]
X_test_bitter_unimi_subset = X_test_bitter_unimi_tf[X_bitter_subset_cols]

## Evaluation (With Feature Selection using BorutaShap)
In this section, we evaluate the models without applying feature selection yet

In [189]:
clf_xgboost_sweet_ba = XGBClassifier(
    colsample_bytree=0.3,
    learning_rate=0.01,
    max_depth=10,
    n_estimators=1000
)
clf_catboost_sweet_ba = CatBoostClassifier(
    depth=6,
    iterations=250, 
    learning_rate=0.2, 
    logging_level='Silent'
)
clf_rf_sweet_ba = RandomForestClassifier(
    bootstrap=False,
    max_depth=None,
    max_features='auto',
    min_samples_leaf=1,
    min_samples_split=10,
    n_estimators=400
)
sweet_models_full = [
    {'model': clf_xgboost_sweet_ba, 'name': 'XGBoost (BorutaShap)'},
    {'model': clf_catboost_sweet_ba, 'name': 'CatBoost (BorutaShap)'},
    {'model': clf_rf_sweet_ba, 'name': 'Random Forest (BorutaShap)'},
]

In [190]:
clf_xgboost_bitter_ba = XGBClassifier(
    colsample_bytree=0.3,
    learning_rate=0.01,
    max_depth=10,
    n_estimators=500
)
clf_catboost_bitter_ba = CatBoostClassifier(
    depth=10,
    iterations=350, 
    learning_rate=0.1, 
    logging_level='Silent'
)
clf_rf_bitter_ba = RandomForestClassifier(
    bootstrap=False,
    max_depth=100,
    max_features='auto',
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=400
)

bitter_models_full = [
    {'model': clf_xgboost_bitter_ba, 'name': 'XGBoost (BorutaShap)'},
    {'model': clf_catboost_bitter_ba, 'name': 'CatBoost (BorutaShap)'},
    {'model': clf_rf_bitter_ba, 'name': 'Random Forest (BorutaShap)'},
]

### Sweet Classifier

In [191]:
evaluate_all(sweet_models_full, X_train_sweet_subset, y_train_sweet, X_test_sweet_subset, y_test_sweet)

Model: XGBoost (BorutaShap)
Accuracy: 0.75
Precision: 0.8764044943820225
Average Precision: 0.923719143259935
AUROC: 0.8320754716981131
Sensitivity 0.7358490566037735
Specificity 0.78
F1-Score 0.8
Classification Report
              precision    recall  f1-score   support

           0       0.58      0.78      0.67        50
           1       0.88      0.74      0.80       106

    accuracy                           0.75       156
   macro avg       0.73      0.76      0.73       156
weighted avg       0.78      0.75      0.76       156

-----------------

Model: CatBoost (BorutaShap)
Accuracy: 0.7435897435897436
Precision: 0.8666666666666667
Average Precision: 0.9247911506004936
AUROC: 0.831132075471698
Sensitivity 0.7358490566037735
Specificity 0.76
F1-Score 0.7959183673469387
Classification Report
              precision    recall  f1-score   support

           0       0.58      0.76      0.66        50
           1       0.87      0.74      0.80       106

    accuracy          

### Bitter Classifier

#### Phyto-Dictionary

In [192]:
evaluate_all(bitter_models_full, X_train_bitter_subset, y_train_bitter, X_test_bitter_phyto_subset, y_test_bitter_phyto)

Model: XGBoost (BorutaShap)
Accuracy: 0.8837209302325582
Precision: 0.9574468085106383
Average Precision: 0.9542256269344968
AUROC: 0.9448256146369354
Sensitivity 0.8490566037735849
Specificity 0.9393939393939394
F1-Score 0.9
Classification Report
              precision    recall  f1-score   support

           0       0.79      0.94      0.86        33
           1       0.96      0.85      0.90        53

    accuracy                           0.88        86
   macro avg       0.88      0.89      0.88        86
weighted avg       0.90      0.88      0.89        86

-----------------

Model: CatBoost (BorutaShap)
Accuracy: 0.9069767441860465
Precision: 0.9591836734693877
Average Precision: 0.9708588250537533
AUROC: 0.9556889651229274
Sensitivity 0.8867924528301887
Specificity 0.9393939393939394
F1-Score 0.9215686274509803
Classification Report
              precision    recall  f1-score   support

           0       0.84      0.94      0.89        33
           1       0.96      0.89

#### UNIMI

In [193]:
evaluate_all(bitter_models_full, X_train_bitter_subset, y_train_bitter, X_test_bitter_unimi_subset, y_test_bitter_unimi)

Model: XGBoost (BorutaShap)
Accuracy: 0.7647058823529411
Precision: 0.875
Average Precision: 0.7493512822460191
AUROC: 0.8644781144781144
Sensitivity 0.3888888888888889
Specificity 0.9696969696969697
F1-Score 0.5384615384615385
Classification Report
              precision    recall  f1-score   support

           0       0.74      0.97      0.84        33
           1       0.88      0.39      0.54        18

    accuracy                           0.76        51
   macro avg       0.81      0.68      0.69        51
weighted avg       0.79      0.76      0.73        51

-----------------

Model: CatBoost (BorutaShap)
Accuracy: 0.7058823529411765
Precision: 0.6666666666666666
Average Precision: 0.6597369988760181
AUROC: 0.755050505050505
Sensitivity 0.3333333333333333
Specificity 0.9090909090909091
F1-Score 0.4444444444444444
Classification Report
              precision    recall  f1-score   support

           0       0.71      0.91      0.80        33
           1       0.67      0.3

#### Bitter-New

In [194]:
evaluate_all(bitter_models_full, X_train_bitter_subset, y_train_bitter, X_test_bitter_bitternew_subset, y_test_bitter_bitternew)

Model: XGBoost (BorutaShap)
Accuracy: 0.4444444444444444
Precision: 1.0
Average Precision: 0.9999999999999998
AUROC: N/A (Only 1 class in y_true)
Sensitivity 0.4444444444444444
Specificity 0.0
F1-Score 0.6153846153846153
Classification Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.44      0.62        27

    accuracy                           0.44        27
   macro avg       0.50      0.22      0.31        27
weighted avg       1.00      0.44      0.62        27

-----------------

Model: CatBoost (BorutaShap)
Accuracy: 0.4074074074074074
Precision: 1.0
Average Precision: 0.9999999999999998
AUROC: N/A (Only 1 class in y_true)
Sensitivity 0.4074074074074074
Specificity 0.0
F1-Score 0.5789473684210525
Classification Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.41      0.58        27

   