In [1]:
# Importing packages
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pycaret.classification import *
import mlflow
import logging
import sys

In [2]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s')

stdout_handler = logging.StreamHandler(sys.stdout)
stdout_handler.setLevel(logging.DEBUG)
stdout_handler.setFormatter(formatter)

file_handler = logging.FileHandler('logs.log')
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(formatter)


logger.addHandler(file_handler)
logger.addHandler(stdout_handler)

In [3]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina' #or 'retina'

In [4]:
# Set paths
fasta_folder = os.path.join(os.getcwd(), os.pardir, 'data', 'raw-data', 'fasta')
interim_folder = os.path.join(os.getcwd(), os.pardir, 'data', 'interim')
models_folder = os.path.join(os.getcwd(), os.pardir, 'models')
processed_folder = os.path.join(os.getcwd(), os.pardir, 'data', 'processed')
img_folder = os.path.join(os.getcwd(), os.pardir, 'reports', 'figures')

In [5]:
# Define organisms dataset to be used
dataset_names = (
        'Arabidopsis_non_tata',
        'Arabidopsis_tata',
        'Bacillus',
        'Ecoli',
        'Human_non_tata',
        'Mouse_non_tata',
        'Mouse_tata',
    )

In [6]:
# Utility functions

def get_organism_props_dataframes(organism_path: str, extension: str = 'feather'):
    # Get all properties files from organism folder
    props_files = [f'{organism_path}/{file}' for file in os.listdir(organism_path) if file.startswith('df_data_prop_')]

    # Get dataframes for each property
    props = list()
    for prop_idx in range(len(props_files)):
        prop_file = f'{organism_path}/df_data_prop_{prop_idx}.{extension}'  # Get property file path
        if os.path.exists(prop_file): # Check if property file exists
            if extension == 'feather': # Feather is faster than csv but not compatible with other libraries
                df_data = pd.read_feather(prop_file)
            elif extension == 'csv': # CSV is more compatible with other libraries
                df_data = pd.read_csv(prop_file, index_col=False)
            else:
                raise ValueError('Extension not supported.')
            props.append(df_data) # Append dataframe to list
        else:
            print(f'Property {prop_idx} {extension} file not found.')

    return props

def get_bp_positions(seq_len: int, step: int = 10):
    # Set range of bp positions (upstream, TSS, downstream)
    if seq_len == 80:
        _range = np.arange(-60, 20, step)
    elif seq_len == 79:
        _range = np.arange(-60, 19, step)
    elif seq_len == 250:
        _range = np.arange(-200, 50, step)
    elif seq_len == 249:
        _range = np.arange(-200, 49, step)
    else:
        raise ValueError('Sequence length not supported.')
    return _range

def get_props_names(kmer_type: str = 'dinuc'):
    kmers_values_folder = os.path.join(os.getcwd(), os.pardir, 'data', 'raw-data',
                                       'physicochemical-properties-reference',
                                           f'original-{kmer_type}.tsv')
    kmer_df = pd.read_csv(kmers_values_folder, sep='\t', index_col=0)
    props_names = kmer_df.index.tolist()
    return props_names

In [7]:
# Set kmer type, get properties names and number of properties for that kmer type
kmer_types = ('dinuc', 'trinuc')                                                                                        # Define kmer types
props_names = {'dinuc': get_props_names(kmer_type=kmer_types[0]), 'trinuc': get_props_names(kmer_type=kmer_types[1])}   # Get properties names
props_num = {'dinuc': len(props_names['dinuc']), 'trinuc': len(props_names['trinuc'])}                                  # Get number of properties

In [8]:
print(f"Dinucleotide properties ({props_num['dinuc']}):\n {props_names['dinuc']}", end='\n\n')
print(f"Trinucleotide properties ({props_num['trinuc']}):\n {props_names['trinuc']}")

Dinucleotide properties (38):
 ['Base stacking', 'Protein induced deformability', 'B-DNA twist', 'Dinucleotide GC Content', 'A-philicity', 'Propeller twist', 'Duplex stability(free energy)', 'Duplex tability(disrupt energy)', 'DNA denaturation', 'Bending stiffness', 'Protein DNA twist', 'Stabilising energy of Z-DNA', 'Aida_BA_transition', 'Breslauer_dG', 'Breslauer_dH', 'Breslauer_dS', 'Electron_interaction', 'Hartman_trans_free_energy', 'Helix-Coil_transition', 'Ivanov_BA_transition', 'Lisser_BZ_transition', 'Polar_interaction', 'SantaLucia_dG', 'SantaLucia_dH', 'SantaLucia_dS', 'Sarai_flexibility', 'Stability', 'Stacking_energy', 'Sugimoto_dG', 'Sugimoto_dH', 'Sugimoto_dS', 'Watson-Crick_interaction', 'Twist', 'Tilt', 'Roll', 'Shift', 'Slide', 'Rise']

Trinucleotide properties (12):
 ['Bendability (DNAse) ', 'Bendability (consensus)', 'Trinucleotide GC Content', 'Nucleosome positioning', 'Consensus_roll', 'Consensus_Rigid', 'Dnase I', 'Dnase I-Rigid', 'MW-Daltons', 'MW-kg', 'Nucleoso

Get dataframes for each organism and each kmer type ('dinuc' and 'trinuc') and store them in a dictionary of dictionaries
 of dataframes (features)

In [9]:
# Get dataframes for each organism and each kmer type (dinuc and trinuc) and store them in a dictionary of
# dictionaries of dataframes (features)
features = {x: {'dinuc': None, 'trinuc': None} for x in dataset_names}                          # Initialize dictionary
for dataset_name in dataset_names:                                                              # Iterate over organisms
    for kmer_type in kmer_types:                                                                # Iterate over kmer types
        feature_folder = os.path.join(interim_folder, kmer_type, f'{dataset_name}-original')    # Get folder path
        features[dataset_name][kmer_type] = get_organism_props_dataframes(feature_folder)       # Get dataframes
        print(f'{dataset_name} - {kmer_type} - {len(features[dataset_name][kmer_type])}')       # Print number of properties

Arabidopsis_non_tata - dinuc - 38
Arabidopsis_non_tata - trinuc - 12
Arabidopsis_tata - dinuc - 38
Arabidopsis_tata - trinuc - 12
Bacillus - dinuc - 38
Bacillus - trinuc - 12
Ecoli - dinuc - 38
Ecoli - trinuc - 12
Human_non_tata - dinuc - 38
Human_non_tata - trinuc - 12
Mouse_non_tata - dinuc - 38
Mouse_non_tata - trinuc - 12
Mouse_tata - dinuc - 38
Mouse_tata - trinuc - 12


# Experiment to collect data for all properties and all organisms using PyCaret models
PyCaret and MLFlow tracking and logging experiments to MLFlow server (http://localhost:5000) and
artifacts to local folder (mlruns)

In [None]:
iter_idx = 0
results_df = pd.DataFrame(columns=['Organism', 'ModelName', 'ModelID', 'K-mer', 'PropertyIdx', 'PropertyName',
                                   'Accuracy_avg', 'AUC_avg', 'F1_avg', 'Precision_avg', 'Recall_avg', 'Kappa_avg',
                                   'MCC_avg', 'MCC_max', 'MCC_min', 'TT (s)'])

results_path = os.path.join(processed_folder, 'exp-results.csv')
results_df.to_csv(results_path, index=False, mode='w')

for dataset_idx, dataset_name in enumerate(dataset_names):
    log_msg = f'> Organism {dataset_idx} - {dataset_name}'
    logger.info(log_msg)

    for k_mer_idx, k_mer_type in enumerate(kmer_types):
        # print(f'K-mer {k_mer_type}')
        for prop_idx, prop_name in enumerate(props_names[k_mer_type]):
            # print(f'Property {prop_idx} - {prop_name}')

            # Get dataframe with features for a given property and organism (dataset) and kmer type (dinuc or trinuc)
            input_features = features[dataset_name][k_mer_type][prop_idx]

            # Set up the experiment
#             exp_name = f'{dataset_name}-{k_mer_type}-{prop_idx}'
            exp_name = f'{dataset_name}'

            logger.info(f'>> ({iter_idx}) Starting experiment: {exp_name}...')

            iter_idx += 1 # increment experiment index

            # Initialize the experiment with PyCaret and MLFlow tracking and logging experiments to MLFlow server
            # (http://localhost:5000) and artifacts to local folder (mlruns)
            exp = setup(data=input_features, target='y', session_id=123, verbose=False, log_experiment=True,
                        experiment_name=exp_name, html=False, log_plots=True, fold_strategy='stratifiedkfold',
                        fold=5, preprocess=False) # log_profile=True, profile=True, profile_kwargs={'silent': True}

            # Set experiment tags for MLFlow
            tags = {'Dataset': {dataset_name}, 'kmer': k_mer_type, 'prop_idx': prop_idx, 'prop_name': prop_name}
            mlflow.set_tags(tags) # set experiment tags for MLFlow
            mlflow.set_tag("mlflow.runName", f"{k_mer_type}-{prop_idx}") # set run name

            models_types_df = models() # get models dataframe (PyCaret) with model types and names

            # compare models and select the best (PyCaret)
            best_model = compare_models(
                sort='MCC',
                experiment_custom_tags={'prop': f'{k_mer_type}_{prop_idx}'}
            )

            metrics_df = pull() # get metrics dataframe
            model_idx = metrics_df.iloc[0,:].name # get index of best model
            model_name = metrics_df.iloc[0,:]['Model'] # get name of best model
            model_mcc_avg = metrics_df.iloc[0,:]['MCC'] # get average MCC of best model
            model_mcc_max = metrics_df.loc[:,'MCC'].max() # get max MCC of best model
            model_mcc_min = metrics_df.loc[:,'MCC'].min() # get min MCC of best model

            logger.info(f'Best model: {model_name} - {model_idx} - MCC AVG: {model_mcc_avg}')
            # print(metrics_df) # print metrics for all models


            # tune model (PyCaret) - optimize MCC metric (default) - 5-fold CV (default)
            tuned_model = tune_model(
                best_model,
                optimize='MCC',
                fold=5,
                n_iter=10,
                search_library='optuna',
                search_algorithm='tpe',
                verbose=False,
                tuner_verbose=False,
            )

            tunned_metrics_df = pull() # get metrics dataframe for tunned model
            tunned_mcc_avg = tunned_metrics_df.iloc[-2,:]['MCC'] # get MCC of best tunned model
            tunned_mcc_max = tunned_metrics_df.iloc[:-2,:]['MCC'].max() # get max MCC of best model
            tunned_mcc_min = tunned_metrics_df.iloc[:-2,:]['MCC'].min() # get min MCC of best model

            print(f'Tunned model Average MCC: {tunned_mcc_avg}')
            # print(tunned_metrics_df) # print metrics for tunned model


            tags = {'Model Name': model_name, 'Model ID': model_idx, 'Prop Name': prop_name}
            mlflow.set_tags(tags) # set model tags for MLFlow

            mlflow.log_metric('MCC-avg', tunned_mcc_avg) # log best tunned avg MCC
            mlflow.log_metric('MCC-max', tunned_mcc_max) # log best tunned max MCC
            mlflow.log_metric('MCC-min', tunned_mcc_min) # log best tunned min MCC


            # Save best tuned model
            if not os.path.exists(models_folder):
                os.makedirs(models_folder)
            tunned_model_path = f'{exp_name}-{k_mer_type}-{prop_idx}-{model_idx}-tunned-{str(tunned_mcc_avg)}'
            tunned_model_path = os.path.join(models_folder, tunned_model_path)
            save_model(tuned_model, tunned_model_path) # save best tuned model (PyCaret)
            mlflow.log_artifact(f'{tunned_model_path}.pkl') # log experiment results


            row_df = pd.DataFrame({'Organism': dataset_name, 'kmer': k_mer_type, 'Property': prop_name,
                                   'Model': model_name, 'Model ID': model_idx, 'MCC_avg': model_mcc_avg,
                                   'MCC_max': model_mcc_max, 'MCC_min': model_mcc_min, 'TT (s)': 0}, index=[0])
            row_df.to_csv(results_path, mode='a', header=False, index=False) # save results to CSV file

            logger.info(f'>> Ending experiment: {exp_name}...')

            # break

    log_msg = f'> Ending organism {dataset_idx} - {dataset_name}...'
    logger.info(log_msg)

2023-05-19 01:18:05,025 | INFO | > Organism 0 - Arabidopsis_non_tata
2023-05-19 01:18:05,025 | INFO | >> (0) Starting experiment: Arabidopsis_non_tata...


2023/05/19 01:18:05 INFO mlflow.tracking.fluent: Experiment with name 'Arabidopsis_non_tata' does not exist. Creating a new experiment.
                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8888  0.9479  0.7830  0.8770   
gbc          Gradient Boosting Classifier    0.8611  0.9258  0.7058  0.8608   
ada                  Ada Boost Classifier    0.8319  0.8953  0.7116  0.7759   
lr                    Logistic Regression    0.8099  0.8653  0.6526  0.7554   
lda          Linear Discriminant Analysis    0.8103  0.8654  0.6475  0.7592   
ridge                    Ridge Classifier    0.8105  0.0000  0.6371  0.7664   
qda       Quadratic Discriminant Analysis    0.8100  0.8816  0.5805  0.8064   
nb                            Naive Bayes    0.7919  0.8456  0.6886  0.6961   
rf               Random Forest Classifier    0.7847  0.8983  0.3987  0.9268   
svm                   SVM - Linear Kernel    0.7777  0.0000  0.5245  0.7973   
dt               Decision Tree Classifier    0.7289  0.7030  0.6223  0.5976   
et                 Extra Trees Classifier    0.7346 



Tunned model Average MCC: 0.7626
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:19:15,218 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:19:15,219 | INFO | >> (1) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8785  0.9436  0.8055  0.8323   
gbc          Gradient Boosting Classifier    0.8523  0.9212  0.7781  0.7861   
ada                  Ada Boost Classifier    0.8405  0.9003  0.7469  0.7757   
rf               Random Forest Classifier    0.8355  0.8976  0.6528  0.8273   
lda          Linear Discriminant Analysis    0.8082  0.8715  0.7084  0.7223   
ridge                    Ridge Classifier    0.8077  0.0000  0.6988  0.7258   
lr                    Logistic Regression    0.8078  0.8717  0.6867  0.7317   
nb                            Naive Bayes    0.7920  0.8552  0.7590  0.6720   
et                 Extra Trees Classifier    0.7949  0.8897  0.4612  0.8775   
qda       Quadratic Discriminant Analysis    0.7708  0.8257  0.5059  0.7379   
dt               Decision Tree Classifier    0.7461  0.7215  0.6446  0.6224   
svm                   SVM - Linear Kernel    0.7518 



Tunned model Average MCC: 0.7439
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:20:13,764 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:20:13,764 | INFO | >> (2) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8739  0.9332  0.7721  0.8440   
gbc          Gradient Boosting Classifier    0.8501  0.9122  0.7680  0.7865   
ada                  Ada Boost Classifier    0.8177  0.8776  0.6823  0.7579   
dt               Decision Tree Classifier    0.7214  0.6917  0.5988  0.5889   
rf               Random Forest Classifier    0.7143  0.8483  0.1735  0.9275   
nb                            Naive Bayes    0.6765  0.6624  0.2903  0.5463   
et                 Extra Trees Classifier    0.6782  0.8205  0.0569  0.9462   
qda       Quadratic Discriminant Analysis    0.6641  0.6410  0.2853  0.5113   
lda          Linear Discriminant Analysis    0.6523  0.5955  0.1539  0.4659   
ridge                    Ridge Classifier    0.6549  0.0000  0.1338  0.4729   
lr                    Logistic Regression    0.6531  0.5628  0.1096  0.4570   
svm                   SVM - Linear Kernel    0.5410 



Tunned model Average MCC: 0.7531
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:21:39,012 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:21:39,012 | INFO | >> (3) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8238  0.8882  0.6276  0.8121   
gbc          Gradient Boosting Classifier    0.8055  0.8748  0.5584  0.8112   
qda       Quadratic Discriminant Analysis    0.8023  0.8635  0.5558  0.8022   
nb                            Naive Bayes    0.7797  0.8376  0.6964  0.6692   
lda          Linear Discriminant Analysis    0.7873  0.8393  0.5952  0.7297   
ridge                    Ridge Classifier    0.7877  0.0000  0.5809  0.7391   
lr                    Logistic Regression    0.7866  0.8395  0.5996  0.7253   
ada                  Ada Boost Classifier    0.7749  0.8205  0.5841  0.7038   
svm                   SVM - Linear Kernel    0.7527  0.0000  0.6661  0.6480   
rf               Random Forest Classifier    0.7612  0.8528  0.3610  0.8517   
et                 Extra Trees Classifier    0.7570  0.8530  0.3409  0.8606   
knn                K Neighbors Classifier    0.7425 



Tunned model Average MCC: 0.6318
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:23:03,280 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:23:03,281 | INFO | >> (4) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8759  0.9387  0.7723  0.8493   
gbc          Gradient Boosting Classifier    0.8486  0.9124  0.7116  0.8196   
ada                  Ada Boost Classifier    0.8177  0.8789  0.6729  0.7630   
rf               Random Forest Classifier    0.7685  0.8527  0.3719  0.8765   
dt               Decision Tree Classifier    0.7242  0.6990  0.6201  0.5899   
nb                            Naive Bayes    0.7257  0.7658  0.5449  0.6075   
lda          Linear Discriminant Analysis    0.7137  0.7411  0.4346  0.6111   
lr                    Logistic Regression    0.7134  0.7416  0.4360  0.6100   
ridge                    Ridge Classifier    0.7139  0.0000  0.4150  0.6181   
svm                   SVM - Linear Kernel    0.6867  0.0000  0.4958  0.5910   
qda       Quadratic Discriminant Analysis    0.7005  0.7048  0.3530  0.6017   
et                 Extra Trees Classifier    0.6917 



Tunned model Average MCC: 0.7256
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:24:46,700 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:24:46,700 | INFO | >> (5) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8838  0.9434  0.7985  0.8509   
gbc          Gradient Boosting Classifier    0.8673  0.9283  0.7776  0.8227   
ada                  Ada Boost Classifier    0.8349  0.8978  0.7360  0.7687   
lr                    Logistic Regression    0.8188  0.8840  0.6932  0.7541   
lda          Linear Discriminant Analysis    0.8179  0.8840  0.7026  0.7469   
nb                            Naive Bayes    0.8144  0.8793  0.7360  0.7232   
ridge                    Ridge Classifier    0.8180  0.0000  0.6913  0.7532   
rf               Random Forest Classifier    0.8148  0.9076  0.5178  0.8927   
qda       Quadratic Discriminant Analysis    0.7968  0.8649  0.5492  0.7897   
svm                   SVM - Linear Kernel    0.7414  0.0000  0.7377  0.6236   
et                 Extra Trees Classifier    0.7650  0.8981  0.3448  0.9060   
dt               Decision Tree Classifier    0.7336 



Tunned model Average MCC: 0.7471
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:26:02,790 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:26:02,791 | INFO | >> (6) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8913  0.9515  0.7793  0.8874   
gbc          Gradient Boosting Classifier    0.8665  0.9331  0.7189  0.8661   
ada                  Ada Boost Classifier    0.8327  0.8962  0.7162  0.7751   
rf               Random Forest Classifier    0.7971  0.9079  0.4454  0.9139   
nb                            Naive Bayes    0.7703  0.8291  0.6813  0.6562   
lr                    Logistic Regression    0.7768  0.8223  0.5804  0.7105   
lda          Linear Discriminant Analysis    0.7772  0.8221  0.5749  0.7143   
ridge                    Ridge Classifier    0.7766  0.0000  0.5630  0.7192   
dt               Decision Tree Classifier    0.7448  0.7178  0.6332  0.6230   
qda       Quadratic Discriminant Analysis    0.7578  0.7956  0.4936  0.7062   
et                 Extra Trees Classifier    0.7477  0.8925  0.2756  0.9407   
svm                   SVM - Linear Kernel    0.7120 



Tunned model Average MCC: 0.7692
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:27:46,969 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:27:46,970 | INFO | >> (7) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8581  0.9244  0.7101  0.8478   
gbc          Gradient Boosting Classifier    0.8139  0.8913  0.5848  0.8160   
ada                  Ada Boost Classifier    0.7844  0.8517  0.6194  0.7103   
rf               Random Forest Classifier    0.7397  0.8538  0.2724  0.8781   
lr                    Logistic Regression    0.7230  0.7495  0.4481  0.6304   
lda          Linear Discriminant Analysis    0.7226  0.7486  0.4433  0.6313   
ridge                    Ridge Classifier    0.7216  0.0000  0.4239  0.6360   
dt               Decision Tree Classifier    0.6970  0.6637  0.5596  0.5541   
nb                            Naive Bayes    0.6886  0.7279  0.5734  0.5399   
et                 Extra Trees Classifier    0.7050  0.8307  0.1461  0.9157   
svm                   SVM - Linear Kernel    0.5971  0.0000  0.7245  0.4941   
qda       Quadratic Discriminant Analysis    0.6914 



Tunned model Average MCC: 0.7028
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:29:30,528 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:29:30,528 | INFO | >> (8) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8957  0.9531  0.8188  0.8673   
gbc          Gradient Boosting Classifier    0.8642  0.9349  0.7561  0.8297   
ada                  Ada Boost Classifier    0.8493  0.9129  0.7430  0.8002   
rf               Random Forest Classifier    0.8316  0.9014  0.5836  0.8814   
ridge                    Ridge Classifier    0.8008  0.0000  0.6252  0.7479   
lda          Linear Discriminant Analysis    0.7995  0.8594  0.6354  0.7387   
nb                            Naive Bayes    0.7922  0.8591  0.6961  0.6940   
qda       Quadratic Discriminant Analysis    0.7994  0.8742  0.5405  0.8054   
lr                    Logistic Regression    0.7929  0.8523  0.6293  0.7256   
dt               Decision Tree Classifier    0.7445  0.7189  0.6388  0.6210   
et                 Extra Trees Classifier    0.7387  0.8786  0.2485  0.9370   
svm                   SVM - Linear Kernel    0.6983 



Tunned model Average MCC: 0.7851
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:30:44,366 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:30:44,366 | INFO | >> (9) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8493  0.9203  0.6821  0.8455   
gbc          Gradient Boosting Classifier    0.8227  0.9035  0.5875  0.8439   
ada                  Ada Boost Classifier    0.7943  0.8518  0.6349  0.7258   
lda          Linear Discriminant Analysis    0.7747  0.8218  0.5700  0.7101   
lr                    Logistic Regression    0.7737  0.8215  0.5734  0.7059   
ridge                    Ridge Classifier    0.7736  0.0000  0.5565  0.7146   
rf               Random Forest Classifier    0.7644  0.8592  0.3617  0.8700   
nb                            Naive Bayes    0.7492  0.8123  0.6676  0.6223   
qda       Quadratic Discriminant Analysis    0.7659  0.8184  0.5086  0.7210   
et                 Extra Trees Classifier    0.7439  0.8642  0.2768  0.9033   
svm                   SVM - Linear Kernel    0.7191  0.0000  0.6155  0.6174   
knn                K Neighbors Classifier    0.6792 



Tunned model Average MCC: 0.695
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:32:00,236 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:32:00,237 | INFO | >> (10) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8860  0.9459  0.8045  0.8523   
gbc          Gradient Boosting Classifier    0.8614  0.9250  0.7774  0.8083   
ada                  Ada Boost Classifier    0.8534  0.9139  0.7561  0.8019   
rf               Random Forest Classifier    0.8360  0.8976  0.6230  0.8554   
nb                            Naive Bayes    0.8026  0.8680  0.7750  0.6856   
lda          Linear Discriminant Analysis    0.7891  0.8461  0.6598  0.7024   
ridge                    Ridge Classifier    0.7890  0.0000  0.6463  0.7082   
et                 Extra Trees Classifier    0.7666  0.8818  0.3499  0.9061   
lr                    Logistic Regression    0.7531  0.8058  0.6097  0.6454   
dt               Decision Tree Classifier    0.7369  0.7108  0.6291  0.6099   
qda       Quadratic Discriminant Analysis    0.7299  0.7799  0.4179  0.6637   
svm                   SVM - Linear Kernel    0.6911 



Tunned model Average MCC: 0.7687
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:33:19,779 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:33:19,779 | INFO | >> (11) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8813  0.9424  0.7934  0.8479   
gbc          Gradient Boosting Classifier    0.8609  0.9234  0.7856  0.8018   
nb                            Naive Bayes    0.8438  0.9048  0.7612  0.7755   
ada                  Ada Boost Classifier    0.8305  0.8945  0.7418  0.7553   
rf               Random Forest Classifier    0.8300  0.9002  0.5802  0.8792   
lr                    Logistic Regression    0.8248  0.8886  0.7051  0.7620   
lda          Linear Discriminant Analysis    0.8239  0.8890  0.7113  0.7564   
ridge                    Ridge Classifier    0.8240  0.0000  0.7031  0.7611   
svm                   SVM - Linear Kernel    0.7873  0.0000  0.7568  0.6913   
qda       Quadratic Discriminant Analysis    0.7856  0.8557  0.5047  0.7887   
et                 Extra Trees Classifier    0.7783  0.8932  0.3881  0.9069   
dt               Decision Tree Classifier    0.7313 



Tunned model Average MCC: 0.7661
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:34:51,011 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:34:51,011 | INFO | >> (12) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8841  0.9424  0.7929  0.8560   
gbc          Gradient Boosting Classifier    0.8563  0.9175  0.7404  0.8202   
ada                  Ada Boost Classifier    0.8395  0.8961  0.7280  0.7848   
rf               Random Forest Classifier    0.7751  0.9009  0.3641  0.9350   
dt               Decision Tree Classifier    0.7531  0.7264  0.6426  0.6357   
nb                            Naive Bayes    0.7505  0.7941  0.5575  0.6572   
lr                    Logistic Regression    0.7194  0.7351  0.4329  0.6269   
lda          Linear Discriminant Analysis    0.7183  0.7346  0.4270  0.6259   
ridge                    Ridge Classifier    0.7188  0.0000  0.4096  0.6342   
svm                   SVM - Linear Kernel    0.6988  0.0000  0.4418  0.5835   
et                 Extra Trees Classifier    0.6937  0.8644  0.1031  0.9638   
qda       Quadratic Discriminant Analysis    0.6654 



Tunned model Average MCC: 0.7388
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:36:36,698 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:36:36,698 | INFO | >> (13) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8609  0.9292  0.7193  0.8488   
gbc          Gradient Boosting Classifier    0.8218  0.8970  0.6049  0.8244   
ada                  Ada Boost Classifier    0.7967  0.8590  0.6460  0.7263   
rf               Random Forest Classifier    0.7615  0.8809  0.3324  0.9082   
dt               Decision Tree Classifier    0.7117  0.6800  0.5809  0.5755   
et                 Extra Trees Classifier    0.7249  0.8764  0.2045  0.9387   
lr                    Logistic Regression    0.7187  0.7465  0.4375  0.6230   
nb                            Naive Bayes    0.6965  0.7352  0.5807  0.5513   
lda          Linear Discriminant Analysis    0.7180  0.7451  0.4336  0.6225   
ridge                    Ridge Classifier    0.7183  0.0000  0.4154  0.6299   
svm                   SVM - Linear Kernel    0.6582  0.0000  0.4310  0.6379   
qda       Quadratic Discriminant Analysis    0.6798 



Tunned model Average MCC: 0.6747
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:38:00,447 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:38:00,447 | INFO | >> (14) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8878  0.9458  0.8072  0.8552   
gbc          Gradient Boosting Classifier    0.8654  0.9278  0.7793  0.8167   
ada                  Ada Boost Classifier    0.8342  0.8965  0.7189  0.7772   
rf               Random Forest Classifier    0.8047  0.9037  0.4684  0.9167   
nb                            Naive Bayes    0.8075  0.8586  0.6218  0.7678   
dt               Decision Tree Classifier    0.7464  0.7190  0.6334  0.6257   
lr                    Logistic Regression    0.7528  0.7818  0.5178  0.6792   
lda          Linear Discriminant Analysis    0.7518  0.7824  0.5137  0.6786   
ridge                    Ridge Classifier    0.7509  0.0000  0.4977  0.6838   
et                 Extra Trees Classifier    0.7158  0.8739  0.1735  0.9496   
qda       Quadratic Discriminant Analysis    0.7017  0.7033  0.3542  0.6050   
svm                   SVM - Linear Kernel    0.6184 



Tunned model Average MCC: 0.0
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:39:21,072 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:39:21,072 | INFO | >> (15) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8806  0.9396  0.8014  0.8404   
gbc          Gradient Boosting Classifier    0.8605  0.9222  0.7767  0.8063   
nb                            Naive Bayes    0.8362  0.8914  0.6947  0.7977   
ada                  Ada Boost Classifier    0.8237  0.8895  0.7080  0.7580   
rf               Random Forest Classifier    0.8050  0.8950  0.4832  0.8954   
ridge                    Ridge Classifier    0.8045  0.0000  0.6383  0.7499   
lda          Linear Discriminant Analysis    0.8036  0.8580  0.6475  0.7423   
lr                    Logistic Regression    0.8033  0.8568  0.6463  0.7425   
dt               Decision Tree Classifier    0.7427  0.7186  0.6431  0.6169   
qda       Quadratic Discriminant Analysis    0.7379  0.7723  0.4232  0.6860   
et                 Extra Trees Classifier    0.7308  0.8729  0.2240  0.9352   
svm                   SVM - Linear Kernel    0.6840 



Tunned model Average MCC: 0.6862
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:41:14,320 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:41:14,320 | INFO | >> (16) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8238  0.8882  0.6276  0.8121   
gbc          Gradient Boosting Classifier    0.8055  0.8748  0.5584  0.8112   
qda       Quadratic Discriminant Analysis    0.8022  0.8635  0.5555  0.8021   
nb                            Naive Bayes    0.7797  0.8376  0.6964  0.6692   
lda          Linear Discriminant Analysis    0.7873  0.8393  0.5952  0.7297   
ridge                    Ridge Classifier    0.7872  0.0000  0.5814  0.7374   
lr                    Logistic Regression    0.7851  0.8370  0.5962  0.7232   
ada                  Ada Boost Classifier    0.7749  0.8205  0.5841  0.7038   
svm                   SVM - Linear Kernel    0.7341  0.0000  0.6475  0.6496   
rf               Random Forest Classifier    0.7612  0.8528  0.3610  0.8517   
et                 Extra Trees Classifier    0.7570  0.8530  0.3409  0.8606   
knn                K Neighbors Classifier    0.7416 



Tunned model Average MCC: 0.577
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:42:38,435 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:42:38,435 | INFO | >> (17) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8887  0.9490  0.8050  0.8590   
nb                            Naive Bayes    0.8825  0.9387  0.8035  0.8438   
gbc          Gradient Boosting Classifier    0.8592  0.9292  0.7750  0.8042   
ada                  Ada Boost Classifier    0.8536  0.9154  0.7689  0.7942   
rf               Random Forest Classifier    0.8512  0.9011  0.6755  0.8569   
lda          Linear Discriminant Analysis    0.8418  0.9042  0.7310  0.7885   
ridge                    Ridge Classifier    0.8420  0.0000  0.7230  0.7943   
lr                    Logistic Regression    0.8389  0.9040  0.7271  0.7838   
qda       Quadratic Discriminant Analysis    0.8287  0.9015  0.6056  0.8467   
et                 Extra Trees Classifier    0.7804  0.9024  0.3835  0.9292   
svm                   SVM - Linear Kernel    0.7389  0.0000  0.6764  0.7290   
dt               Decision Tree Classifier    0.7443 



Tunned model Average MCC: 0.7936
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:44:04,809 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:44:04,810 | INFO | >> (18) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8609  0.9292  0.7193  0.8488   
gbc          Gradient Boosting Classifier    0.8218  0.8970  0.6049  0.8244   
ada                  Ada Boost Classifier    0.7967  0.8590  0.6460  0.7263   
rf               Random Forest Classifier    0.7615  0.8809  0.3324  0.9082   
dt               Decision Tree Classifier    0.7117  0.6800  0.5809  0.5755   
et                 Extra Trees Classifier    0.7246  0.8672  0.2013  0.9477   
lr                    Logistic Regression    0.7187  0.7469  0.4389  0.6225   
nb                            Naive Bayes    0.6955  0.7341  0.5843  0.5495   
lda          Linear Discriminant Analysis    0.7178  0.7455  0.4319  0.6223   
ridge                    Ridge Classifier    0.7184  0.0000  0.4157  0.6304   
qda       Quadratic Discriminant Analysis    0.6806  0.6759  0.3656  0.5464   
svm                   SVM - Linear Kernel    0.6826 



Tunned model Average MCC: 0.503
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:45:54,067 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:45:54,067 | INFO | >> (19) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8759  0.9387  0.7723  0.8493   
gbc          Gradient Boosting Classifier    0.8486  0.9124  0.7116  0.8196   
ada                  Ada Boost Classifier    0.8177  0.8789  0.6729  0.7630   
rf               Random Forest Classifier    0.7685  0.8527  0.3719  0.8765   
dt               Decision Tree Classifier    0.7242  0.6990  0.6201  0.5899   
nb                            Naive Bayes    0.7257  0.7658  0.5449  0.6075   
lda          Linear Discriminant Analysis    0.7137  0.7411  0.4346  0.6111   
lr                    Logistic Regression    0.7134  0.7416  0.4360  0.6100   
ridge                    Ridge Classifier    0.7139  0.0000  0.4150  0.6181   
svm                   SVM - Linear Kernel    0.6867  0.0000  0.4958  0.5910   
qda       Quadratic Discriminant Analysis    0.7005  0.7048  0.3530  0.6017   
et                 Extra Trees Classifier    0.6917 



Tunned model Average MCC: 0.7363
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:46:59,669 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:46:59,669 | INFO | >> (20) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8887  0.9490  0.8050  0.8590   
nb                            Naive Bayes    0.8823  0.9386  0.8033  0.8434   
gbc          Gradient Boosting Classifier    0.8592  0.9292  0.7750  0.8042   
ada                  Ada Boost Classifier    0.8536  0.9154  0.7689  0.7942   
rf               Random Forest Classifier    0.8514  0.9010  0.6760  0.8570   
lda          Linear Discriminant Analysis    0.8414  0.9039  0.7310  0.7877   
ridge                    Ridge Classifier    0.8417  0.0000  0.7213  0.7945   
lr                    Logistic Regression    0.8386  0.9036  0.7256  0.7839   
qda       Quadratic Discriminant Analysis    0.8288  0.9015  0.6071  0.8459   
svm                   SVM - Linear Kernel    0.7842  0.0000  0.5967  0.7983   
et                 Extra Trees Classifier    0.7827  0.9021  0.3883  0.9342   
dt               Decision Tree Classifier    0.7442 



Tunned model Average MCC: 0.6719
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:48:24,784 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:48:24,785 | INFO | >> (21) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8238  0.8882  0.6276  0.8121   
gbc          Gradient Boosting Classifier    0.8055  0.8748  0.5584  0.8112   
qda       Quadratic Discriminant Analysis    0.8021  0.8635  0.5555  0.8018   
nb                            Naive Bayes    0.7796  0.8376  0.6964  0.6690   
lr                    Logistic Regression    0.7870  0.8393  0.6003  0.7262   
lda          Linear Discriminant Analysis    0.7873  0.8393  0.5952  0.7297   
ridge                    Ridge Classifier    0.7876  0.0000  0.5812  0.7387   
ada                  Ada Boost Classifier    0.7749  0.8205  0.5841  0.7038   
svm                   SVM - Linear Kernel    0.7529  0.0000  0.6315  0.6565   
rf               Random Forest Classifier    0.7602  0.8531  0.3588  0.8494   
et                 Extra Trees Classifier    0.7570  0.8530  0.3409  0.8606   
knn                K Neighbors Classifier    0.7437 



Tunned model Average MCC: 0.5982
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:49:37,537 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:49:37,537 | INFO | >> (22) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8976  0.9553  0.8067  0.8826   
gbc          Gradient Boosting Classifier    0.8781  0.9408  0.7547  0.8699   
ada                  Ada Boost Classifier    0.8511  0.9128  0.7401  0.8065   
rf               Random Forest Classifier    0.7951  0.9025  0.4372  0.9168   
qda       Quadratic Discriminant Analysis    0.7907  0.8573  0.5217  0.7917   
nb                            Naive Bayes    0.7759  0.8322  0.6823  0.6665   
lr                    Logistic Regression    0.7801  0.8342  0.5911  0.7131   
lda          Linear Discriminant Analysis    0.7801  0.8345  0.5841  0.7169   
ridge                    Ridge Classifier    0.7807  0.0000  0.5722  0.7248   
dt               Decision Tree Classifier    0.7585  0.7353  0.6630  0.6399   
svm                   SVM - Linear Kernel    0.6911  0.0000  0.7673  0.5728   
et                 Extra Trees Classifier    0.7473 



Tunned model Average MCC: 0.7719
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:51:02,394 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:51:02,395 | INFO | >> (23) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8738  0.9353  0.7796  0.8380   
gbc          Gradient Boosting Classifier    0.8541  0.9154  0.7496  0.8080   
ada                  Ada Boost Classifier    0.8237  0.8831  0.6838  0.7719   
dt               Decision Tree Classifier    0.7211  0.6919  0.6008  0.5880   
rf               Random Forest Classifier    0.7268  0.8589  0.2098  0.9414   
nb                            Naive Bayes    0.7124  0.7338  0.5016  0.5911   
lr                    Logistic Regression    0.6846  0.6791  0.3121  0.5657   
lda          Linear Discriminant Analysis    0.6839  0.6793  0.3136  0.5634   
ridge                    Ridge Classifier    0.6834  0.0000  0.2908  0.5671   
qda       Quadratic Discriminant Analysis    0.6735  0.6587  0.3196  0.5331   
et                 Extra Trees Classifier    0.6773  0.8140  0.0537  0.9534   
svm                   SVM - Linear Kernel    0.6378 



Tunned model Average MCC: 0.751
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:52:33,671 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:52:33,672 | INFO | >> (24) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8760  0.9399  0.7772  0.8460   
gbc          Gradient Boosting Classifier    0.8581  0.9209  0.7406  0.8247   
ada                  Ada Boost Classifier    0.8104  0.8764  0.6608  0.7519   
nb                            Naive Bayes    0.7805  0.8281  0.6172  0.7013   
dt               Decision Tree Classifier    0.7429  0.7185  0.6422  0.6174   
ridge                    Ridge Classifier    0.7415  0.0000  0.4820  0.6658   
lda          Linear Discriminant Analysis    0.7402  0.7784  0.4950  0.6567   
lr                    Logistic Regression    0.7383  0.7779  0.4929  0.6527   
rf               Random Forest Classifier    0.7300  0.8648  0.2231  0.9301   
qda       Quadratic Discriminant Analysis    0.7010  0.7097  0.3700  0.5973   
et                 Extra Trees Classifier    0.6918  0.8435  0.1007  0.9366   
svm                   SVM - Linear Kernel    0.5859 



Tunned model Average MCC: 0.7227
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:53:59,076 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:53:59,076 | INFO | >> (25) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8823  0.9452  0.7861  0.8563   
gbc          Gradient Boosting Classifier    0.8663  0.9308  0.7655  0.8285   
ada                  Ada Boost Classifier    0.8387  0.9042  0.7358  0.7781   
rf               Random Forest Classifier    0.7917  0.9030  0.4222  0.9237   
nb                            Naive Bayes    0.7620  0.8331  0.7174  0.6323   
lr                    Logistic Regression    0.7731  0.8251  0.5867  0.6980   
ridge                    Ridge Classifier    0.7721  0.0000  0.5592  0.7093   
lda          Linear Discriminant Analysis    0.7707  0.8242  0.5730  0.6987   
dt               Decision Tree Classifier    0.7310  0.7050  0.6235  0.6009   
qda       Quadratic Discriminant Analysis    0.7377  0.7807  0.5103  0.6448   
et                 Extra Trees Classifier    0.7350  0.8835  0.2349  0.9436   
svm                   SVM - Linear Kernel    0.7234 



Tunned model Average MCC: 0.7356
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:55:33,782 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:55:33,782 | INFO | >> (26) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8863  0.9479  0.7810  0.8714   
gbc          Gradient Boosting Classifier    0.8646  0.9305  0.7068  0.8711   
ada                  Ada Boost Classifier    0.8338  0.8966  0.7160  0.7778   
lda          Linear Discriminant Analysis    0.8177  0.8739  0.6605  0.7706   
ridge                    Ridge Classifier    0.8177  0.0000  0.6526  0.7760   
lr                    Logistic Regression    0.8159  0.8730  0.6663  0.7625   
nb                            Naive Bayes    0.8042  0.8665  0.7092  0.7133   
rf               Random Forest Classifier    0.7914  0.9017  0.4295  0.9092   
qda       Quadratic Discriminant Analysis    0.7810  0.8351  0.5175  0.7619   
dt               Decision Tree Classifier    0.7443  0.7205  0.6460  0.6189   
svm                   SVM - Linear Kernel    0.6944  0.0000  0.5708  0.7201   
et                 Extra Trees Classifier    0.7350 



Tunned model Average MCC: 0.7909
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:57:18,236 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:57:18,236 | INFO | >> (27) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8881  0.9476  0.7820  0.8757   
gbc          Gradient Boosting Classifier    0.8611  0.9258  0.7058  0.8608   
ada                  Ada Boost Classifier    0.8319  0.8953  0.7116  0.7759   
lda          Linear Discriminant Analysis    0.8103  0.8650  0.6472  0.7595   
ridge                    Ridge Classifier    0.8104  0.0000  0.6356  0.7671   
lr                    Logistic Regression    0.8089  0.8649  0.6513  0.7536   
qda       Quadratic Discriminant Analysis    0.8099  0.8814  0.5802  0.8063   
nb                            Naive Bayes    0.7917  0.8449  0.6886  0.6956   
rf               Random Forest Classifier    0.7794  0.8925  0.3837  0.9226   
svm                   SVM - Linear Kernel    0.7672  0.0000  0.5039  0.7916   
dt               Decision Tree Classifier    0.7313  0.7049  0.6223  0.6020   
et                 Extra Trees Classifier    0.7362 



Tunned model Average MCC: 0.3811
Transformation Pipeline and Model Successfully Saved
2023-05-19 01:58:57,355 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 01:58:57,356 | INFO | >> (28) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

                                    Model  Accuracy     AUC  Recall   Prec.  \
lightgbm  Light Gradient Boosting Machine    0.8880  0.9501  0.7733  0.8829   
gbc          Gradient Boosting Classifier    0.8679  0.9336  0.7210  0.8682   
ada                  Ada Boost Classifier    0.8327  0.8962  0.7162  0.7751   
rf               Random Forest Classifier    0.7960  0.9053  0.4411  0.9152   
nb                            Naive Bayes    0.7753  0.8372  0.6862  0.6642   
lda          Linear Discriminant Analysis    0.7811  0.8263  0.5843  0.7196   
ridge                    Ridge Classifier    0.7814  0.0000  0.5727  0.7266   
lr                    Logistic Regression    0.7799  0.8265  0.5850  0.7160   
svm                   SVM - Linear Kernel    0.7456  0.0000  0.5877  0.6835   
et                 Extra Trees Classifier    0.7505  0.8916  0.2829  0.9449   
dt               Decision Tree Classifier    0.7412  0.7139  0.6284  0.6174   
qda       Quadratic Discriminant Analysis    0.7555 



Tunned model Average MCC: 0.776
Transformation Pipeline and Model Successfully Saved
2023-05-19 02:00:20,280 | INFO | >> Ending experiment: Arabidopsis_non_tata...
2023-05-19 02:00:20,281 | INFO | >> (29) Starting experiment: Arabidopsis_non_tata...


                                                                                                                        

# PyCaret experiment - Base case

In [None]:
# Define property, dataset and kmer type to be used
prop_idx = 0
dataset_name = 'Bacillus'
kmer_type = 'dinuc'
print(f'Property {prop_idx} - {props_names["dinuc"][prop_idx]}')
# Select property dataframe for a given organism and kmer type
data = features[dataset_name][kmer_type][prop_idx]

new_cols = [f'{x}' for x in get_bp_positions(data.shape[1]-1, step=1)] + ['y'] # Get new columns names
data.columns = new_cols # Set new columns names

data

In [None]:
# Set up the experiment
exp_name = f'{dataset_name}-{kmer_type}-{prop_idx}'
exp = setup(data=data, target='y', session_id=123, verbose=True, log_experiment=True, experiment_name=exp_name)

In [None]:
# View all models available in the library (pycaret.classification)
models()

In [None]:
# Compare models and select the best 3 models based on MCC
best_models = compare_models(sort='MCC', n_select=5)

In [None]:
lightgbm = best_models[1]
lightgbm
lightgbm_tuned = tune_model(lightgbm, optimize='MCC')
pull()

In [None]:
metrics_df = pull()
metrics_df
# predict_model(lightgbm_tuned)
# pull()

In [None]:
metrics_df.iloc[:-2, -1].min()

# Interpretation of the model

In [None]:
plot_model(lightgbm_tuned, 'dimension')
interpret_model(lightgbm_tuned)

In [None]:
# !mlflow ui

In [None]:
x = np.arange(5)
y_mean = np.array([3, 5, 4, 7, 6])
y_min = np.array([2, 4, 3, 6, 5])
y_max = np.array([4, 6, 5, 8, 7])
y_error = np.array([y_mean - y_min, y_max - y_mean])

plt.errorbar(x, y_mean, yerr=y_error, fmt='o', color='black', capsize=0, capthick=0,
             marker='o', markersize=0, mec='cornflowerblue', mfc='lightblue', ecolor='cornflowerblue',
             elinewidth=1)

# Customize the markers for the minimum, maximum, and mean values
plt.plot(x, y_min, 'v', color='royalblue', markersize=8, label='Min')
plt.plot(x, y_max, '^', color='royalblue', markersize=8, label='Max')
plt.plot(x, y_mean, 'o', color='royalblue', markersize=6, label='Mean')

plt.grid(False)

plt.legend()
plt.show()

In [None]:
# Example data
score_name = 'MCC'
datasets = ['D1', 'D2', 'D3', 'D4']
algorithms = ['SVM', 'Random Forest']
algorithms_colors = ['cornflowerblue', 'sandybrown']
means = np.array([[0.85, 0.89], [0.75, 0.82], [0.93, 0.95], [0.91, 0.94]])
mins = np.array([[0.80, 0.85], [0.65, 0.75], [0.90, 0.92], [0.88, 0.90]])
maxs = np.array([[0.90, 0.92], [0.80, 0.88], [0.96, 0.98], [0.94, 0.96]])

# Set up plot
fig, ax = plt.subplots()
width = 0.35
ind = np.arange(len(datasets))

# Plot means
# for alg_idx, alg_name in enumerate(algorithms):
#     means_alg = means[:, alg_idx]
#     ax.bar(ind + alg_idx*width, means_alg, width, label=alg_name)

# Add errorbars
for alg_idx, alg_name in enumerate(algorithms):
    mins_alg = mins[:, alg_idx]
    maxs_alg = maxs[:, alg_idx]
    _color = algorithms_colors[alg_idx]
    ax.errorbar(ind + alg_idx*width, means[:, alg_idx], yerr=[means[:, alg_idx] - mins_alg, maxs_alg - means[:, alg_idx]],
                fmt='o', capsize=3, capthick=1, ecolor=_color, label=alg_name, elinewidth=1,
                markeredgewidth=1, mec=_color, mfc='white', markersize=6, marker='o')

# Add labels, title, legend, and grid
ax.set_ylabel(f'{score_name}')
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(datasets)
ax.set_title(f'{score_name} by Dataset and Algorithm')
ax.legend(algorithms, loc='best', fancybox=True, shadow=True)
ax.grid(False)

plt.show()