<a href="https://colab.research.google.com/github/isabellaloren4/isabellaloren4/blob/main/ml_evaluation_acp_new_acp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install numpy
!pip install pytoda
!pip install catboost
!pip install scikit-optimize
!pip install xgboost
!pip install lightgbm
!pip install openpyxl



In [None]:
import pandas as pd
from numpy import mean
from numpy import std
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import *
from threading import Thread
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.base import BaseEstimator, ClassifierMixin
from csv import writer

from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed

In [None]:
def experiment(data_name, model_name, model, params, X_train, y_train, X_test, y_test, i):

    # configure the cross-validation procedure
    cv_inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
    # define search
    search = BayesSearchCV(model, params, scoring='accuracy', cv=cv_inner, n_iter=10, refit=True, random_state=1, n_jobs=1)

    # execute search
    result = search.fit(X_train, y_train)

    # get the best performing model fit on the whole training set
    best_model = result.best_estimator_

    # evaluate model on the hold out dataset
    yhat = best_model.predict(X_test)

    # evaluate the model
    acc = accuracy_score(y_test, yhat)
    prec = precision_score(y_test, yhat)
    rec = recall_score(y_test, yhat)
    f1 = f1_score(y_test, yhat)
    mcc = matthews_corrcoef(y_test, yhat)

    # store the result
    #results.append([data_name, model_name, i, acc, rec, prec, f1, mcc, result.best_score_, result.best_params_])
    results = [data_name, model_name, i, acc, rec, prec, f1, mcc, result.best_score_, result.best_params_]
    with open('results/'+data_name+'_results.csv', 'a') as f_object:
        writer_object = writer(f_object)
        writer_object.writerow(results)
        f_object.close()

    # report progress
    print(f"{data_name}, {model_name} {i} > acc={acc:.2f}, est={result.best_score_:.2f}, cfg={result.best_params_}")

In [None]:
# definição dos modelos e parametros
model_params = {
          'lr': {'model': LogisticRegression(),
                'params': {
                          'C': Real(1e-4, 1e4, prior='log-uniform'),
                          'fit_intercept': Categorical([True, False]),
                          'solver': Categorical(['newton-cg', 'liblinear', 'sag', 'saga'])}},

          'knn': {'model': KNeighborsClassifier(),
                  'params': {
                            'n_neighbors': Integer(1, 50),
                            'weights': Categorical(['uniform', 'distance']),
                            'algorithm': Categorical(['auto', 'ball_tree', 'kd_tree', 'brute']),
                            'p': Integer(1, 5)}},

          'nb': {'model': GaussianNB(),
                'params': {
                          'var_smoothing': Real(1e-10, 1e-1, prior='log-uniform')}},

          'dt': {'model': DecisionTreeClassifier(),
                'params': {
                          'criterion': Categorical(['gini', 'entropy']),
                          'splitter': Categorical(['best', 'random']),
                          'max_depth': Integer(3, 30),
                          'min_samples_split': Integer(2, 10),
                          'min_samples_leaf': Integer(1, 10),
                          'max_features': Real(0.1, 1.0, prior='uniform')}},

          'svm': {'model': LinearSVC(),
                  'params': {
                            'C': Real(1e-6, 1e+6, prior='log-uniform'),
                            'loss': Categorical(['hinge', 'squared_hinge']),
                            'tol': Real(1e-6, 1e-2, prior='log-uniform')}},

          'gpc': {'model': GaussianProcessClassifier(),
                  'params': {
                            'optimizer': Categorical(['fmin_l_bfgs_b', None]),
                            'n_restarts_optimizer': Integer(0, 10),
                            'max_iter_predict': Integer(100, 1000)}},

          'mlp': {'model': MLPClassifier(),
                  'params': {
                            'hidden_layer_sizes': Integer(10,100),
                            'activation': Categorical(['identity', 'logistic', 'tanh', 'relu']),
                            'solver': Categorical(['sgd', 'adam']),
                            'alpha': Real(1e-5, 1e-1, prior='log-uniform'),
                            'learning_rate': Categorical(['constant', 'invscaling', 'adaptive']),
                            'learning_rate_init': Real(1e-4, 1e-1, prior='log-uniform'),
                            'max_iter': Integer(1000,1001)}},

          'ridge': {'model': RidgeClassifier(),
                    'params': {
                              'alpha': Real(1e-4, 1e4, prior='log-uniform'),
                              'fit_intercept': Categorical([True, False]),
                              'solver': Categorical(['auto', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])}},

          'rf': {'model': RandomForestClassifier(),
                'params': {
                          'n_estimators': Integer(10, 500),
                          'criterion': Categorical(['gini', 'entropy']),
                          'max_depth': Integer(3, 30),
                          'min_samples_split': Integer(2, 10),
                          'min_samples_leaf': Integer(1, 10),
                          'max_features': Real(0.1, 1.0, prior='uniform'),
                          'bootstrap': Categorical([True, False]),
                          'class_weight': Categorical(['balanced', 'balanced_subsample', None])}},

          'qda': {'model': QuadraticDiscriminantAnalysis(),
                  'params': {
                            'reg_param': Real(0, 1, prior='uniform'),
                            'store_covariance': Categorical([True, False]),
                            'tol': Real(1e-5, 1e-1, prior='log-uniform')}},

          'ada': {'model': AdaBoostClassifier(),
                  'params': {
                            'n_estimators': Integer(10, 500),
                            'learning_rate': Real(1e-3, 1, prior='log-uniform'),
                            'algorithm': Categorical(['SAMME', 'SAMME.R'])}},

          'gbc': {'model': GradientBoostingClassifier(),
                  'params': {
                            'n_estimators': Integer(10, 500),
                            'learning_rate': Real(1e-3, 1, prior='log-uniform'),
                            'max_depth': Integer(3, 10),
                            'min_samples_split': Integer(2, 10),
                            'min_samples_leaf': Integer(1, 10),
                            'max_features': Real(0.1, 1.0, prior='uniform'),
                            'subsample': Real(0.1, 1.0, prior='uniform')}},

          'lda': {'model': LinearDiscriminantAnalysis(),
                  'params': {
                            'solver': Categorical(['lsqr', 'eigen']),
                            'shrinkage': Real(0, 1, prior='uniform'),
                            'tol': Real(1e-6, 1e-4, prior='log-uniform')}},

          'et': {'model': ExtraTreesClassifier(),
                'params': {
                          'n_estimators': Integer(10, 500),
                          'criterion': Categorical(['gini', 'entropy']),
                          'max_depth': Integer(3, 30),
                          'min_samples_split': Integer(2, 10),
                          'min_samples_leaf': Integer(1, 10),
                          'max_features': Real(0.1, 1.0, prior='uniform'),
                          'bootstrap': Categorical([True, False]),
                          'class_weight': Categorical(['balanced', 'balanced_subsample', None])}},

          'xgboost': {'model': XGBClassifier(),
                      'params': {
                                'learning_rate': Real(0.01, 0.3, prior='uniform'),
                                'n_estimators': Integer(50, 500),
                                'max_depth': Integer(3, 10),
                                'min_child_weight': Integer(1, 10),
                                'gamma': Real(0, 1, prior='uniform'),
                                'subsample': Real(0.5, 1, prior='uniform'),
                                'colsample_bytree': Real(0.5, 1, prior='uniform'),
                                'reg_alpha': Real(0, 1, prior='uniform'),
                                'reg_lambda': Real(1, 3, prior='uniform'),
                                'scale_pos_weight': Real(1, 5, prior='uniform')}},

          'lightgbm': {'model': LGBMClassifier(verbose=-1),
                     'params': {
                                'learning_rate': Real(1e-3, 1, prior='log-uniform'),
                                'n_estimators': Integer(10, 500),
                                'num_leaves': Integer(2, 100),
                                'max_depth': Integer(3, 10),
                                'min_child_samples': Integer(1, 50),
                                'min_child_weight': Real(1e-5, 1e-3, prior='log-uniform'),
                                'subsample': Real(0.1, 1.0, prior='uniform'),
                                'colsample_bytree': Real(0.1, 1.0, prior='uniform'),
                                'reg_alpha': Real(0, 1, prior='uniform'),
                                'reg_lambda': Real(0, 1, prior='uniform')}},

          'catboost': {'model': CatBoostClassifier(verbose=0),
                      'params': {
                                'learning_rate': Real(1e-3, 1, prior='log-uniform'),
                                'iterations': Integer(10, 500),
                                'depth': Integer(3, 10),
                                'l2_leaf_reg': Real(1, 10, prior='uniform'),
                                'border_count': Integer(1, 255),
                                'bagging_temperature': Real(0, 1, prior='uniform'),
                                'random_strength': Real(1e-9, 10, prior='log-uniform')}}
}

In [None]:
# definição dos modelos e parametros
#model_params = {
          #'mlp': {'model': MLPClassifier(),
                  #'params': {
                            #'hidden_layer_sizes': Integer(10,100),
                            #'activation': Categorical(['identity', 'logistic', 'tanh', 'relu']),
                            #'solver': Categorical(['sgd', 'adam']),
                            #'alpha': Real(1e-5, 1e-1, prior='log-uniform'),
                            #'learning_rate': Categorical(['constant', 'invscaling', 'adaptive']),
                            #'learning_rate_init': Real(1e-4, 1e-1, prior='log-uniform'),
                            #'max_iter': Integer(1000,1001)}}
#}

In [None]:
facp = open("acp.txt",'r')
fnacp = open("nacp.txt", 'r')

acp = facp.readlines()
nacp = fnacp.readlines()

facp.close()
fnacp.close()

X = []
y = []
for i in range(len(acp)):
    X.append(acp[i][:-1])
    y.append(1)

for i in range(len(nacp)):
    X.append(nacp[i][:-1])
    y.append(0)

X = np.array(X)
y = np.array(y)

In [None]:
# T é um lista dos dados transformados que serão utilizados nos experimentos
T = []

#carregar dos arquivos os dados transformados:

#---1D DeepChem
#T.append( ('maccskeys', pd.read_csv('features/maccskeys.csv', header=None).to_numpy()) )
#T.append( ('circular', pd.read_csv('features/circular.csv', header=None).to_numpy()) )
#T.append( ('mol2vec', pd.read_csv('features/mol2vec.csv', header=None).to_numpy()) )
#T.append( ('rdkit', pd.read_csv('features/rdkit.csv', header=None).to_numpy()) )
#T.append( ('bpsymmetry', pd.read_csv('features/bpsymmetry.csv', header=None).to_numpy()) )
T.append( ('modlamp', pd.read_csv('features/modlamp.csv', header=None).to_numpy()) )
T.append( ('fastatoseq', pd.read_csv('features/fastatoseq.csv', header=None).to_numpy()) )
T.append( ('smilestoseq', pd.read_csv('features/smilestoseq.csv', header=None).to_numpy()) )
#T.append( ('mordred', normalize(pd.read_csv('features/mordred.csv', header=None).to_numpy())) )

#---1D protPy
#T.append( ('AAC', pd.read_csv('features/AAC.csv', header=None).to_numpy()) )
#T.append( ('PAAC', pd.read_csv('features/PAAC.csv', header=None).to_numpy()) )
#T.append( ('APAAC', pd.read_csv('features/APAAC.csv', header=None).to_numpy()) )
#T.append( ('CTD', pd.read_csv('features/CTD.csv', header=None).to_numpy()) )
#T.append( ('CTriad', pd.read_csv('features/CTriad.csv', header=None).to_numpy()) )
#T.append( ('DPC', pd.read_csv('features/DPC.csv', header=None).to_numpy()) )
#T.append( ('TPC', pd.read_csv('features/TPC.csv', header=None).to_numpy()) )


#---2D DeepChem
#T.append( ('coulombmatrix', pd.read_csv('features/coulombmatrix.csv', low_memory=False, header=None).to_numpy()) )
#T.append( ('onehot', pd.read_csv('features/onehot.csv', low_memory=False, header=None).to_numpy()) )
#T.append( ('smiles2image', pd.read_csv('features/smiles2image.csv', low_memory=False, header=None).to_numpy()) )



In [None]:

import time

# configure the cross-validation procedure
cv_outer = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=1)

#lista para armazenar a referencia das threads
threads = []

# create a thread pool with max worker threads
pool = ThreadPoolExecutor(max_workers=8)

for data_name, X_ in T:

    for i, (train_ix, test_ix) in enumerate(cv_outer.split(X_, y)):

        # split data
        X_train, X_test = X_[train_ix, :], X_[test_ix, :]
        y_train, y_test = y[train_ix], y[test_ix]

        for model_name, mp in model_params.items():

            # adiciona experimento na lista de threads: pool
            exp = pool.submit(experiment, data_name, model_name, mp['model'],mp['params'], X_train, y_train, X_test, y_test, i) # does not block

            #adiciona na lista para salvar a referencia da thread
            threads.append(exp)

            time.sleep(0.1)

# aguarda pela finalização das threads
for exp in as_completed(threads):
    exp.result()


modlamp, nb 0 > acc=0.64, est=0.65, cfg=OrderedDict([('var_smoothing', 0.0029757134630289256)])
modlamp, lr 0 > acc=0.64, est=0.66, cfg=OrderedDict([('C', 5502.07329435664), ('fit_intercept', False), ('solver', 'sag')])
modlamp, dt 0 > acc=0.62, est=0.65, cfg=OrderedDict([('criterion', 'gini'), ('max_depth', 15), ('max_features', 0.6380599656985619), ('min_samples_leaf', 3), ('min_samples_split', 7), ('splitter', 'best')])
modlamp, qda 0 > acc=0.65, est=0.66, cfg=OrderedDict([('reg_param', 0.31614545298544944), ('store_covariance', True), ('tol', 0.019958994474678478)])
modlamp, svm 0 > acc=0.63, est=0.66, cfg=OrderedDict([('C', 0.006219356272098938), ('loss', 'squared_hinge'), ('tol', 0.0019958994474678477)])
modlamp, knn 0 > acc=0.67, est=0.68, cfg=OrderedDict([('algorithm', 'brute'), ('n_neighbors', 17), ('p', 1), ('weights', 'distance')])
modlamp, ridge 0 > acc=0.64, est=0.67, cfg=OrderedDict([('alpha', 41.79842712265741), ('fit_intercept', False), ('solver', 'lsqr')])
modlamp, lda