# Modelos QSAR para protease principal 3C-like protease (M<sup>pro</sup>) de SARS-CoV

- Os modelos neste fluxo de trabalho foram gerados usando descritores 2D sirms.py e PaDEL-Descriptor.
- Foram coletados 114 pontos de dados para 113 compostos testados em SARS-CoV Mpro (CHEMBL3927). Os dados foram cuidadosamente selecionados seguindo os protocolos desenvolvidos por Fourches et al. Após a curadoria, 91 compostos (27 ativos e 64 inativos) foram mantidos para modelagem.
- Devido ao tamanho limitado dos dados, validamos os modelos apenas por meio de validação cruzada externa de 5 vezes e aplicamos um limite de 70% de confiança para avaliar os modelos e selecionar resultados para validação experimental.

## Importando Módulos e Funções
         
    

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import _pickle as cPickle
import gzip
from time import time
from sklearn.preprocessing import StandardScaler
#BalanceBySim: função de balanceamento de conjunto de dados (Equilibre os dados usando 1/2 similaridade e 1/2 aleatória)
from BalanceBySim import *

#Stats: Calculos estatisticos referente ao modelo Kappa	CCR	Sensitivity	PPV	Specificity	NPV	Coverage
from stats import *

from collections import Counter

#Rdkit: coleção de quiminformática e software de aprendizado de máquina escrito em C++ e Python de Código Aberto.
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole

#Sklearn: Bibliotecas p/ Machine learning de Código Aberto
from sklearn import metrics
from sklearn.svm import SVC # biblioteca SVM para Classificação
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV, cross_val_score # biblioteca GridSearch e cross_validate
from sklearn.model_selection import permutation_test_score, StratifiedKFold
from sklearn.metrics import classification_report

def warn(*args, **kwargs):
    pass
import warnings
warnings.filterwarnings("ignore")
warnings.warn = warn

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
%reload_ext autoreload
%autoreload 2
Draw.DrawingOptions.atomLabelFontFace = "DejaVu Sans"
Draw.DrawingOptions.atomLabelFontSize = 18

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
hiper_parametros = 'grid_search' #grid_search, random_search

## Carregar dados

In [None]:
# Definir caminho do arquivo
file = 'datasets/curated_data/chembl-pdb-sars-cov-3C-like-proteinase.sdf.gz'

# Novo dicionário inicializado a partir de um objeto de mapeamento
sdfInfo = dict(smilesName='SMILES', molColName='ROMol')

# Carregando o arquivo SDF com os dicionarios mapeados
moldf = PandasTools.LoadSDF(file, **sdfInfo)
print('Dados originais: ', moldf.shape)

# Renomear ROMol
moldf = moldf.rename(columns={'ROMol': 'Mol'})

# Remover moléculas RDKit ausentes
moldf = moldf[pd.notnull(moldf['Mol'])]
if 'StandardizerResult' in moldf.columns:
    moldf = moldf.drop(columns='StandardizerResult')
    
# Colunas
print('Dados mantidos: ', moldf.shape)
moldf.describe()

## Forma dos dados

In [None]:
# (27 ativos e 64 inativos) 91 compostos utilizando o software ChemAxon Standardizer 
# (13 ativos e 09 inativos) 22 compostos obtidos de empresas encontradas do PDB
moldf['Outcome'] = moldf['Outcome'].replace('Active', 1)
moldf['Outcome'] = moldf['Outcome'].replace('Inactive', 0)

classes = Counter(moldf['Outcome'])
print('\033[1m' + 'Forma do conjunto de treinamento:' + '\n' + '\033[0m')
for key, value in classes.items():
    print('\t\t Classe %d: %d' % (key, value))
print('\t\t Número total de compostos: %d' % (len(moldf['Outcome'])))

# SiRMS

## Descritores de importação

Os descritores foram calculados externamente usando o SiRMS.py. Descritores com baixa variância e correlacionados foram removidos do http://www.qsar4u.com/pages/sirms.php usando o módulo Métodos de Análise de Dados.

In [None]:
desc = pd.read_csv('descriptors/sirms_descritores.csv', sep='\t')
desc.drop(desc.columns[0:2], axis=1,inplace=True)

#Retorne um novo índice com elementos do índice que não estão no "outro".
descriptors = desc.columns.difference(moldf.columns).tolist()
desc.head()

In [None]:
moldf_desc = pd.concat([moldf,desc], axis=1)
balance_data = 'no'

if balance_data == 'yes':
    # Equilibre os dados usando 1/2 similaridade e 1/2 aleatória
    moldf_desc = BalanceBySim(moldf_desc, 'Outcome', 2)
    # Formas dos conjuntos
    print('Forma do conjunto de treinamento: %s' % Counter(moldf_desc['Outcome'].loc[moldf_desc['Set'] == 'train']))
    print('Forma externa definida: %s' % Counter(moldf_desc['Outcome'].loc[moldf_desc['Set'] == 'ext']))
      
else:
    moldf_desc['Set'] = 'train'
    # Formas dos conjuntos
    print('Forma do conjunto de treinamento: %s' % Counter(moldf_desc['Outcome']))
    print('Forma externa definida: %s' % Counter(moldf_desc['Outcome'].loc[moldf_desc['Set'] == 'ext']))

In [None]:
#conjunto de treinamento
moldf_train = moldf_desc[(moldf_desc['Set'] == 'train')]

# variáveis dependentes ()
y_train = moldf_train['Outcome'].to_numpy()

# variáceis independentes - propriedades estruturais (descritores) calculados
X_train = moldf_train[descriptors]
X_train.shape

##### Remova variáveis constantes e quase constantes

In [None]:
X_train = X_train.select_dtypes(exclude=['object'])
X_train = X_train.dropna(axis=1, how='any')
X_train = X_train.fillna(0)

# Definir filtro de baixa variação (limite de 10%)
def variance_filter(data, threshold=0.1):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

# Aplicar filtro
X_train = variance_filter(X_train)

##### Remover variáveis correlacionadas

In [None]:
%%time
correlated_features = set()  
correlation_matrix = X_train.corr()

for i in range(len(correlation_matrix.columns)):  
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

X_train.drop(labels=correlated_features, axis=1, inplace=True)

X_train.shape

In [None]:
X_train.to_csv('descriptors/sirms-chembl-sars-cov-3C-like-proteinase-processed.txt', sep='\t', index=False)

In [None]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Modelo com classificação: {0}".format(i))
            print("Escore médio de validação: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parametros: {0}".format(results['params'][candidate]))
            print("")

## Construção de modelo

### Dados de modelagem

##### Encontre os melhores parâmetros

### Pesquisa em grade (Grid Search)

In [None]:
def grid_search(clf_rf, rf_params, X_train, y_train):
    n_iter_search = 20
    # configuração detalhada = 10 imprimirá o progresso para cada 10 tarefas concluídas
    grid = GridSearchCV(clf_rf, rf_params, scoring='roc_auc', cv=5, n_jobs=-1, verbose=1)
    
    start = time()
    grid.fit(X_train, y_train)
    print("GridSearchCV levou %.2f segundos para %d candidatos"
      " configurações de parâmetros." % ((time() - start), n_iter_search))
    report(grid.cv_results_)

    return grid

### Pesquisa aleatória (Random Search)

In [None]:
def random_search(clf_rf, param_dist, X_train, y_train):
    n_iter_search = 80
    random_search = RandomizedSearchCV(clf_rf, param_distributions=param_dist, 
                                       n_iter=n_iter_search, verbose=1, n_jobs=-1)
    
    start = time()
    random_search.fit(X_train, y_train)
    print("RandomizedSearchCV levou %.2f segundos para %d candidatos"
      " configurações de parâmetros." % ((time() - start), n_iter_search))
    report(random_search.cv_results_)

    return random_search

In [None]:
kernel = ['rbf', 'linear', 'sigmoid', 'poly']
C = [1, 1000]
gamma = [1, 0.00001]
random_state = [0, 24]

# Crie a grade aleatória
random_param = {'kernel': kernel,
              'C': C,
              'gamma': gamma,
              'random_state': random_state}

#Normalize
#X = StandardScaler().fit_transform(X)
clf_svc_sirms = SVC(random_state=24, probability=True).fit(X_train, y_train)

svc_random_sirms = random_search(clf_svc_sirms, random_param, X_train, y_train);

grid_params_sirms = {
    'kernel': [svc_random_sirms.best_params_['kernel']],
    'random_state': [svc_random_sirms.best_params_['random_state']], 
    'gamma': [svc_random_sirms.best_params_['gamma'] - 0.1, 
         svc_random_sirms.best_params_['gamma'] - 0.01, 
         svc_random_sirms.best_params_['gamma'], 
         svc_random_sirms.best_params_['gamma'] + 0.1, 
         svc_random_sirms.best_params_['gamma'] + 0.01,
             'auto'],
    'C': [svc_random_sirms.best_params_['C'] - 150, 
         svc_random_sirms.best_params_['C'] - 100, 
         svc_random_sirms.best_params_['C'], 
         svc_random_sirms.best_params_['C'] + 100, 
         svc_random_sirms.best_params_['C'] + 150]
}

svc_grid_sirms = grid_search(clf_svc_sirms, grid_params_sirms, X_train, y_train);

In [None]:
baseline = cross_val_score(clf_svc_sirms, X_train, y_train).mean()
y_gs = np.maximum.accumulate(svc_grid_sirms.cv_results_['mean_test_score'])
y_rs = np.maximum.accumulate(svc_random_sirms.cv_results_['mean_test_score'])

print(f'Baseline = {baseline:.2f}')

print()
print('Best param - Random search: %s' % svc_random_sirms.best_params_)
print(f'Random search = %.2f' % svc_random_sirms.best_score_)

print()
print('Best param - Grid search: %s' % svc_grid_sirms.best_params_)
print(f'Grid search = %.2f' % svc_grid_sirms.best_score_)

plt.plot(y_gs, 'gs-', label='Grid search')
plt.plot(y_rs, 'rs-', label='Random search')
plt.xlabel('Iteration')
plt.ylabel('score')
plt.ylim(0, 1)
plt.title('Valor da melhor pontuação CV amostrada');
plt.legend();

##### Modelo com w/ melhores parâmetros

In [None]:
svc_best = SVC(C=101, gamma='auto', random_state= 0, kernel='poly', probability=True)
svc_best.fit(X_train, y_train)

##### 5-fold cross-validation

In [None]:
# Parametros
pred = []
ad = []
index = []
cross_val = StratifiedKFold(n_splits=5)

# Faça um loop de 5 vezes
for train_index, test_index in cross_val.split(X_train, y_train):    
    fold_model = svc_best.fit(X_train.iloc[train_index], y_train[train_index])
    fold_pred = svc_best.predict(X_train.iloc[test_index])
    fold_ad = svc_best.predict_proba(X_train.iloc[test_index])
    pred.append(fold_pred)
    ad.append(fold_ad)
    index.append(test_index)

In [None]:
threshold_ad = 0.70

# Preparar resultados para exportar   
fold_index = np.concatenate(index)    
fold_pred = np.concatenate(pred)
fold_ad = np.concatenate(ad)
fold_ad = (np.amax(fold_ad, axis=1) >= threshold_ad).astype(str)
five_fold_sirms = pd.DataFrame({'Prediction': fold_pred,'AD': fold_ad}, index=list(fold_index))
five_fold_sirms.AD[five_fold_sirms.AD == 'False'] = np.nan
five_fold_sirms.AD[five_fold_sirms.AD == 'True'] = five_fold_sirms.Prediction
five_fold_sirms.sort_index(inplace=True)
five_fold_sirms['y_train'] = pd.DataFrame(y_train)
five_fold_sirms_ad = five_fold_sirms.dropna().astype(int)
cobertura_5f = len(five_fold_sirms_ad) / len(five_fold_sirms)

# estatísticas de sirms
sirms = pd.DataFrame(stats(five_fold_sirms['y_train'], five_fold_sirms['Prediction']))
sirms['Cobertura'] = 1.0

# estatísticas de sirms AD
sirms_ad = five_fold_sirms.dropna(subset=['AD']).astype(int)
cobertura_sirms_ad = len(sirms_ad['AD']) / len(five_fold_sirms['y_train'])
sirms_ad = pd.DataFrame(stats(sirms_ad['y_train'], sirms_ad['AD']))
sirms_ad['Cobertura'] = round(cobertura_sirms_ad, 2)

# imprimir estatísticas
print('\033[1m' + 'Características estatísticas de validação cruzada externa de 5 vezes dos modelos QSAR desenvolvidos SiRMS' + '\n' + '\033[0m')
sirms_5f_stats = sirms.append(sirms_ad)
sirms_5f_stats.set_index([['SiRMS', 'SiRMS AD']], drop=True, inplace=True)
sirms_5f_stats

##### Prever conjunto retido externo após o balanceamento

In [None]:
moldf_ext = moldf_desc[(moldf_desc['Set'] == 'ext')]
descriptor_list = list(X_train.columns.values)

if len(moldf_ext) > 0:
    y_ext = moldf_ext['Outcome'].to_numpy()
    X_ext = moldf_ext[descriptors]
    
    # Filtrar descritores não presentes no modelo
    X_ext = X_ext[descriptor_list]
    
    # Fazer previsões
    ext_set_sirms = svc_best.predict(X_ext)
    ext_set_sirms_ad = svc_best.predict_proba(X_ext)
    ext_set_sirms_ad = (np.amax(ext_set_sirms_ad, axis=1) >= threshold_ad).astype(str)
    
    # Preparar dados
    ext_set_sirms = pd.DataFrame({'Prediction': ext_set_sirms,'AD': ext_set_sirms_ad})
    ext_set_sirms.AD[ext_set_sirms.AD == 'False'] = np.nan
    ext_set_sirms.AD[ext_set_sirms.AD == 'True'] = ext_set_sirms.Prediction
    ext_set_sirms.sort_index(inplace=True)
    ext_set_sirms['y_ext'] = pd.DataFrame(y_ext)
    ext_set_sirms_ad = ext_set_sirms.dropna().astype(int)
    cobertura_ext = len(ext_set_sirms_ad) / len(ext_set_sirms)
    
    # ext_set_sirms estatísticas
    sirms_ext = pd.DataFrame(stats(ext_set_sirms['y_ext'], ext_set_sirms['Prediction']))
    sirms_ext['Cobertura'] = 1.0
    
    # ext_set_sirms AD estatísticas
    sirms_ext_ad = ext_set_sirms.dropna(subset=['AD']).astype(int)
    cobertura_sirms_ext_ad = len(sirms_ext_ad['AD']) / len(ext_set_sirms['y_ext'])
    sirms_ext_ad = pd.DataFrame(stats(sirms_ext_ad['y_ext'], sirms_ext_ad['AD']))
    sirms_ext_ad['Cobertura'] = round(cobertura_sirms_ext_ad, 2)
    
    # imprimir estatísticas
    print('\033[1m' + 'Características estatísticas da previsão do conjunto retido na fonte por modelos SiRMS' + '\n' + '\033[0m')
    ext_set_sirms_stats = sirms_ext.append(sirms_ext_ad)
    ext_set_sirms_stats.set_index([['SiRMS Ext.', 'SiRMS Ext. AD']], drop=True, inplace=True)
    ext_set_sirms_stats
else:
    pass

##### Y-randomization

In [None]:
permutations = 20
score, permutation_scores, pvalue = permutation_test_score(svc_best, X_train, y_train,
                                                           cv=5, scoring='balanced_accuracy',
                                                           n_permutations=permutations,
                                                           n_jobs=-1,
                                                           verbose=1,
                                                           random_state=24)
print('True score = ', score.round(2),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

##### Salvar modelo

In [None]:
with gzip.GzipFile('model/sars-cov-3clpro-sirms_SVM_ad_balanced.pgz', 'w') as f:
    cPickle.dump(svc_best, f)

### Estatísticas de plotagem

In [None]:
# Exportar estatísticas
if len(moldf_ext) > 0:
    sirms_stats = pd.concat([sirms_5f_stats, ext_set_sirms_stats], axis=0)
    sirms_stats
else:
    sirms_stats = sirms_5f_stats.copy()

In [None]:
# Estatísticas de transposição
sirms_stats_t = sirms_stats.T
sirms_stats_t = sirms_stats_t.reset_index()
sirms_stats_t = sirms_stats_t.rename(columns={'index': 'Stats'})

# Fazer enredo
plt.style.use('seaborn-colorblind')
fig, ax1 = plt.subplots(figsize=(8,5), dpi=130)

sirms_stats_t.plot(kind='bar', ax=ax1, width=0.8)
ax1.set_xticklabels(labels=sirms_stats_t['Stats'].tolist(), fontsize=14, rotation=0)
ax1.axhline(y=.6, color='indianred', ls='dashed')
ax1.legend_.remove()
plt.title('Características estatísticas', fontsize=16)
ax1.set_yticks(np.arange(0, 1.1, 0.1))
ax1.tick_params(labelsize=9)

handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles, labels, fontsize=12,
            loc='upper center', bbox_to_anchor=(0.5, -0.09), ncol=4)
fig.tight_layout()

plt.savefig('statistics-sirms.png', bbox_inches='tight', transparent=False, format='png', dpi=300)
plt.show();

# PaDEL-Descriptor

## Descritores de importação

Os descritores foram calculados externamente usando o PaDEL-Descriptor

In [None]:
desc = pd.read_csv('descriptors/descritores_padel.csv', sep=',')
desc.drop(desc.columns[0:1], axis=1,inplace=True)
descriptors = desc.columns.difference(moldf.columns).tolist()
desc.head()

In [None]:
moldf_desc = pd.concat([moldf,desc], axis=1)
balance_data = 'no'

if balance_data == 'yes':
    # Equilibre os dados usando 1/2 similaridade e 1/2 aleatória
    moldf_desc = BalanceBySim(moldf_desc, 'Outcome', 2)
    # Forma de impressão
    print('Forma do conjunto de treinamento: %s' % Counter(moldf_desc['Outcome'].loc[moldf_desc['Set'] == 'train']))
    print('Forma externa definida: %s' % Counter(moldf_desc['Outcome'].loc[moldf_desc['Set'] == 'ext']))
      
else:
    moldf_desc['Set'] = 'train'
    # Forma de impressão
    print('Forma do conjunto de treinamento: %s' % Counter(moldf_desc['Outcome']))
    print('Forma externa definida: %s' % Counter(moldf_desc['Outcome'].loc[moldf_desc['Set'] == 'ext']))

In [None]:
moldf_train = moldf_desc[(moldf_desc['Set'] == 'train')]

y_train = moldf_train['Outcome'].to_numpy()
X_train = moldf_train[descriptors]
X_train.shape

##### Remover variáveis constantes e quase constantes

In [None]:
X_train = X_train.select_dtypes(exclude=['object'])
X_train = X_train.dropna(axis=1, how='any')
X_train = X_train.fillna(0)

# Definir filtro de baixa variação (limite de 10%)
def variance_filter(data, threshold=0.1):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

# Aplicar filtro
X_train = variance_filter(X_train)

##### Remover variáveis correlacionadas

In [None]:
correlated_features = set()  
correlation_matrix = X_train.corr()

for i in range(len(correlation_matrix.columns)):  
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

X_train.drop(labels=correlated_features, axis=1, inplace=True)

X_train.shape

In [None]:
X_train.to_csv('descriptors/padel-chembl-sars-cov-3C-like-proteinase-processed.txt', sep='\t', index=False)

## Construção de modelo

### Dados de modelagem

##### Encontre os melhores parâmetros

In [None]:

clf_svc_padel = SVC(random_state=24, probability = True).fit(X_train, y_train)

svc_random_padel = random_search(clf_svc_padel, random_param, X_train, y_train);

grid_params_padel = {
    'kernel': [svc_random_padel.best_params_['kernel']],
    'random_state': [svc_random_padel.best_params_['random_state']], 
    'gamma': [svc_random_padel.best_params_['gamma'] - 0.1, 
         svc_random_padel.best_params_['gamma'] - 0.01, 
         svc_random_padel.best_params_['gamma'], 
         svc_random_padel.best_params_['gamma'] + 0.1, 
         svc_random_padel.best_params_['gamma'] + 0.01,
             'auto'],
    'C': [svc_random_padel.best_params_['C'] - 150, 
         svc_random_padel.best_params_['C'] - 100, 
         svc_random_padel.best_params_['C'], 
         svc_random_padel.best_params_['C'] + 100, 
         svc_random_padel.best_params_['C'] + 150]
}

svc_grid_padel = grid_search(clf_svc_padel, grid_params_padel, X_train, y_train);

In [None]:
baseline = cross_val_score(clf_svc_padel, X_train, y_train).mean()
y_gs = np.maximum.accumulate(svc_grid_padel.cv_results_['mean_test_score'])
y_rs = np.maximum.accumulate(svc_random_padel.cv_results_['mean_test_score'])

print(f'Baseline = {baseline:.2f}')

print()
print('Best param - Random search: %s' % svc_random_padel.best_params_)
print(f'Random search = %.2f' % svc_random_padel.best_score_)

print()
print('Best param - Grid search: %s' % svc_grid_padel.best_params_)
print(f'Grid search = %.2f' % svc_grid_padel.best_score_)

plt.plot(y_gs, 'gs-', label='Grid search')
plt.plot(y_rs, 'rs-', label='Random search')
plt.xlabel('Iteration')
plt.ylabel('score')
plt.ylim(0, 1)
plt.title('Valor da melhor pontuação CV amostrada');
plt.legend();

In [None]:
svc_best = SVC(C=1, gamma=0.9, random_state= 0, kernel='linear', probability=True)
svc_best.fit(X_train, y_train)

##### Validação cruzada 5 vezes

In [None]:
# Parametros
pred = []
ad = []
index = []
cross_val = StratifiedKFold(n_splits=5)

# Faça um loop de 5 vezes
for train_index, test_index in cross_val.split(X_train, y_train):
    
    fold_model = svc_best.fit(X_train.iloc[train_index], y_train[train_index])
    fold_pred = svc_best.predict(X_train.iloc[test_index])
    fold_ad = svc_best.predict_proba(X_train.iloc[test_index])
    pred.append(fold_pred)
    ad.append(fold_ad)
    index.append(test_index)

threshold_ad = 0.70

# Preparar resultados para exportar    
fold_index = np.concatenate(index)    
fold_pred = np.concatenate(pred)
fold_ad = np.concatenate(ad)
fold_ad = (np.amax(fold_ad, axis=1) >= threshold_ad).astype(str)
five_fold_padel = pd.DataFrame({'Prediction': fold_pred,'AD': fold_ad}, index=list(fold_index))
five_fold_padel.AD[five_fold_padel.AD == 'False'] = np.nan
five_fold_padel.AD[five_fold_padel.AD == 'True'] = five_fold_padel.Prediction
five_fold_padel.sort_index(inplace=True)
five_fold_padel['y_train'] = pd.DataFrame(y_train)
five_fold_padel_ad = five_fold_padel.dropna().astype(int)
cobertura_5f = len(five_fold_padel_ad) / len(five_fold_padel)

# estatísticas padel
padel = pd.DataFrame(stats(five_fold_padel['y_train'], five_fold_padel['Prediction']))
padel['Cobertura'] = 1.0

# estatísticas padel AD
padel_ad = five_fold_padel.dropna(subset=['AD']).astype(int)
cobertura_padel_ad = len(padel_ad['AD']) / len(five_fold_padel['y_train'])
padel_ad = pd.DataFrame(stats(padel_ad['y_train'], padel_ad['AD']))
padel_ad['Cobertura'] = round(cobertura_padel_ad, 2)

# imprimir estatísticas
print('\033[1m' + 'Características estatísticas de validação cruzada externa de 5 vezes dos modelos QSAR desenvolvidos PaDEL' + '\n' + '\033[0m')
padel_5f_stats = padel.append(padel_ad)
padel_5f_stats.set_index([['PaDEL', 'PaDEL AD']], drop=True, inplace=True)
padel_5f_stats

##### Prever conjunto retido externo após o balanceamento

In [None]:
moldf_ext = moldf_desc[(moldf_desc['Set'] == 'ext')]
descriptor_list = list(X_train.columns.values)

if len(moldf_ext) > 0:
    y_ext = moldf_ext['Outcome'].to_numpy()
    X_ext = moldf_ext[descriptors]
    
    # Filtrar descritores não presentes no modelo
    X_ext = X_ext[descriptor_list]
    
    # Fazer previsões
    ext_set_padel = svc_best.predict(X_ext)
    ext_set_padel_ad = svc_best.predict_proba(X_ext)
    ext_set_padel_ad = (np.amax(ext_set_padel_ad, axis=1) >= threshold_ad).astype(str)
    
    # Preparar dados
    ext_set_padel = pd.DataFrame({'Prediction': ext_set_padel,'AD': ext_set_padel_ad})
    ext_set_padel.AD[ext_set_padel.AD == 'False'] = np.nan
    ext_set_padel.AD[ext_set_padel.AD == 'True'] = ext_set_padel.Prediction
    ext_set_padel.sort_index(inplace=True)
    ext_set_padel['y_ext'] = pd.DataFrame(y_ext)
    ext_set_padel_ad = ext_set_padel.dropna().astype(int)
    cobertura_ext = len(ext_set_padel_ad) / len(ext_set_padel)
    
    # ext_set_padel estatísticas
    padel_ext = pd.DataFrame(stats(ext_set_padel['y_ext'], ext_set_padel['Prediction']))
    padel_ext['Cobertura'] = 1.0
    
    # ext_set_padel AD stats
    padel_ext_ad = ext_set_padel.dropna(subset=['AD']).astype(int)
    cobertura_padel_ext_ad = len(padel_ext_ad['AD']) / len(ext_set_padel['y_ext'])
    padel_ext_ad = pd.DataFrame(stats(padel_ext_ad['y_ext'], padel_ext_ad['AD']))
    padel_ext_ad['Cobertura'] = round(cobertura_padel_ext_ad, 2)
    
    # imprimir estatísticas
    print('\033[1m' + 'Características estatísticas da previsão do conjunto retido na fonte por modelos PaDEL' + '\n' + '\033[0m')
    ext_set_padel_stats = padel_ext.append(padel_ext_ad)
    ext_set_padel_stats.set_index([['PaDEL Ext.', 'PaDEL Ext. AD']], drop=True, inplace=True)
    ext_set_padel_stats
else:
    pass

##### Y-randomization

In [None]:
permutations = 20
score, permutation_scores, pvalue = permutation_test_score(svc_best, X_train, y_train,
                                                           cv=5, scoring='balanced_accuracy',
                                                           n_permutations=permutations,
                                                           n_jobs=-1,
                                                           verbose=1,
                                                           random_state=24)
print('True score = ', score.round(2),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

##### Salvar modelo

In [None]:
with gzip.GzipFile('model/sars-cov-3clpro-padel_SVM_ad_balanced.pgz', 'w') as f:
    cPickle.dump(svc_best, f)

### Plotar estatísticas

In [None]:
# Exportar estatísticas
if len(moldf_ext) > 0:
    padel_stats = pd.concat([padel_5f_stats, ext_set_padel_stats], axis=0)
    padel_stats
else:
    padel_stats = padel_5f_stats.copy()
    padel_stats

In [None]:
# Estatísticas de transposição
padel_stats_t = padel_stats.T
padel_stats_t = padel_stats_t.reset_index()
padel_stats_t = padel_stats_t.rename(columns={'index': 'Stats'})

# Fazer enredo
plt.style.use('seaborn-colorblind')
fig, ax1 = plt.subplots(figsize=(8,5), dpi=130)

padel_stats_t.plot(kind='bar', ax=ax1, width=0.8)
ax1.set_xticklabels(labels=padel_stats_t['Stats'].tolist(), fontsize=14, rotation=0)
ax1.axhline(y=.6, color='indianred', ls='dashed')
ax1.legend_.remove()
plt.title('Características estatísticas', fontsize=16)
ax1.set_yticks(np.arange(0, 1.1, 0.1))
ax1.tick_params(labelsize=12)

handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles, labels, fontsize=12,
            loc='upper center', bbox_to_anchor=(0.5, -0.09), ncol=4)
fig.tight_layout()

plt.savefig('statistics-padel.png', bbox_inches='tight', transparent=False, format='png', dpi=300)
plt.show();

# Consenso

In [None]:
results_sirms = five_fold_sirms.drop(columns='y_train')
results_sirms = five_fold_sirms.rename(columns={'Prediction':'sirms', 'AD':'sirms_ad'})
results_padel = five_fold_padel.drop(columns='y_train')
results_padel = five_fold_padel.rename(columns={'Prediction':'padel', 'AD':'padel_ad'})

In [None]:
var = list(moldf.columns.values)
moldf_train = moldf_train[var]
results_sirms.reset_index(drop=True, inplace=True)
results_padel.reset_index(drop=True, inplace=True)
predictions = pd.concat([moldf_train.reset_index(drop=True), results_sirms, results_padel], axis=1)

#### Previsões de consenso

In [None]:
# Consenso
predictions['consensus'] = (predictions.sirms + predictions.padel)/2
predictions['consensus'] = np.where(predictions['consensus'] > 0.5, 1, 0)

# Consenso AD
for i in range(0, predictions.shape[0]):
    if all([np.isnan(predictions.sirms_ad[i]) == False, np.isnan(predictions.padel_ad[i]) == False]):
        predictions.loc[i,'consensus_ad'] = (predictions.sirms_ad[i] + predictions.padel_ad[i])/2
        predictions.loc[i,'consensus_ad'] = np.where(predictions.loc[i,'consensus_ad'] > 0.5, 1, 0)
    elif all([np.isnan(predictions.sirms_ad[i]) == True, np.isnan(predictions.padel_ad[i]) == False]):
        predictions.loc[i,'consensus_ad'] = predictions.padel_ad[i]
    elif all([np.isnan(predictions.sirms_ad[i]) == False, np.isnan(predictions.padel_ad[i]) == True]):
        predictions.loc[i,'consensus_ad'] = predictions.sirms_ad[i]
    else:
        predictions.loc[i,'consensus_ad']  = np.nan

# Rigor de consenso
for i in range(0, predictions.shape[0]):
    if all([np.isnan(predictions.sirms_ad[i]) == False, np.isnan(predictions.padel_ad[i]) == False]):
        predictions.loc[i,'consensus_rigor'] = (predictions.sirms_ad[i] + predictions.padel_ad[i])/2
        predictions.loc[i,'consensus_rigor'] = np.where(predictions.loc[i,'consensus_rigor'] > 0.5, 1, 0)
    else:
        predictions.loc[i,'consensus_rigor']  = np.nan
        
predictions.drop(columns=['y_train', 'ID'], inplace=True)

In [None]:
##### SiRMS
predictions.sirms = predictions.sirms.fillna(predictions.sirms.median())
# Estatísticas SiRMS
sirms = pd.DataFrame(stats(predictions.Outcome, predictions.sirms))
sirms['Cobertura'] = 1.0

# Estatísticas SiRMS AD
sirms_ad = predictions.dropna(subset=['sirms_ad'])
cobertura_sirms_ad = len(sirms_ad.sirms_ad) / len(predictions.Outcome)
sirms_ad = pd.DataFrame(stats(sirms_ad.Outcome, sirms_ad.sirms_ad.astype(int)))
sirms_ad['Cobertura'] = round(cobertura_sirms_ad, 2)

##### PaDEL
predictions.padel = predictions.padel.fillna(predictions.padel.median())
# estatísticas de padel
padel = pd.DataFrame(stats(predictions.Outcome, predictions.padel))
padel['Cobertura'] = 1.0

# estatísticas de padel AD
padel_ad = predictions.dropna(subset=['padel_ad'])
cobertura_padel_ad = len(padel_ad.padel_ad) / len(predictions.Outcome)
padel_ad = pd.DataFrame(stats(padel_ad.Outcome, padel_ad.padel_ad.astype(int)))
padel_ad['Cobertura'] = round(cobertura_padel_ad, 2)

##### Consenso

# estatísticas consenso
consensus = pd.DataFrame(stats(predictions.Outcome, predictions.consensus))
consensus['Cobertura'] = 1.0

# estatísticas consenso AD
consensus_ad = predictions.dropna(subset=['consensus_ad'])
cobertura_consensus_ad = len(consensus_ad.consensus_ad) / len(predictions.Outcome)

consensus_ad = pd.DataFrame(stats(consensus_ad.Outcome, consensus_ad.consensus_ad.astype(int)))
consensus_ad['Cobertura'] = round(cobertura_consensus_ad, 2)

# estatísticas rigor do consenso
consensus_rigor = predictions.dropna(subset=['consensus_rigor'])
cobertura_consensus_rigor = len(consensus_rigor.consensus_rigor) / len(predictions.Outcome)
consensus_rigor = pd.DataFrame(stats(consensus_rigor.Outcome, consensus_rigor.consensus_rigor.astype(int)))
consensus_rigor['Cobertura'] = round(cobertura_consensus_rigor, 2)

##### Previsões de exportação

In [None]:
pred_exp = predictions.drop(columns=['Mol'])

with pd.ExcelWriter('predictions-sirms-padel.xlsx') as writer:
    pred_exp.to_excel(writer, sheet_name='sirms-padel', index=False)

##### Estatísticas

In [None]:
stats = pd.concat([sirms_ad, padel_ad, consensus, consensus_ad, consensus_rigor], axis=0)
stats.set_index([['SiRMS', 'PaDEL', 'Consenso', 'Consenso (AD)', 'Consenso com rigor']], drop=True, inplace=True)

stats

In [None]:
# Estatísticas de transposição
stats_t = stats.T
stats_t = stats_t.reset_index()
stats_t = stats_t.rename(columns={'index': 'Stats'})

# Fazer plot
plt.style.use('seaborn-colorblind')
fig, ax1 = plt.subplots(figsize=(8,5), dpi=130)

stats_t.plot(kind='bar', ax=ax1, width=0.8)
ax1.set_xticklabels(labels=stats_t['Stats'].tolist(), fontsize=14, rotation=0)
ax1.axhline(y=.6, color='indianred', ls='dashed')
ax1.legend_.remove()
plt.title('Características estatísticas do QSAR ', fontsize=16)
ax1.set_yticks(np.arange(0, 1.1, 0.1))
ax1.tick_params(labelsize=9)

handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles, labels, fontsize=10,
            loc='upper center', bbox_to_anchor=(0.5, -0.09), ncol=5)
fig.tight_layout()

plt.savefig('statistics-sirms-padel-5f.png', bbox_inches='tight', transparent=False, format='png', dpi=300)
plt.show();