# Modelos QSAR-SVM para protease principal 3C-like protease (M<sup>pro</sup>) de SARS-CoV

Os modelos nesse fluxo de trabalho foram criados usando as PaDEL-Descriptor, Impressões digitais de Morgan, SiRMS (Simplex Representation of Molecular Structure) e Drangon 7.0 com SVM scikit-learn.

## Importando Módulos e Funções

In [1]:
# Funções
from functions.BalanceBySim import *
from functions.DescritoresMorgan import morgan_descriptors
from functions.DescritoresPaDEL import padel_descriptors
from functions.DescritoresSiRMS import sirms_descriptors
from functions.DescritoresDragon import dragon_descriptors
from functions.stats import stats
from functions.RocAUC import roc_auc
from functions.YRandomization import y_randomization
from functions.BestModel import grid_search, random_search, best_grid_search, best_random_search
from functions.Evaluation import statistics
from functions.StatsChart import print_stats
from functions.CarregarDados import carregar_dados
from functions.Consenso import statistics_consenso
from functions.ConsensoChart import print_consenso
from functions.ComparisonHyperparameters import compare
import scipy

from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold

## Carregar compostos inibidores (Y)

In [2]:
moldf = carregar_dados();

Original data:  (113, 11)
Dados mantidos:  (113, 11)
[1mForma do conjunto de treinamento:
[0m
		 Classe 1: 40
		 Classe 0: 73
		 Número total de compostos: 113
Class labels: [Counter({0: 73, 1: 40})]


## Gerando conjunto de treinamento e teste a partir dos descritores (X)

#### Função Fingerprints de Harry Morgan

In [3]:
data_morgan = morgan_descriptors(moldf)
Y_train_morgan = data_morgan['Y_train']
X_train_morgan = data_morgan['X_train']
X_train_morgan.shape

[1mForma do conjunto de treinamento:
[0m
		 Classe 1: 40
		 Classe 0: 73
		 Número total de compostos: 113
Class labels: [Counter({0: 73, 1: 40})]
Forma do conjunto de treinamento: Counter({0: 73, 1: 40})
Forma externa definida: Counter()


(113, 2048)

#### PaDEL-Descriptor

In [None]:
data_padel = padel_descriptors(moldf)
Y_train_padel = data_padel['Y_train']
X_train_padel = data_padel['X_train']
X_train_padel.shape

#### SiRMS (Simplex Representation of Molecular Structure)

In [None]:
data_sirms = sirms_descriptors(moldf)
Y_train_sirms = data_sirms['Y_train']
X_train_sirms = data_sirms['X_train']
X_train_sirms.shape

#### Drangon 7.0

In [None]:
data_dragon = dragon_descriptors(moldf)
Y_train_dragon = data_dragon['Y_train']
X_train_dragon = data_dragon['X_train']
X_train_dragon.shape

## Parâmetros p/ geração dos modelos

#### Parâmetros SVM

In [4]:
# SVM
svm_param = {
    "C": [.01, .1, 1, 5, 10, 100],
    "gamma": [0, .01, .1, 1, 5, 10, 100],
    "kernel": ['rbf', 'linear', 'sigmoid', 'poly'],
    "random_state": [1]
}

svm_dist = {
    "C": scipy.stats.expon(scale=.01),
    "gamma": scipy.stats.expon(scale=.01),
    "kernel": ['rbf', 'linear', 'sigmoid', 'poly'],
    "random_state": [1]
}

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from rdkit import Chem
from math import floor
from time import time
import pylab as pl
import itertools
import matplotlib.pyplot as plt 
from functions.Hyperparameter import procurar
from functions.PlotConfusionMatrix import plot_confusion_matrix
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss

# PROCURAR POR PARAMETROS
def compare(estimator, X_train, Y_train, param, dist):
    grid = procurar(estimator, X_train, Y_train, param, "grid")
    cfmatrix_grid = confusion_matrix(y_true=Y_train, y_pred=grid.predict(X_train))
    print("**Resultados Grid Search**")
    print("Melhores Parâmetros: %s" % (grid.best_params_))
    print("Melhor precisão de treinamento:\t", grid.best_score_)

    random = procurar(estimator, X_train, Y_train, dist, "random")
    cfmatrix_rand = confusion_matrix(y_true=Y_train, y_pred=random.predict(X_train))
    print("**Resultados Random Search**")
    print("Melhores Parâmetros: %s" % (random.best_params_))
    print("Melhor precisão de treinamento:\t", random.best_score_)

    plt.subplots(1,2)
    plt.subplots_adjust(left=-0.5, bottom=None, right=None, top=None, wspace=0.5, hspace=None)
    plot_confusion_matrix(cfmatrix_rand, title="Matriz de confusão Random Search")
    plt.subplot(121)
    plot_confusion_matrix(cfmatrix_grid, title="Matriz de confusão Grid Search")

In [8]:
compare(SVC(), X_train_morgan, Y_train_morgan, svm_param, svm_dist)

**Resultados Grid Search**
Melhores Parâmetros: {'C': 0.01, 'gamma': 0.1, 'kernel': 'poly', 'random_state': 1}
Melhor precisão de treinamento:	 0.7977272727272727


NameError: name 'acc' is not defined

## Otimizando hiperparâmetros

#### Grid search

In [None]:
# Grid Search - SVM
#grid_svm_morgan = best_grid_search(SVC(), param_grid_svm, X_train_morgan, Y_train_morgan, 'Morgan', 'SVM')
#grid_svm_sirms = best_grid_search(SVC(), param_grid_svm, X_train_sirms, Y_train_sirms, 'Sirms', 'SVM')
#grid_svm_dragon = best_grid_search(SVC(), param_grid_svm, X_train_dragon, Y_train_dragon, 'Dragon', 'SVM')
#grid_svm_padel = best_grid_search(SVC(), param_grid_svm, X_train_padel, Y_train_padel, 'Padel', 'SVM')

#### Random Search

In [None]:
# Random Search - RF
random_svm_morgan = best_random_search(SVC(), param_grid_svm, X_train_morgan, Y_train_morgan, 'Morgan', 'SVM')
random_svm_sirms = best_random_search(SVC(), param_grid_svm, X_train_sirms, Y_train_sirms, 'Sirms', 'SVM')
random_svm_dragon = best_random_search(SVC(), param_grid_svm, X_train_dragon, Y_train_dragon, 'Dragon', 'SVM')
random_svm_padel = best_random_search(SVC(), param_grid_svm, X_train_padel, Y_train_padel, 'Padel', 'SVM')

#### Criando modelos a partir  melhores parâmetros

In [None]:
#criando o modelo com os melhores parametros utilizando processamento paralelo
svm_best_morgan = SVC(C=1, gamma=1, random_state= 0, kernel='poly', probability=True)
svm_best_sirms = SVC(C=1, gamma=1, random_state= 0, kernel='linear', probability=True)
svm_best_dragon = SVC(C=1, gamma=1, random_state= 0, kernel='linear', probability=True)
svm_best_padel = SVC(C=1, gamma=0.9, random_state= 0, kernel='linear', probability=True)


svm_best_morgan.fit(X_train_morgan, Y_train_morgan)
svm_best_sirms.fit(X_train_sirms, Y_train_sirms)
svm_best_dragon.fit(X_train_dragon, Y_train_dragon)
svm_best_padel.fit(X_train_padel, Y_train_padel)

## Validação dos modelos

####  Área sob a curva ROC (AUC) 

In [None]:
# n_splits: int, padrão = 5 Número de dobras. Deve ser pelo menos 2.
cross_val = StratifiedKFold(n_splits=5)

In [None]:
roc_auc(svm_best, cross_val, X_train, y_train)

####  Estatisiticas e AD

In [None]:
data_stats_morgan = statistics(svm_best_morgan, X_train_morgan, Y_train_morgan, cross_val, data_morgan['moldf_desc'], data_morgan['moldf_train'], 'Morgan')
stats_morgan = data_stats_morgan['stats']
five_fold_morgan = data_stats_morgan['five_fold']
stats_morgan

In [None]:
data_stats_sirms = statistics(svm_best_sirms, X_train_sirms, Y_train_sirms, cross_val, data_sirms['moldf_desc'], data_sirms['moldf_train'], 'Sirms')
stats_sirms = data_stats_sirms['stats']
five_fold_sirms = data_stats_sirms['five_fold']
stats_sirms

In [None]:
data_stats_dragon = statistics(svm_best_dragon, X_train_dragon, Y_train_dragon, cross_val, data_dragon['moldf_desc'], data_dragon['moldf_train'], 'Dragon')
stats_dragon = data_stats_dragon['stats']
five_fold_dragon = data_stats_dragon['five_fold']
stats_dragon

In [None]:
data_stats_padel = statistics(svm_best_padel, X_train_padel, Y_train_padel, cross_val, data_padel['moldf_desc'], data_padel['moldf_train'], 'Padel')
stats_padel = data_stats_padel['stats']
five_fold_padel = data_stats_padel['five_fold']
stats_padel

In [None]:
print_stats(stats)

#### Y-randomization

In [None]:
y_randomization(svm_best, X_train, y_train)

## Consenso

#### Obter consenso

In [None]:
stats_consenso = statistics_consenso(moldf, five_fold_morgan, five_fold_sirms,five_fold_dragon,five_fold_padel, stats)
stats_consenso

#### Plotar consenso

In [None]:
print_consenso(stats_consenso, 'SVM')

## Salvar modelos

### Random Forest

In [None]:
with gzip.GzipFile('model/sars-cov-3clpro-morgan_RF_ad_balanced.pgz', 'w') as f:
    cPickle.dump(rf_best_morgan, f)
with gzip.GzipFile('model/sars-cov-3clpro-sirms_RF_ad_balanced.pgz', 'w') as f:
    cPickle.dump(rf_best_sirms, f)
with gzip.GzipFile('model/sars-cov-3clpro-dragon_RF_ad_balanced.pgz', 'w') as f:
    cPickle.dump(rf_best_dragon, f)
with gzip.GzipFile('model/sars-cov-3clpro-padel_RF_ad_balanced.pgz', 'w') as f:
    cPickle.dump(rf_best_padel, f)

### SVM

In [None]:
with gzip.GzipFile('model/sars-cov-3clpro-morgan_SVM_ad_balanced.pgz', 'w') as f:
    cPickle.dump(svm_best, f)
with gzip.GzipFile('model/sars-cov-3clpro-sirms_SVM_ad_balanced.pgz', 'w') as f:
    cPickle.dump(svm_best, f)
with gzip.GzipFile('model/sars-cov-3clpro-dragon_SVM_ad_balanced.pgz', 'w') as f:
    cPickle.dump(svm_best, f)    
with gzip.GzipFile('model/sars-cov-3clpro-padel_SVM_ad_balanced.pgz', 'w') as f:
    cPickle.dump(svm_best, f)

## Exportando Predições

### Random Forest

In [None]:
with pd.ExcelWriter('predictions-morgan.xlsx') as writer:
    pred_morgan.to_excel(writer, sheet_name='morgan', index=False)

### SVM