# Modelo para prever se o steroid é ou não derivado de testosterona

In [8]:
# data structuring
import pandas as pd
import numpy as np
from SpectraFP import SpectraFP
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from rdkit.Chem import PandasTools
from rdkit.Chem import Descriptors
from rdkit import Chem

# build-models
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# avaluated models
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import cross_val_score,StratifiedKFold, LeaveOneOut, KFold

pd. set_option("display.max_rows", None)




# Estruturando os dados

In [9]:
# import data descriptor spectral MS
df_descriptor = pd.read_csv('df_esteroids_MB_IN_AR.csv', sep=';')
df_descriptor.columns = df_descriptor.columns.astype(str)

df_descriptor

Unnamed: 0,Classe,[M]+,[M-15]+,[M-29]+,[M-90]+,[M-2·90]+,[M-3·90],[M-90-15]+,[M-2·90-15]+,[M-3·90-15],...,[M-140],[M-157],[M-144],m/z 103,m/z 129,m/z 143,m/z 169,m/z 244,m/z 218,m/z 231
0,2,3,3,0,0,0,0,3,3,0,...,0,0,0,0,1,1,2,0,0,0
1,0,3,0,0,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2,3,1,0,0,0,0,0,0,0,...,1,0,0,0,1,3,0,0,0,0
5,0,3,3,0,0,0,0,2,0,2,...,0,0,0,0,1,0,3,0,0,0
6,0,3,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,1,0,0,1
7,0,3,0,0,1,0,0,0,0,0,...,0,0,0,0,1,1,1,0,0,0
8,0,3,0,0,1,0,1,0,0,0,...,0,0,0,0,1,3,0,0,0,0
9,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [10]:
df_descriptor.columns = df_descriptor.columns.str.replace('[^a-zA-Z0-9]', '_')
df_descriptor.columns = ["".join(c if c.isalnum() else "_" for c in str(col)) for col in df_descriptor.columns]

df_descriptor

Unnamed: 0,Classe,__M__,__M_15__,_M_29__,_M_90__,_M_2_90__,_M_3_90_,_M_90_15__,_M_2_90_15__,_M_3_90_15_,...,_M_140_,_M_157_,_M_144_,m_z_103,m_z_129,m_z_143,m_z_169,m_z_244,_m_z_218,m_z_231
0,2,3,3,0,0,0,0,3,3,0,...,0,0,0,0,1,1,2,0,0,0
1,0,3,0,0,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2,3,1,0,0,0,0,0,0,0,...,1,0,0,0,1,3,0,0,0,0
5,0,3,3,0,0,0,0,2,0,2,...,0,0,0,0,1,0,3,0,0,0
6,0,3,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,1,0,0,1
7,0,3,0,0,1,0,0,0,0,0,...,0,0,0,0,1,1,1,0,0,0
8,0,3,0,0,1,0,1,0,0,0,...,0,0,0,0,1,3,0,0,0,0
9,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [11]:
# classes de esteroides
df_descriptor['Classe'].value_counts()

Classe
0    49
2    43
4    27
5    12
3    10
1    10
6     7
Name: count, dtype: int64

In [12]:
# Xi and y
xi = df_descriptor.iloc[:,1::]

In [18]:
df = pd.DataFrame(df_descriptor.iloc[:,0])
df.value_counts()


Classe
0         49
2         43
4         27
5         12
1         10
3         10
6          7
Name: count, dtype: int64

In [24]:
# separando classe 0(nao-derivado de esteroide) e 1
from collections import Counter

# Supondo que df seja o seu DataFrame

# Inicializando uma nova coluna 'Classe_bin' com valor padrão 0
df['Classe_AAS'] = 0

# Atribuindo valor 1 para o intervalo [91:,:]
df.loc[92:, 'Classe_AAS'] = 1

# Exibindo as contagens após a separação
counter = Counter(df['Classe_AAS'])
yi = df['Classe_AAS']
yi = np.array(yi)
yi


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1])

In [27]:
# balanceamento
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

# Supondo que 'd' é o seu conjunto de dados e está no formato de um DataFrame do pandas

# Separando classe 0 (não derivado de esteroide) e 1
yi = np.where(df['Classe_AAS'].values == 0, 0, 1)
yi = np.array(yi)
yi = yi.ravel()
# Verificando a contagem inicial das classes
print("Contagem inicial das classes:", Counter(yi))

# Aplicando UnderSampler usando RandomUnderSampler
rs = RandomUnderSampler(random_state=42)
xi_resampled, yi_resampled = rs.fit_resample(xi, yi)

# Verificando a contagem após o UnderSampler
print("Contagem após UnderSampler:", Counter(yi_resampled))

Contagem inicial das classes: Counter({0: 92, 1: 66})
Contagem após UnderSampler: Counter({0: 66, 1: 66})


# Treino de Modelos

## RandomForest Classifier

In [28]:
# model RFC tunning
#resampling_balaced = 1, train_split = 29, random_state = 1, n_stimator = 11, max_depth = 6, accuracy : 0.9772727272727273 mcc : 0.9555330859059091

# modelo de Classificação é ou n anabolizante
for w in tqdm(range(1,100)):
    rs = RandomUnderSampler(random_state=w)
    xi_resampled, yi_resampled = rs.fit_resample(xi, yi)
    for j in range(29,100): # train test split
        for z in range(1,100): # random state
            for i in range(7,75): # n_estimators
                for f in range(3,15): # max_depth
                    
                    X_train, X_test, y_train, y_test = train_test_split(xi_resampled, yi_resampled, test_size=0.33, random_state=j, shuffle=True, stratify=yi_resampled)
                    RFC = RandomForestClassifier(random_state=z, n_estimators = i, max_depth=f, n_jobs=24)
                    model_RFC = RFC.fit(X_train,y_train.ravel())
                    predRFC = model_RFC.predict(X_test)
                    accuracy_RFC = accuracy_score(y_test, predRFC)
                    mcc_RFC = matthews_corrcoef(y_test, predRFC)
                    if accuracy_RFC >= 0.95:
                         print("resampling_balaced = {w}, train_split = {j}, random_state = {z}, n_stimator = {i}, max_depth = {f}, accuracy : {acc} mcc : {mcc}"
                               .format(mcc = mcc_RFC, acc = accuracy_RFC, i = i, z = z, f = f, j = j, w = w))
                 

  0%|                                                    | 0/99 [00:00<?, ?it/s]

resampling_balaced = 1, train_split = 29, random_state = 1, n_stimator = 7, max_depth = 4, accuracy : 0.9545454545454546 mcc : 0.9090909090909091
resampling_balaced = 1, train_split = 29, random_state = 1, n_stimator = 7, max_depth = 7, accuracy : 0.9545454545454546 mcc : 0.9090909090909091
resampling_balaced = 1, train_split = 29, random_state = 1, n_stimator = 7, max_depth = 8, accuracy : 0.9545454545454546 mcc : 0.9090909090909091
resampling_balaced = 1, train_split = 29, random_state = 1, n_stimator = 7, max_depth = 11, accuracy : 0.9545454545454546 mcc : 0.9090909090909091
resampling_balaced = 1, train_split = 29, random_state = 1, n_stimator = 7, max_depth = 12, accuracy : 0.9545454545454546 mcc : 0.9090909090909091
resampling_balaced = 1, train_split = 29, random_state = 1, n_stimator = 7, max_depth = 13, accuracy : 0.9545454545454546 mcc : 0.9090909090909091
resampling_balaced = 1, train_split = 29, random_state = 1, n_stimator = 7, max_depth = 14, accuracy : 0.9545454545454546

  0%|                                                    | 0/99 [00:24<?, ?it/s]

resampling_balaced = 1, train_split = 29, random_state = 1, n_stimator = 57, max_depth = 13, accuracy : 0.9772727272727273 mcc : 0.9555330859059091
resampling_balaced = 1, train_split = 29, random_state = 1, n_stimator = 57, max_depth = 14, accuracy : 0.9772727272727273 mcc : 0.9555330859059091





KeyboardInterrupt: 

In [31]:
# cross validation RFC
rs = RandomUnderSampler(random_state=2)
xi_resampled, yi_resampled = rs.fit_resample(xi, yi)

X_train, X_test, y_train, y_test = train_test_split(xi_resampled, yi_resampled, test_size=0.33, random_state=36, shuffle=True, stratify=yi_resampled)
model_RFC = RandomForestClassifier(random_state=49, n_estimators=11, max_depth=5, n_jobs=24)

stratifiedkf=StratifiedKFold(n_splits=7)
# score
score_RFC=cross_val_score(model_RFC,xi_resampled,yi_resampled,cv=stratifiedkf)
#score_GBC=cross_val_score(model_GBC,X,Y,cv=stratifiedkf)
#score_NN=cross_val_score(model_NN,X,Y,cv=stratifiedkf)

#accuracy
#accuracy_RFC = accuracy_score(y_test, y_pred)

# mcc
mcc_RFC= matthews_corrcoef(y_test, y_pred)

# hamming loss
#hamming_loss_RFC = hamming_loss(y_test, y_pred)

print("Cross Validation Score: \n RFC: {rf} MCC: {mcc}".format(rf=score_RFC.mean(), mcc=mcc_RFC.mean()))


NameError: name 'y_pred' is not defined

## Extreme Gradient Boosting Classifier

In [33]:
# XGBC model tunning

# resampling_balaced = 3, train_split = 3, random_state = 2, n_stimator = 70, max_depth = 1, accuracy : 0.9772727272727273 mcc : 0.9555330859059091
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
# modelo de Classificação é ou n anabolizante
for w in tqdm(range(3, 100)):
    rs = RandomUnderSampler(random_state=w)
    xi_resampled, yi_resampled = rs.fit_resample(xi, yi)

    for j in range(1, 100):  # train test split
        for z in range(1, 100):  # random state
            for i in range(1, 75):  # n_estimators
                for f in range(1, 15):  # max_depth
                    X_train, X_test, y_train, y_test = train_test_split(xi_resampled, yi_resampled, test_size=0.33, random_state=j, shuffle=True, stratify=yi_resampled)

                    xgb = XGBClassifier(n_estimators=i, max_depth=f, random_state=z, n_jobs=-1)
                    model_xgb = xgb.fit(X_train, y_train)
                    pred_xgb = model_xgb.predict(X_test)

                    accuracy_xgb = accuracy_score(y_test, pred_xgb)
                    mcc_xgb = matthews_corrcoef(y_test, pred_xgb)

                    if mcc_xgb >= 0.93:
                        print("resampling_balaced = {w}, train_split = {j}, random_state = {z}, n_stimator = {i}, max_depth = {f}, accuracy : {acc} mcc : {mcc}"
                            .format(mcc=mcc_xgb, acc=accuracy_xgb, i=i, z=z, f=f, j=j, w=w))

  0%|                                                    | 0/97 [00:00<?, ?it/s]

resampling_balaced = 3, train_split = 3, random_state = 1, n_stimator = 3, max_depth = 1, accuracy : 0.9772727272727273 mcc : 0.9555330859059091
resampling_balaced = 3, train_split = 3, random_state = 1, n_stimator = 4, max_depth = 1, accuracy : 0.9772727272727273 mcc : 0.9555330859059091
resampling_balaced = 3, train_split = 3, random_state = 1, n_stimator = 11, max_depth = 2, accuracy : 0.9772727272727273 mcc : 0.9555330859059091
resampling_balaced = 3, train_split = 3, random_state = 1, n_stimator = 12, max_depth = 2, accuracy : 0.9772727272727273 mcc : 0.9555330859059091
resampling_balaced = 3, train_split = 3, random_state = 1, n_stimator = 13, max_depth = 2, accuracy : 0.9772727272727273 mcc : 0.9555330859059091
resampling_balaced = 3, train_split = 3, random_state = 1, n_stimator = 14, max_depth = 1, accuracy : 0.9772727272727273 mcc : 0.9555330859059091
resampling_balaced = 3, train_split = 3, random_state = 1, n_stimator = 14, max_depth = 2, accuracy : 0.9772727272727273 mcc :

  0%|                                                  | 0/97 [7:46:34<?, ?it/s]


KeyboardInterrupt: 

# Save Models

In [13]:
import pickle
rs = RandomUnderSampler(random_state=1)
xi_resampled, yi_resampled = rs.fit_resample(xi, yi)

X_train, X_test, y_train, y_test = train_test_split(xi_resampled, yi_resampled, test_size=0.33, random_state=36, shuffle=True, stratify=yi_resampled)
model_GBC = GradientBoostingClassifier(random_state=26, n_estimators=11, max_depth=8,learning_rate)
# save the iris classification model as a pickle file
model_pkl_file = "GradientBoosting_anabolizante_model.pkl"  

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(model_GBC, file)

In [None]:
erwer 

In [None]:
# Carregando bibliotecas
library(dplyr)

# Função para verificar se é um número
is_number <- function(s) {
  !is.na(as.numeric(s))
}

# Função para concatenar ao longo de uma dimensão
cat <- function(dim, ...) {
  args <- list(...)
  valid_args <- lapply(args, function(r) nrow(r) > 0)
  cat_args <- args[unlist(valid_args)]
  do.call(rbind, cat_args)
}

# Função para ordenar as linhas
sortrows <- function(a, i) {
  a[order(a[, i]), ]
}

# Exemplo de utilização:
# data <- sortrows(data, i)  # Onde 'data' é a matriz e 'i' é o índice da coluna pelo qual ordenar
