# Modelos para identificar genes de resistência

Como construir o melhor bancos de dados?

Não balanceado: 66k de sequências de proteínas vindas de genes não resistêntes + 17k de genes resistentes?

Balanceado: tamanhos iguais para sequências positivas de negativa. Downsampling or upsampling?

![Dataset](/home/tiago/documents/PhD-Tiago/pics/Databaseconstruction.png)

## Identificador de genes de resistência

In [5]:
import numpy as np
from Bio import SeqIO
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.models import Sequential
import matplotlib.pyplot as plt
import pandas as pd

In [4]:
def load_training(negative_dataset:str,positive_dataset:str):
    """Load and prepare datases for model training, test and evaluation.
    
    Returns:
        negative_data:list -> proteins sequences that are not taken as antibiotic resistance proteins
        negative_lavel:array-> a array of zeros representing non resistance label
        positive_data:list -> resistance proteins sequences
        positive_label:array -> a array of 1 representing resistance label
    """
    #Loading nonargs data
    negative_data = [str(info.seq) for info in SeqIO.parse(negative_dataset, "fasta")]
    negative_data = list(map(" ".join,negative_data))
    negative_label = np.zeros((len(negative_data),1), float)
    #Loading ARG data
    positive_data = [str(info.seq) for info in SeqIO.parse(positive_dataset, "fasta")]
    positive_data = list(map(" ".join,positive_data))
    positive_label = np.ones((len(positive_data),1), float)
    return negative_data,negative_label,positive_data,positive_label


In [5]:
neg_proteins_seq, neg_proteins_tags,pos_proteins_seq,pos_proteins_tags =load_training(
    negative_dataset = "datasets/uniprot/negative.db.fasta",
    positive_dataset = "datasets/hmd/arg_v5.fasta")

In [6]:
np.random.seed(0)
X_p_sub = np.random.choice(pos_proteins_seq, size = int(len(pos_proteins_seq)*0.10), replace = False)
y_p_sub = np.random.choice(pos_proteins_tags.ravel(), size = int(pos_proteins_tags.shape[0]*0.10), replace = False)
X_n_sub = np.random.choice(neg_proteins_seq, size = int(len(neg_proteins_seq)*0.10), replace = False)
y_n_sub = np.random.choice(neg_proteins_tags.ravel(), size = int(neg_proteins_tags.shape[0]*0.10), replace = False)

In [7]:
del neg_proteins_seq, neg_proteins_tags, pos_proteins_seq, pos_proteins_tags 

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
#Samples split
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_p_sub,y_p_sub, random_state = 42)
X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(X_n_sub,y_n_sub, random_state = 42)

In [10]:
del X_n_sub ,X_p_sub, y_n_sub, y_p_sub

In [11]:
#Merged samples
X_train, X_test, y_train, y_test = (
    np.concatenate((X_train_n, X_train_p), axis = 0).reshape(-1,1),
    np.concatenate((X_test_n, X_test_p), axis = 0).reshape(-1,1),
    np.concatenate((y_train_n, y_train_p), axis = 0).reshape(-1,1),
    np.concatenate((y_test_n, y_test_p), axis = 0).reshape(-1,1)
    )

In [12]:
del X_train_p, X_test_p, y_train_p, y_test_p, X_train_n, X_test_n, y_train_n, y_test_n

In [13]:
X_train.shape,X_test.shape

((6281, 1), (2094, 1))

### Information on protein sequences

In [14]:
#Vocabulary -> number of different amoniacids in my database
aminoacids = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
longest_protein = sorted(list(map(len,list(X_train.ravel()))))[-1]
print(len(aminoacids))
print(aminoacids)
print(longest_protein)


26
['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
68699


In [15]:
from sklearn.preprocessing import LabelEncoder
encoder_obj = LabelEncoder()

In [16]:
def sequence_encoder(array, classes_array,encoder_type):
    """"""
    encoder_type.fit(classes_array)
    encoded_proteins = []
    for n in range(array.shape[0]):
        encoded_proteins.append(list(encoder_obj.transform(array[n][0].split(" "))))
    return encoded_proteins

In [17]:
X_train_padded = sequence_encoder(X_train,aminoacids,encoder_obj)
X_train_padded = pad_sequences(X_train_padded, maxlen = longest_protein, padding='post',value=27) #sets the same lenght for all vector
X_test_padded = sequence_encoder(X_test, aminoacids,encoder_obj)
X_test_padded = pad_sequences(X_test_padded, maxlen = longest_protein, padding = "post", value = 27 )

In [18]:
X_train_padded[0]

array([12,  0,  0, ..., 27, 27, 27], dtype=int32)

In [19]:
del aminoacids, X_train, X_test

In [20]:
X_train_padded.shape, X_test_padded.shape,y_train.shape, y_test.shape

((6281, 68699), (2094, 68699), (6281, 1), (2094, 1))

In [25]:
CNN = Sequential([
    tf.keras.layers.Embedding(input_dim = 26+2, output_dim = 1, input_length = longest_protein),

    tf.keras.layers.Conv1D(filters = 32,kernel_size = 40*4),  #Conv1
    tf.keras.layers.MaxPooling1D(pool_size = 5*2), 

    tf.keras.layers.Conv1D(filters = 64,kernel_size = 30*4),  #Conv2

    tf.keras.layers.Conv1D(filters = 128,kernel_size = 30*4), #Conv3
    tf.keras.layers.MaxPooling1D(pool_size = 5*2),

    tf.keras.layers.Conv1D(filters = 256,kernel_size = 20*3), #Conv4


    tf.keras.layers.Conv1D(filters = 256,kernel_size = 20*3), #Conv5
    tf.keras.layers.MaxPooling1D(pool_size = 4),

    tf.keras.layers.Conv1D(filters = 256,kernel_size = 20*3),    #Conv6
    tf.keras.layers.MaxPooling1D(pool_size = 2),

    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(0.42),
    
    tf.keras.layers.Dense(12288, activation = "relu"),
    tf.keras.layers.Dense(1024, activation = "relu"),

    tf.keras.layers.Dense(1, activation = "sigmoid")
])
CNN.compile(
    optimizer = tf.optimizers.Adam(),
    loss = tf.losses.BinaryCrossentropy(),
    metrics = [
        tf.metrics.BinaryAccuracy(),
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall()
    ]
)
print(CNN.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 68699, 1)          28        
                                                                 
 conv1d_18 (Conv1D)          (None, 68540, 32)         5152      
                                                                 
 max_pooling1d_12 (MaxPoolin  (None, 6854, 32)         0         
 g1D)                                                            
                                                                 
 conv1d_19 (Conv1D)          (None, 6735, 64)          245824    
                                                                 
 conv1d_20 (Conv1D)          (None, 6616, 128)         983168    
                                                                 
 max_pooling1d_13 (MaxPoolin  (None, 661, 128)         0         
 g1D)                                                 

In [None]:
trainer = CNN.fit(
     X_train_padded,
     y_train,
     epochs = 1,
     batch_size = 10,
     validation_data = (X_test_padded,y_test),  
     verbose = 1)

: 

In [None]:
def metrics(plot_name:str):

    #Plot loss
    plt.figure(figsize = (10,10))
    plt.subplot(221)
    plt.title('Loss')
    plt.plot(trainer.history['loss'], label='train')
    plt.plot(trainer.history['val_loss'], label='test')
    plt.legend()
    #Plot accuracy during training
    plt.subplot(222)
    plt.title('Accuracy')
    plt.plot(trainer.history['binary_accuracy'], label='train')
    plt.plot(trainer.history['val_binary_accuracy'], label='test')
    plt.legend()
    #Plot Precison
    plt.subplot(223)
    plt.title('Precison')
    plt.plot(trainer.history['precision_1'], label='train')
    plt.plot(trainer.history['val_precision_1'], label='test')
    plt.legend()
    #Plot Recall
    plt.subplot(224)
    plt.title('Recall')
    plt.plot(trainer.history['recall_1'], label='train')
    plt.plot(trainer.history['val_recall_1'], label='test')
    plt.legend()
    plt.savefig(plot_name)

In [None]:
y_hat = (CNN.predict(X_test_padded) > 0.5).astype("float")


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [None]:
cm = confusion_matrix(y_test, y_hat)  

In [None]:
plt.figure(figsize = (6,6))
sns.heatmap(cm, annot = True, fmt = ".5g", cmap = "coolwarm",linewidths=.5)
plt.ylabel('Actal Values',fontsize = 14)
plt.xlabel('Predicted Values')
plt.savefig("pics/identifier_cm.png")


In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, y_hat).ravel()
tn, fp, fn, tp

$ Precision = \frac{tp}{tp+fp} $

In [None]:
precision = tp/(tp+fp)
print(f"Precision: {np.round(precision,2)}")

$ Recall = \frac{tp}{tp+fn} $

In [None]:
recall = tp/(tp+fn)
print(f"Recall: {np.round(recall,2)}")

$ F1-Score = 2\frac{precision*recall}{precision+recall} $

In [None]:
f1_score = np.round(2*((precision*recall)/(precision+recall)),2)
f1_score

## Classificador

Os genes de resistência identificador vão passar por uma classificação

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from Bio import SeqIO

In [None]:
def label_encoder(array,integers):
    encoder_obj = LabelEncoder()
    encoder_obj.fit(integers)
    encoded_proteins = []
    for n in range(array.shape[0]):
        encoded_proteins.append(list(encoder_obj.transform(array[n][0].split(" "))))
    return encoded_proteins


### Classes

#### Classificador sem alterações

- Modelo com overfitting
- 33 Classes
- 1 Camada convolucional

In [None]:
pos_proteins_seq =  [str(info.seq) for info in SeqIO.parse("datasets/hmd/arg_v5.fasta", "fasta")]
pos_proteins_seq = list(map(" ".join,pos_proteins_seq))
mechanisms = [info.description.split("|")[5] for info in SeqIO.parse("datasets/hmd/arg_v5.fasta", "fasta")]
classes = [info.description.split("|")[3] for info in SeqIO.parse("datasets/hmd/arg_v5.fasta", "fasta")]
classes_to_fit = list(set(classes))
print(sorted(classes_to_fit))
X_train, X_test, y_train, y_test = train_test_split(pos_proteins_seq,classes, random_state = 42)
X_train, X_test, y_train, y_test = (
    np.array(X_train).reshape(-1,1),
    np.array(X_test).reshape(-1,1),
    np.array(y_train).reshape(-1,1),
    np.array(y_test).reshape(-1,1)
)
longest_protein = sorted(list(map(len,list(X_train.ravel()))))[-1]
print(longest_protein)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
aminoacids = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
X_train_encoded = label_encoder(X_train,aminoacids)
X_train_padded = pad_sequences(X_train_encoded, maxlen = longest_protein, padding='post', value = 27) #sets the same lenght for all vector
X_train_padded  =  np.asarray(X_train_padded).astype('float32')
X_test_encoded = label_encoder(X_test,aminoacids)
X_test_padded = pad_sequences(X_test_encoded, maxlen = longest_protein, padding='post', value = 27) #sets the same lenght for all vector
print(X_train_padded.shape, X_test_padded.shape, y_train.shape, y_test.shape)
encoder = LabelEncoder()
encoder.fit(classes_to_fit)
y_train_encodded = encoder.transform(y_train)
y_train_encodded = y_train_encodded.reshape(-1,1)
y_test_encodded = encoder.transform(y_test)
y_test_encodded = y_test_encodded.reshape(-1,1)
classificador = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim = 26+2,output_dim = 8,input_length=longest_protein),
    tf.keras.layers.Conv1D(filters = 32,kernel_size = 12),
    tf.keras.layers.MaxPooling1D(pool_size = 4),
    tf.keras.layers.Flatten(),
    #tf.keras.layers.Dropout(0.42),
    tf.keras.layers.Dense(len(classes_to_fit),activation = "softmax")
    ])
classificador.compile(
    optimizer = tf.optimizers.Adam(),
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics = [
        tf.keras.metrics.SparseCategoricalAccuracy(),
    ]
)
print(classificador.summary())
classes_models_trainer = classificador.fit(
    X_train_padded,
    y_train_encodded,
    validation_data = (X_test_padded,y_test_encodded),
    epochs = 40,
    verbose = 0)


In [None]:
def translate_predicted(array,all_classes):
    tranlated = []
    for i in array:
        tranlated.append((all_classes[np.where(i == np.max(i))[0][0]]))
    return tranlated
Y_hat_classes = classificador.predict(X_test_padded)
Y_hat_classes_tran = translate_predicted(Y_hat_classes, classes_to_fit)


In [None]:
len(set(y_test.ravel()))

In [None]:
len(set(y_train.ravel()))

In [None]:
set(y_train.ravel()).difference(y_test.ravel())

In [None]:
cm = confusion_matrix(list(y_test.ravel()),Y_hat_classes_tran)
cm_df = pd.DataFrame(cm,index = classes_to_fit,columns = classes_to_fit)

In [None]:
plt.figure()
pd.Series(mechanisms).value_counts().plot.bar(color = ["orange","green","blue","red","purple"])
plt.xticks(rotation = 30, ha = "right", fontsize = 14)
plt.yticks(fontsize = 16)
#plt.savefig("pics/mech_bar.png",dpi = 400)

In [None]:
plt.figure()
pd.get_dummies(pd.DataFrame({" ": mechanisms}, index = classes)).groupby(level = 0).sum().plot.barh(stacked = True,figsize=(18,12)).legend(bbox_to_anchor=(.42, 1),fontsize = 16).set_title("Mechanisms")
plt.title("Classes distribution", fontsize = 20)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
#plt.savefig("pics/classes_dist.png", dpi = 400)

In [None]:
import matplotlib.pyplot as plt
#Plot loss
plt.figure(figsize = (8,12))
plt.subplot(211)
plt.plot(classes_models_trainer.history['loss'], label='train')
plt.plot(classes_models_trainer.history['val_loss'], label='test')
plt.title('Loss', fontsize = 24)
plt.xticks(fontsize = 18)
plt.xlabel("epochs",fontsize = 15)
plt.yticks(fontsize = 18)
plt.ylim(0,.6)
plt.legend(fontsize = 18)
#Plot accuracy during training
plt.subplot(212)
plt.title('Accuracy',fontsize = 24)
plt.plot(classes_models_trainer.history['sparse_categorical_accuracy'], label='train')
plt.plot(classes_models_trainer.history['val_sparse_categorical_accuracy'], label='test')
plt.xlabel("epochs",fontsize = 15)
plt.xticks(fontsize = 18)
plt.yticks(fontsize = 18)
plt.ylim(.75,1)
plt.legend(fontsize = 18)
plt.tight_layout()
#plt.savefig("classes_metrics_14.04.2022.png")


#### Classificador - Funcionando

- Número de classes reduzido para 15
- Sem overfitting
- 4 camadas convolucionais

In [6]:
classes = [info.description.split("|")[3] for info in SeqIO.parse("datasets/hmd/arg_v5.fasta", "fasta")]
mechanisms = [info.description.split("|")[5] for info in SeqIO.parse("datasets/hmd/arg_v5.fasta", "fasta")]
main_classes = pd.Series(classes).value_counts()[:14].sort_values(ascending = False)
pos_proteins_seq =  [str(info.seq) for info in SeqIO.parse("datasets/hmd/arg_v5.fasta", "fasta") if info.description.split("|")[3] in main_classes]
pos_proteins_seq = list(map(" ".join,pos_proteins_seq))
classes = [info.description.split("|")[3] for info in SeqIO.parse("datasets/hmd/arg_v5.fasta", "fasta") if info.description.split("|")[3] in main_classes]
mechanisms = [info.description.split("|")[5] for info in SeqIO.parse("datasets/hmd/arg_v5.fasta", "fasta") if info.description.split("|")[3] in main_classes]
classes_to_fit = list(set(classes))
print(sorted(classes_to_fit))

X_train, X_test, y_train, y_test = train_test_split(pos_proteins_seq,classes, random_state = 42)
X_train, X_test, y_train, y_test = (
    np.array(X_train).reshape(-1,1),
    np.array(X_test).reshape(-1,1),
    np.array(y_train).reshape(-1,1),
    np.array(y_test).reshape(-1,1)
)

longest_protein = sorted(list(map(len,list(X_train.ravel()))))[-1]
print(longest_protein)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
aminoacids = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
X_train_encoded = label_encoder(X_train,aminoacids)
X_train_padded = pad_sequences(X_train_encoded, maxlen = longest_protein, padding='post', value = 27) #sets the same lenght for all vector
X_train_padded  =  np.asarray(X_train_padded).astype('float32')
X_test_encoded = label_encoder(X_test,aminoacids)
X_test_padded = pad_sequences(X_test_encoded, maxlen = longest_protein, padding='post', value = 27) #sets the same lenght for all vector
print(X_train_padded.shape, X_test_padded.shape, y_train.shape, y_test.shape)

encoder = LabelEncoder()
encoder.fit(classes_to_fit)
y_train_encodded = encoder.transform(y_train)
y_train_encodded = y_train_encodded.reshape(-1,1)
y_test_encodded = encoder.transform(y_test)
y_test_encodded = y_test_encodded.reshape(-1,1)
classificador_train = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim = 26+2,output_dim = 8,input_length=longest_protein),
    tf.keras.layers.Conv1D(filters = 64,kernel_size = 24),
    tf.keras.layers.MaxPooling1D(pool_size = 8),
    tf.keras.layers.Dropout(0.42),
    tf.keras.layers.Conv1D(filters = 32,kernel_size = 12),
    tf.keras.layers.MaxPooling1D(pool_size = 4),
    tf.keras.layers.Dropout(0.42),
    tf.keras.layers.Conv1D(filters = 16,kernel_size = 6),
    tf.keras.layers.MaxPooling1D(pool_size = 2),
    tf.keras.layers.Dropout(0.42),
    tf.keras.layers.Conv1D(filters = 8,kernel_size = 4),
    tf.keras.layers.MaxPooling1D(pool_size = 1),   
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(0.42),
    tf.keras.layers.Dense(len(classes_to_fit),activation = "softmax")
    ])
classificador_train.compile(
    optimizer = tf.optimizers.Adam(),
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics = [
        tf.keras.metrics.SparseCategoricalAccuracy(),
    ]
)
print(classificador_train.summary())
classes_models_trainer = classificador_train.fit(
    X_train_padded,
    y_train_encodded,
    validation_data = (X_test_padded,y_test_encodded),
    epochs = 5,
    verbose = 0)

['aminoglycoside', 'bacitracin', 'beta_lactam', 'chloramphenicol', 'fosfomycin', 'glycopeptide', 'macrolide-lincosamide-streptogramin', 'multidrug', 'polymyxin', 'quinolone', 'rifampin', 'sulfonamide', 'tetracycline', 'trimethoprim']
3151
(12797, 1) (4266, 1) (12797, 1) (4266, 1)


NameError: name 'label_encoder' is not defined

In [None]:
#Plot loss
plt.figure(figsize = (8,12))
plt.subplot(211)
plt.plot(classes_models_trainer.history['loss'], label='train')
plt.plot(classes_models_trainer.history['val_loss'], label='test')
plt.title('Loss', fontsize = 24)
plt.xticks(fontsize = 18)
plt.xlabel("epochs",fontsize = 15)
plt.yticks(fontsize = 18)
plt.ylim(0,.6)
plt.legend(fontsize = 18)
#Plot accuracy during training
plt.subplot(212)
plt.title('Accuracy',fontsize = 24)
plt.plot(classes_models_trainer.history['sparse_categorical_accuracy'], label='train')
plt.plot(classes_models_trainer.history['val_sparse_categorical_accuracy'], label='test')
plt.xlabel("epochs",fontsize = 15)
plt.xticks(fontsize = 18)
plt.yticks(fontsize = 18)
plt.ylim(.75,1)
plt.legend(fontsize = 18)
plt.tight_layout()
#plt.savefig("Main_classes_metrics_18.04.2022.png")


In [None]:
list(encoder.classes_)

In [None]:
sorted(classes_to_fit)

In [None]:
def translate_predicted(array,all_classes):
    tranlated = []
    for i in array:
        tranlated.append((all_classes[np.where(i == np.max(i))[0][0]]))
    return tranlated
    
Y_hat_classes_tran = translate_predicted(Y_hat_classes, list(encoder.classes_))
cm = confusion_matrix(list(y_test.ravel()),Y_hat_classes_tran)
cm_df = pd.DataFrame(cm,index = list(encoder.classes_),columns = list(encoder.classes_))

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(cm_df, annot=True, fmt = ".4g", linewidths=.5, cmap="coolwarm")
plt.title('Confusion Matrix - Classes',fontsize = 28)
plt.ylabel('Actual Values',fontsize = 18)
plt.xlabel('Predicted Values',fontsize = 18)
plt.yticks(fontsize = 16)
plt.xticks(fontsize = 16)
#plt.savefig("pics/cm_classes_main_classes.png", dpi = 300)
plt.show()

In [None]:
plt.figure()
pd.get_dummies(pd.DataFrame({" ": mechanisms}, index = classes)).groupby(level = 0).sum().plot.barh(stacked = True,figsize=(18,12)).legend(fontsize = 16).set_title("Mechanisms")
plt.title("Classes distribution", fontsize = 20)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.savefig("pics/main_classes_dist.png", dpi = 400)

In [None]:
plt.figure()
pd.Series(mechanisms).value_counts().plot.bar(color = ["orange","green","blue","red","purple"])
plt.xticks(rotation = 30, ha = "right", fontsize = 14)
plt.yticks(fontsize = 16)
plt.savefig("pics/main_classes_mech_bar.png",dpi = 400)

##### Production

In [None]:
classes = [info.description.split("|")[3] for info in SeqIO.parse("datasets/hmd/arg_v5.fasta", "fasta")]
mechanisms = [info.description.split("|")[5] for info in SeqIO.parse("datasets/hmd/arg_v5.fasta", "fasta")]
main_classes = pd.Series(classes).value_counts()[:14].sort_values(ascending = False)
pos_proteins_seq =  [str(info.seq) for info in SeqIO.parse("datasets/hmd/arg_v5.fasta", "fasta") if info.description.split("|")[3] in main_classes]
pos_proteins_seq = list(map(" ".join,pos_proteins_seq))
classes = [info.description.split("|")[3] for info in SeqIO.parse("datasets/hmd/arg_v5.fasta", "fasta") if info.description.split("|")[3] in main_classes]
mechanisms = [info.description.split("|")[5] for info in SeqIO.parse("datasets/hmd/arg_v5.fasta", "fasta") if info.description.split("|")[3] in main_classes]
classes_to_fit = list(set(classes))
print(sorted(classes_to_fit))
X = np.array(pos_proteins_seq).reshape(-1,1)
y = np.array(classes).reshape(-1,1)
longest_protein = sorted(list(map(len,list(X.ravel()))))[-1]
print(longest_protein)
aminoacids = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
X_encoded = label_encoder(X,aminoacids)
X_padded  = pad_sequences(X_encoded, maxlen = longest_protein, padding='post', value = 27) #sets the same lenght for all vector


In [None]:
X_padded  =  np.asarray(X_padded).astype('float32')
print(X_train_padded.shape, X_test_padded.shape, y_train.shape, y_test.shape)
encoder = LabelEncoder()
encoder.fit(classes_to_fit)
y_encodded = encoder.transform(y)
y_encodded = y_encodded.reshape(-1,1)

classificador = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim = 26+2,output_dim = 8,input_length=longest_protein),
    tf.keras.layers.Conv1D(filters = 64,kernel_size = 24),
    tf.keras.layers.MaxPooling1D(pool_size = 8),
    tf.keras.layers.Dropout(0.42),
    tf.keras.layers.Conv1D(filters = 32,kernel_size = 12),
    tf.keras.layers.MaxPooling1D(pool_size = 4),
    tf.keras.layers.Dropout(0.42),
    tf.keras.layers.Conv1D(filters = 16,kernel_size = 6),
    tf.keras.layers.MaxPooling1D(pool_size = 2),
    tf.keras.layers.Dropout(0.42),
    tf.keras.layers.Conv1D(filters = 8,kernel_size = 4),
    tf.keras.layers.MaxPooling1D(pool_size = 1),   
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(0.42),
    tf.keras.layers.Dense(len(classes_to_fit),activation = "softmax")
    ])
classificador.compile(
    optimizer = tf.optimizers.Adam(),
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics = [
        tf.keras.metrics.SparseCategoricalAccuracy(),
    ]
)
print(classificador.summary())
classes_models_trainer = classificador.fit(
    X_padded,
    y_encodded,
    validation_data = (X_test_padded,y_test_encodded),
    epochs = 40,
    verbose = 0)

In [None]:
classificador.save("trained_models/classifier_production")

#### Classificador Overbalanced

- Balanceamentos das classes

In [None]:
classes = [info.description.split("|")[3] for info in SeqIO.parse("datasets/hmd/arg_v5.fasta", "fasta")]
mechanisms = [info.description.split("|")[5] for info in SeqIO.parse("datasets/hmd/arg_v5.fasta", "fasta")]
main_classes = pd.Series(classes).value_counts()[:14].sort_values(ascending = False)
pos_proteins_seq =  [str(info.seq) for info in SeqIO.parse("datasets/hmd/arg_v5.fasta", "fasta") if info.description.split("|")[3] in main_classes]
pos_proteins_seq = list(map(" ".join,pos_proteins_seq))
classes = [info.description.split("|")[3] for info in SeqIO.parse("datasets/hmd/arg_v5.fasta", "fasta") if info.description.split("|")[3] in main_classes]
mechanisms = [info.description.split("|")[5] for info in SeqIO.parse("datasets/hmd/arg_v5.fasta", "fasta") if info.description.split("|")[3] in main_classes]
classes_to_fit = list(set(classes))
print(sorted(classes_to_fit))
X_train, X_test, y_train, y_test = train_test_split(pos_proteins_seq,classes, random_state = 42)
X_train, X_test, y_train, y_test = (
    np.array(X_train).reshape(-1,1),
    np.array(X_test).reshape(-1,1),
    np.array(y_train).reshape(-1,1),
    np.array(y_test).reshape(-1,1)
)
longest_protein = sorted(list(map(len,list(X_train.ravel()))))[-1]
print(longest_protein)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
aminoacids = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
X_train_encoded = label_encoder(X_train,aminoacids)
X_train_padded = pad_sequences(X_train_encoded, maxlen = longest_protein, padding='post', value = 27) #sets the same lenght for all vector
X_train_padded  =  np.asarray(X_train_padded).astype('float32')
X_test_encoded = label_encoder(X_test,aminoacids)
X_test_padded = pad_sequences(X_test_encoded, maxlen = longest_protein, padding='post', value = 27) #sets the same lenght for all vector
print(X_train_padded.shape, X_test_padded.shape, y_train.shape, y_test.shape)
encoder = LabelEncoder()
encoder.fit(classes_to_fit)
y_train_encodded = encoder.transform(y_train)
y_train_encodded = y_train_encodded.reshape(-1,1)
y_test_encodded = encoder.transform(y_test)
y_test_encodded = y_test_encodded.reshape(-1,1)

In [None]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE


In [None]:
#before balance
print(sorted(Counter(y_train_encodded.ravel()).items()))


In [None]:
#Oversampling
oversampler = RandomOverSampler(random_state = 0)
X_train_padded, y_train_encodded =  SMOTE().fit_resample(X_train_padded,y_train_encodded)

In [None]:
#After oversampling
print(sorted(Counter(y_train_encodded.ravel()).items()))

In [None]:
classificador = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim = 26+2,output_dim = 8,input_length=longest_protein),
    tf.keras.layers.Conv1D(filters = 32,kernel_size = 12),
    tf.keras.layers.MaxPooling1D(pool_size = 4),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(0.42),
    tf.keras.layers.Dense(len(classes_to_fit),activation = "softmax")
    ])
classificador.compile(
    optimizer = tf.optimizers.Adam(),
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics = [
        tf.keras.metrics.SparseCategoricalAccuracy(),
    ]
)
print(classificador.summary())
classes_models_trainer = classificador.fit(
    X_train_padded,
    y_train_encodded,
    validation_data = (X_test_padded,y_test_encodded),
    epochs = 40,
    verbose = 0)

In [None]:
#Plot loss
plt.figure(figsize = (8,12))
plt.subplot(211)
plt.plot(classes_models_trainer.history['loss'], label='train')
plt.plot(classes_models_trainer.history['val_loss'], label='test')
plt.title('Loss', fontsize = 24)
plt.xticks(fontsize = 18)
plt.xlabel("epochs",fontsize = 15)
plt.yticks(fontsize = 18)
plt.ylim(0,.6)
plt.legend(fontsize = 18)
#Plot accuracy during training
plt.subplot(212)
plt.title('Accuracy',fontsize = 24)
plt.plot(classes_models_trainer.history['sparse_categorical_accuracy'], label='train')
plt.plot(classes_models_trainer.history['val_sparse_categorical_accuracy'], label='test')
plt.xlabel("epochs",fontsize = 15)
plt.xticks(fontsize = 18)
plt.yticks(fontsize = 18)
plt.ylim(.75,1)
plt.legend(fontsize = 18)
plt.tight_layout()
#plt.savefig("Main_classes_metrics_18.04.2022.png")


In [None]:
Y_hat_classes = classificador.predict(X_test_padded)
Y_hat_classes_tran = translate_predicted(Y_hat_classes, classes_to_fit)
cm = confusion_matrix(list(y_test.ravel()),Y_hat_classes_tran)
cm_df = pd.DataFrame(cm,index = classes_to_fit,columns = classes_to_fit)

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(cm_df, annot=True, fmt = ".4g", linewidths=.5, cmap="coolwarm")
plt.title('Confusion Matrix - Classes',fontsize = 28)
plt.ylabel('Actual Values',fontsize = 18)
plt.xlabel('Predicted Values',fontsize = 18)
plt.yticks(fontsize = 16)
plt.xticks(fontsize = 16)
#plt.savefig("pics/cm_classes_main_classes.png", dpi = 300)
plt.show()

### Mecanismos

In [None]:
pos_proteins_seq =  [str(info.seq) for info in SeqIO.parse("datasets/hmd/arg_v5.fasta", "fasta")]
pos_proteins_seq = list(map(" ".join,pos_proteins_seq))
mechanisms = [info.description.split("|")[5] for info in SeqIO.parse("datasets/hmd/arg_v5.fasta", "fasta")]
mechanisms_to_fit = list(set(mechanisms))
print(sorted(mechanisms_to_fit))
X_train, X_test, y_train, y_test = train_test_split(pos_proteins_seq,mechanisms, random_state = 42)
X_train, X_test, y_train, y_test = (
    np.array(X_train).reshape(-1,1),
    np.array(X_test).reshape(-1,1),
    np.array(y_train).reshape(-1,1),
    np.array(y_test).reshape(-1,1)
)
longest_protein = sorted(list(map(len,list(X_train.ravel()))))[-1]
print(longest_protein)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
aminoacids = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
X_train_encoded = label_encoder(X_train,aminoacids)
X_train_padded = pad_sequences(X_train_encoded, maxlen = longest_protein, padding='post', value = 27) #sets the same lenght for all vector
X_train_padded  =  np.asarray(X_train_padded).astype('float32')
X_test_encoded = label_encoder(X_test,aminoacids)
X_test_padded = pad_sequences(X_test_encoded, maxlen = longest_protein, padding='post', value = 27) #sets the same lenght for all vector
print(X_train_padded.shape, X_test_padded.shape, y_train.shape, y_test.shape)
encoder = LabelEncoder()
encoder.fit(mechanisms_to_fit)
y_train_encodded = encoder.transform(y_train)
y_train_encodded = y_train_encodded.reshape(-1,1)
y_test_encodded = encoder.transform(y_test)
y_test_encodded = y_test_encodded.reshape(-1,1)
classificador = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim = 26+2,output_dim = 8,input_length=longest_protein),
    tf.keras.layers.Conv1D(filters = 32,kernel_size = 12),
    tf.keras.layers.MaxPooling1D(pool_size = 4),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(0.42),
    tf.keras.layers.Dense(len(mechanisms_to_fit),activation = "softmax")
    ])
classificador.compile(
    optimizer = tf.optimizers.Adam(),
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics = [
        tf.keras.metrics.SparseCategoricalAccuracy(),
    ]
)
print(classificador.summary())
mech_models_trainer = classificador.fit(
    X_train_padded,
    y_train_encodded,
    validation_data = (X_test_padded,y_test_encodded),
    epochs = 40,
    verbose = 0)


In [None]:
def translate_predicted(array,all_classes):
    tranlated = []
    for i in array:
        tranlated.append((all_classes[np.where(i == np.max(i))[0][0]]))
    return tranlated
Y_hat_mech = classificador.predict(X_test_padded)
Y_hat_mech_tran = translate_predicted(Y_hat_mech, mechanisms_to_fit)
cm = confusion_matrix(list(y_test.ravel()),Y_hat_mech_tran)
cm_df = pd.DataFrame(cm,index = mechanisms_to_fit,columns = mechanisms_to_fit)

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(cm_df, annot=True, fmt = ".4g", linewidths=.5, cmap="coolwarm")
plt.title('Confusion Matrix - Mechanisms',fontsize = 28)
plt.ylabel('Actual Values',fontsize = 18)
plt.xlabel('Predicted Values',fontsize = 18)
plt.yticks(fontsize = 16)
plt.xticks(fontsize = 16)
plt.savefig("pics/cm_mechanisms.png", dpi = 300)
plt.show()

In [None]:
import matplotlib.pyplot as plt
#Plot loss
plt.figure(figsize = (8,12))
plt.subplot(211)
plt.plot(mech_models_trainer.history['loss'], label='train')
plt.plot(mech_models_trainer.history['val_loss'], label='test')
plt.title('Loss', fontsize = 24)
plt.xticks(fontsize = 18)
plt.xlabel("epochs",fontsize = 15)
plt.yticks(fontsize = 18)
plt.ylim(0,.6)
plt.legend(fontsize = 18)
#Plot accuracy during training
plt.subplot(212)
plt.title('Accuracy',fontsize = 24)
plt.plot(mech_models_trainer.history['sparse_categorical_accuracy'], label='train')
plt.plot(mech_models_trainer.history['val_sparse_categorical_accuracy'], label='test')
plt.xlabel("epochs",fontsize = 15)
plt.yticks(fontsize = 18)
plt.ylim(.75,1)
plt.legend(fontsize = 18)
plt.tight_layout()
plt.savefig("mechamisms_metrics_14.04.2022.png")


In [None]:
classificador.save("trained_models/model_mechanisms.25.03.2020")

In [10]:
import Data

In [11]:
my_data = Data.Data(
    negative_dataset = "datasets/uniprot/negative.db.fasta",
    positive_dataset = "datasets/hmd/arg_v5.fasta")

In [12]:
neg_proteins_seq, neg_proteins_tags,pos_proteins_seq,pos_proteins_tags = my_data.load_data()

In [13]:
#my_data.get_longest_protein()

In [14]:
sub_set_data = Data.Subset_data(
    negative_dataset = "datasets/uniprot/negative.db.fasta",
    positive_dataset = "datasets/hmd/arg_v5.fasta"
    )

In [15]:
a,c,b,d  = sub_set_data.load_data()

In [16]:
len(a)

6647

In [21]:
print(a.dtype)

<U70425


In [22]:
import numpy as np

arr = np.array(['apple', 'banana', 'cherry'])

print(arr.dtype)

<U6
