### requirements for the following codings


In [1]:
### packages required 
# !pip install fair-esm 
# !pip install torch
# !pip install tensorflow
# !pip install sklearn

### peptide embeddings with esm2_t6_8M_UR50D pretrained models
6 layers, 8M parameters, dataset: UR50/D 2021_04, embedding dimension: 320
mode download URL: https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t6_8M_UR50D.pt

In [2]:
def esm_embeddings(peptide_sequence_list):
  # NOTICE: ESM for embeddings is quite RAM usage, if your sequence is too long, 
  #         or you have too many sequences for transformation in a single converting, 
  #         you conputer might automatically kill the job.
  import torch
  import esm
  import collections
  # load the model
  # NOTICE: if the model was not downloaded in your local environment, it will automatically download it.
  model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
  batch_converter = alphabet.get_batch_converter()
  model.eval()  # disables dropout for deterministic results

  # load the peptide sequence list into the bach_converter
  batch_labels, batch_strs, batch_tokens = batch_converter(peptide_sequence_list)
  batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
  ## batch tokens are the embedding results of the whole data set

  # Extract per-residue representations (on CPU)
  with torch.no_grad():
      # Here we export the last layer of the EMS model output as the representation of the peptides
      # model'esm2_t6_8M_UR50D' only has 6 layers, and therefore repr_layers parameters is equal to 6
      results = model(batch_tokens, repr_layers=[6], return_contacts=True)  
  token_representations = results["representations"][6]

  # Generate per-sequence representations via averaging
  # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.
  sequence_representations = []
  for i, tokens_len in enumerate(batch_lens):
      sequence_representations.append(token_representations[i, 1 : tokens_len - 1].mean(0))
  # save dataset
  # sequence_representations is a list and each element is a tensor
  embeddings_results = collections.defaultdict(list)
  for i in range(len(sequence_representations)):
      # tensor can be transformed as numpy sequence_representations[0].numpy() or sequence_representations[0].to_list
      each_seq_rep = sequence_representations[i].tolist()
      for each_element in each_seq_rep:
          embeddings_results[i].append(each_element)
  embeddings_results = pd.DataFrame(embeddings_results).T
  return embeddings_results

### data loading and embeddings

In [3]:
import numpy as np
import pandas as pd

In [5]:
# training dataset loading
dataset = pd.read_excel('APP_train.xlsx',na_filter = False) # take care the NA sequence problem
sequence_list = dataset['sequence'] 

embeddings_results = pd.DataFrame()
for seq in sequence_list:
    format_seq = [seq,seq] # the setting is just following the input format setting in ESM model, [name,sequence]
    tuple_sequence = tuple(format_seq)
    peptide_sequence_list = []
    peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
    # employ ESM model for converting and save the converted data in csv format
    one_seq_embeddings = esm_embeddings(peptide_sequence_list)
    embeddings_results= pd.concat([embeddings_results,one_seq_embeddings])

embeddings_results.to_csv('APP_train_esm2_t6_8M_UR50D_unified_320_dimension.csv')

# loading the y dataset for model development 
y_train = dataset['label']
y_train = np.array(y_train) # transformed as np.array for CNN model

Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t6_8M_UR50D.pt" to C:\Users\harel/.cache\torch\hub\checkpoints\esm2_t6_8M_UR50D.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t6_8M_UR50D-contact-regression.pt" to C:\Users\harel/.cache\torch\hub\checkpoints\esm2_t6_8M_UR50D-contact-regression.pt


In [6]:
# test dataset loading
dataset = pd.read_excel('APP_test.xlsx',na_filter = False) # take care the NA sequence problem
sequence_list = dataset['sequence'] 
embeddings_results = pd.DataFrame()
# embedding all the peptide one by one
for seq in sequence_list:
    format_seq = [seq,seq] # the setting is just following the input format setting in ESM model, [name,sequence]
    tuple_sequence = tuple(format_seq)
    peptide_sequence_list = []
    peptide_sequence_list.append(tuple_sequence) # build a summarize list variable including all the sequence information
    # employ ESM model for converting and save the converted data in csv format
    one_seq_embeddings = esm_embeddings(peptide_sequence_list)
    embeddings_results= pd.concat([embeddings_results,one_seq_embeddings])

embeddings_results.to_csv('APP_test_esm2_t6_8M_UR50D_unified_320_dimension.csv')


# loading the y dataset for model development 
y_test = dataset['label']
y_test = np.array(y_test) # transformed as np.array for CNN model

In [7]:
# assign the dataset 
X_train_data_name = 'APP_train_esm2_t6_8M_UR50D_unified_320_dimension.csv'
X_train_data = pd.read_csv(X_train_data_name,header=0, index_col = 0,delimiter=',')

X_test_data_name = 'APP_test_esm2_t6_8M_UR50D_unified_320_dimension.csv'
X_test_data = pd.read_csv(X_test_data_name,header=0, index_col = 0,delimiter=',')

X_train = np.array(X_train_data)
X_test = np.array(X_test_data)

# normalize the X data range
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train) # normalize X to 0-1 range 
X_test = scaler.transform(X_test)

In [8]:
# check the dimension of the dataset before model development
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2118, 320)
(92, 320)
(2118,)
(92,)


In [28]:
# training dataset loading
dataset = pd.read_excel('APP_train.xlsx',na_filter = False) # take care the NA sequence problem
sequence_list = dataset['sequence'] 

# loading the y dataset for model development 
y_train = dataset['label']
y_train = np.array(y_train) # transformed as np.array for CNN model
# test dataset loading
dataset = pd.read_excel('APP_test.xlsx',na_filter = False) # take care the NA sequence problem
sequence_list = dataset['sequence'] 
# loading the y dataset for model development 
y_test = dataset['label']
y_test = np.array(y_test) # transformed as np.array for CNN model
# assign the dataset 
X_train_data_name = 'APP_train_esm2_t6_8M_UR50D_unified_320_dimension.csv'
X_train_data = pd.read_csv(X_train_data_name,header=0, index_col = 0,delimiter=',')

X_test_data_name = 'APP_test_esm2_t6_8M_UR50D_unified_320_dimension.csv'
X_test_data = pd.read_csv(X_test_data_name,header=0, index_col = 0,delimiter=',')

X_train = np.array(X_train_data)
X_test = np.array(X_test_data)

# normalize the X data range
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train) # normalize X to 0-1 range 
X_test = scaler.transform(X_test)
# check the dimension of the dataset before model development
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2118, 320)
(92, 320)
(2118,)
(92,)


In [37]:
# Shuffle y_train
y_train = np.random.permutation(y_train)
# Shuffle y_test
y_test = np.random.permutation(y_test)


In [38]:
y_train

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

### dataset undersampling

NearMiss-1 selects examples from the majority class that have the smallest average distance to the three closest examples from the minority class. NearMiss-2 selects examples from the majority class that have the smallest average distance to the three furthest examples from the minority class. NearMiss-3 involves selecting a given number of majority class examples for each example in the minority class that are closest.

#### NearMiss

In [39]:
# !pip install imbalanced-learn

In [40]:
# NearMiss-1
from imblearn.under_sampling import NearMiss
# define the undersampling method
undersample = NearMiss(sampling_strategy= 1, version=1, n_neighbors=3)
# transform the dataset
X_train, y_train = undersample.fit_resample(X_train, y_train)

In [41]:
# NearMiss-2
from imblearn.under_sampling import NearMiss
# define the undersampling method
undersample = NearMiss(sampling_strategy= 1, version=2, n_neighbors=3)
# transform the dataset
X_train, y_train = undersample.fit_resample(X_train, y_train)

In [42]:
# NearMiss-3
from imblearn.under_sampling import NearMiss
# define the undersampling method
undersample = NearMiss(sampling_strategy= 1, version=3, n_neighbors=3)
# transform the dataset
X_train, y_train = undersample.fit_resample(X_train, y_train)



In [43]:
# check the dimension of the dataset before model development
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(426, 320)
(92, 320)
(426,)
(92,)


#### Random select

In [44]:
from imblearn.over_sampling import RandomOverSampler
# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy=1,random_state = 0)
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [45]:
from imblearn.under_sampling import RandomUnderSampler
# define undersampling strategy
undersample = RandomUnderSampler(sampling_strategy='majority',random_state = 1)
X_train, y_train = undersample.fit_resample(X_train, y_train)

In [46]:
# check the dimension of the dataset before model development
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(np.count_nonzero(y_train == 0))
print(np.count_nonzero(y_train == 1))

(510, 320)
(92, 320)
(510,)
(92,)
255
255


#### Cluster Centroid

In [47]:
from imblearn.under_sampling import ClusterCentroids
# define ClusterCentroids strategy
CC = ClusterCentroids(sampling_strategy=1, random_state = 3)
X_train, y_train = CC.fit_resample(X_train, y_train)

  super()._check_params_vs_input(X, default_n_init=10)
  return fit_method(estimator, *args, **kwargs)


In [48]:
# check the dimension of the dataset before model development
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(np.count_nonzero(y_train == 0))
print(np.count_nonzero(y_train == 1))

(510, 320)
(92, 320)
(510,)
(92,)
255
255


### Model architecture

In [49]:
def ESM_CNN(X_train, y_train, X_test, y_test):
  from keras.layers import Input,InputLayer, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D,Conv1D
  from keras.layers import AveragePooling2D, MaxPooling2D, Dropout, AveragePooling1D, MaxPooling1D
  from keras.models import Sequential,Model
  from keras.optimizers import SGD
  from keras.callbacks import ModelCheckpoint,LearningRateScheduler, EarlyStopping
  import keras
  from keras import backend as K
  inputShape=(320,1)
  input = Input(inputShape)
  x = Conv1D(128,(3),strides = (1),name='layer_conv1',padding='same')(input)
  x = BatchNormalization()(x)
  x = Activation('relu')(x)
  x = MaxPooling1D((2), name='MaxPool1',padding="same")(x)
  x = Dropout(0.15)(x)
  x = Conv1D(32,(3),strides = (1),name='layer_conv2',padding='same')(input)
  x = BatchNormalization()(x)
  x = Activation('relu')(x)
  x = MaxPooling1D((2), name='MaxPool2',padding="same")(x)
  x = Dropout(0.15)(x)
  x = Flatten()(x)
  x = Dense(64,activation = 'relu',name='fc1')(x)
  x = Dropout(0.15)(x)
  x = Dense(2,activation = 'softmax',name='fc2')(x)
  model = Model(inputs = input,outputs = x,name='Predict')
  # define SGD optimizer
  momentum = 0.5
  sgd = SGD(lr=0.01, momentum=momentum, decay=0.0, nesterov=False)
  # compile the model
  model.compile(loss='sparse_categorical_crossentropy',optimizer=sgd, metrics=['accuracy'])
  # learning deccay setting
  import math
  def step_decay(epoch): # gradually decrease the learning rate 
      initial_lrate=0.1
      drop=0.6
      epochs_drop = 3.0
      lrate= initial_lrate * math.pow(drop,    # math.pow base raised to a power
            math.floor((1+epoch)/epochs_drop)) # math.floor Round numbers down to the nearest integer
      return lrate
  lrate = LearningRateScheduler(step_decay)

  # early stop setting
  early_stop = EarlyStopping(monitor='val_accuracy', patience = 40,restore_best_weights = True)

  # summary the callbacks_list
  callbacks_list = [ lrate , early_stop]
######################
  ######################
  # HAREL: test is used ass validation!!!!
  ######################
  ######################
  
  model_history = model.fit(X_train, y_train, validation_data=(X_test, y_test),
                            epochs=200,callbacks=callbacks_list,batch_size = 8, verbose=2)
  return model, model_history

### 10-fold cross validation

In [50]:
#Implementing 10-fold cross validation
from sklearn.model_selection import KFold
k = 10 
kf = KFold(n_splits=k, shuffle = True, random_state=1)
X_train = pd.DataFrame(X_train)
y_train = pd.DataFrame(y_train)

# result collection list
ACC_collecton = []
BACC_collecton = []
Sn_collecton = []
Sp_collecton = []
MCC_collecton = []
AUC_collecton = []

for train_index , test_index in kf.split(y_train):
    X_train_CV , X_valid_CV = X_train.iloc[train_index,:],X_train.iloc[test_index,:]
    y_train_CV , y_valid_CV = y_train.iloc[train_index] , y_train.iloc[test_index]
    model, model_history = ESM_CNN(X_train_CV, y_train_CV, X_valid_CV, y_valid_CV)
    # confusion matrix 
    predicted_class= []
    predicted_protability = model.predict(X_valid_CV,batch_size=1)
    for i in range(predicted_protability.shape[0]):
      index = np.where(predicted_protability[i] == np.amax(predicted_protability[i]))[0][0]
      predicted_class.append(index)
    predicted_class = np.array(predicted_class)
    y_true = y_valid_CV    
    from sklearn.metrics import confusion_matrix
    import math
    # np.ravel() return a flatten 1D array
    TP, FP, FN, TN = confusion_matrix(y_true, predicted_class).ravel() # shape [ [True-Positive, False-positive], [False-negative, True-negative] ]
    ACC = (TP+TN)/(TP+TN+FP+FN)
    ACC_collecton.append(ACC)
    Sn_collecton.append(TP/(TP+FN))
    Sp_collecton.append(TN/(TN+FP))
    MCC = (TP*TN-FP*FN)/math.pow(((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)),0.5)
    MCC_collecton.append(MCC)
    BACC_collecton.append(0.5*TP/(TP+FN)+0.5*TN/(TN+FP))
    from sklearn.metrics import roc_auc_score
    AUC = roc_auc_score(y_valid_CV, predicted_protability[:,1])
    AUC_collecton.append(AUC)


  super().__init__(name, **kwargs)


Epoch 1/200
58/58 - 4s - loss: 0.9852 - accuracy: 0.6623 - val_loss: 0.7968 - val_accuracy: 0.4510 - lr: 0.1000 - 4s/epoch - 74ms/step
Epoch 2/200
58/58 - 1s - loss: 0.4975 - accuracy: 0.7538 - val_loss: 0.7473 - val_accuracy: 0.4510 - lr: 0.1000 - 975ms/epoch - 17ms/step
Epoch 3/200
58/58 - 1s - loss: 0.4949 - accuracy: 0.8061 - val_loss: 0.8660 - val_accuracy: 0.4510 - lr: 0.0600 - 943ms/epoch - 16ms/step
Epoch 4/200
58/58 - 1s - loss: 0.4059 - accuracy: 0.8126 - val_loss: 0.9023 - val_accuracy: 0.4510 - lr: 0.0600 - 954ms/epoch - 16ms/step
Epoch 5/200
58/58 - 1s - loss: 0.3491 - accuracy: 0.8519 - val_loss: 0.8763 - val_accuracy: 0.4510 - lr: 0.0600 - 948ms/epoch - 16ms/step
Epoch 6/200
58/58 - 1s - loss: 0.3056 - accuracy: 0.8802 - val_loss: 0.7325 - val_accuracy: 0.5490 - lr: 0.0360 - 944ms/epoch - 16ms/step
Epoch 7/200
58/58 - 1s - loss: 0.3139 - accuracy: 0.8780 - val_loss: 0.5512 - val_accuracy: 0.7059 - lr: 0.0360 - 943ms/epoch - 16ms/step
Epoch 8/200
58/58 - 1s - loss: 0.2924

  super().__init__(name, **kwargs)


Epoch 1/200
58/58 - 4s - loss: 1.3580 - accuracy: 0.6797 - val_loss: 0.6681 - val_accuracy: 0.6471 - lr: 0.1000 - 4s/epoch - 69ms/step
Epoch 2/200
58/58 - 1s - loss: 0.5191 - accuracy: 0.7647 - val_loss: 0.6831 - val_accuracy: 0.6471 - lr: 0.1000 - 954ms/epoch - 16ms/step
Epoch 3/200
58/58 - 1s - loss: 0.3822 - accuracy: 0.8214 - val_loss: 0.6795 - val_accuracy: 0.6667 - lr: 0.0600 - 807ms/epoch - 14ms/step
Epoch 4/200
58/58 - 1s - loss: 0.3542 - accuracy: 0.8540 - val_loss: 0.6405 - val_accuracy: 0.6471 - lr: 0.0600 - 821ms/epoch - 14ms/step
Epoch 5/200
58/58 - 1s - loss: 0.3315 - accuracy: 0.8540 - val_loss: 0.6168 - val_accuracy: 0.6667 - lr: 0.0600 - 771ms/epoch - 13ms/step
Epoch 6/200
58/58 - 1s - loss: 0.3034 - accuracy: 0.8627 - val_loss: 0.5288 - val_accuracy: 0.8824 - lr: 0.0360 - 779ms/epoch - 13ms/step
Epoch 7/200
58/58 - 1s - loss: 0.2618 - accuracy: 0.9063 - val_loss: 0.4326 - val_accuracy: 0.8824 - lr: 0.0360 - 823ms/epoch - 14ms/step
Epoch 8/200
58/58 - 1s - loss: 0.2628

  super().__init__(name, **kwargs)


Epoch 1/200
58/58 - 4s - loss: 2.2456 - accuracy: 0.7015 - val_loss: 3.0239 - val_accuracy: 0.5294 - lr: 0.1000 - 4s/epoch - 75ms/step
Epoch 2/200
58/58 - 1s - loss: 0.4358 - accuracy: 0.8279 - val_loss: 0.4923 - val_accuracy: 0.7451 - lr: 0.1000 - 930ms/epoch - 16ms/step
Epoch 3/200
58/58 - 1s - loss: 0.3783 - accuracy: 0.8540 - val_loss: 0.5550 - val_accuracy: 0.6667 - lr: 0.0600 - 949ms/epoch - 16ms/step
Epoch 4/200
58/58 - 1s - loss: 0.3208 - accuracy: 0.8606 - val_loss: 0.4256 - val_accuracy: 0.8039 - lr: 0.0600 - 997ms/epoch - 17ms/step
Epoch 5/200
58/58 - 1s - loss: 0.2921 - accuracy: 0.8845 - val_loss: 0.5126 - val_accuracy: 0.7451 - lr: 0.0600 - 933ms/epoch - 16ms/step
Epoch 6/200
58/58 - 1s - loss: 0.2882 - accuracy: 0.8954 - val_loss: 0.4046 - val_accuracy: 0.8235 - lr: 0.0360 - 876ms/epoch - 15ms/step
Epoch 7/200
58/58 - 1s - loss: 0.2532 - accuracy: 0.9063 - val_loss: 0.4349 - val_accuracy: 0.8235 - lr: 0.0360 - 837ms/epoch - 14ms/step
Epoch 8/200
58/58 - 1s - loss: 0.2340

  super().__init__(name, **kwargs)


Epoch 1/200
58/58 - 4s - loss: 1.1948 - accuracy: 0.6013 - val_loss: 0.6985 - val_accuracy: 0.5098 - lr: 0.1000 - 4s/epoch - 74ms/step
Epoch 2/200
58/58 - 1s - loss: 0.4735 - accuracy: 0.7800 - val_loss: 0.7688 - val_accuracy: 0.5098 - lr: 0.1000 - 933ms/epoch - 16ms/step
Epoch 3/200
58/58 - 1s - loss: 0.3661 - accuracy: 0.8388 - val_loss: 0.8353 - val_accuracy: 0.5098 - lr: 0.0600 - 954ms/epoch - 16ms/step
Epoch 4/200
58/58 - 1s - loss: 0.3255 - accuracy: 0.8606 - val_loss: 0.7805 - val_accuracy: 0.5098 - lr: 0.0600 - 888ms/epoch - 15ms/step
Epoch 5/200
58/58 - 1s - loss: 0.2951 - accuracy: 0.8780 - val_loss: 0.7310 - val_accuracy: 0.5490 - lr: 0.0600 - 846ms/epoch - 15ms/step
Epoch 6/200
58/58 - 1s - loss: 0.2728 - accuracy: 0.8954 - val_loss: 0.6714 - val_accuracy: 0.6471 - lr: 0.0360 - 862ms/epoch - 15ms/step
Epoch 7/200
58/58 - 1s - loss: 0.2616 - accuracy: 0.8911 - val_loss: 0.5710 - val_accuracy: 0.7255 - lr: 0.0360 - 1s/epoch - 18ms/step
Epoch 8/200
58/58 - 1s - loss: 0.2610 - 

  super().__init__(name, **kwargs)


Epoch 1/200
58/58 - 4s - loss: 1.2151 - accuracy: 0.6296 - val_loss: 0.7090 - val_accuracy: 0.4902 - lr: 0.1000 - 4s/epoch - 75ms/step
Epoch 2/200
58/58 - 1s - loss: 0.6685 - accuracy: 0.5272 - val_loss: 0.6927 - val_accuracy: 0.4902 - lr: 0.1000 - 987ms/epoch - 17ms/step
Epoch 3/200
58/58 - 1s - loss: 0.5097 - accuracy: 0.7560 - val_loss: 0.7290 - val_accuracy: 0.4902 - lr: 0.0600 - 991ms/epoch - 17ms/step
Epoch 4/200
58/58 - 1s - loss: 0.4147 - accuracy: 0.8148 - val_loss: 0.7409 - val_accuracy: 0.5294 - lr: 0.0600 - 1s/epoch - 18ms/step
Epoch 5/200
58/58 - 1s - loss: 0.3480 - accuracy: 0.8519 - val_loss: 0.6210 - val_accuracy: 0.6667 - lr: 0.0600 - 962ms/epoch - 17ms/step
Epoch 6/200
58/58 - 1s - loss: 0.3187 - accuracy: 0.8736 - val_loss: 0.5383 - val_accuracy: 0.6863 - lr: 0.0360 - 996ms/epoch - 17ms/step
Epoch 7/200
58/58 - 1s - loss: 0.3122 - accuracy: 0.8802 - val_loss: 0.4947 - val_accuracy: 0.7647 - lr: 0.0360 - 998ms/epoch - 17ms/step
Epoch 8/200
58/58 - 1s - loss: 0.2777 - 

  super().__init__(name, **kwargs)


Epoch 1/200
58/58 - 4s - loss: 1.4898 - accuracy: 0.7233 - val_loss: 0.7107 - val_accuracy: 0.4706 - lr: 0.1000 - 4s/epoch - 68ms/step
Epoch 2/200
58/58 - 1s - loss: 0.4541 - accuracy: 0.8301 - val_loss: 0.6995 - val_accuracy: 0.4706 - lr: 0.1000 - 905ms/epoch - 16ms/step
Epoch 3/200
58/58 - 1s - loss: 0.3281 - accuracy: 0.8627 - val_loss: 0.6899 - val_accuracy: 0.5294 - lr: 0.0600 - 967ms/epoch - 17ms/step
Epoch 4/200
58/58 - 1s - loss: 0.3024 - accuracy: 0.8715 - val_loss: 0.6835 - val_accuracy: 0.6078 - lr: 0.0600 - 961ms/epoch - 17ms/step
Epoch 5/200
58/58 - 1s - loss: 0.2847 - accuracy: 0.8954 - val_loss: 0.6397 - val_accuracy: 0.6667 - lr: 0.0600 - 841ms/epoch - 15ms/step
Epoch 6/200
58/58 - 1s - loss: 0.2688 - accuracy: 0.8889 - val_loss: 0.5681 - val_accuracy: 0.8039 - lr: 0.0360 - 900ms/epoch - 16ms/step
Epoch 7/200
58/58 - 1s - loss: 0.2427 - accuracy: 0.9063 - val_loss: 0.4322 - val_accuracy: 0.8824 - lr: 0.0360 - 928ms/epoch - 16ms/step
Epoch 8/200
58/58 - 1s - loss: 0.2569

  super().__init__(name, **kwargs)


58/58 - 3s - loss: 2.1676 - accuracy: 0.6405 - val_loss: 0.6542 - val_accuracy: 0.5686 - lr: 0.1000 - 3s/epoch - 44ms/step
Epoch 2/200
58/58 - 1s - loss: 0.4279 - accuracy: 0.8279 - val_loss: 0.6736 - val_accuracy: 0.6078 - lr: 0.1000 - 620ms/epoch - 11ms/step
Epoch 3/200
58/58 - 1s - loss: 0.3668 - accuracy: 0.8497 - val_loss: 0.5329 - val_accuracy: 0.8627 - lr: 0.0600 - 580ms/epoch - 10ms/step
Epoch 4/200
58/58 - 1s - loss: 0.3160 - accuracy: 0.8758 - val_loss: 0.4702 - val_accuracy: 0.8824 - lr: 0.0600 - 605ms/epoch - 10ms/step
Epoch 5/200
58/58 - 1s - loss: 0.3166 - accuracy: 0.8780 - val_loss: 0.5093 - val_accuracy: 0.7647 - lr: 0.0600 - 616ms/epoch - 11ms/step
Epoch 6/200
58/58 - 1s - loss: 0.2869 - accuracy: 0.8780 - val_loss: 0.4078 - val_accuracy: 0.8627 - lr: 0.0360 - 558ms/epoch - 10ms/step
Epoch 7/200
58/58 - 1s - loss: 0.2709 - accuracy: 0.8998 - val_loss: 0.4383 - val_accuracy: 0.8627 - lr: 0.0360 - 592ms/epoch - 10ms/step
Epoch 8/200
58/58 - 1s - loss: 0.2481 - accuracy:

  super().__init__(name, **kwargs)


Epoch 1/200
58/58 - 4s - loss: 0.9191 - accuracy: 0.5534 - val_loss: 0.7204 - val_accuracy: 0.4706 - lr: 0.1000 - 4s/epoch - 64ms/step
Epoch 2/200
58/58 - 1s - loss: 0.5896 - accuracy: 0.6972 - val_loss: 0.7565 - val_accuracy: 0.4706 - lr: 0.1000 - 976ms/epoch - 17ms/step
Epoch 3/200
58/58 - 1s - loss: 0.5661 - accuracy: 0.6841 - val_loss: 0.7610 - val_accuracy: 0.4706 - lr: 0.0600 - 938ms/epoch - 16ms/step
Epoch 4/200
58/58 - 1s - loss: 0.4617 - accuracy: 0.7669 - val_loss: 0.7333 - val_accuracy: 0.4902 - lr: 0.0600 - 977ms/epoch - 17ms/step
Epoch 5/200
58/58 - 1s - loss: 0.3725 - accuracy: 0.8519 - val_loss: 0.6599 - val_accuracy: 0.6275 - lr: 0.0600 - 920ms/epoch - 16ms/step
Epoch 6/200
58/58 - 1s - loss: 0.3256 - accuracy: 0.8627 - val_loss: 0.4923 - val_accuracy: 0.7451 - lr: 0.0360 - 1s/epoch - 17ms/step
Epoch 7/200
58/58 - 1s - loss: 0.3325 - accuracy: 0.8540 - val_loss: 0.4554 - val_accuracy: 0.7843 - lr: 0.0360 - 993ms/epoch - 17ms/step
Epoch 8/200
58/58 - 1s - loss: 0.2971 - 

  super().__init__(name, **kwargs)


Epoch 1/200
58/58 - 4s - loss: 2.4819 - accuracy: 0.6688 - val_loss: 0.6248 - val_accuracy: 0.5098 - lr: 0.1000 - 4s/epoch - 61ms/step
Epoch 2/200
58/58 - 1s - loss: 0.4470 - accuracy: 0.8039 - val_loss: 0.5913 - val_accuracy: 0.5098 - lr: 0.1000 - 604ms/epoch - 10ms/step
Epoch 3/200
58/58 - 1s - loss: 0.3493 - accuracy: 0.8606 - val_loss: 0.4115 - val_accuracy: 0.8627 - lr: 0.0600 - 683ms/epoch - 12ms/step
Epoch 4/200
58/58 - 1s - loss: 0.3315 - accuracy: 0.8649 - val_loss: 0.2633 - val_accuracy: 0.9216 - lr: 0.0600 - 657ms/epoch - 11ms/step
Epoch 5/200
58/58 - 1s - loss: 0.3101 - accuracy: 0.8736 - val_loss: 0.3109 - val_accuracy: 0.8431 - lr: 0.0600 - 623ms/epoch - 11ms/step
Epoch 6/200
58/58 - 1s - loss: 0.3502 - accuracy: 0.8475 - val_loss: 0.2244 - val_accuracy: 0.9216 - lr: 0.0360 - 582ms/epoch - 10ms/step
Epoch 7/200
58/58 - 1s - loss: 0.3002 - accuracy: 0.8824 - val_loss: 0.2189 - val_accuracy: 0.9216 - lr: 0.0360 - 638ms/epoch - 11ms/step
Epoch 8/200
58/58 - 1s - loss: 0.2825

  super().__init__(name, **kwargs)


58/58 - 3s - loss: 1.6166 - accuracy: 0.6449 - val_loss: 0.6299 - val_accuracy: 0.8039 - lr: 0.1000 - 3s/epoch - 44ms/step
Epoch 2/200
58/58 - 1s - loss: 0.4453 - accuracy: 0.7974 - val_loss: 0.6462 - val_accuracy: 0.5686 - lr: 0.1000 - 655ms/epoch - 11ms/step
Epoch 3/200
58/58 - 1s - loss: 0.3209 - accuracy: 0.8758 - val_loss: 0.5408 - val_accuracy: 0.7451 - lr: 0.0600 - 642ms/epoch - 11ms/step
Epoch 4/200
58/58 - 1s - loss: 0.3771 - accuracy: 0.8431 - val_loss: 0.5800 - val_accuracy: 0.6863 - lr: 0.0600 - 622ms/epoch - 11ms/step
Epoch 5/200
58/58 - 1s - loss: 0.3256 - accuracy: 0.8671 - val_loss: 0.4845 - val_accuracy: 0.7451 - lr: 0.0600 - 665ms/epoch - 11ms/step
Epoch 6/200
58/58 - 1s - loss: 0.3039 - accuracy: 0.8780 - val_loss: 0.3896 - val_accuracy: 0.8235 - lr: 0.0360 - 691ms/epoch - 12ms/step
Epoch 7/200
58/58 - 1s - loss: 0.2840 - accuracy: 0.8780 - val_loss: 0.2834 - val_accuracy: 0.8627 - lr: 0.0360 - 627ms/epoch - 11ms/step
Epoch 8/200
58/58 - 1s - loss: 0.2868 - accuracy:

In [51]:
from statistics import mean, stdev
print(mean(ACC_collecton),'±',stdev(ACC_collecton))
print(mean(BACC_collecton),'±',stdev(BACC_collecton))
print(mean(Sn_collecton),'±',stdev(Sn_collecton))
print(mean(Sp_collecton),'±',stdev(Sp_collecton))
print(mean(MCC_collecton),'±',stdev(MCC_collecton))
print(mean(AUC_collecton),'±',stdev(AUC_collecton))


0.907843137254902 ± 0.03927011473411714
0.9187015805122181 ± 0.037450078663330916
0.9861471861471861 ± 0.030935474045279358
0.85125597487725 ± 0.050911222016377794
0.8246166829868402 ± 0.07528877247879974
0.9149141633904253 ± 0.042544786753747725


In [52]:
ACC_collecton

[0.9215686274509803,
 0.9019607843137255,
 0.8235294117647058,
 0.8823529411764706,
 0.9019607843137255,
 0.9215686274509803,
 0.8823529411764706,
 0.9411764705882353,
 0.9411764705882353,
 0.9607843137254902]

### model evaluation in test dataset

In [24]:
# result collection list
ACC_collecton = []
BACC_collecton = []
Sn_collecton = []
Sp_collecton = []
MCC_collecton = []
AUC_collecton = []
######################
#HARE: Notice how test is passed on as validation!!!!!!
######################
model, model_history = ESM_CNN(X_train, y_train, X_test , y_test)
# confusion matrix 
predicted_class= []
predicted_protability = model.predict(X_test,batch_size=1)
for i in range(predicted_protability.shape[0]):
  index = np.where(predicted_protability[i] == np.amax(predicted_protability[i]))[0][0]
  predicted_class.append(index)
predicted_class = np.array(predicted_class)
y_true = y_test    
from sklearn.metrics import confusion_matrix
import math
# np.ravel() return a flatten 1D array
TP, FP, FN, TN = confusion_matrix(y_true, predicted_class).ravel() # shape [ [True-Positive, False-positive], [False-negative, True-negative] ]
ACC = (TP+TN)/(TP+TN+FP+FN)
ACC_collecton.append(ACC)
Sn_collecton.append(TP/(TP+FN))
Sp_collecton.append(TN/(TN+FP))
MCC = (TP*TN-FP*FN)/math.pow(((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)),0.5)
MCC_collecton.append(MCC)
BACC_collecton.append(0.5*TP/(TP+FN)+0.5*TN/(TN+FP))
from sklearn.metrics import roc_auc_score
AUC = roc_auc_score(y_test, predicted_protability[:,1])
AUC_collecton.append(AUC)

  super().__init__(name, **kwargs)


Epoch 1/200
64/64 - 5s - loss: 1.3020 - accuracy: 0.6490 - val_loss: 0.7436 - val_accuracy: 0.5000 - lr: 0.1000 - 5s/epoch - 81ms/step
Epoch 2/200
64/64 - 2s - loss: 0.5671 - accuracy: 0.7098 - val_loss: 0.7234 - val_accuracy: 0.5000 - lr: 0.1000 - 2s/epoch - 38ms/step
Epoch 3/200
64/64 - 2s - loss: 0.4892 - accuracy: 0.7843 - val_loss: 0.7524 - val_accuracy: 0.5000 - lr: 0.0600 - 2s/epoch - 36ms/step
Epoch 4/200
64/64 - 1s - loss: 0.4301 - accuracy: 0.8098 - val_loss: 0.7743 - val_accuracy: 0.5000 - lr: 0.0600 - 982ms/epoch - 15ms/step
Epoch 5/200
64/64 - 1s - loss: 0.3994 - accuracy: 0.8157 - val_loss: 0.7574 - val_accuracy: 0.5000 - lr: 0.0600 - 1s/epoch - 17ms/step
Epoch 6/200
64/64 - 1s - loss: 0.3726 - accuracy: 0.8118 - val_loss: 0.7089 - val_accuracy: 0.5652 - lr: 0.0360 - 1s/epoch - 19ms/step
Epoch 7/200
64/64 - 1s - loss: 0.3381 - accuracy: 0.8333 - val_loss: 0.7049 - val_accuracy: 0.5652 - lr: 0.0360 - 861ms/epoch - 13ms/step
Epoch 8/200
64/64 - 1s - loss: 0.3495 - accuracy:

In [25]:
print(ACC_collecton[0])
print(BACC_collecton[0])
print(Sn_collecton[0])
print(Sp_collecton[0])
print(MCC_collecton[0])
print(AUC_collecton[0])

0.6630434782608695
0.6888341543513957
0.6190476190476191
0.7586206896551724
0.3509312031717982
0.672022684310019


In [26]:
model.save('APP_tensorflow_model',save_format = 'tf') 
!zip -r /content/AMAP_alternative_tensorflow_model.zip /content/AMAP_alternative_tensorflow_model



INFO:tensorflow:Assets written to: APP_tensorflow_model\assets


INFO:tensorflow:Assets written to: APP_tensorflow_model\assets
'zip' is not recognized as an internal or external command,
operable program or batch file.


### t-SNE graph making

In [27]:
# loading datasset
X_train_data_name = 'BBP_train_esm2_t6_8M_UR50D_unified_320_dimension.csv'
X_train_data = pd.read_csv(X_train_data_name,header=0, index_col = 0,delimiter=',')
X_test_data_name = 'BBP_test_esm2_t6_8M_UR50D_unified_320_dimension.csv'
X_test_data = pd.read_csv(X_test_data_name,header=0, index_col = 0,delimiter=',')
X_train = np.array(X_train_data)
X_test = np.array(X_test_data)
# training dataset loading
dataset = pd.read_excel('BBP_train.xlsx',na_filter = False) # take care the NA sequence problem
# loading the y dataset for model development 
y_train = dataset['label']
y_train = np.array(y_train) # transformed as np.array for CNN model
# training dataset loading
dataset = pd.read_excel('BBP_test.xlsx',na_filter = False) # take care the NA sequence problem
# loading the y dataset for model development 
y_test = dataset['label']
y_test = np.array(y_test) # transformed as np.array for CNN model
# normalize the X data range (just )
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train) # normalize X to 0-1 range 
X_test = scaler.transform(X_test)
# concatenate the dataset
X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

from sklearn.manifold import TSNE
from keras.datasets import mnist
from sklearn.datasets import load_iris
from numpy import reshape
import seaborn as sns
import pandas as pd  
tsne = TSNE(n_components=2, verbose=0, perplexity= 25, learning_rate='auto',n_iter = 5000,random_state=123)
z = tsne.fit_transform(X) 
df = pd.DataFrame()
df["comp-1"] = z[:,0]
df["comp-2"] = z[:,1]
y_new_label=[]
for i in y:
    if i == 0:
        y_new_label.append('Active')
    if i == 1:
        y_new_label.append('Inactive')
df["y"] = y_new_label
graph = sns.scatterplot(data=df, x="comp-1", y="comp-2", hue=y_new_label,
                palette='BrBG_r', legend='full')
graph_for_output = graph.get_figure()
graph_for_output.savefig('11.BBP_t-SNE.png', dpi=300)
df.to_excel('11.BBP_t-SNE.xlsx')


FileNotFoundError: [Errno 2] No such file or directory: 'BBP_train_esm2_t6_8M_UR50D_unified_320_dimension.csv'

In [1]:
# !pip install umap

Collecting umap
  Downloading umap-0.1.1.tar.gz (3.2 kB)
Building wheels for collected packages: umap
  Building wheel for umap (setup.py) ... [?25ldone
[?25h  Created wheel for umap: filename=umap-0.1.1-py3-none-any.whl size=3564 sha256=879a7899406d5aec8c819a383fa99940a5800f1e54e2ee1231bd20f2b57ac85c
  Stored in directory: /Users/zhenjiaodu/Library/Caches/pip/wheels/d4/13/91/2e752dc8dab5df027854bd33d2b65e1dc5cdc107fd1133990f
Successfully built umap
Installing collected packages: umap
Successfully installed umap-0.1.1


In [2]:
import umap
reducer = umap.UMAP()

AttributeError: module 'umap' has no attribute 'UMAP'