In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dropout, Layer
from tensorflow.keras.layers import Embedding, Input, GlobalAveragePooling1D, Dense
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential, Model
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt 
import seaborn as sns
import sklearn
from Bio.Seq import Seq
from transformers import TFBertModel, BertTokenizer,BertConfig
import re
import pickle
import sys
import gc
import os

# np.random.seed(42)
# tf.random.set_seed(42)
# os.environ['PYTHONHASHSEED']=str(42)
tf.keras.utils.set_random_seed(42)


# Data Loading

In [2]:
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert_bfd", do_lower_case=False)

In [3]:
df = pd.read_csv("../Data/CoV-AbDab_201222.csv")
df = df[["VHorVHH"]]
df = df[df["VHorVHH"].apply(lambda x: len(x) <= 138)]
df = df[(df.VHorVHH != 'ND')]
df
# df = df[["CDRH3"]]

Unnamed: 0,VHorVHH
0,EVQLVESGGGLIQPGGSLRLSCAASGLTVSSNYMNWVRQAPGKGLE...
1,EVQLVQSGGGLVQPGGSLRLSCLASGLTFSSYEFNWIRQAPGKGLE...
2,QVQLVQSGAEVKRPGASVKVLCMASGYSFTNYGINWVRQAPGQGLE...
3,EVQLVQSGAEVKKPRESLKISCKGSGYNFTSYWIGWVRQMPGKGLE...
4,EVQLVESGGGLVQPGGSLRLSCAASRFTFANYWMSWVRQAPGKGLE...
...,...
11999,EVQVVESGGGLVKPGGSLRLSCAASGFTFSSYTMNWVRQAPGKGLE...
12000,QMQLVQSGPEVKRPGTSVKVSCEASGFTFSSSAILWVRQPRGQRLE...
12001,QVQLVESGGGLVKPGGSLRLSCAASGFTFSDYYMNWIRQAPGKGLE...
12002,EVQLVESGGGLVQPGGSLRLSCAASGFTFSRFAMHWVRQAPGKGLE...


In [4]:
dummy = []
head = []
with open("../Data/cAb-rep/cAb-Rep_heavy.nt.txt") as myfile:
    # count = 0
    for i in myfile:
        # if count <= 1:
        #     print(i)
        #     if i.find(">") == -1 & i.find("-") == -1:
        #         print(Seq.translate(i.strip()))
        #     count+=1
        dummy.append(i)
    np.random.shuffle(dummy)
    
    for i in dummy:
        if i.find(">") == -1 & i.find("-") == -1 & i.find("N") == -1: # These conditions must be met for a valid sequence, the longest was 141. However, there is no 141 sequence for COVID, the greatest is 138, so we go with that
            aa_sequence = Seq.translate(i.strip())
            if (len(aa_sequence) <= 138) & (len(aa_sequence) >= 100):
                head.append(aa_sequence)
                if len(head) >= 11538:
                    break
print(head[:5], len(head))
healthy_sequences = head



['EVQLVQSGPEVKKPGSSVKVSCKASGGTFSNFAFSWVRQAPGQGLEWMGSVILHLGTSTYAQKFQGRVTITADESTSAAFMDLNALTSDDTAVYYCARVVAVPGRVPYWFDPWGQGTLVTVSS', 'TLSLTCAVYGGSFSGYYWSWIRQPPGKGLEWIGEINHSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARVPPTSTVTTLGDDYWGQGTLVTVSS', 'QVQLVQSGPEVKKPGASVRVSCKPSGYPFSNYGISWMRQAPGQGLEWMGWVNIDKGNTKYAQKFQDRVTMTTDTSSSTVYLELRSLRSDDTALYYCARERGGYRYGDYWGQGTLVIVSS', 'TLSLTCAVYGGSFSGYYWSWIRQPPGKGLEWIGEIKHSGSTNYIPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCASRAGAAAASWGQGTLVTVSS', 'SETLSLTCAVHGGSFSDYYWTWIRQPPGKGLEWIGEINHRGGTNYNPSLKSRLNILVDTSKSQFSLKLSSVTAADTAVYFCARERFILIRGLTKYYYYMDVWGKGTTVTVS'] 11538


In [5]:
del head
del myfile
del dummy
gc.collect()

0

In [6]:
covid_sequences = df.to_numpy()
covid_sequences = np.squeeze(covid_sequences)
np.random.shuffle(covid_sequences)
healthy_sequences = np.array([re.sub(r"[UZOB]", "X", sequence) for sequence in healthy_sequences])
healthy_sequences = np.array([(" ".join(s)) for s in healthy_sequences])
covid_sequences = np.array([re.sub(r"[UZOB]", "X", sequence) for sequence in covid_sequences])
covid_sequences = np.array([(" ".join(s)) for s in covid_sequences])

print(len(max(healthy_sequences, key=len)))
print(len(max(covid_sequences, key=len)))

275
275


In [7]:
print(healthy_sequences)

['E V Q L V Q S G P E V K K P G S S V K V S C K A S G G T F S N F A F S W V R Q A P G Q G L E W M G S V I L H L G T S T Y A Q K F Q G R V T I T A D E S T S A A F M D L N A L T S D D T A V Y Y C A R V V A V P G R V P Y W F D P W G Q G T L V T V S S'
 'T L S L T C A V Y G G S F S G Y Y W S W I R Q P P G K G L E W I G E I N H S G S T N Y N P S L K S R V T I S V D T S K N Q F S L K L S S V T A A D T A V Y Y C A R V P P T S T V T T L G D D Y W G Q G T L V T V S S'
 'Q V Q L V Q S G P E V K K P G A S V R V S C K P S G Y P F S N Y G I S W M R Q A P G Q G L E W M G W V N I D K G N T K Y A Q K F Q D R V T M T T D T S S S T V Y L E L R S L R S D D T A L Y Y C A R E R G G Y R Y G D Y W G Q G T L V I V S S'
 ...
 'Q S G A E V K K A G E S L R I S C K A S G Y S F A S Y W I G W V R Q M P G K G L E C M G I I N P G D S D T R Y S P S F Q G H V T I S V D K S V N T A Y L Q W S S L K A S D T A I Y N C S K Q I I T Y S S G W Y G F D Y W G Q G T L V T V S'
 'D V Q L L E S G G G L A Q P G G S L R L S C A A S G

In [8]:
ids = tokenizer.batch_encode_plus(healthy_sequences, add_special_tokens=True, padding=True, return_tensors="tf")
healthy_tokens = ids['input_ids']
print(healthy_tokens)

Metal device set to: Apple M1 Max

systemMemory: 64.00 GB
maxCacheSize: 24.00 GB

tf.Tensor(
[[ 2  9  8 ...  0  0  0]
 [ 2 15  5 ...  0  0  0]
 [ 2 18  8 ...  0  0  0]
 ...
 [ 2 18 10 ...  0  0  0]
 [ 2 14  8 ...  0  0  0]
 [ 2 18  8 ...  0  0  0]], shape=(11538, 140), dtype=int32)


2023-01-17 22:31:18.581630: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-01-17 22:31:18.581753: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [9]:
ids = tokenizer.batch_encode_plus(covid_sequences, add_special_tokens=True, padding=True, return_tensors="tf")
covid_tokens = ids['input_ids']
print(covid_tokens)

tf.Tensor(
[[ 2 18  8 ...  0  0  0]
 [ 2 18  8 ...  0  0  0]
 [ 2 18  8 ...  0  0  0]
 ...
 [ 2 18  8 ...  0  0  0]
 [ 2 18  8 ...  0  0  0]
 [ 2 18  8 ...  0  0  0]], shape=(11538, 140), dtype=int32)


In [10]:
healthy_lables = [0] * 11538
covid_lables = [1] * 11538

In [11]:
X = np.concatenate((healthy_tokens, covid_tokens))
y = np.concatenate((healthy_lables, covid_lables))

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [13]:
del X
del y
gc.collect()

0

In [14]:
# vocab_size = 20000  # Only consider the top 20k words
# maxlen = 200  # Only consider the first 200 words of each movie review

# (x_train, y_train), (x_val, y_val) = imdb.load_data(num_words=vocab_size)
# print(len(x_train), "Training sequences")
# print(len(x_val), "Validation sequences")

In [15]:
# print(x_train.shape)

In [16]:
# x_train = tf.keras.utils.pad_sequences(x_train, maxlen=maxlen)
# x_val = tf.keras.utils.pad_sequences(x_val, maxlen=maxlen)

In [17]:
# print(x_train.shape)

# Model Build and Train

In [None]:
# Structure borrowed from https://keras.io/examples/nlp/text_classification_with_transformer/

In [18]:
class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential(
            [Dense(ff_dim, activation="relu"), 
             Dense(embed_dim),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [19]:
class TokenAndPositionEmbedding(Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [20]:
embed_dim = 1024  # Embedding size for each token
num_heads = 16  # Number of attention heads
ff_dim = 512  # Hidden layer size in feed forward network inside transformer
maxlen = 140
vocab_size = 30

# x6 greater

inputs = Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = GlobalAveragePooling1D()(x)
x = Dropout(0.2, seed=42)(x)
x = Dense(256, activation="relu")(x)
x = Dense(64, activation="relu")(x)
x = Dropout(0.3, seed=42)(x)
outputs = Dense(1, activation="sigmoid")(x)

model = Model(inputs=inputs, outputs=outputs)

In [21]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 140)]             0         
                                                                 
 token_and_position_embeddin  (None, 140, 1024)        174080    
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_block (Transfor  (None, 140, 1024)        68213248  
 merBlock)                                                       
                                                                 
 global_average_pooling1d (G  (None, 1024)             0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_2 (Dropout)         (None, 1024)              0     

In [22]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

history = model.fit(X_train, y_train, 
                    batch_size=32, epochs=10, 
                    validation_data=(X_val, y_val)
                   )

Epoch 1/10


2023-01-17 22:31:25.785465: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-01-17 22:31:26.470490: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-01-17 23:19:41.157358: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
results = model.evaluate(X_test, y_test, verbose=2)

for name, value in zip(model.metrics_names, results):
    print("%s: %.4f" % (name, value))

73/73 - 129s - loss: 0.4441 - accuracy: 0.7660 - 129s/epoch - 2s/step
loss: 0.4441
accuracy: 0.7660
