In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt 
import seaborn as sns
import sklearn
from Bio.Seq import Seq
from transformers import T5Tokenizer, TFT5EncoderModel, AdamWeightDecay
import re
np.random.seed(42)
tf.random.set_seed(42)
import pickle
import sys
import gc
import os
from antiberty import get_weights

In [2]:
df = pd.read_csv("../Data/CoV-AbDab_201222.csv")
df = df[["VHorVHH"]]
df = df[df["VHorVHH"].apply(lambda x: len(x) <= 138)]
df = df[(df.VHorVHH != 'ND')]
df
# df = df[["CDRH3"]]

Unnamed: 0,VHorVHH
0,EVQLVESGGGLIQPGGSLRLSCAASGLTVSSNYMNWVRQAPGKGLE...
1,EVQLVQSGGGLVQPGGSLRLSCLASGLTFSSYEFNWIRQAPGKGLE...
2,QVQLVQSGAEVKRPGASVKVLCMASGYSFTNYGINWVRQAPGQGLE...
3,EVQLVQSGAEVKKPRESLKISCKGSGYNFTSYWIGWVRQMPGKGLE...
4,EVQLVESGGGLVQPGGSLRLSCAASRFTFANYWMSWVRQAPGKGLE...
...,...
11999,EVQVVESGGGLVKPGGSLRLSCAASGFTFSSYTMNWVRQAPGKGLE...
12000,QMQLVQSGPEVKRPGTSVKVSCEASGFTFSSSAILWVRQPRGQRLE...
12001,QVQLVESGGGLVKPGGSLRLSCAASGFTFSDYYMNWIRQAPGKGLE...
12002,EVQLVESGGGLVQPGGSLRLSCAASGFTFSRFAMHWVRQAPGKGLE...


In [3]:
dummy = []
head = []
with open("../Data/cAb-rep/cAb-Rep_heavy.nt.txt") as myfile:
    # count = 0
    for i in myfile:
        # if count <= 1:
        #     print(i)
        #     if i.find(">") == -1 & i.find("-") == -1:
        #         print(Seq.translate(i.strip()))
        #     count+=1
        dummy.append(i)
    np.random.shuffle(dummy)
    
    for i in dummy:
        if i.find(">") == -1 & i.find("-") == -1 & i.find("N") == -1: # These conditions must be met for a valid sequence, the longest was 141. However, there is no 141 sequence for COVID, the greatest is 138, so we go with that
            aa_sequence = Seq.translate(i.strip())
            if (len(aa_sequence) <= 138) & (len(aa_sequence) >= 100):
                head.append(aa_sequence)
                if len(head) >= 11538:
                    break
print(head[:5], len(head))
healthy_sequences = head



['EVQLVQSGPEVKKPGSSVKVSCKASGGTFSNFAFSWVRQAPGQGLEWMGSVILHLGTSTYAQKFQGRVTITADESTSAAFMDLNALTSDDTAVYYCARVVAVPGRVPYWFDPWGQGTLVTVSS', 'TLSLTCAVYGGSFSGYYWSWIRQPPGKGLEWIGEINHSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARVPPTSTVTTLGDDYWGQGTLVTVSS', 'QVQLVQSGPEVKKPGASVRVSCKPSGYPFSNYGISWMRQAPGQGLEWMGWVNIDKGNTKYAQKFQDRVTMTTDTSSSTVYLELRSLRSDDTALYYCARERGGYRYGDYWGQGTLVIVSS', 'TLSLTCAVYGGSFSGYYWSWIRQPPGKGLEWIGEIKHSGSTNYIPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCASRAGAAAASWGQGTLVTVSS', 'SETLSLTCAVHGGSFSDYYWTWIRQPPGKGLEWIGEINHRGGTNYNPSLKSRLNILVDTSKSQFSLKLSSVTAADTAVYFCARERFILIRGLTKYYYYMDVWGKGTTVTVS'] 11538


In [4]:
del head
del myfile
del dummy
gc.collect()

20

In [5]:
covid_sequences = df.to_numpy()
covid_sequences = np.squeeze(covid_sequences)
np.random.shuffle(covid_sequences)
print(len(max(healthy_sequences, key=len)))
print(len(max(covid_sequences, key=len)))

138
138


In [6]:
del df
gc.collect()

0

In [7]:
healthy_lables = [0] * 11538
covid_lables = [1] * 11538

In [8]:
X = np.concatenate((healthy_sequences, covid_sequences))
y = np.concatenate((healthy_lables, covid_lables))

In [9]:
X = X.tolist()
y = y.tolist()

In [10]:
del healthy_sequences
del covid_sequences
gc.collect()

0

## Splitting the data

Since the data we're loading isn't prepared for us as a machine learning dataset, we'll have to split the data into train and test sets ourselves! We can use sklearn's function for that:

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [12]:
del X
del y
gc.collect()

0

In [13]:
X_train = [(" ".join(s)) for s in X_train]
X_test = [(" ".join(s)) for s in X_test]

In [14]:
from transformers import RobertaTokenizer, AutoTokenizer, BertTokenizer

# tokenizer = RobertaTokenizer.from_pretrained("../antiberta/antibody-tokenizer") # idk if they actually have a tokenizer
tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert_bfd")

In [15]:
X_train[0]

'E V Q L V E S G G G L V Q P G G S Q R L S C A A S G F T V S S N Y M S W I R Q A P G K G L E W V S V I Y S G G S A Y Y V D S V K G R F T I S R D N S K N T L Y L Q M N S L R P E D T A V Y Y C A R I A N Y M D V W G K G T T V T V S S'

In [16]:
tokenizer(X_train[0])

{'input_ids': [2, 9, 8, 18, 5, 8, 9, 10, 7, 7, 7, 5, 8, 18, 16, 7, 7, 10, 18, 13, 5, 10, 23, 6, 6, 10, 7, 19, 15, 8, 10, 10, 17, 20, 21, 10, 24, 11, 13, 18, 6, 16, 7, 12, 7, 5, 9, 24, 8, 10, 8, 11, 20, 10, 7, 7, 10, 6, 20, 20, 8, 14, 10, 8, 12, 7, 13, 19, 15, 11, 10, 13, 14, 17, 10, 12, 17, 15, 5, 20, 5, 18, 21, 17, 10, 5, 13, 16, 9, 14, 15, 6, 8, 20, 20, 23, 6, 13, 11, 6, 17, 20, 21, 14, 8, 24, 7, 12, 7, 15, 15, 8, 15, 8, 10, 10, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

This looks good! We can see that our sequence has been converted into `input_ids`, which is the tokenized sequence, and an `attention_mask`. The attention mask handles the case when we have sequences of variable length - in those cases, the shorter sequences are padded with blank "padding" tokens, and the attention mask is padded with 0s to indicate that those tokens should be ignored by the model.

So now, let's tokenize our whole dataset. Note that we don't need to do anything with the labels, as they're already in the format we need.

In [17]:
train_tokenized = tokenizer(X_train)
test_tokenized = tokenizer(X_test)
# val_tokenized = tokenizer(X_val)

In [18]:
from datasets import Dataset
train_dataset = Dataset.from_dict(train_tokenized)
test_dataset = Dataset.from_dict(test_tokenized)
# val_dataset = Dataset.from_dict(val_tokenized)

train_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 20768
})

This looks good, but we're missing our labels! Let's add those on as an extra column to the datasets.

In [19]:
train_dataset = train_dataset.add_column("labels", y_train) # train_labels = y_train
test_dataset = test_dataset.add_column("labels", y_test)
# val_dataset = val_dataset.add_column("labels", y_val)

train_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 20768
})

In [20]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("josephyu12/antibertyv3-finetuned-healthy-covid-classification")
# model = TFAutoModelForSequenceClassification.from_pretrained(get_weights(), num_labels=2, from_pt=True)

model.compile(optimizer=AdamWeightDecay(2e-5), metrics=["accuracy"])
model.summary()

Metal device set to: Apple M1 Max

systemMemory: 64.00 GB
maxCacheSize: 24.00 GB



2023-01-05 07:32:30.049513: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-01-05 07:32:30.049633: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
Some layers from the model checkpoint at josephyu12/antiberty-finetuned-healthy-covid-classification were not used when initializing TFBertForSequenceClassification: ['dropout_77']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertFor

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  25758720  
                                                                 
 dropout_25 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1026      
                                                                 
Total params: 25,759,746
Trainable params: 25,759,746
Non-trainable params: 0
_________________________________________________________________


In [21]:
tf_train_set = model.prepare_tf_dataset(
    train_dataset,
    batch_size=8,
    shuffle=True,
    tokenizer=tokenizer
)

# tf_val_set = model.prepare_tf_dataset(
#     val_dataset,
#     batch_size=8,
#     shuffle=False,
#     tokenizer=tokenizer
# )

tf_test_set = model.prepare_tf_dataset(
    test_dataset,
    batch_size=8,
    shuffle=False,
    tokenizer=tokenizer
)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
history = model.fit(tf_train_set, validation_data=tf_test_set, epochs=3)

Epoch 1/3


2023-01-04 06:38:23.415180: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-01-04 06:38:29.623255: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-01-04 07:01:06.927384: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3
Epoch 3/3


In [23]:
model.label2id = {"healthy": 0, "covid": 1}
model.id2label = {val: key for key, val in model.label2id.items()}

Now we can push it to the hub as simply as...

In [24]:
model_name = "antibertyv3"
finetuned_model_name = f"{model_name}-finetuned-healthy-covid-classification"

model.push_to_hub(finetuned_model_nameqwer)
tokenizer.push_to_hub(finetuned_model_nameqwer)

NameError: name 'finetuned_model_nameqwer' is not defined

In [None]:
import os
os._exit(00)

: 

: 