In [18]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt 
import seaborn as sns
import sklearn
from Bio.Seq import Seq
from transformers import T5Tokenizer, TFT5EncoderModel, AdamWeightDecay
import re
np.random.seed(42)
tf.random.set_seed(42)
import pickle
import sys
import gc
import os
from antiberty import get_weights

In [19]:
df = pd.read_csv("../Data/CoV-AbDab_031022.csv")
df = df[["VHorVHH"]]
df = df[df["VHorVHH"].apply(lambda x: len(x) <= 138)]
df = df[(df.VHorVHH != 'ND')]
df
# df = df[["CDRH3"]]

Unnamed: 0,VHorVHH
5,QITLKESGPTLVKPTQTLTLTCKLSGFSVNTGGVGVGWIRQPPGKA...
32,QVQLVQSGAEVKKPGSSVKVSCKASGDTFNIYAINWVRQAPGQGLE...
33,QVQLVQSGAEVKKPGSSVKVSCKASGGTFNSYAITWVRQAPGQGLE...
34,QVQLVESGGGVVQPGRSLRLSCAASGFTFSTHGMHWVRQAPGKGLE...
35,QVQLVQSGAEVKKPGSSVKVSCKASGGTFRRYAISWVRQAPGQGLE...
...,...
11862,EVQVVESGGGLVKPGGSLRLSCAASGFTFSSYTMNWVRQAPGKGLE...
11863,QMQLVQSGPEVKRPGTSVKVSCEASGFTFSSSAILWVRQPRGQRLE...
11864,QVQLVESGGGLVKPGGSLRLSCAASGFTFSDYYMNWIRQAPGKGLE...
11865,EVQLVESGGGLVQPGGSLRLSCAASGFTFSRFAMHWVRQAPGKGLE...


In [20]:
dummy = []
head = []
with open("../Data/cAb-rep/cAb-Rep_heavy.nt.txt") as myfile:
    # count = 0
    for i in myfile:
        # if count <= 1:
        #     print(i)
        #     if i.find(">") == -1 & i.find("-") == -1:
        #         print(Seq.translate(i.strip()))
        #     count+=1
        dummy.append(i)
    np.random.shuffle(dummy)
    
    for i in dummy:
        if i.find(">") == -1 & i.find("-") == -1 & i.find("N") == -1: # These conditions must be met for a valid sequence, the longest was 141. However, there is no 141 sequence for COVID, the greatest is 138, so we go with that
            aa_sequence = Seq.translate(i.strip())
            if (len(aa_sequence) <= 138) & (len(aa_sequence) >= 100):
                head.append(aa_sequence)
                if len(head) >= 11415:
                    break
print(head[:5], len(head))
healthy_sequences = head



['EVQLVQSGPEVKKPGSSVKVSCKASGGTFSNFAFSWVRQAPGQGLEWMGSVILHLGTSTYAQKFQGRVTITADESTSAAFMDLNALTSDDTAVYYCARVVAVPGRVPYWFDPWGQGTLVTVSS', 'TLSLTCAVYGGSFSGYYWSWIRQPPGKGLEWIGEINHSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCARVPPTSTVTTLGDDYWGQGTLVTVSS', 'QVQLVQSGPEVKKPGASVRVSCKPSGYPFSNYGISWMRQAPGQGLEWMGWVNIDKGNTKYAQKFQDRVTMTTDTSSSTVYLELRSLRSDDTALYYCARERGGYRYGDYWGQGTLVIVSS', 'TLSLTCAVYGGSFSGYYWSWIRQPPGKGLEWIGEIKHSGSTNYIPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCASRAGAAAASWGQGTLVTVSS', 'SETLSLTCAVHGGSFSDYYWTWIRQPPGKGLEWIGEINHRGGTNYNPSLKSRLNILVDTSKSQFSLKLSSVTAADTAVYFCARERFILIRGLTKYYYYMDVWGKGTTVTVS'] 11415


In [21]:
del head
del myfile
del dummy
gc.collect()

0

In [22]:
covid_sequences = df.to_numpy()
covid_sequences = np.squeeze(covid_sequences)
np.random.shuffle(covid_sequences)
print(len(max(healthy_sequences, key=len)))
print(len(max(covid_sequences, key=len)))

138
138


In [23]:
del df
gc.collect()

0

In [24]:
healthy_lables = [0] * 11415
covid_lables = [1] * 11415

In [25]:
X = np.concatenate((healthy_sequences, covid_sequences))
y = np.concatenate((healthy_lables, covid_lables))

In [26]:
X = X.tolist()
y = y.tolist()

In [27]:
del healthy_sequences
del covid_sequences
gc.collect()

0

## Splitting the data

Since the data we're loading isn't prepared for us as a machine learning dataset, we'll have to split the data into train and test sets ourselves! We can use sklearn's function for that:

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [29]:
del X
del y
gc.collect()

0

In [30]:
X_train = [(" ".join(s)) for s in X_train]
X_test = [(" ".join(s)) for s in X_test]

In [31]:
from transformers import RobertaTokenizer, AutoTokenizer, BertTokenizer

tokenizer = RobertaTokenizer.from_pretrained("../antiberta/antibody-tokenizer") # idk if they actually have a tokenizer
# tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert_bfd")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.


In [32]:
X_train[0]

'E V Q L V E S G G G L I Q P G G S L R L S C A A S G F T V S S N Y M S W V R Q A P G K G L E W V S V I Y S G G S T Y Y A D S V K G R F T V S R D N S K N T L Y L Q M N S L R A E D T A V Y Y C A R G G R Y D Y D V F D I W G Q G T M V T V S S'

In [33]:
tokenizer(X_train[0])

{'input_ids': [0, 8, 3, 22, 3, 18, 3, 14, 3, 22, 3, 8, 3, 20, 3, 10, 3, 10, 3, 10, 3, 14, 3, 12, 3, 18, 3, 17, 3, 10, 3, 10, 3, 20, 3, 14, 3, 19, 3, 14, 3, 20, 3, 6, 3, 5, 3, 5, 3, 20, 3, 10, 3, 9, 3, 21, 3, 22, 3, 20, 3, 20, 3, 16, 3, 24, 3, 15, 3, 20, 3, 23, 3, 22, 3, 19, 3, 18, 3, 5, 3, 17, 3, 10, 3, 13, 3, 10, 3, 14, 3, 8, 3, 23, 3, 22, 3, 20, 3, 22, 3, 12, 3, 24, 3, 20, 3, 10, 3, 10, 3, 20, 3, 21, 3, 24, 3, 24, 3, 5, 3, 7, 3, 20, 3, 22, 3, 13, 3, 10, 3, 19, 3, 9, 3, 21, 3, 22, 3, 20, 3, 19, 3, 7, 3, 16, 3, 20, 3, 13, 3, 16, 3, 21, 3, 14, 3, 24, 3, 14, 3, 18, 3, 15, 3, 16, 3, 20, 3, 14, 3, 19, 3, 5, 3, 8, 3, 7, 3, 21, 3, 5, 3, 22, 3, 24, 3, 24, 3, 6, 3, 5, 3, 19, 3, 10, 3, 10, 3, 19, 3, 24, 3, 7, 3, 24, 3, 7, 3, 22, 3, 9, 3, 7, 3, 12, 3, 23, 3, 10, 3, 18, 3, 10, 3, 21, 3, 15, 3, 22, 3, 21, 3, 22, 3, 20, 3, 20, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

This looks good! We can see that our sequence has been converted into `input_ids`, which is the tokenized sequence, and an `attention_mask`. The attention mask handles the case when we have sequences of variable length - in those cases, the shorter sequences are padded with blank "padding" tokens, and the attention mask is padded with 0s to indicate that those tokens should be ignored by the model.

So now, let's tokenize our whole dataset. Note that we don't need to do anything with the labels, as they're already in the format we need.

In [34]:
train_tokenized = tokenizer(X_train)
test_tokenized = tokenizer(X_test)
val_tokenized = tokenizer(X_val)

In [35]:
from datasets import Dataset
train_dataset = Dataset.from_dict(train_tokenized)
test_dataset = Dataset.from_dict(test_tokenized)
val_dataset = Dataset.from_dict(val_tokenized)

train_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 18492
})

This looks good, but we're missing our labels! Let's add those on as an extra column to the datasets.

In [36]:
train_dataset = train_dataset.add_column("labels", y_train) # train_labels = y_train
test_dataset = test_dataset.add_column("labels", y_test)
val_dataset = val_dataset.add_column("labels", y_val)

train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 18492
})

In [37]:
from transformers import TFAutoModelForSequenceClassification

# model = TFAutoModelForSequenceClassification.from_pretrained("josephyu12/antibertyv3-finetuned-healthy-covid-classification")
model = TFAutoModelForSequenceClassification.from_pretrained(get_weights(), num_labels=2, from_pt=True)

model.compile(optimizer=AdamWeightDecay(2e-5), metrics=["accuracy"])
model.summary()

2023-01-04 22:40:21.776022: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-01-04 22:40:21.776272: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Metal device set to: Apple M1 Max

systemMemory: 64.00 GB
maxCacheSize: 24.00 GB



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids', 'cls.species.weight', 'cls.graft.bias', 'cls.chain.weight', 'cls.species.bias', 'cls.graft.weight', 'cls.chain.bias']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task 

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  25758720  
                                                                 
 dropout_25 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1026      
                                                                 
Total params: 25,759,746
Trainable params: 25,759,746
Non-trainable params: 0
_________________________________________________________________


In [38]:
tf_train_set = model.prepare_tf_dataset(
    train_dataset,
    batch_size=8,
    shuffle=True,
    tokenizer=tokenizer
)

tf_val_set = model.prepare_tf_dataset(
    val_dataset,
    batch_size=8,
    shuffle=False,
    tokenizer=tokenizer
)

tf_test_set = model.prepare_tf_dataset(
    test_dataset,
    batch_size=8,
    shuffle=False,
    tokenizer=tokenizer
)

In [39]:
history = model.fit(tf_train_set, validation_data=tf_val_set, epochs=3)

Epoch 1/3


2023-01-04 22:40:23.826264: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-01-04 22:40:30.709403: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-01-04 23:23:46.593898: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3
  67/2311 [..............................] - ETA: 1:15:38 - loss: 0.4261 - accuracy: 0.7892

In [23]:
model.label2id = {"healthy": 0, "covid": 1}
model.id2label = {val: key for key, val in model.label2id.items()}

Now we can push it to the hub as simply as...

In [24]:
model_name = "antibertyv3"
finetuned_model_name = f"{model_name}-finetuned-healthy-covid-classification"

model.push_to_hub(finetuned_model_nameqwer)
tokenizer.push_to_hub(finetuned_model_nameqwer)

NameError: name 'finetuned_model_nameqwer' is not defined

In [None]:
import os
os._exit(00)

: 

: 