In [1]:
# Libraries needed, may need to do a "pip3 install [package-name]"
# To install jupyer lab: "pip3 install jupyterlab"
# "pip3 install jupyterlab torch transformers pandas scikit-learn"
# Launching jupyter lab: "jupyter lab"
import torch
import pandas as pd
from torch.optim import AdamW
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification

In [2]:
# Splitting the data and tokenizing it
data = pd.DataFrame({'text': ['sample text1', 'sample text2'], 'label': [0, 1]})
X = data['text'].values
Y = data['label'].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

train_tokens = tokenizer(list(X_train), padding=True, truncation=True, max_length=512, return_tensors='pt')
test_tokens = tokenizer(list(X_test), padding=True, truncation=True, max_length=512, return_tensors='pt')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# This class allows the data to be formatted correctly with the DataLoader in PyTorch.
# We don't need this, but it simplfies the process a lot.
# DataLoader is a utility that simplifies loading and managing datasets, especially large ones.
# DataLoader allows us to use batches, which means we can group multiple inputs together instead of
# doing this one by one which is incredibly time saving.
class TokenData(Dataset):
    def __init__(self, train = False):
        if train:
            self.text_data = X_train
            self.tokens = train_tokens
            self.labels = list(Y_train)
        else:
            self.text_data = X_test
            self.tokens = test_tokens
            self.labels = list(Y_test)

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        sample = {}
        for k, v in self.tokens.items():
            sample[k] = torch.tensor(v[idx])
        sample['labels'] = torch.tensor(self.labels[idx])
        return sample

In [4]:
# Loading the DataLoader, Model, Optimizer, and Loss function
train_data = TokenData(train=True)
test_data = TokenData(train=False)
train_loader = DataLoader(train_data, batch_size=8, shuffle=True)
test_loader = DataLoader(test_data, batch_size=8, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)
optimizer = AdamW(bert_model.parameters(), lr=1e-5)

In [5]:
num_epochs = 3
batch_size = 8
for epoch in range(num_epochs):
    print("Epoch: ",(epoch + 1))
    # Sets the model to train mode.
    bert_model.train()
    # Here, we iterate over each batch in the train_loader dataset.
    for i,batch in enumerate(train_loader):
        # Move each batch to the CPU.
        batch = {k: v.to(device) for k, v in batch.items()}
        # We reset the gradients from the previous step before setting them for the current step.
        optimizer.zero_grad()

        # <-- TODO --->
        outputs = bert_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        # <------------>

        # Calculating the running loss for logging purposes
        train_batch_loss = loss.item()
        train_last_loss = train_batch_loss / batch_size
        print('Training batch {} last loss: {}'.format(i + 1, train_last_loss))

    # Logging epoch-wise training loss
    print(f"\nTraining epoch {epoch + 1} loss: ",train_last_loss)
    # TRAINING BLOCK ENDS

    # Set the model to eval() mode.
    bert_model.eval()
    correct = 0
    test_pred = []
    total_test_loss = 0

    # Testing the accuracy of our code on the test data
    for i, batch in enumerate(test_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        # We don't need gradients for testing
        with torch.no_grad():
            outputs = bert_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])

        # <!-- TODO -->
            loss = outputs.loss
            logits = outputs.logits


            total_test_loss += loss.item()
            test_last_loss = total_test_loss / len(test_loader)

        # <----------->
        print('Testing batch {} loss: {}'.format(i + 1, test_last_loss))

        # Comparing the predicted target with the labels in the batch
        correct += (logits.argmax(1) == batch['labels']).sum().item()
        print("Testing accuracy: ",correct/((i + 1) * batch_size))

    print(f"\nTesting epoch {epoch + 1} last loss: ",test_last_loss)

Epoch:  1


  sample[k] = torch.tensor(v[idx])


Training batch 1 last loss: 0.09044206887483597

Training epoch 1 loss:  0.09044206887483597
Testing batch 1 loss: 0.9449033141136169
Testing accuracy:  0.0

Testing epoch 1 last loss:  0.9449033141136169
Epoch:  2
Training batch 1 last loss: 0.060301344841718674

Training epoch 2 loss:  0.060301344841718674
Testing batch 1 loss: 1.1258469820022583
Testing accuracy:  0.0

Testing epoch 2 last loss:  1.1258469820022583
Epoch:  3
Training batch 1 last loss: 0.06375452131032944

Training epoch 3 loss:  0.06375452131032944
Testing batch 1 loss: 1.1883957386016846
Testing accuracy:  0.0

Testing epoch 3 last loss:  1.1883957386016846


In [6]:
# Define a function to predict a new text input
def predict(text, model, tokenizer):
    # Tokenize the text (make sure to use padding and truncation)
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)

    # Move tokens to the same device as the model
    tokens = {k: v.to(device) for k, v in tokens.items()}

    # Put model in evaluation mode and make prediction
    model.eval()
    with torch.no_grad():
        outputs = model(**tokens)

    # Extract logits and find the predicted class
    logits = outputs.logits
    predicted_class = logits.argmax(dim=1).item()

    # Return predicted class
    return predicted_class

# Replace this with your input
text = "The effects can still be felt today"
prediction = predict(text, bert_model, tokenizer)

# If the class is 0, it is not COVID related and if it is 1, then it is COVID related.
print("Predicted class:", prediction)

Predicted class: 0
