In [2]:
# Libraries needed, may need to do a "pip3 install [package-name]"
# To install jupyer lab: "pip3 install jupyterlab"
# "pip3 install jupyterlab torch transformers pandas scikit-learn"
# Launching jupyter lab: "jupyter lab"
import torch
import pandas as pd
from torch.optim import AdamW
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification

In [1]:
# Splitting the data and tokenizing it
data = pd.read_csv("data/Train.csv")
print(data.head())

X_train, X_test, Y_train, Y_test = train_test_split(data['text'], data['target'], train_size=0.2, shuffle=True)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

train_tokens = tokenizer(list(X_train), padding=True, truncation=True)
test_tokens = tokenizer(list(X_test), padding=True, truncation=True)

NameError: name 'pd' is not defined

In [None]:
# This class allows the data to be formatted correctly with the DataLoader in PyTorch.
# We don't need this, but it simplfies the process a lot.
# DataLoader is a utility that simplifies loading and managing datasets, especially large ones.
# DataLoader allows us to use batches, which means we can group multiple inputs together instead of
# doing this one by one which is incredibly time saving.
class TokenData(Dataset):
    def __init__(self, train = False):
        if train:
            self.text_data = X_train
            self.tokens = train_tokens
            self.labels = list(Y_train)
        else:
            self.text_data = X_test
            self.tokens = test_tokens
            self.labels = list(Y_test)

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        sample = {}
        for k, v in self.tokens.items():
            sample[k] = torch.tensor(v[idx])
        sample['labels'] = torch.tensor(self.labels[idx])
        return sample

In [None]:
# Loading the DataLoader, Model, Optimizer, and Loss function
batch_size = 8

train_dataset = TokenData(train=True)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

test_dataset = TokenData(train=False)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)

bert_model = BertForSequenceClassification.from_pretrained("bert-base-cased")
optimizer = AdamW(bert_model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

num_epochs = 1
device = "cpu"
bert_model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
for epoch in range(num_epochs):
    print("Epoch: ",(epoch + 1))
    # Sets the model to train mode.
    bert_model.train()
    # Here, we iterate over each batch in the train_loader dataset.
    for i,batch in enumerate(train_loader):
        # Move each batch to the CPU.
        batch = {k: v.to(device) for k, v in batch.items()}
        # We reset the gradients from the previous step before setting them for the current step.
        optimizer.zero_grad()

        # <-- TODO --->

        outputs = bert_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        pred = outputs.logits
        loss = loss_fn(pred, batch['labels'])
        loss.backward()
        optimizer.step()

        # <------------>

        # Calculating the running loss for logging purposes
        train_batch_loss = loss.item()
        train_last_loss = train_batch_loss / batch_size
        print('Training batch {} last loss: {}'.format(i + 1, train_last_loss))

    # Logging epoch-wise training loss
    print(f"\nTraining epoch {epoch + 1} loss: ",train_last_loss)
    # TRAINING BLOCK ENDS

    # Set the model to eval() mode.
    bert_model.eval()
    correct = 0
    test_pred = []

    # Testing the accuracy of our code on the test data
    for i, batch in enumerate(test_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        # We don't need gradients for testing
        with torch.no_grad():
            outputs = bert_model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])

        # <!-- TODO -->

        logits = outputs.logits
        loss = loss_fn(logits, batch['labels'])
        test_batch_loss = loss.item()
        test_last_loss = test_batch_loss / batch_size

        # <----------->
        print('Testing batch {} loss: {}'.format(i + 1, test_last_loss))

        # Comparing the predicted target with the labels in the batch
        correct += (logits.argmax(1) == batch['labels']).sum().item()
        print("Testing accuracy: ",correct/((i + 1) * batch_size))

    print(f"\nTesting epoch {epoch + 1} last loss: ",test_last_loss)

Epoch:  1
Training batch 1 last loss: 0.08928822726011276
Training batch 2 last loss: 0.08651192486286163
Training batch 3 last loss: 0.0942913293838501
Training batch 4 last loss: 0.0868922546505928
Training batch 5 last loss: 0.08840437233448029
Training batch 6 last loss: 0.08826050162315369
Training batch 7 last loss: 0.08297395706176758
Training batch 8 last loss: 0.08706354349851608
Training batch 9 last loss: 0.08159544318914413
Training batch 10 last loss: 0.08254213631153107
Training batch 11 last loss: 0.08098576217889786
Training batch 12 last loss: 0.08731159567832947
Training batch 13 last loss: 0.0872943177819252
Training batch 14 last loss: 0.08410470932722092
Training batch 15 last loss: 0.07968933135271072
Training batch 16 last loss: 0.08732444047927856
Training batch 17 last loss: 0.08102112263441086
Training batch 18 last loss: 0.0789869949221611
Training batch 19 last loss: 0.091135673224926
Training batch 20 last loss: 0.0795910432934761
Training batch 21 last los

In [None]:
# Define a function to predict a new text input
def predict(text, model, tokenizer):
    # Tokenize the text (make sure to use padding and truncation)
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)

    # Move tokens to the same device as the model
    tokens = {k: v.to(device) for k, v in tokens.items()}

    # Put model in evaluation mode and make prediction
    model.eval()
    with torch.no_grad():
        outputs = model(**tokens)

    # Extract logits and find the predicted class
    logits = outputs.logits
    predicted_class = logits.argmax(dim=1).item()

    # Return predicted class
    return predicted_class

# Replace this with your input
text = "The effects can still be felt today"
prediction = predict(text, bert_model, tokenizer)

# If the class is 0, it is not COVID related and if it is 1, then it is COVID related.
print("Predicted class:", prediction)

NameError: name 'bert_model' is not defined