## 1. Training


Specifying GPU as device.

In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('GPU(s) available: %d' % torch.cuda.device_count())

    print('Using GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using CPU.')
    device = torch.device("cpu")

GPU(s) available: 1
Using GPU: Tesla T4


In [2]:
import logging as logger
import json
import tqdm
import copy

In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [4]:
#imports
import numpy as np
import random
import os
import pandas as pd
import time
import datetime
import csv
import math
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup

Load and sample the data.

In [None]:
# path is the path of your dataset
# the directory needs to have:
# train.json
# valid.json
# test.json
# aggregated_soft_labels.pt
# train_out_ids.txt
# link to the original AGNews dataset - https://drive.google.com/drive/folders/1v55IKG2JN9fMtKJWU48B_5_DcPWGnpTq?usp=sharing
path = 'agnews_wLLM_5k/'

In [5]:
def load_json(path):
  with open( path, 'r') as jsonf:
    data = json.load(jsonf)

  return data

In [6]:
def _get_train_data( path, split = 'train') :

    data_path = path + split + ".json" # train.json
    ids4agglabels = split + "_out_ids" # train_out_ids.txt
    # train_out_ids.txt contains the IDs of texts for which atleast one Labeling function/LLM did not abstain
    # this is emitted by the label model - Snorkel
    ids_path = path + ids4agglabels + ".txt"

    train_ids = []
    with open(ids_path, 'r') as txtf:
      train_ids = txtf.read().split()
    print("len:", len(train_ids))

    logger.info(f'loading data from {data_path}')
    data = json.load(open(data_path, 'r'))
    ids = []
    labels = []
    weak_labels = []
    texts = []
    for i, id in enumerate(train_ids):
        # id is string
        ids.append(id)
        labels.append(int(data[id]['label']))
        weak_labels.append(data[id]['weak_labels'])
        texts.append(data[id]['data']['text'])

    return (ids, texts, labels, weak_labels)

In [7]:

def _get_valid_data( path, split = 'valid') :
    
    data_path = path + split + ".json"
    logger.info(f'loading data from {data_path}')
    data = json.load(open(data_path, 'r'))
    ids = []
    labels = []
    weak_labels = []
    texts = []
    for i, item in data.items():
        ids.append(i)
        labels.append(int(data[i]['label']))
        weak_labels.append(data[i]['weak_labels'])
        texts.append(data[i]['data']['text'])

    return (ids, texts, labels, weak_labels)

In [8]:
def _get_test_data( path, split = 'test') :
    
    data_path = path + split + ".json"
    logger.info(f'loading data from {data_path}')
    data = json.load(open(data_path, 'r'))
    ids = []
    labels = []
    weak_labels = []
    texts = []
    for i, item in data.items():
        ids.append(i)
        labels.append(int(data[i]['label']))
        weak_labels.append(data[i]['weak_labels'])
        texts.append(data[i]['data']['text'])

    return (ids, texts, labels, weak_labels)

Some tweets with labels 0(Against) and 1(Favor) each.

In [9]:
# path = 'agnews_wLLM_5k/'
train_ids, train_texts, train_labels, train_weak_labels  = _get_train_data(path)
valid_ids, valid_texts, valid_labels, valid_weak_labels  = _get_valid_data(path)
test_ids, test_texts, test_labels, test_weak_labels  = _get_test_data(path)
print(len(train_ids), len(train_texts), len(train_labels), len(train_weak_labels))
print(len(valid_ids), len(valid_texts), len(valid_labels), len(valid_weak_labels))
print(len(test_ids), len(test_texts), len(test_labels), len(test_weak_labels))

len: 3424
3424 3424 3424 3424
1000 1000 1000 1000
1000 1000 1000 1000


## Dataset Preparation
Tokenization & Padding/Truncation

In [10]:
# Load BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', 
                                           return_dict=True)

Loading BERT tokenizer...


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
# Encode input sentences and get tokenized sentences, correpsonding attention masks and labels.
def encode_everything(partition, sentences, labels):
    input_ids = []
    attention_masks = []
    agg_labels = None

    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
                          sent,
                          add_special_tokens = True, # Add cls, sep tokens
                          max_length = 128,           # pad/truncate to max_length
                          pad_to_max_length = True,
                          return_attention_mask = True,
                          return_tensors = 'pt',
                    )
      
        # input_ids - list of token-encoded sentences   
        input_ids.append(encoded_dict['input_ids'])
      
        # attention_masks - list of attention masks
        attention_masks.append(encoded_dict['attention_mask'])

    # creates corresponsing tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels, dtype=int)

    if partition == 'train':
        # code to load tensor of agg_labels - TODO @jagriti
        agg_labels_ = torch.load("agnews_wLLM_5k/aggregated_soft_labels.pt", map_location=device)
        agg_labels = torch.from_numpy(agg_labels_).to(device)
        # shape = (input_ids.size()[0], 4)

        # # initialize a random tensor of the given shape with float values
        # agg_labels = torch.randn(*shape, dtype=torch.float)
        # agg_labels = torch.nn.functional.softmax(agg_labels, dim=1)
        print("agg_labels:", agg_labels.shape)
        print("agg_labels:", agg_labels[0])
        print(type(agg_labels))

    # torch.tensor(input_ids.size[0],)

    # verify tensor dimensions are correct
    print("Input IDs tensor shape: " + str(input_ids.shape))
    print("Attention masks tensor shape: " + str(attention_masks.shape))
    print("Labels tensor shape: " + str(labels.shape))
    print("One Label: ", labels[0])

    return (input_ids, attention_masks, labels, agg_labels)
    # return None

# encode_everything('train', train_texts, train_labels)
train_input_ids, train_attention_masks, train_labels_, train_agg_labels = encode_everything('train', train_texts, train_labels)
valid_input_ids, valid_attention_masks, valid_labels_, _ = encode_everything('valid', valid_texts, valid_labels)
test_input_ids, test_attention_masks, test_labels_, _ = encode_everything('test', test_texts, test_labels)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


agg_labels: torch.Size([3424, 4])
agg_labels: tensor([0.0282, 0.0253, 0.9092, 0.0372], device='cuda:0', dtype=torch.float64)
<class 'torch.Tensor'>
Input IDs tensor shape: torch.Size([3424, 128])
Attention masks tensor shape: torch.Size([3424, 128])
Labels tensor shape: torch.Size([3424])
One Label:  tensor(2)
Input IDs tensor shape: torch.Size([1000, 128])
Attention masks tensor shape: torch.Size([1000, 128])
Labels tensor shape: torch.Size([1000])
One Label:  tensor(3)
Input IDs tensor shape: torch.Size([1000, 128])
Attention masks tensor shape: torch.Size([1000, 128])
Labels tensor shape: torch.Size([1000])
One Label:  tensor(2)




```
# This is formatted as code
```

# Train, val, test dataloaders


In [13]:
# Batch size = 16 to accommodate entire batch in GPU memory on colab
batch_size = 16

# print(type(train_input_ids), type(train_attention_masks), type(train_labels_), type(train_agg_labels))
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels_, train_agg_labels)
val_dataset = TensorDataset(valid_input_ids, valid_attention_masks, valid_labels_)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels_)

# Create the DataLoaders for training and val sets. 
train_dataloader = DataLoader(
            train_dataset,
            sampler = SequentialSampler(train_dataset), # Select batches randomly
            batch_size = batch_size
        )

val_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset), # Select batches sequentially.
            batch_size = batch_size
        )

test_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset), # Select batches sequentially.
            batch_size = batch_size
        )

<class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'>


Fine-tune COVIDTwitter-BERT and train classification model BertForSequenceClassification

In [14]:
# Load BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("bert-base-cased",
                                                      num_labels=4,
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model.cuda()

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

Hyperparameters tuned: 

**Learning Rate:** 5e-6 (Tried 1e-5, 1e-6, 5e-5, 5e-6)

**Batch Size:** 16 (Tried 32 as well but colab has GPU memory limitations)

**Epochs:** 10 (Tried 2,5,10,20)

**Warmup Steps:** 15 (Tried default 0, but 15 gives most optimal results)

Using AdamW as optimizer.
Rest everything is default.

In [15]:
epochs = 10
# epochs = 2
num_batches = len(train_dataloader)
num_val_batches = len(val_dataloader)
num_training_steps = epochs * num_batches # total training steps
# optimizer = AdamW(model.parameters(), eps = 1e-8, lr = 1e-5)
optimizer = AdamW(model.parameters(), eps = 1e-8, lr = 5e-6)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps = 15,
    num_training_steps = num_training_steps # total training steps
)



In [16]:
import torch.nn.functional as F
from typing import Dict, Optional, Union

# loss function used by wrench during finetuning - https://github.com/JieyuZ2/wrench/blob/2647603e3718100cfdb18ec0b08edea80a3b5001/wrench/utils.py#L98
def cross_entropy_with_probs(
        input: torch.Tensor,
        target: torch.Tensor,
        weight: Optional[torch.Tensor] = None,
        reduction: str = "mean",
) -> torch.Tensor:
    """Calculate cross-entropy loss when targets are probabilities (floats), not ints.

    PyTorch's F.cross_entropy() method requires integer labels; it does accept
    probabilistic labels. We can, however, simulate such functionality with a for loop,
    calculating the loss contributed by each class and accumulating the results.
    Libraries such as keras do not require this workaround, as methods like
    "categorical_crossentropy" accept float labels natively.

    Note that the method signature is intentionally very similar to F.cross_entropy()
    so that it can be used as a drop-in replacement when target labels are changed from
    from a 1D tensor of ints to a 2D tensor of probabilities.

    Parameters
    ----------
    input
        A [num_points, num_classes] tensor of logits
    target
        A [num_points, num_classes] tensor of probabilistic target labels
    weight
        An optional [num_classes] array of weights to multiply the loss by per class
    reduction
        One of "none", "mean", "sum", indicating whether to return one loss per data
        point, the mean loss, or the sum of losses

    Returns
    -------
    torch.Tensor
        The calculated loss

    Raises
    ------
    ValueError
        If an invalid reduction keyword is submitted
    """
    if input.shape[1] == 1:
        input = input.squeeze()
        if target.ndim == 2:
            target = target[:, 1]
        return F.binary_cross_entropy_with_logits(input, target, weight=weight, reduction=reduction)
    else:

        if target.ndim == 1:
            return F.cross_entropy(input, target.long(), weight=weight, reduction=reduction)

        num_points, num_classes = input.shape
        # Note that t.new_zeros, t.new_full put tensor on same device as t
        cum_losses = input.new_zeros(num_points)
        for y in range(num_classes):
            target_temp = input.new_full((num_points,), y, dtype=torch.long)
            y_loss = F.cross_entropy(input, target_temp, reduction="none")
            if weight is not None:
                y_loss = y_loss * weight[y]
            cum_losses += target[:, y].float() * y_loss

    if reduction == "none":
        return cum_losses
    elif reduction == "mean":
        return cum_losses.mean()
    elif reduction == "sum":
        return cum_losses.sum()
    else:
        raise ValueError("Keyword 'reduction' must be one of ['none', 'mean', 'sum']")


In [17]:
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed = 20

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []
best_val_acc = 0.0
best_model = None

for epoch_i in range(0, epochs):
    print()
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print("Training Mode:")

    # Start time for epoch
    t_start = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):

        # Log progress every 20 batches.
        if step % 20 == 0 and not step == 0:
            training_time_elapsed = str(datetime.timedelta(seconds = math.floor(time.time() - t_start)))
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, num_batches, training_time_elapsed))

        # batch input ids, attention masks and labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        b_agg_labels = batch[3].to(device)

        # reset gradients
        model.zero_grad()        

        # forward pass
        result = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask,
                      #  labels=b_labels,
                       return_dict=True)
        
        logits = result.logits
        probs = F.softmax(logits, dim=1)
        loss = cross_entropy_with_probs(probs, b_agg_labels, reduction='none')
        loss = torch.mean(loss)

        # total training loss over batches
        total_loss += loss.item()
        loss.backward()

        # gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    # average loss over batches
    avg_train_loss = total_loss / num_batches           
    
    # Measure how long this epoch took.
    training_time = str(datetime.timedelta(seconds = math.floor(time.time() - t_start)))

    print()
    print("Average training loss: {0:.2f}".format(avg_train_loss))  #round to 2 decimal places
    print("Training epoch took: {:}".format(training_time))
        
    # Eval mode
    print()
    print("Validation Mode:")

    t_start = time.time()
    model.eval()

    # total_val_accuracy = 0
    total_val_loss = 0
    num_val_steps = 0

    predictions , pred_labels, true_labels = [], [], []
    for batch in val_dataloader:
        # batch input ids, attention masks, labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():        

            # forward pass
            result = model(b_input_ids, 
                           token_type_ids=None, 
                           attention_mask=b_input_mask,
                           labels=b_labels,
                           return_dict=True)

        # loss, logits
        logits = result.logits
        probs = F.softmax(logits, dim=1)
        loss = cross_entropy_with_probs(probs, b_labels, reduction='none')
        loss = torch.mean(loss)
        
        pred_labels_b = torch.argmax(logits, dim=1)
            
        # total val loss over batches
        total_val_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        pred_labels_b = pred_labels_b.to('cpu').numpy()
        
        # Store predictions and true labels
        predictions.append(logits)
        true_labels.append(label_ids)
        pred_labels.append(pred_labels_b)

    print('Classification Results:')
    tp = 0 # true pos
    cnt = 0

    for j in range(len(true_labels)):
      for i in range(len(true_labels[j])):
        cnt += 1
        if true_labels[j][i] == pred_labels[j][i]:
          tp += 1

    acc = tp / cnt
    print()
    print()
    print("VAL STATS:")
    print("Accuracy: " + str(acc))
    if acc > best_val_acc:
      # SAVE better model
      print("SAVING BEST MODEL\n")
      best_val_acc = acc
      best_model = copy.deepcopy(model.state_dict())
    print()
    print()
        

    # avg val loss across batches.
    avg_val_loss = total_val_loss / num_val_batches
    
    # elapsed time
    val_time = str(datetime.timedelta(seconds = math.floor(time.time() - t_start)))
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(val_time))

print("Training complete.")


Training Mode:
  Batch    20  of    214.    Elapsed: 0:00:10.
  Batch    40  of    214.    Elapsed: 0:00:16.
  Batch    60  of    214.    Elapsed: 0:00:23.
  Batch    80  of    214.    Elapsed: 0:00:30.
  Batch   100  of    214.    Elapsed: 0:00:36.
  Batch   120  of    214.    Elapsed: 0:00:43.
  Batch   140  of    214.    Elapsed: 0:00:50.
  Batch   160  of    214.    Elapsed: 0:00:58.
  Batch   180  of    214.    Elapsed: 0:01:05.
  Batch   200  of    214.    Elapsed: 0:01:12.

Average training loss: 1.22
Training epoch took: 0:01:17

Validation Mode:
Classification Results:
[3 1 3 3 0 3 2 2 2 1 1 3 3 1 1 1] [3 1 0 3 0 2 2 2 2 1 1 3 3 1 1 1]
[1 1 3 0 2 2 2 2 2 2 2 2 2 2 3 3] [1 1 2 0 2 2 3 2 3 2 3 2 2 0 0 3]
[3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1] [2 3 3 3 3 3 2 3 3 1 1 1 0 1 1 1]
tp:  821 tn: 0 fp: 0 fn: 0


VAL STATS:
Accuracy: 0.821
SAVING BEST MODEL



  Validation Loss: 1.00
  Validation took: 0:00:07

Training Mode:
  Batch    20  of    214.    Elapsed: 0:00:06.
  Batch    40  of  

# Performance On Test Set

## Evaluate on test set


In [18]:
# Prediction on test set

# eval mode
print("LOADING BEST MODEL:\n")
model.load_state_dict(best_model)

model.eval()

predictions , pred_labels, true_labels = [], [], []

# Predict 
for batch in test_dataloader:
  # Add batch to GPU
  row = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = row
  
  with torch.no_grad():
      result = model(
          b_input_ids, 
          token_type_ids=None, 
          attention_mask=b_input_mask,
          return_dict=True
      )

  logits = result.logits
  pred_labels_b = torch.argmax(logits, dim=1) # get max value label as pred_class, no need of softmax here

  logits = logits.detach().cpu().numpy()
  pred_labels_b = pred_labels_b.to('cpu').numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)
  pred_labels.append(pred_labels_b)

print('Classification Results:')
tp = 0 # true pos
cnt = 0
for j in range(len(true_labels)):
  for i in range(len(true_labels[j])):
    if true_labels[j][i] == pred_labels[j][i]:
      tp += 1
    cnt += 1

acc = tp / cnt
print("Accuracy: " + str(acc))

LOADING BEST MODEL:

Classification Results:
Accuracy: 0.826


## Saving Model

In [19]:
# code in this cell is taken from huggingface run_glue.py
output_dir = './model_save/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

model_saved = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_saved.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


In [None]:
# Save model to google drive
# Mount Google Drive to this Notebook instance.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Copy the model files to a directory in your Google Drive.
!cp -r ./model_save/ "./drive/My Drive/AGNEWS_BERT/"

## Loading saved model

In [None]:
# Save model to google drive
# Mount Google Drive to this Notebook instance.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
## Please run the first 3 cells before running this & below cells

# Load saved model & tokenizer from drive path
output_dir = "./drive/My Drive/AGNEWS_BERT/model_save"
model = BertForSequenceClassification.from_pretrained(output_dir)
model.to(device)
tokenizer = BertTokenizer.from_pretrained(output_dir)