In [1]:
import os
import re
import time
import torch
import random
import numpy as np
from tqdm import tqdm

from sklearn.utils import shuffle
from utils import load_json, dump_json
from transformers import BertTokenizer
from sklearn.metrics import f1_score, accuracy_score, roc_curve, auc
from torch.utils.tensorboard import SummaryWriter  
from sklearn.model_selection import train_test_split
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# load data

In [2]:

datas = load_json("ps_sents_pair_with_label.json")

In [3]:
#shuffle datas
datas = shuffle(shuffle(datas,random_state=22),random_state=22)

In [4]:
train,test = train_test_split(datas,test_size=0.2, random_state=2022)
test,val = train_test_split(test,test_size=0.5, random_state=2022)

In [5]:
train[0]

{'sent1': "These achievements combined with academic progress resulted in the honour of signing the Head Teacher's Book, thirteen times.",
 'sent2': 'Since then I have been determined to not only help people in pain, I want to have the independence of making my own decisions and building friendly relationships with my clients.',
 'label': 0}

In [6]:
train1,train0,test1,test0,val1,val0=0,0,0,0,0,0


In [7]:
len(train),len(test),len(val)

(26870, 3359, 3359)

# Load the BERT tokenizer

In [8]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True,cache_dir=os.path.join(os.getcwd(),"bert-base-uncased_cache"))


# find the maximum length

In [9]:

sents1 = [d["sent1"].lower() for d in datas]
sents2 = [d["sent2"].lower() for d in datas]
sents = list(set(sents1+sents2))
# Encode our concatenated data
encoded_tweets = [tokenizer.encode(sent, add_special_tokens=True) for sent in sents]
# Find the maximum length
max_len = max([len(sent) for sent in encoded_tweets])
print('Max length: ', max_len)

Max length:  141


In [10]:
# specify a MAX_LEN you like
MAX_LEN=150

In [11]:
def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# set device

In [12]:
def get_device():
    if torch.cuda.is_available():       
        device = torch.device("cuda")
        print(f'There are {torch.cuda.device_count()} GPU(s) available.')
        print('Device name:', torch.cuda.get_device_name(0))

    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")
    return device
device = get_device()

No GPU available, using the CPU instead.


# Create a function to tokenize a set of texts

In [13]:

def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=text_preprocessing(sent["sent1"].lower()),#The first sequence to be encoded
            text_pair=text_preprocessing(sent["sent2"].lower()),# second sequence to be encoded
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,                  # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
#             return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True      # Return attention mask
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

In [14]:
#Convert  labels to torch.Tensor
train_labels = torch.tensor([t["label"] for t in train])
test_labels = torch.tensor([t["label"] for t in test])
val_labels = torch.tensor([t["label"] for t in val])
train_labels,test_labels,val_labels

(tensor([0, 0, 0,  ..., 0, 0, 0]),
 tensor([0, 0, 0,  ..., 0, 0, 0]),
 tensor([0, 0, 0,  ..., 0, 0, 0]))

In [15]:
torch.count_nonzero(train_labels).item(),len(train_labels)-torch.count_nonzero(train_labels).item()

(5589, 21281)

In [16]:
torch.count_nonzero(test_labels).item(),len(test_labels)-torch.count_nonzero(test_labels).item()

(682, 2677)

In [17]:
torch.count_nonzero(val_labels).item(),len(val_labels)-torch.count_nonzero(val_labels).item()

(721, 2638)

In [18]:
# 
train_inputs, train_masks = preprocessing_for_bert(train)
test_inputs, test_masks = preprocessing_for_bert(test)
val_inputs, val_masks = preprocessing_for_bert(val)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


# Create PyTorch DataLoader

In [19]:
#Create PyTorch DataLoader

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 4

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Create BertClassifier

BERT-base consists of 12 transformer layers, each transformer layer takes in a list of token embeddings, and produces the same number of embeddings with the same hidden size (or dimensions) on the output. The output of the final transformer layer of the [CLS] token is used as the features of the sequence to feed a classifier.

The transformers library has the BertForSequenceClassification class which is designed for classification tasks. However, we will create a new class so we can specify our own choice of classifiers.

Below we will create a BertClassifier class with a BERT model to extract the last hidden layer of the [CLS] token and a single-hidden-layer feed-forward neural network as our classifier.

In [20]:
%%time
import torch
import torch.nn as nn
from transformers import BertModel

# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 128, 2
        unfreeze_layers = ['layer.10','layer.11','bert.pooler','out.']
        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased',cache_dir=os.path.join(os.getcwd(),"bert-base-uncased_cache"))

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
#             nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
#             for name, param in self.bert.named_parameters():
#                 print(name,param.size())
 
#             print("*"*30)
#             print('\n')
 
            for name ,param in self.bert.named_parameters():
                param.requires_grad = False
                for ele in unfreeze_layers:
                    if ele in name:
                        param.requires_grad = True
                        break
#             #验证一下
#             for name, param in self.bert.named_parameters():
#                 if param.requires_grad:
#                     print(name,param.size())

        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

Wall time: 2.99 ms


# Optimizer & Learning Rate Scheduler

recommend following hyper-parameters
   - Batch size: 16 or 32       
   - Learning rate (Adam): 5e-5, 3e-5 or 2e-5  
   - Number of epochs: 2, 3, 4  

In [21]:
len_train_step = len(train_dataloader)
print('len_train_step: ',len_train_step)

def initialize_model(epochs=10,fp16=False,n_gpu=0,local_rank=-1):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=True)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=1e-6,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len_train_step * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=int(total_steps/2), # Default value
                                                num_training_steps=total_steps)
    #### Optional configuration
    if fp16:"Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
        try:
            from apex import amp
            fp16_opt_level = '01'
        except ImportError:
            raise ImportError("Please install apex from "
            "https://www.github.com/nvidia/apex to use fp16 training.")
        bert_classifier, optimizer = amp.initialize(
            bert_classifier, optimizer, opt_level=fp16_opt_level)
        "For fp16: Apex AMP optimization level "
        "selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html"
    # multi-gpu training (should be after apex fp16 initialization)
    if n_gpu > 1:
        bert_classifier = torch.nn.DataParallel(bert_classifier)
        
     # Distributed training (should be after apex fp16 initialization)
    if local_rank != -1:
        bert_classifier = torch.nn.parallel.DistributedDataParallel(
            bert_classifier, 
            device_ids=[local_rank],
            output_device=local_rank,
            find_unused_parameters=True
            )
        
    return bert_classifier, optimizer, scheduler


len_train_step:  6718


# Training Loop

Training:

- Unpack our data from the dataloader and load the data onto the GPU
- Zero out gradients calculated in the previous pass
- Perform a forward pass to compute logits and loss
- Perform a backward pass to compute gradients (loss.backward())
- Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
- Update the model's parameters (optimizer.step())
- Update the learning rate (scheduler.step())

Evaluation:

- Unpack our data and load onto the GPU
- Forward pass
- Compute loss and accuracy rate over the validation set

In [22]:
writer = SummaryWriter('./log/lr=1e-6droupout=0.0warmup=0.5batchsize=4')

In [39]:
# Specify loss function
# loss_fn = nn.CrossEntropyLoss(reduction='mean',label_smoothing=0.05)
loss_fn = nn.CrossEntropyLoss(reduction='mean')
def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    one_cricle_length = len(train_dataloader)
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed/lr':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)
            #print('logits,b_labels')
            #print(logits)
            #print(b_labels)
            '''tensor([[-0.0491,  0.1918],
                       [-0.0062,  0.1743],
                       [-0.0856,  0.2257],
                       [-0.0044,  0.1629]], grad_fn=<AddmmBackward>)
                tensor([0, 1, 0, 0])
            '''
            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            #print(loss)
            #print(logits.shape,b_labels.shape,loss.shape)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()
#             b = 0.64
#             flood = (loss-b).abs()+b
#             flood.backward()
            
            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients",L2 norm fuc
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0, norm_type=2)

            # Update parameters and the learning rate
            op_params =optimizer.state_dict()['param_groups'][0]
            lr = op_params.get('lr')
            
            ########################################################################
            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                writer.add_scalar('lr=1e-6droupout=0.0warmup=0.5batchsize=4/train_loss_step', batch_loss / batch_counts, step + epoch_i*one_cricle_length)
                writer.add_scalar('lr=1e-6droupout=0.0warmup=0.5batchsize=4/train_lr_step', lr, step + epoch_i*one_cricle_length)

                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch
#                 writer.add_scalar('steploss/train_loss_step', batch_loss / batch_counts, step)
                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {lr}")
                
                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()
            ###############################################################
            optimizer.step()
            scheduler.step()
        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_f1 = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            writer.add_scalar('lr=1e-6droupout=0.0warmup=0.5batchsize=4/train_loss', avg_train_loss, epoch_i)
            writer.add_scalar('lr=1e-6droupout=0.0warmup=0.5batchsize=4/val_loss', val_loss, epoch_i)
            writer.add_scalar('lr=1e-6droupout=0.0warmup=0.5batchsize=4val_f1', val_f1, epoch_i)
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_f1:^9.2f} | {time_elapsed:^15.6f}")
            print("-"*70)
        print("\n")
    writer.close()
    print("Training complete!")


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []
    val_f1 = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        # accuracy = (preds == b_labels).cpu().numpy().mean() * 100
#         accuracy = (preds == b_labels).cpu().numpy().mean()
        f1 = f1_score(b_labels, preds, average='weighted')
        val_f1.append(f1)
#         val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_f1 = np.mean(val_f1)

    return val_loss, val_f1

# running

In [None]:
epochs=10
set_seed(42)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=epochs)
train(bert_classifier, train_dataloader, val_dataloader, epochs=epochs, evaluation=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  | Elapsed/lr
----------------------------------------------------------------------
   1    |   20    |   0.761838   |     -      |     -     | 5.954153021732658e-10
   1    |   40    |   0.763377   |     -      |     -     | 1.1908306043465317e-09
   1    |   60    |   0.777186   |     -      |     -     | 1.7862459065197973e-09
   1    |   80    |   0.765086   |     -      |     -     | 2.3816612086930634e-09
   1    |   100   |   0.750388   |     -      |     -     | 2.977076510866329e-09
   1    |   120   |   0.755693   |     -      |     -     | 3.5724918130395946e-09
   1    |   140   |   0.748822   |     -      |     -     | 4.167907115212861e-09
   1    |   160   |   0.762722   |     -      |     -     | 4.763322417386127e-09
   1    |   180   |   0.765131   |     -      |     -     | 5.358737719559392e-09
   1    |   200   |   0.765088   |     -      |     -     | 5.954153021732658e-09
   1    |   220

   1    |  1980   |   0.706651   |     -      |     -     | 5.894611491515332e-08
   1    |  2000   |   0.714023   |     -      |     -     | 5.954153021732658e-08
   1    |  2020   |   0.713647   |     -      |     -     | 6.013694551949984e-08
   1    |  2040   |   0.717290   |     -      |     -     | 6.073236082167311e-08
   1    |  2060   |   0.711208   |     -      |     -     | 6.132777612384638e-08
   1    |  2080   |   0.705910   |     -      |     -     | 6.192319142601964e-08
   1    |  2100   |   0.710987   |     -      |     -     | 6.251860672819291e-08
   1    |  2120   |   0.707816   |     -      |     -     | 6.311402203036618e-08
   1    |  2140   |   0.705626   |     -      |     -     | 6.370943733253945e-08
   1    |  2160   |   0.708541   |     -      |     -     | 6.43048526347127e-08
   1    |  2180   |   0.700679   |     -      |     -     | 6.490026793688599e-08
   1    |  2200   |   0.709295   |     -      |     -     | 6.549568323905924e-08
   1    |  2220  

   1    |  3980   |   0.663692   |     -      |     -     | 1.184876451324799e-07
   1    |  4000   |   0.591198   |     -      |     -     | 1.1908306043465316e-07
   1    |  4020   |   0.576435   |     -      |     -     | 1.1967847573682644e-07
   1    |  4040   |   0.611475   |     -      |     -     | 1.2027389103899968e-07
   1    |  4060   |   0.593397   |     -      |     -     | 1.2086930634117298e-07
   1    |  4080   |   0.579902   |     -      |     -     | 1.2146472164334622e-07
   1    |  4100   |   0.595288   |     -      |     -     | 1.220601369455195e-07
   1    |  4120   |   0.577261   |     -      |     -     | 1.2265555224769276e-07
   1    |  4140   |   0.619609   |     -      |     -     | 1.2325096754986603e-07
   1    |  4160   |   0.596826   |     -      |     -     | 1.2384638285203927e-07
   1    |  4180   |   0.618415   |     -      |     -     | 1.2444179815421257e-07
   1    |  4200   |   0.578730   |     -      |     -     | 1.2503721345638581e-07
   1  

   1    |  5960   |   0.520951   |     -      |     -     | 1.7743376004763322e-07
   1    |  5980   |   0.499683   |     -      |     -     | 1.780291753498065e-07
   1    |  6000   |   0.510507   |     -      |     -     | 1.7862459065197973e-07
   1    |  6020   |   0.538781   |     -      |     -     | 1.7922000595415303e-07
   1    |  6040   |   0.494235   |     -      |     -     | 1.798154212563263e-07
   1    |  6060   |   0.541044   |     -      |     -     | 1.8041083655849954e-07
   1    |  6080   |   0.488781   |     -      |     -     | 1.810062518606728e-07
   1    |  6100   |   0.610495   |     -      |     -     | 1.8160166716284608e-07
   1    |  6120   |   0.488126   |     -      |     -     | 1.8219708246501932e-07
   1    |  6140   |   0.479988   |     -      |     -     | 1.8279249776719262e-07
   1    |  6160   |   0.501548   |     -      |     -     | 1.8338791306936589e-07
   1    |  6180   |   0.506272   |     -      |     -     | 1.8398332837153916e-07
   1   

   2    |  1140   |   0.581667   |     -      |     -     | 2.3393867222387614e-07
   2    |  1160   |   0.475587   |     -      |     -     | 2.3453408752604941e-07
   2    |  1180   |   0.522047   |     -      |     -     | 2.3512950282822266e-07
   2    |  1200   |   0.572367   |     -      |     -     | 2.3572491813039593e-07
   2    |  1220   |   0.492985   |     -      |     -     | 2.3632033343256922e-07
   2    |  1240   |   0.563165   |     -      |     -     | 2.3691574873474247e-07
   2    |  1260   |   0.490675   |     -      |     -     | 2.3751116403691574e-07
   2    |  1280   |   0.496382   |     -      |     -     | 2.38106579339089e-07
   2    |  1300   |   0.487547   |     -      |     -     | 2.387019946412623e-07
   2    |  1320   |   0.548495   |     -      |     -     | 2.392974099434355e-07
   2    |  1340   |   0.563930   |     -      |     -     | 2.398928252456088e-07
   2    |  1360   |   0.419023   |     -      |     -     | 2.4048824054778206e-07
   2    |

In [None]:

#saving a checkpoint assuming the network class named ClassNet
checkpoint={'modle':bert_classifier,
             'model_state_dict':bert_classifier.state_dict(),
             'optimize_state_dict':optimizer.state_dict(),
             'epoch':epochs}
torch.save(checkpoint,'checkoutpoints/checkpoint_epoch_10_lr_1e-6_dropout_0_warmupstep_0.5_seed_42.pkl')

In [None]:
def load_checkpoint(filepath,optimizer):
    checkpoint = torch.load(filepath)
    model=checkpoint['modle']#提前网络结构
    model.load_state_dict(checkpoint['model_state_dict'])#加载网络权重参数
    optimizer=optimizer
    optimizer.load_state_dict(checkpoint['optimize_state_dict'])#加载优化器参数
    for parameter in model.parameters():
        parameter.requires_grad=False
    model.eval()
    return model
modle=load_checkpoint('checkoutpoints/checkpoint_epoch_10_lr_1e-6_dropout_0_warmupstep_0.5_seed_42.pkl',optimizer)

# Evaluation on Validation Set

In [None]:
import torch.nn.functional as F

def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return probs

In [None]:
from sklearn.metrics import accuracy_score, roc_curve, auc
import matplotlib.pyplot as plt
def evaluate_roc(probs, y_true):
    """
    - Print AUC and accuracy on the test set
    - Plot ROC
    @params    probs (np.array): an array of predicted probabilities with shape (len(y_true), 2)
    @params    y_true (np.array): an array of the true values with shape (len(y_true),)
    """
    preds = probs[:, 1]
    fpr, tpr, threshold = roc_curve(y_true, preds)
    roc_auc = auc(fpr, tpr)
    print(f'AUC: {roc_auc:.4f}')
       
    # Get accuracy over the test set
    y_pred = np.where(preds >= 0.5, 1, 0)
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy: {accuracy*100:.2f}%')
    
    # Plot ROC AUC
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc) 
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [None]:
# Compute predicted probabilities on the test set
probs = bert_predict(modle, val_dataloader)

# Evaluate the Bert classifier
evaluate_roc(probs, test_labels)