### Credit to [Google Colab](https://colab.research.google.com/github/Ankur3107/colab_notebooks/blob/master/classification/BERT_Fine_Tuning_Sentence_Classification_v2.ipynb#scrollTo=JhUZO9vc_l6T)

In [1]:
import pandas as pd

import torch
from transformers import BertModel, BertTokenizer
import torch.nn as nn
from torch import optim

from transformers import BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pd.read_parquet('train-00000-of-00001.parquet')
val = pd.read_parquet('validation-00000-of-00001.parquet')
test = pd.read_parquet('test-00000-of-00001.parquet')

In [3]:
df = pd.concat([train, val], ignore_index=True)

In [4]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [5]:
# Get the lists of sentences and their labels.
sentences = df['sentence'].values
labels = df['label'].values

In [6]:
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [7]:
# Print the original sentence.
print(' Original: ', sentences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

 Original:  hide new secretions from the parental units 
Tokenized:  ['hide', 'new', 'secret', '##ions', 'from', 'the', 'parental', 'units']
Token IDs:  [5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197]


In [8]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

# For every sentence...
for sent in sentences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'

                        # This function also supports truncation and conversion
                        # to pytorch tensors, but we need to do padding, so we
                        # can't use these features :( .
                        #max_length = 128,          # Truncate all sentences.
                        #return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_sent)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Original:  hide new secretions from the parental units 
Token IDs: [101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102]


In [9]:
print('Max sentence length: ', max([len(sen) for sen in input_ids]))

Max sentence length:  66


In [10]:
# We'll borrow the `pad_sequences` utility function to do this.
from keras.preprocessing.sequence import pad_sequences

# Set the maximum sequence length.
# I've chosen 64 somewhat arbitrarily. It's slightly larger than the
# maximum training sentence length of 47...
MAX_LEN = 72

print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)

print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

# Pad our input tokens with value 0.
# "post" indicates that we want to pad and truncate at the end of the sequence,
# as opposed to the beginning.
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long",
                          value=0, truncating="post", padding="post")

print('\nDone.')


Padding/truncating all sentences to 72 values...

Padding token: "[PAD]", ID: 0

Done.


In [11]:
# Create attention masks
attention_masks = []

# For each sentence...
for sent in input_ids:

    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]

    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

In [12]:
# Use train_test_split to split our data into train and validation sets for
# training
from sklearn.model_selection import train_test_split

# Use 90% for training and 10% for validation.
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels,
                                                            random_state=2018, test_size=0.1)
# Do the same for the masks.
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
                                             random_state=2018, test_size=0.1)

In [13]:
# Convert all inputs and labels into torch tensors, the required datatype
# for our model.
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [14]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.

batch_size = 32

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [15]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer

class BertForSentimentClassification(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', num_classes=2):
        super(BertForSentimentClassification, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.classifier(x)
        return logits

In [16]:
model = BertForSentimentClassification()

In [17]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
# I believe the 'W' stands for 'Weight Decay fix"
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(),
              lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
              eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
            )



In [18]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [19]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [20]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [21]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [22]:
def train_bert_sentiment_classifier(model,device,  num_epochs=4, learning_rate=2e-5):

    # Prepare optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_dataloader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    # Training loop
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):

            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            optimizer.zero_grad()
            outputs = model(b_input_ids,
                    attention_mask=b_input_mask
                    )
            loss = nn.CrossEntropyLoss()(outputs, b_labels)
            total_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Average training loss: {avg_train_loss:.4f}")

        # Evaluation
        model.eval()
        eval_accuracy = 0
        for batch in validation_dataloader:
            with torch.no_grad():

                b_input_ids = batch[0].to(device)
                b_input_mask = batch[1].to(device)
                b_labels = batch[2].to(device)

                outputs = model(b_input_ids,
                            attention_mask=b_input_mask
                            )
                predictions = torch.argmax(outputs, dim=-1)
                eval_accuracy += (predictions == b_labels).float().mean().item()

        eval_accuracy /= len(validation_dataloader)
        print(f"Validation Accuracy: {eval_accuracy:.4f}")

    return model

# Usage
trained_model = train_bert_sentiment_classifier(model,device)

Epoch 1/4: 100%|██████████| 1919/1919 [11:49<00:00,  2.70it/s]


Average training loss: 0.2177
Validation Accuracy: 0.9445


Epoch 2/4: 100%|██████████| 1919/1919 [11:51<00:00,  2.70it/s]


Average training loss: 0.1124
Validation Accuracy: 0.9485


Epoch 3/4: 100%|██████████| 1919/1919 [11:50<00:00,  2.70it/s]


Average training loss: 0.0720
Validation Accuracy: 0.9491


Epoch 4/4: 100%|██████████| 1919/1919 [11:50<00:00,  2.70it/s]


Average training loss: 0.0472
Validation Accuracy: 0.9498


In [23]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [24]:
import os
model_dir = '/content/drive/MyDrive/models/'
os.makedirs(os.path.dirname(model_dir), exist_ok=True)
filename = 'model.pt'

# Save the entire model
torch.save(trained_model, model_dir+filename)

In [28]:
import torch
import torch.nn.functional as F
from transformers import BertTokenizer

def predict_sentiment(model, text, device=None):
    """
    Make softmax predictions on input text using the trained BERT model.

    Args:
    - model: Trained BertForSentimentClassification model
    - text: Input text string
    - device: torch.device (if None, will use CUDA if available)

    Returns:
    - predictions: Dict containing softmax probabilities and predicted class
    """
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model.eval()
    model.to(device)

    # Load tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Tokenize input text
    encoded_input = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')

    # Move input to device
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)

    # Make prediction
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    # Apply softmax to get probabilities
    probabilities = F.softmax(outputs, dim=1)

    # Get predicted class
    _, predicted_class = torch.max(probabilities, dim=1)

    # Convert to numpy for easier handling
    probabilities = probabilities.cpu().numpy()[0]
    predicted_class = predicted_class.cpu().numpy()[0]

    # Create prediction dictionary
    predictions = {
            'negative': float(probabilities[0]),
            'positive': float(probabilities[1])
        }

    return predictions

# Usage example:
# Assuming you have a trained model
model = torch.load('model.pt', map_location=torch.device('cpu'))

# Make a prediction
text = """
A spate of new state laws, including on guns, minimum wage and gender transition care, went into effect as the calendar flipped to 2024. Perhaps the most significant change bans programs that promote diversity, equity and inclusion at publicly funded colleges and universities in Texas.
"""

result = predict_sentiment(model, text)
print(result)

{'negative': 0.9299237728118896, 'positive': 0.07007624208927155}
