In [None]:
!pip3 install torch torchvision
!pip3 install transformers
!pip3 install datasets
!pip3 install ipywidgets

In [1]:
from tqdm.notebook import tqdm
from datasets import load_dataset

dataset = load_dataset('imdb')

train_dataset = dataset['train'].shuffle(seed=1).select(range(100))
test_dataset = dataset['test'].shuffle(seed=1).select(range(100))

Found cached dataset imdb (/Users/jweix/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /Users/jweix/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-b30490d9bb1910a8.arrow
Loading cached shuffled indices for dataset at /Users/jweix/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-25a13d1ede6e3cb3.arrow


In [None]:
from collections import Counter

Counter(train_dataset['label'])

In [4]:
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# tokenize text input, 
encoded_input = tokenizer(train_dataset['text'], 
                          return_tensors='pt', 
                          padding='max_length', 
                          max_length=512, # pretrained berth expect max of 512 tokens
                          truncation=True)

# label does not need to be tokenized
# just covert it to tensor object
encoded_labels = torch.tensor(train_dataset['label'])


# import torch.nn.functional as F

# # Convert target tensor to one-hot encoded format
# target_one_hot = F.one_hot(encoded_labels, num_classes=2)
# target_one_hot

# wrap the train data and label into TensorDatasets
tensor_dataset = TensorDataset(encoded_input['input_ids'], 
                               encoded_input['attention_mask'], 
                               encoded_labels)

# split train dataset into multiple batch
train_loader = DataLoader(tensor_dataset, batch_size=16, shuffle=True)


# Define optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# Define loss function
loss_fn = torch.nn.CrossEntropyLoss()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
encoded_labels

In [5]:
# Train the model

# wrap range iterable in a tqdm object
training_cycles = tqdm(range(1)) 

for training_cycle in training_cycles:
    
    model.train()
    
    # wrap train_loader in a tqdm object
    train_loader_progress = tqdm(
        train_loader, 
        desc=f'Cycle {training_cycle:1d}', 
        leave=False, 
        disable=False)
    
    count = 0
    for batch in train_loader_progress:

        optimizer.zero_grad()

        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2].float()
        }

        outputs = model(**inputs)

        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()
     
        print(count)
        train_loader_progress.set_postfix(batch = f'{count}')     
        
        count += 1

                        

  0%|          | 0/1 [00:00<?, ?it/s]

Cycle 0:   0%|          | 0/7 [00:00<?, ?it/s]

ValueError: Target size (torch.Size([16])) must be the same as input size (torch.Size([16, 2]))

In [None]:
train_dataset['text']

In [None]:
# making prediction

model.eval()

inputs = tokenizer(["a nice job by everyone concerned"], return_tensors="pt", padding=True)

print(inputs)
labels = torch.tensor([0])  # Batch size 1

outputs = model(**inputs, labels=labels) ## input is a dictionar, ** to pass it as kwargs

loss, logits = outputs[:2]

softmax_prob =torch.softmax(logits, dim=1)
print(loss)
print(logits)

In [None]:
from sklearn.metrics import accuracy_score

# Define data loader for testing data
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Evaluate the model
y_true = []
y_pred = []
model.eval()

for batch in test_loader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['label']
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    logits = outputs.logits
    _, predictions = torch.max(logits, dim=1)
    y_true += labels.tolist()
    y_pred += predictions.tolist()
accuracy = accuracy_score(y_true, y_pred)
print(f'Accuracy: {accuracy:.2f}')


In [None]:
torch.max(torch.tensor([0.1,1.2]))

## Using pretrained model

In [None]:
model_untoned = BertForSequenceClassification.from_pretrained('bert-base-uncased')

In [None]:
inputs = tokenizer(["no, it is too late", "iam happy"], return_tensors="pt", padding=True)
print(inputs)
labels = torch.tensor([[0, 1]])  # Batch size 1

outputs = model_untoned(**inputs, labels=labels) ## input is a dictionar, ** to pass it as kwargs

loss, logits = outputs[:2]

softmax_prob =torch.softmax(logits, dim=1)
print(loss)
print(logits)

# Training example 2

In [None]:
import torch
import transformers

# Load the BERT tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

# Load the pre-trained BERT model
model = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)


In [None]:
train_data = [
    ("The movie was great and I loved it!", "positive"),
    ("The movie was terrible and I hated it.", "negative"),
    ("The movie was okay but could have been better.", "neutral"),
    ("I'm not sure how I feel about the movie.", "neutral"),
    ("The movie was fantastic!", "positive"),
    ("I didn't like the movie at all.", "negative"),
    ("The movie was decent but not amazing.", "neutral")
]

label_map = {
    "positive": 0,
    "negative": 1,
    "neutral" : 2 
}


In [None]:
# Tokenize the training data
tokenized_train_data = [(tokenizer.encode_plus(review, add_special_tokens=True, max_length=128, padding='max_length', truncation=True), label) for review, label in train_data]

# Convert the tokenized training data to tensors
train_inputs = torch.tensor([tokenized_review['input_ids'] for tokenized_review, _ in tokenized_train_data])
train_attention_masks = torch.tensor([tokenized_review['attention_mask'] for tokenized_review, _ in tokenized_train_data])

# covert label to int representation
train_labels_int = [label_map[label] for _, label in tokenized_train_data]

# covert to one hot encoding formats
one_hot_labels = np.eye(len(label_map))[train_labels_int]

print(one_hot_labels)

train_labels = torch.tensor(one_hot_labels)


In [None]:
# Define the optimizer and learning rate
optimizer = transformers.AdamW(model.parameters(), lr=5e-5)

# Define the number of training epochs
num_epochs = 3

# Train the model
for epoch in range(num_epochs):
    # Set the model to training mode
    model.train()

    # Reset the loss for this epoch
    epoch_loss = 0.0

    # Iterate over the training data in batches
    for batch_inputs, batch_attention_masks, batch_labels in zip(train_inputs.split(8), train_attention_masks.split(8), train_labels.split(8)):
        # Zero the gradients for this batch
        optimizer.zero_grad()
        print(batch_inputs)
        print(batch_labels)
        # Forward pass through the model
        outputs = model(batch_inputs, attention_mask=batch_attention_masks, labels=batch_labels)

        # Calculate the loss for this batch
        loss = outputs.loss

        # Backward pass to calculate gradients
        loss.backward()

        # Update the parameters
        optimizer.step()

        # Accumulate the loss for this epoch
        epoch_loss += loss.item()

    # Print the loss for this epoch
    print("Epoch {}: Loss = {}".format(epoch + 1, epoch_loss))


In [None]:
model.eval()

# Define input text
text = "I loved it"

# Tokenize input text and convert to input IDs
inputs = tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')

input_ids = inputs['input_ids']

# Define one-hot encoding for labels
one_hot_labels = torch.zeros((1, len(label_map)))
one_hot_labels[0][0] = 3  # set first label as positive

# Run model on input
outputs = model(input_ids=input_ids, labels=one_hot_labels)

# Get logits and convert to probabilities
logits = outputs[1]
probs = torch.nn.functional.softmax(logits, dim=-1)
print(outputs)
# Print probabilities for each label
for label, prob in zip(label_map.keys(), probs[0]):
    print(f"{label}: {prob:.4f}")

In [None]:
import numpy as np

# define labels
labels = ['positive', 'neutral', 'negative', 'negative']

# create a dictionary to map label strings to integers
label_map = {'positive': 0, 'neutral': 1, 'negative': 2}

# convert labels to integer representation
label_ids = [label_map[label] for label in labels]

# perform one-hot encoding using numpy
one_hot_labels = np.eye(len(label_map))[label_ids]

print(one_hot_labels)


In [None]:
label_ids

In [None]:
np.eye(5)