In [None]:
from google.colab import drive

drive.mount("./drive")

Mounted at ./drive


In [None]:
#REFERENCE: https://towardsdatascience.com/fine-tuning-bert-for-text-classification-54e7df642894

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m44.4 MB/s[0m eta [36m0:00:0

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

In [None]:
df = pd.read_csv("/content/classif23classes.csv")
df.drop(columns=["Unnamed: 0"], inplace=True)
df

Unnamed: 0,text,label
0,cell1 should be greater than cl2,0
1,zdl should be greater than yka5,0
2,oS1 should be less or equal to d2,3
3,dZS1 should be less or equal to gf4,3
4,t1 should be greater or equal to c2,2
...,...,...
1173,zhte8 should not be equal to 35.,23
1174,MpD must not be equal to 25.,23
1175,LZR should not be equal to 40.,23
1176,p3 should not be equal to 70.,23


In [None]:
test = df.sample(frac=0.2, random_state=1)
test

Unnamed: 0,text,label
660,Ensure a distance greater than 27 is maintaine...,12
548,The distance between fktvi99 and cdko00 should...,10
1177,u90 must not be equal to 750.,23
321,Audrey should not be equal to Aurora.,5
1053,u needs to be lower than or equal to 65.,21
...,...,...
495,The spatial distinction between Zc7 and Ybn6 m...,11
785,The magnitude of the difference between Jg7rN ...,14
560,The separation between Hf45 and fer4l needs to...,10
767,The separation between Tm4aB and Gp2kH must eq...,16


In [None]:
df.drop(test.index.values, inplace=True)

In [None]:
text = df.text.values
labels = df.label.values

In [None]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
    )

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def print_rand_sentence():
  '''Displays the tokens and respective IDs of a random text sample'''
  index = random.randint(0, len(text)-1)
  table = np.array([tokenizer.tokenize(text[index]),
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[index]))]).T
  print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence()

╒══════════╤═════════════╕
│ Tokens   │   Token IDs │
╞══════════╪═════════════╡
│ the      │        1996 │
├──────────┼─────────────┤
│ iq       │       26264 │
├──────────┼─────────────┤
│ score    │        3556 │
├──────────┼─────────────┤
│ of       │        1997 │
├──────────┼─────────────┤
│ student  │        3076 │
├──────────┼─────────────┤
│ ##3      │        2509 │
├──────────┼─────────────┤
│ should   │        2323 │
├──────────┼─────────────┤
│ be       │        2022 │
├──────────┼─────────────┤
│ higher   │        3020 │
├──────────┼─────────────┤
│ than     │        2084 │
├──────────┼─────────────┤
│ the      │        1996 │
├──────────┼─────────────┤
│ iq       │       26264 │
├──────────┼─────────────┤
│ score    │        3556 │
├──────────┼─────────────┤
│ of       │        1997 │
├──────────┼─────────────┤
│ student  │        3076 │
├──────────┼─────────────┤
│ ##7      │        2581 │
├──────────┼─────────────┤
│ .        │        1012 │
╘══════════╧═════════════╛


In [None]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 128,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids'])
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
def print_rand_sentence_encoding():
  '''Displays tokens, token IDs and attention mask of a random text sample'''
  index = random.randint(0, len(text) - 1)
  tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
  token_ids = [i.numpy() for i in token_id[index]]
  attention = [i.numpy() for i in attention_masks[index]]

  table = np.array([tokens, token_ids, attention]).T
  print(tabulate(table,
                 headers = ['Tokens', 'Token IDs', 'Attention Mask'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence_encoding()

╒══════════╤═════════════╤══════════════════╕
│ Tokens   │   Token IDs │   Attention Mask │
╞══════════╪═════════════╪══════════════════╡
│ [CLS]    │         101 │                1 │
├──────────┼─────────────┼──────────────────┤
│ the      │        1996 │                1 │
├──────────┼─────────────┼──────────────────┤
│ absolute │        7619 │                1 │
├──────────┼─────────────┼──────────────────┤
│ di       │        4487 │                1 │
├──────────┼─────────────┼──────────────────┤
│ ##spar   │       27694 │                1 │
├──────────┼─────────────┼──────────────────┤
│ ##ity    │        3012 │                1 │
├──────────┼─────────────┼──────────────────┤
│ between  │        2090 │                1 │
├──────────┼─────────────┼──────────────────┤
│ cb       │       17324 │                1 │
├──────────┼─────────────┼──────────────────┤
│ ##9      │        2683 │                1 │
├──────────┼─────────────┼──────────────────┤
│ ##v      │        2615 │        

In [None]:
val_ratio = 0.22
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
batch_size = 16

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx],
                          attention_masks[train_idx],
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx],
                        attention_masks[val_idx],
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [None]:
def b_tp(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 1'''
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  '''
  Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)
  '''
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
  return b_accuracy, b_precision, b_recall, b_specificity

In [None]:
# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 24,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(),
                              lr = 5e-5,
                              eps = 1e-08
                              )

# Run on GPU
model.cuda()

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#b_metrics
# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 4
cpt = 0
for _ in trange(epochs, desc = 'Epoch'):
    cpt+=1
    # ========== Training ==========

    # Set model to training mode
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids,
                             token_type_ids = None,
                             attention_mask = b_input_mask,
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids,
                              token_type_ids = None,
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': val_specificity.append(b_specificity)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')

    if(cpt==4):
      torch.save({
              'epoch': cpt,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(),
              'loss': tr_loss / nb_tr_steps,
              }, "/content/drive/MyDrive/BertForClassifforCSP/classif23classes.ckpt")

Epoch:  25%|██▌       | 1/4 [00:16<00:50, 16.86s/it]


	 - Train loss: 3.1453
	 - Validation Accuracy: 0.0000
	 - Validation Precision: NaN
	 - Validation Recall: NaN
	 - Validation Specificity: NaN


Epoch:  50%|█████     | 2/4 [00:33<00:33, 16.71s/it]


	 - Train loss: 2.5252
	 - Validation Accuracy: 0.0673
	 - Validation Precision: 0.8000
	 - Validation Recall: 0.5000
	 - Validation Specificity: 0.8750



Epoch:  75%|███████▌  | 3/4 [00:50<00:16, 16.98s/it]


	 - Train loss: 1.4309
	 - Validation Accuracy: 0.0962
	 - Validation Precision: 1.0000
	 - Validation Recall: 1.0000
	 - Validation Specificity: 1.0000


	 - Train loss: 0.6621
	 - Validation Accuracy: 0.0962
	 - Validation Precision: 1.0000
	 - Validation Recall: 1.0000
	 - Validation Specificity: 1.0000



Epoch: 100%|██████████| 4/4 [01:34<00:00, 23.53s/it]


## inference

In [None]:
new_sentence = "the distance from X to Y should be different than the distance between C and B"


# We need Token IDs and Attention Mask for inference on the new sentence
test_ids = []
test_attention_mask = []

# Apply the tokenizer
encoding = preprocessing(new_sentence, tokenizer)

# Extract IDs and Attention Mask
test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0)
test_attention_mask = torch.cat(test_attention_mask, dim = 0)

# Forward pass, calculate logit predictions
with torch.no_grad():
  output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

if np.argmax(output.logits.cpu().numpy()).flatten().item() == 0:
  prediction = "BINARY_gt"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 1:
  prediction = "BINARY_lt"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 2:
  prediction = "BINARY_ge"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 3:
  prediction = "BINARY_le"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 4:
  prediction = "BINARY_eq"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 5:
  prediction = "BINARY_ne"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 6:
  prediction = "BINARYDIST_gt"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 7:
  prediction = "BINARYDIST_lt"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 8:
  prediction = "BINARYDIST_ge"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 9:
  prediction = "BINARYDIST_le"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 10:
  prediction = "BINARYDIST_eq"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 11:
  prediction = "BINARYDIST_ne"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 12:
  prediction = "UNARYDIST_gt"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 13:
  prediction = "UNARYDIST_lt"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 14:
  prediction = "UNARYDIST_ge"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 15:
  prediction = "UNARYDIST_le"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 16:
  prediction = "UNARYDIST_eq"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 17:
  prediction = "UNARYDIST_ne"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 18:
  prediction = "UNARY_le"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 19:
  prediction = "UNARY_lt"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 20:
  prediction = "UNARY_ge"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 21:
  prediction = "UNARY_le"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 22:
  prediction = "UNARY_eq"
elif np.argmax(output.logits.cpu().numpy()).flatten().item() == 23:
  prediction = "UNARY_ne"
else:
  prediction = "UNKOWN"


print('Input Sentence: ', new_sentence)
print('Predicted Class: ', prediction)

Input Sentence:  the distance from X to Y should be different than the distance between C and B
Predicted Class:  BINARYDIST_ne




## test

In [None]:
text_test = test.text.values
labels_test = test.label.values

In [None]:
y_test = labels_test

In [None]:
y_pred = []

for t in text_test:
  new_sentence = t


  # We need Token IDs and Attention Mask for inference on the new sentence
  test_ids = []
  test_attention_mask = []

  # Apply the tokenizer
  encoding = preprocessing(new_sentence, tokenizer)

  # Extract IDs and Attention Mask
  test_ids.append(encoding['input_ids'])
  test_attention_mask.append(encoding['attention_mask'])
  test_ids = torch.cat(test_ids, dim = 0)
  test_attention_mask = torch.cat(test_attention_mask, dim = 0)

  # Forward pass, calculate logit predictions
  with torch.no_grad():
    output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

  y_pred.append(np.argmax(output.logits.cpu().numpy()).flatten().item())




In [None]:
y_pred

In [None]:
from sklearn.metrics import accuracy_score

print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.9703389830508474
