# Text Classification


## Dependencies

In [1]:
!pip install transformers

import torch
from torch.utils.data import (
    TensorDataset,
    DataLoader,
    RandomSampler,
    SequentialSampler
)

from transformers import (
    BertTokenizer,
    BertForSequenceClassification
)

from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.2-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.5/268.5 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m119.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m83.2 MB/s[0m eta [36m0:00:

## Data

In [2]:
!wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'

--2023-07-05 18:54:30--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘smsspamcollection.zip’

smsspamcollection.z     [  <=>               ] 198.65K   408KB/s    in 0.5s    

2023-07-05 18:54:31 (408 KB/s) - ‘smsspamcollection.zip’ saved [203415]



In [3]:
!unzip -o smsspamcollection.zip

Archive:  smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


In [4]:
file_path = '/content/SMSSpamCollection'

# Read file into a DataFrame directly
df = pd.read_csv(file_path, sep='\t', names=['label', 'text'])

# Convert labels from 'spam'/'ham' to binary values
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
text = df.loc[:, "text"].values
labels = df.loc[:, "label"].values
print(text)

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
 'Ok lar... Joking wif u oni...'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
 ... 'Pity, * was in mood for that. So...any other suggestions?'
 "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free"
 'Rofl. Its true to its name']


## Preprocessing

In [6]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
    )

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
def print_rand_sentence():
  '''Displays the tokens and respective IDs of a random text sample'''
  selected_text = random.choice(text)
  tokens = tokenizer.tokenize(selected_text)
  token_ids = tokenizer.convert_tokens_to_ids(tokens)
  table = list(zip(tokens, token_ids))
  print(tabulate(table, headers = ['Tokens', 'Token IDs'], tablefmt = 'fancy_grid'))

print_rand_sentence()

╒══════════╤═════════════╕
│ Tokens   │   Token IDs │
╞══════════╪═════════════╡
│ yu       │        9805 │
├──────────┼─────────────┤
│ ##p      │        2361 │
├──────────┼─────────────┤
│ .        │        1012 │
├──────────┼─────────────┤
│ .        │        1012 │
├──────────┼─────────────┤
│ .        │        1012 │
├──────────┼─────────────┤
│ hey      │        4931 │
├──────────┼─────────────┤
│ then     │        2059 │
├──────────┼─────────────┤
│ one      │        2028 │
├──────────┼─────────────┤
│ day      │        2154 │
├──────────┼─────────────┤
│ on       │        2006 │
├──────────┼─────────────┤
│ fr       │       10424 │
├──────────┼─────────────┤
│ ##i      │        2072 │
├──────────┼─────────────┤
│ we       │        2057 │
├──────────┼─────────────┤
│ can      │        2064 │
├──────────┼─────────────┤
│ ask      │        3198 │
├──────────┼─────────────┤
│ mi       │        2771 │
├──────────┼─────────────┤
│ ##wa     │        4213 │
├──────────┼─────────────┤
│

In [8]:
def preprocessing(input_text, tokenizer):
    preprocess = tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 32,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )
    return preprocess



token_id, attention_masks = [], []

for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids'])
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [9]:
print(token_id[np.random.randint(0, len(token_id)-1)]) # 101 = CLS; 102 = SEP

tensor([  101,  2196,  7499,  1037,  2154,  1999, 24471,  2166,  1012,  2204,
         2420,  2507,  1057,  8404,  1012,  2919,  2420,  2507,  1057,  3325,
         1012,  2119,  2024,  6827,  1999,  2166,   999,  2035,  2024,  5932,
        24618,   102])


In [10]:
def print_rand_sentence_encoding():
    '''Displays tokens, token IDs and attention mask of a random text sample'''
    index = random.randint(0, len(text) - 1)

    tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
    token_ids = token_id[index].tolist()
    attention = attention_masks[index].tolist()

    table = list(zip(tokens, token_id, attention))
    print(tabulate(table,
                   headers=['Tokens', 'Token IDs', 'Attention Mask'],
                   tablefmt='fancy_grid'))

print_rand_sentence_encoding()

╒══════════╤═════════════════════════════════════════════════════════════════════════════════╤══════════════════╕
│ Tokens   │ Token IDs                                                                       │   Attention Mask │
╞══════════╪═════════════════════════════════════════════════════════════════════════════════╪══════════════════╡
│ [CLS]    │ tensor([  101,  2175,  2127, 18414, 17583,  2391,  1010,  4689,  1012,  1012,   │                1 │
│          │          2800,  2069,  1999, 11829,  2483,  1050,  2307,  2088,  2474,  1041,   │                  │
│          │         28305,  1012,  1012,  1012, 25022,  2638,  2045,  2288, 26297, 28194,   │                  │
│          │          1012,   102])                                                          │                  │
├──────────┼─────────────────────────────────────────────────────────────────────────────────┼──────────────────┤
│ lo       │ tensor([  101,  7929,  2474,  2099,  1012,  1012,  1012, 16644, 15536,  254

## Data split

In [11]:
def create_dataloader(features, masks, labels, indices, batch_size):
    """Create DataLoader from given features, masks, and labels."""
    dataset = TensorDataset(features[indices], masks[indices], labels[indices])
    if indices[0] == 0:  # Training set, use RandomSampler
        sampler = RandomSampler(dataset)
    else:  # Validation set, use SequentialSampler
        sampler = SequentialSampler(dataset)
    dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size)
    return dataloader

val_ratio = 0.2
batch_size = 16  # Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size=val_ratio,
    shuffle=True,
    stratify=labels
)

# Train and validation DataLoaders
train_dataloader = create_dataloader(token_id, attention_masks, labels, train_idx, batch_size)
validation_dataloader = create_dataloader(token_id, attention_masks, labels, val_idx, batch_size)


In [12]:
print("Train loader size: ", len(train_dataloader))
print("Validation loader size: ", len(validation_dataloader))

Train loader size:  279
Validation loader size:  70


## Train and Metrics

In [17]:
def b_tp(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 1'''
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  '''
  Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)
  '''
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
  return b_accuracy, b_precision, b_recall, b_specificity


In [18]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(),
                              lr = 5e-5,
                              eps = 1e-08
                              )

# Run on GPU
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 2

for _ in trange(epochs, desc = 'Epoch'):

    # ========== Training ==========

    # Set model to training mode
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids,
                             token_type_ids = None,
                             attention_mask = b_input_mask,
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids,
                              token_type_ids = None,
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': val_specificity.append(b_specificity)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')


Epoch:  50%|█████     | 1/2 [00:36<00:36, 36.20s/it]


	 - Train loss: 0.0758
	 - Validation Accuracy: 0.9902
	 - Validation Precision: 0.9957
	 - Validation Recall: 0.9278
	 - Validation Specificity: 0.9989



Epoch: 100%|██████████| 2/2 [01:11<00:00, 35.82s/it]


	 - Train loss: 0.0283
	 - Validation Accuracy: 0.9857
	 - Validation Precision: 0.9693
	 - Validation Recall: 0.9097
	 - Validation Specificity: 0.9948






In [24]:
new_text = new_text = "Congratulations! As our loyal customer, you have been chosen to receive a £800 reward! To claim, call 09061701462. Use the claim code LX392. Valid for 24 hours only."

# Apply the tokenizer
encoding = preprocessing(new_text, tokenizer)
print(encoding)
# Convert lists to tensors
input_ids = torch.tensor(encoding['input_ids'])
attention_mask = torch.tensor(encoding['attention_mask'])

# Forward pass, calculate logit predictions
with torch.no_grad():
  output = model(input_ids.to(device), token_type_ids=None, attention_mask=attention_mask.to(device))

prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Ham'

print("\n\nPREDICTION:")
print('Input Text: ', new_text)
print('Predicted Class: ', prediction)

{'input_ids': tensor([[  101, 23156,   999,  2004,  2256,  8884,  8013,  1010,  2017,  2031,
          2042,  4217,  2000,  4374,  1037,  1069, 17914,  2692, 10377,   999,
          2000,  4366,  1010,  2655,  5641,  2692,  2575, 16576, 24096, 21472,
          2475,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])}


PREDICTION:
Input Text:  Congratulations! As our loyal customer, you have been chosen to receive a £800 reward! To claim, call 09061701462. Use the claim code LX392. Valid for 24 hours only.
Predicted Class:  Spam


  input_ids = torch.tensor(encoding['input_ids'])
  attention_mask = torch.tensor(encoding['attention_mask'])
