## Using BERT to Classify text 



In [None]:
import os
import random
import datetime
import numpy as np
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
from collections import Counter
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vectors, build_vocab_from_iterator


USE_CUDA = torch.cuda.is_available()

if USE_CUDA:
    DEVICE = torch.device('cuda')
    print("Using cuda.")
else:
    DEVICE = torch.device('cpu')
    print("Using cpu.")

seed = 30255    
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if USE_CUDA:
    torch.cuda.manual_seed(seed)

COLAB = True

if COLAB:
    from google.colab import drive 
    drive.mount('/content/gdrive')
    PATH = "gdrive/Shareddrives/Adv ML Project/Data/"
else:
    # Change path to appropriate location
    PATH = "/Users/kilaf/OneDrive - The University of Chicago/CAPP255/Project"
    

Using cuda.
Mounted at /content/gdrive


Step 1: Load in pre-processed data and split into train, validation and test datasets using seed 30255

In [None]:
df = pd.read_csv(os.path.join(PATH + "preprocessed_data.csv"))

In [None]:
len(df)

6115

Converting the classes we want to classify to integers

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
labels = le.fit_transform(df['CLASS'])

print(len(labels))

6115


Using the BERT_TOKENIZED column to create our dataloader with the intergerized labels

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m93.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m110.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def tokenize_text(text):
    """
    Truncating the text to 256 for consistency's sake
    """
    return tokenizer.encode_plus(text,
                                  add_special_tokens=True,
                                  max_length=512,
                                  padding='max_length',
                                  return_attention_mask=True,
                                  return_tensors='pt',
                                  truncation=True)

tokenized_text = df['DESCRIPTION'].apply(tokenize_text)

Creating a vocabulary for the words

In [None]:
all_tokens = []

for tokenized_sample in tokenized_text:
    input_ids = tokenized_sample['input_ids'].tolist()[0]
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    all_tokens.extend(tokens)

# Step 2: Write the unique tokens to a vocabulary file
vocab_file_path = 'vocab.txt'  # Path to the vocabulary file you want to create

with open(vocab_file_path, 'w') as file:
    file.write('\n'.join(set(all_tokens)))

# Step 3: Apply the custom vocabulary to the model
custom_tokenizer = BertTokenizer(vocab_file=vocab_file_path)

Apply the tokenizer to the model

In [None]:
tokenized_data = [tokenizer.encode_plus(text, add_special_tokens=True, padding='max_length', max_length=128, truncation=True) for text in df['DESCRIPTION']]

In [None]:
from torch.utils.data import DataLoader, TensorDataset

# create DataLoader - need to stack the lists of tensors into one tensor
input_ids = torch.stack([row['input_ids'] for row in tokenized_text])
attention_mask = torch.stack([row['attention_mask'] for row in tokenized_text])

# Concatenating the Embeddings, Attention Mask and Labels
dataset = TensorDataset(input_ids, attention_mask, torch.tensor(labels, dtype=torch.long))

In [None]:
# compute train/validation/test split sizes
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size

# split dataset randomly into train/validation/test sets
train_data, val_data, test_data = random_split(dataset, [train_size, val_size, test_size])


# Save these to Google Drive
train_df = pd.DataFrame.from_records(train_data)
val_df = pd.DataFrame.from_records(val_data)
test_df = pd.DataFrame.from_records(test_data)

train_df.to_csv('train_data.csv', index=False)
val_df.to_csv('val_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)


# create data loaders for each set
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=True)

In [None]:
import torch.nn as nn
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(labels))



model.to(DEVICE)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

### Step 2: Model Training

In [None]:
def train_model(model, train_data_loader, optimizer):
    model.train()

    for batch in train_data_loader:
        input_ids, attention_mask, labels = batch
        if USE_CUDA:
                input_ids, attention_mask, labels = input_ids.squeeze(1).cuda(), attention_mask.cuda(), labels.cuda()
        else:
                input_ids, attention_mask, labels = input_ids.squeeze(1).cpu(), attention_mask.cpu(), labels.cpu()
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = torch.nn.functional.cross_entropy(outputs.logits, labels)

        loss.backward()
        optimizer.step()

In [None]:
def validate(model, data_loader):
    model.eval() # set model to evaluation mode
    
    # initialize performance metrics
    num_correct = 0
    num_samples = 0
    losses = []
    
    # create data loader for validation set
    
    # iterate over batches in validation set
    for batch in data_loader:
        # extract inputs and targets from batch

        input_ids, attention_mask, labels = batch
        if USE_CUDA:
                input_ids, attention_mask, labels = input_ids.squeeze(1).cuda(), attention_mask.cuda(), labels.cuda()
        else:
                input_ids, attention_mask, labels = input_ids.squeeze(1).cpu(), attention_mask.cpu(), labels.cpu()

        
        # compute model outputs and loss
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = torch.nn.functional.cross_entropy(outputs.logits, labels)
        
        # update performance metrics
        losses.append(loss.item())
        _, preds = torch.max(outputs.logits, dim=1)
        num_correct += (preds == labels).sum().item()
        num_samples += labels.size(0)
        
    # compute average loss and accuracy
    avg_loss = sum(losses) / len(losses)
    accuracy = num_correct / num_samples
    
    print(f'Validation Loss: {avg_loss:.4f} | Accuracy: {accuracy:.4f}')
    
    return avg_loss, accuracy

Setting up the Training and Validation with different hyperparametes

In [None]:
num_epochs = 5
learning_rate = 1e-5

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

loss_fn = nn.CrossEntropyLoss()
val_loss = []
for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs}')
    
    # train model on training set
    train_model(model, train_loader, optimizer)
    
    # validate model on validation set
    accuracy, loss = validate(model, val_loader)
    val_loss.append(loss)

    # update learning rate
    scheduler.step()

Epoch 1/5


OutOfMemoryError: ignored

In [None]:
import matplotlib.pyplot as plt

# plot validation loss
plt.plot(range(1, num_epochs+1), val_loss)
plt.xlabel('Epoch')
plt.ylabel('Validation Accuracy')
plt.title('BERT: Validation Accuracy vs. Epoch')
plt.show()

In [None]:
accuracy, loss = validate(model, test_loader)

In [None]:
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# model.to(device)

# optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
# loss_fn = nn.CrossEntropyLoss()


# epochs = 5
# for epoch in range(epochs):
#     train_loss = 0
#     val_loss = 0

#     model.train()
#     optimizer.zero_grad()
#     # Forward pass
#     for batch in train_loader:
#         optimizer.zero_grad()
#         input_ids, attention_mask, labels = batch
#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         train_loss += loss.item()
#         loss.backward()
    
#     # Validation
#     model.eval()
#     with torch.no_grad():
#         val_outputs = model(
#             input_ids=val_texts['input_ids'].to(device),
#             attention_mask=val_texts['attention_mask'].to(device),
#             labels=val_labels.to(device))

#         val_loss = val_outputs.loss
#         val_acc = (val_outputs.logits.argmax(axis=1) == val_labels).float().mean()

#     print(f'Epoch {epoch+1}/{epochs}: Train loss: {loss:.3f}. Val loss: {val_loss:.3f}. Val acc: {val_acc:.3f}')