# Transformer for Sequence Classification

In [1]:
import pandas as pd
import numpy as np

create dataframe "data" and grab the first lines abstract

In [2]:
# create dataframe
data = pd.read_csv('award_data.csv') # loads and parses award data
data.iloc[0]['Abstract'] # prints out first lines abstract

  data = pd.read_csv('award_data.csv') # loads and parses award data


"The Department of Homeland Security (DHS) grapples with vast and diverse datasets collected daily, ranging from personal property scans to Stream of Commerce (SoC) data. To analyze and improve algorithms for detecting explosives and prohibited items, efficient curation and labeling are essential. However, DHS faces challenges, including data processing inefficiencies, dependency on human labeling, limited scalability, predictive analytics and threat detection obstacles, and inter-agency collaboration barriers.In response, Agile Data Decisions, Inc. (AgileDD) proposes an innovative solution called AI for Data Labeling and Curation at Scale (AI-DLCS). Leveraging their iQC human-in-the-loop AI platform and the CargoSeer AI platform, the project aims to address DHS's challenges. CargoSeer AI, developed by CargoSeer LTD, specializes in consignment inspection, utilizing a Large Foundation Model to automatically inspect scanned cargo for fraud. AgileDD plans to enhance these platforms with n

normalize data, get rid of data objects with NaN as the state

In [3]:
# convert state column to uppercase and removes \n and " "
data['State'] = data['State'].str.strip().str.upper()
# get unique states after cleaning
unique_states = data['State'].unique()
print("Unique States After Cleaning:", unique_states)
# replace NAN with pd.NA
data['State'].replace('NAN', pd.NA, inplace=True) 
# removes all rows where state is pd.NA
data.dropna(subset=['State'], inplace=True)
# re-check unique states to verify cleaning
unique_states_cleaned = data['State'].unique()
print("Final Unique States:", unique_states_cleaned)

Unique States After Cleaning: ['TX' 'CA' 'VT' 'MD' 'GA' 'MI' 'MA' 'NC' 'FL' 'CO' 'OH' 'CT' 'IL' 'AK'
 'NM' 'VA' 'OR' 'AL' 'NY' 'PA' 'WY' 'WI' 'MN' 'NV' 'IN' 'AZ' 'ID' 'KY'
 'DE' 'NJ' 'SC' 'MT' 'DC' 'MO' 'NE' 'HI' 'ND' 'WA' 'TN' 'UT' 'WV' 'OK'
 'RI' 'AR' 'KS' 'LA' 'IA' 'ME' 'NH' 'MS' 'PR' 'SD' 'AS' 'MH' nan 'VI']
Final Unique States: ['TX' 'CA' 'VT' 'MD' 'GA' 'MI' 'MA' 'NC' 'FL' 'CO' 'OH' 'CT' 'IL' 'AK'
 'NM' 'VA' 'OR' 'AL' 'NY' 'PA' 'WY' 'WI' 'MN' 'NV' 'IN' 'AZ' 'ID' 'KY'
 'DE' 'NJ' 'SC' 'MT' 'DC' 'MO' 'NE' 'HI' 'ND' 'WA' 'TN' 'UT' 'WV' 'OK'
 'RI' 'AR' 'KS' 'LA' 'IA' 'ME' 'NH' 'MS' 'PR' 'SD' 'AS' 'MH' 'VI']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['State'].replace('NAN', pd.NA, inplace=True)


create a dictionary d[state] = list of abstract associated with that state:
Ex: 
{
    "CALIFORNIA": ["Abstract 1", "Abstract 2", "Abstract 3"],
    "TEXAS": ["Abstract A", "Abstract B"],
    "OREGON": ["Abstract X"]
}


In [4]:
state_column = 'State'
proposal_title_column = 'Abstract'
data[state_column] = data[state_column].str.upper()
"""
data.groupby(state_column)
    creates grouped object
    rows are grouped together by their states
data.groupby(state_column)[proposal_title_column]
    select only the title column
data.groupby(state_column)[proposal_title_column].apply(list)
    converts each state's groups of absrtacts into a list
data.groupby(state_column)[proposal_title_column].apply(list).to_dict()
    converts to a dictionary: dict[state] = list of abstract of that state
"""
state_proposals = data.groupby(state_column)[proposal_title_column].apply(list).to_dict()
print(len(state_proposals))

55


## Dataset and Preprocessing

### Loading and Tokenizing Reviews

In [5]:
len(state_proposals['TN'])

1709

Split up the training (75%) and the test data (25%). Also create parallel ground truth arrays

In [6]:
import re

OR_Proposal = state_proposals['OR']
TN_Proposal = state_proposals['TN']

# split up the training and test data by 3/4 and 1/4 respectively
Train_OR_Proposal = OR_Proposal[:len(OR_Proposal)//4*3]
Test_OR_Proposal = OR_Proposal[len(OR_Proposal)//4*3:]
Train_TN_Proposal = TN_Proposal[:len(TN_Proposal)//4*3]
Test_TN_Proposal = TN_Proposal[len(TN_Proposal)//4*3:]

train_texts = []
train_labels = []
test_texts = []
test_labels = []

for test in Train_OR_Proposal:
    train_texts.append(str(test))
    train_labels.append(1)

# Load and label negative training reviews
for text in Train_TN_Proposal:
    train_texts.append(str(text))
    train_labels.append(0)  # negative label 0

# Load and label positive test reviews
for text in Test_OR_Proposal:
    test_texts.append(str(text))
    test_labels.append(1)  # positive label 1

# Load and label negative test reviews
for text in Test_TN_Proposal:
    test_texts.append(str(text))
    test_labels.append(0)  # negative label 0


print(f"Loaded {len(train_texts)} training reviews and {len(test_texts)} test reviews.")


Loaded 3240 training reviews and 1082 test reviews.


In [7]:
def tokenize(text):
    # Remove HTML tags
    text = re.sub(r"<.*?>", " ", text)
    # Keep only letters and standard punctuation (replace others with space)
    text = re.sub(r"[^a-zA-Z0-9\s]", ' ', text)
    # Lowercase the text
    text = text.lower()
    # Split into tokens by whitespace
    tokens = text.split()
    return tokens

# Tokenize all reviews
train_tokens = [tokenize(review) for review in train_texts]
test_tokens  = [tokenize(review) for review in test_texts]

# Peek at one tokenized example
print(train_texts[0][:100], "->", train_tokens[0][:20])


With the Artemis program, NASA currently plans to land the first woman and next man on the moon by 2 -> ['with', 'the', 'artemis', 'program', 'nasa', 'currently', 'plans', 'to', 'land', 'the', 'first', 'woman', 'and', 'next', 'man', 'on', 'the', 'moon', 'by', '2025']


## Building the Vocabulary
Here we're grabbing the most common 10,000 words. This removes rarer words, reducing dimensionality and making learning easier. <PAD> will hold padding introduced (necessary for tensors since all need to be the same size) and <UNK> will hold unknown/out-of-vocab words.
We're also assigning indexes to each word, with more frequent words takng lower indexes.

In [None]:
from collections import Counter

# bag of words - counts frequency of each token in the training set
word_counts = Counter(token for review in train_tokens for token in review)
print(f"Total unique tokens in training data: {len(word_counts)}")

vocab_size = 10_000 # limit vocab size to the to 10k
# returns 10_000 most common words
most_common = word_counts.most_common(vocab_size - 2) # -2 is for special tokens (<PAD> and <UNK>)
# each unique word gets its own unique index
word_to_idx = {"<PAD>": 0, "<UNK>": 1}
for i, (word, freq) in enumerate(most_common, start=2):
    word_to_idx[word] = i

print(f"Vocabulary size (includingn PAD/UNK): {len(word_to_idx)}")


Total unique tokens in training data: 28888
Vocabulary size (includingn PAD/UNK): 10000


## Encoding and Padding Sequences

For each abstract (list of tokens), we're going to create an array of unique indexes that correspond to each unique word. We will also pad / truncate till each returned array is 250 tokens long.

In [None]:
max_length = 250
def encode_and_pad(tokens):
    # populate indices with tokens unique index, if token doesn't have an index, set it to 1 <UNK>
    indices = [word_to_idx.get(token, 1) for token in tokens] # 1 is for <UNK> for unknown
    if len(indices) > max_length:
        indices = indices[:max_length]
    # if length of abstracti < 250 then make the rest 0's which is identified as padding
    if len(indices) < max_length:
           indices += [0] * (max_length - len(indices))
    return indices

train_sequences = [encode_and_pad(tok_list) for tok_list in train_tokens]
test_sequences = [encode_and_pad(tok_list) for tok_list in test_tokens]

print("Example encoded review (first 20 indices):", train_sequences[0][:20])
print("Length of encoded review:", len(train_sequences[0]))

Example encoded review (first 20 indices): [12, 2, 9977, 39, 634, 152, 1486, 5, 2765, 2, 166, 6998, 3, 582, 2766, 19, 2, 4433, 15, 8956]
Length of encoded review: 250


## Create PyTorch Dataset and DataLoader

In [10]:
import torch
from torch.utils.data import Dataset, DataLoader

class ProposalDataset(Dataset):
    def __init__(self, sequences, labels):
        # for word_indicies in list_of_word_indicies, make it a tensory of long ints
        self.sequences = [torch.tensor(seq, dtype=torch.long) for seq in sequences]
        # list_of_word_indicies parallel array of ground truth labels
        self.labels = torch.tensor(labels, dtype=torch.long)
    def __len__(self):
        return len(self.sequences)
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]
    
train_dataset = ProposalDataset(train_sequences, train_labels)
test_dataset =  ProposalDataset(test_sequences, test_labels)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Defining Models

We'll implement two models using PyTorch's nn.Module:

1. LSTM-based RNN: An Embedding layer followed by an LSTM (recurrent layer) and a linear layer to output the sentiment class.
2. Transformer Encoder: An Embedding layer (with added positional encoding) followed by Transformer encoder layers and a final linear layer for classification.

Both models will output a prediction for each input review (binary classification: positive or negative). We will use a size-2 output (for classes 0 and 1) and later apply a softmax or use CrossEntropyLoss which expects raw logits of size 2.

### Transform Encoder Model

In [11]:
from torch import nn

class TransformerClassifier(nn.Module):

    def __init__(self, vocab_size, embed_dim=100, num_heads=4, num_layers=2, ff_dim=256, max_len=250):
        super(TransformerClassifier, self).__init__()
        # maps each word to a vector of size embed_dim
        # each word gets an embedding vector, padding_idx=0 ensures padding token contributes nothing
        # to learned representation
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        # each word gets a vector of size 100 to represent it's position in abstract, this will be learned
        # these will be added to token embeddings
        self.pos_embedding = nn.Embedding(max_len, embed_dim)  # learnable positional embeddings
        # Define a Transformer Encoder layer (Multi-Headed Attention)
        # contains both the self attention layer as well as the feed forward layer
        # ff_dim is size of hidden layer in feedforward nn
        # input shape is (batch, seq_len, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, 
                                                  dim_feedforward=ff_dim, batch_first=True)
        # this stacks two encoder_layers ontop of each other
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        # final representation to output two classes 
        self.fc = nn.Linear(embed_dim, 2)
  
    def forward(self, x):
        # x shape: (batch, seq_length)- seq_len should be 250 (number of abstract tokens)
        batch_size, seq_len = x.size()
        # positional encoding
        # torch.arange(0, seq_len) createes a tensor of size seq_len-1 that has digits from 0 to seq_len
        # torch.arange(0, seq_len).unsqueeze(0) adds a new dimension at index 0 -> (1,seq_len)
        # torch.arange(0, seq_len).unsqueeze(0).expand(batch_size, seq_len) -> (batch_size, seq_len)
        pos_indices = torch.arange(0, seq_len, device=x.device).unsqueeze(0).expand(batch_size, seq_len)
        # Get token embeddings and positional embeddings
        token_embeds = self.embedding(x)            # (batch, seq_len, embed_dim)
        pos_embeds = self.pos_embedding(pos_indices)  # (batch, seq_len, embed_dim)
        # Add token and position embeddings
        x_embedded = token_embeds + pos_embeds       # (batch, seq_len, embed_dim)
        # Create padding mask (True where padding token is present)
        pad_mask = (x == 0)  # shape: (batch, seq_len), True at padded indices
        # Pass through Transformer encoder
        enc_out = self.transformer(x_embedded, src_key_padding_mask=pad_mask)  # (batch, seq_len, embed_dim)
        # Aggregate the encoder outputs; we use mean pooling
        # for each sequence, get the average of all the token embeddings
        seq_avg = enc_out.mean(dim=1)               # (batch, embed_dim)
        # input into nn to get output
        logits = self.fc(seq_avg)                   # (batch, 2)
        return logits


In [12]:
import torch.optim as optim

# Select device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# Instantiate models
vocab_size = len(word_to_idx)
transformer_model = TransformerClassifier(vocab_size).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
trans_optimizer = optim.Adam(transformer_model.parameters(), lr=0.001)


Using device: cuda


In [13]:
num_epochs = 4
for epoch in range(num_epochs):
    transformer_model.train()
    total_loss = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        trans_optimizer.zero_grad()
        outputs = transformer_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        trans_optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Transformer Training loss: {avg_loss:.4f}")


Epoch 1/4, Transformer Training loss: 0.6726
Epoch 2/4, Transformer Training loss: 0.6112
Epoch 3/4, Transformer Training loss: 0.4686
Epoch 4/4, Transformer Training loss: 0.3212


In [14]:
transformer_model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = transformer_model(inputs)
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
accuracy = correct / total
print(f"Transformer Model Test Accuracy: {accuracy:.4f}")


Transformer Model Test Accuracy: 0.6257


  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)
