1. Ensure you fill in all cells containing `YOUR CODE HERE`, `YOUR ANSWER HERE`, and `NotImplementedError()`.
2. After you finish, `Restart the kernel & run` all cells in order.
3. Scores will be awarded based on the code, not based on the higher accuracy the better grade. However, the expected accuracy will need to be > 80%.

# Project II: Text Classification Using LSTM and Transformer Network
## Deadline: Nov 14, 11:59 pm

You have learned about the basics of neural network training and testing during the class. Let's proceed to the text classification tasks using simple Transformer and LSTM networks!
    

Let's get started!

# Part 1: LSTM Network (15 points)

**Import library**

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F


# Load data
df = pd.read_csv('sms_spam.csv')
df.head()


  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


**Data processing**

In [2]:
pad_index = 0
unknown_index = 1

# Tokenizing
def tokenize(text):
    return text.lower().split()

In [3]:
def build_vocab(tokenized_texts):
    vocab = Counter()
    for tokens in tokenized_texts:
        vocab.update(tokens)

    vocab = {word: i + 2 for i, (word, _) in enumerate(vocab.most_common())}
    vocab_size = len(vocab) + 2
    return vocab, vocab_size

In [4]:
texts = df['text'].apply(tokenize).tolist()
vocab, vocab_size = build_vocab(texts)
print(vocab)
print(vocab_size)
# DELETE LATER
print(len(vocab))

13612
13610


In [5]:
# Convert tokens to integers, if token is not in vocab, assign unknown_index
def encode(tokens):
    return [vocab.get(token, unknown_index) for token in tokens]

encoded_texts = [encode(tokens) for tokens in texts]

# Convert labels to integers
le = LabelEncoder()
labels = le.fit_transform(df['type']).tolist()

In [6]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return torch.LongTensor(self.texts[idx]), torch.LongTensor([self.labels[idx]])

# Padding function
def collate_fn(batch):
    texts, labels = zip(*batch)
    text_lengths = [len(text) for text in texts]
    texts = pad_sequence(texts, padding_value=pad_index, batch_first=True)
    labels = torch.LongTensor(labels)
    return texts, labels, text_lengths

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(encoded_texts, labels, test_size=0.2, random_state=42)

train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


**Question 1 (5 points):** Define network architecture

You can read this documentation: https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html

In [7]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx):
        super().__init__()


        # YOUR CODE HERE
        # Define self.embedding
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        # Define lstm layer
        self.lstm = nn.LSTM(input_size = embedding_dim, hidden_size = hidden_dim, batch_first = True)
        # Define fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        #raise NotImplementedError()


    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        packed_embedded = pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hidden, _) = self.lstm(packed_embedded)
        return self.fc(hidden.squeeze(0))


**Question 2 (5 points):** Define training logic

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = len(np.unique(labels))
model = LSTMClassifier(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, pad_index).to(device)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss().to(device)

def train(model, loader, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for texts, labels, text_lengths in loader:
        # YOUR CODE HERE
        # Define your training logic here
        optimizer.zero_grad()
        
        
        # Convert data to same device with model
        texts, labels = texts.to(device), labels.to(device)
        
        # Generate model outputs
        outputs = model(texts, text_lengths)
        
        # Loss calculation
        loss = criterion(outputs, labels.squeeze())
        
        # Backpropagation
        loss.backward()
        optimizer.step()
        #raise NotImplementedError()

        epoch_loss += loss.item()
    return epoch_loss / len(loader)



**Question 3 (5 points):** Define eval logic

In [9]:
def evaluate(model, loader, criterion):
    model.eval()
    epoch_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for texts, labels, text_lengths in loader:
            # YOUR CODE HERE
            
            # Convert data to the same device as the model
            texts, labels = texts.to(device), labels.to(device)

            # Forward pass
            outputs = model(texts, text_lengths)

            # Calculate loss for eval data
            loss = criterion(outputs, labels.squeeze())
            epoch_loss += loss.item()

            # Count correct predictions
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels.squeeze()).sum().item()


            #raise NotImplementedError()


    accuracy = correct / total
    return epoch_loss / len(loader), accuracy

In [10]:
NUM_EPOCHS = 8
for epoch in range(NUM_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion)
    test_loss, _ = evaluate(model, test_loader, criterion)

    print(f"Epoch: {epoch+1}/{NUM_EPOCHS}")
    print(f"\tTrain Loss: {train_loss:.4f}")
    print(f"\tTest Loss: {test_loss:.4f}")

Epoch: 1/8
	Train Loss: 0.2565
	Test Loss: 0.0963
Epoch: 2/8
	Train Loss: 0.0882
	Test Loss: 0.0816
Epoch: 3/8
	Train Loss: 0.0516
	Test Loss: 0.0860
Epoch: 4/8
	Train Loss: 0.0225
	Test Loss: 0.0797
Epoch: 5/8
	Train Loss: 0.0173
	Test Loss: 0.0857
Epoch: 6/8
	Train Loss: 0.0082
	Test Loss: 0.0984
Epoch: 7/8
	Train Loss: 0.0072
	Test Loss: 0.1016
Epoch: 8/8
	Train Loss: 0.0024
	Test Loss: 0.0929


In [11]:
_, test_acc = evaluate(model, test_loader, criterion)
print(f"\tTest Accuracy: {test_acc*100:.2f}%")

	Test Accuracy: 97.94%


# Part 2: Transformer (15 points)

Useful resource: https://huggingface.co/docs/transformers/training

Import the needed packages

In [12]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from tqdm.notebook import tqdm

Load the data from the csv files.

**Question 4 (2 points):** To process text data with the transformer, we should load the text file (.csv) with pandas. Please finish that

In [13]:
def load_data(filepath):
    # Load the file with pandas
    # YOUR CODE HERE
    df = pd.read_csv('sms_spam.csv')
    return df
    #raise NotImplementedError()

The loaded files shown in the following

In [14]:
df = load_data("sms_spam.csv")
print('The csv file is shown in below:\n')
df

The csv file is shown in below:



Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...
5570,ham,Will ü b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other s..."
5572,ham,The guy did some bitching but I acted like i'd...


In [15]:
def preprocess_data(df):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    labels = df["type"].unique().tolist()
    label_dict = {label: i for i, label in enumerate(labels)}
    df['label'] = df["type"].replace(label_dict)

    input_ids = []
    attention_masks = []

    for text in df["text"]:
        encoded_dict = tokenizer.encode_plus(text, add_special_tokens=True, max_length=64, pad_to_max_length=True, return_attention_mask=True)
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    return torch.tensor(input_ids), torch.tensor(attention_masks), torch.tensor(df['label'].values), label_dict

input_ids, attention_masks, labels, label_dict = preprocess_data(df)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [16]:
def split_dataset(input_ids, attention_masks, labels):
    dataset = TensorDataset(input_ids, attention_masks, labels)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    return random_split(dataset, [train_size, test_size])

train_dataset, test_dataset = split_dataset(input_ids, attention_masks, labels)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
test_dataloader = DataLoader(test_dataset, shuffle=True, batch_size=8)


In [17]:
def create_model(label_dict):
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict))
    return model
model = create_model(label_dict)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Question 5 (3 points):** Please define the optimizer with Adam, AdamW, or SGD.

In [18]:
def setup_training(model):
    # YOUR CODE HERE
    optimizer = AdamW(model.parameters(), lr=5e-5)
    #raise NotImplementedError()

    epochs = 2
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs)
    return optimizer, epochs, scheduler

optimizer, epochs, scheduler = setup_training(model)





**Question 6 (7 points):** please define  training strategy with model, and loss function

In [19]:
def train_model(model, train_dataloader, optimizer, scheduler, epochs):
    for epoch in range(epochs):
        total_train_loss = 0
        for step, batch in enumerate(train_dataloader):
            b_input_ids = batch[0].to('cuda')
            b_attention_mask = batch[1].to('cuda')
            b_labels = batch[2].to('cuda')

            model.zero_grad()
            # YOUR CODE HERE
            # Please define the training strategy with model, optimizer and loss backward etc.

            # Forward Pass
            outputs = model(input_ids=b_input_ids, attention_mask=b_attention_mask)
            logits = outputs.logits

            # Define the loss function (CrossEntropyLoss)
            loss = F.cross_entropy(logits, b_labels)

            # Backward pass
            loss.backward()

            # Clip the norm of the gradients to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update weights
            optimizer.step()

            # Update the learning rate
            scheduler.step()

            total_train_loss += loss.item()

            
            #raise NotImplementedError()

            if step % 20 == 0:
                print('[epoch: %03d][iter: %04d][loss: %.6f]'%(epoch+1, step, loss.item()))

        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f'Average Training Loss: {avg_train_loss:.4f}')

train_model(model.to('cuda'), train_dataloader, optimizer, scheduler, epochs)


[epoch: 001][iter: 0000][loss: 0.601646]
[epoch: 001][iter: 0020][loss: 0.459457]
[epoch: 001][iter: 0040][loss: 0.003186]
[epoch: 001][iter: 0060][loss: 0.799033]
[epoch: 001][iter: 0080][loss: 0.000676]
[epoch: 001][iter: 0100][loss: 0.002586]
[epoch: 001][iter: 0120][loss: 0.000974]
[epoch: 001][iter: 0140][loss: 0.008085]
[epoch: 001][iter: 0160][loss: 1.478441]
[epoch: 001][iter: 0180][loss: 0.000600]
[epoch: 001][iter: 0200][loss: 0.000447]
[epoch: 001][iter: 0220][loss: 0.000898]
[epoch: 001][iter: 0240][loss: 0.000656]
[epoch: 001][iter: 0260][loss: 0.022088]
[epoch: 001][iter: 0280][loss: 0.600906]
[epoch: 001][iter: 0300][loss: 0.001950]
[epoch: 001][iter: 0320][loss: 0.000850]
[epoch: 001][iter: 0340][loss: 0.000483]
[epoch: 001][iter: 0360][loss: 0.000342]
[epoch: 001][iter: 0380][loss: 0.000289]
[epoch: 001][iter: 0400][loss: 0.000287]
[epoch: 001][iter: 0420][loss: 0.002159]
[epoch: 001][iter: 0440][loss: 0.000270]
[epoch: 001][iter: 0460][loss: 0.000237]
[epoch: 001][ite

**Question 7 (3 points):** please define evaluation strategy with trained model

In [20]:
def evaluate_model(model, test_dataloader):
    model.eval()
    predictions, true_labels = [], []

    for batch in test_dataloader:
        b_input_ids = batch[0].to('cuda')
        b_attention_mask = batch[1].to('cuda')
        b_labels = batch[2].to('cuda')

        with torch.no_grad():
            # YOUR CODE HERE
            # define the model eval process
            outputs = model(input_ids=b_input_ids, attention_mask=b_attention_mask)
            logits = outputs.logits
            #raise NotImplementedError()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.append(logits)
        true_labels.append(label_ids)

    return predictions, true_labels

predictions, true_labels = evaluate_model(model, test_dataloader)


Then, let us eval the model

In [21]:
def compute_accuracy(predictions, true_labels):
    flat_predictions = [item for sublist in predictions for item in sublist]
    predicted_label_ids = np.argmax(flat_predictions, axis=1).flatten()
    flat_true_labels = [item for sublist in true_labels for item in sublist]
    return accuracy_score(flat_true_labels, predicted_label_ids)

accuracy = compute_accuracy(predictions, true_labels)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 99.64%


In [22]:
flat_predictions = [item for sublist in predictions for item in sublist]
predicted_label_ids = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]
report = classification_report(flat_true_labels, predicted_label_ids, target_names=label_dict.keys())
print(report)

              precision    recall  f1-score   support

         ham       1.00      1.00      1.00       967
        spam       1.00      0.97      0.99       148

    accuracy                           1.00      1115
   macro avg       1.00      0.99      0.99      1115
weighted avg       1.00      1.00      1.00      1115



# Part 3: Text CNN (Grad student only) (10 points)

You can read this article first: https://medium.com/voice-tech-podcast/text-classification-using-cnn-9ade8155dfb9

Useful paper: https://arxiv.org/abs/1408.5882

**Question 8 (5 points)**: Define TextCNN

In [23]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx):
        super().__init__()

        # YOUR CODE HERE
        # Embedding
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        # Convolutional components
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embedding_dim))
            for fs in filter_sizes
        ])
        
        # Fully connected layer
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout)
        #raise NotImplementedError()

    def forward(self, text):
        # Embedding layer is applied to the input 'text'
        embedded = self.embedding(text)

        # Add a dimension to the 'embedded' tensor to make it suitable for convolution operations
        embedded = embedded.unsqueeze(1)
        
        # Apply convolution operations with ReLU activation to the 'embedded' tensor using each filter in 'self.convs'
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        
        # Perform 1D max pooling operation along the third dimension of each convolution result in 'conved'
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        # Concatenate the pooled results along the second dimension
        cat = self.dropout(torch.cat(pooled, dim=1))
        # Pass the concatenated and dropout-applied tensor through a fully connected (linear) layer 'self.fc'
        # to obtain the final output of the neural network
        output = self.fc(cat)
        return output
        #raise NotImplementedError()


In [24]:
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [2,3,4]
OUTPUT_DIM = len(np.unique(labels))
DROPOUT = 0.5
PAD_IDX = pad_index

model = TextCNN(vocab_size, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX).to('cuda')
optimizer = optim.Adam(model.parameters())
#criterion = nn.CrossEntropyLoss()

**Question 9 (3 points):** Define training logic

In [25]:
def train(model, loader, optimizer, criterion):
    # YOUR CODE HERE
    # return loss

    model.train()
    epoch_loss = 0
    for batch in loader:

        # Print batch to check its structure
        #print(batch)
        
        # Unpack the batch
        text, labels, text_lengths = batch

        text, labels = text.to(device), labels.to(device)

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        predictions = model(text.to('cuda'))

        # Calculate the loss
        loss = criterion(predictions, labels)

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(loader)
    #raise NotImplementedError()

**Question 10 (2 points)** Define evaluation logic

In [26]:
def evaluate(model, loader, criterion):
    # YOUR CODE HERE

    model.eval()
    epoch_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in loader:
            # Print batch to check its structure
            #print(batch)
            
            # Unpack the batch
            text, labels, text_lengths = batch

            predictions = model(text.to('cuda'))  # Move input to GPU
            labels = labels.to('cuda')  # Move labels to GPU

            # Forward pass
            predictions = model(text.to('cuda'))

            # Calculate loss for eval data
            loss = criterion(predictions, labels)

            # Count correct predictions
            _, predicted = torch.max(predictions.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            epoch_loss += loss.item()

    accuracy = correct / total
    return epoch_loss / len(loader), accuracy
    
    #raise NotImplementedError()
    # return loss, accuracy


In [27]:
NUM_EPOCHS = 10
for epoch in range(NUM_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_loader, criterion)

    print(f"Epoch: {epoch+1}/{NUM_EPOCHS}")
    print(f"\tTrain Loss: {train_loss:.4f}")
    print(f"\tTest Loss: {test_loss:.4f}")


Epoch: 1/10
	Train Loss: 0.3042
	Test Loss: 0.1459
Epoch: 2/10
	Train Loss: 0.1133
	Test Loss: 0.0887
Epoch: 3/10
	Train Loss: 0.0590
	Test Loss: 0.0785
Epoch: 4/10
	Train Loss: 0.0360
	Test Loss: 0.0773
Epoch: 5/10
	Train Loss: 0.0206
	Test Loss: 0.0763
Epoch: 6/10
	Train Loss: 0.0226
	Test Loss: 0.0799
Epoch: 7/10
	Train Loss: 0.0100
	Test Loss: 0.0758
Epoch: 8/10
	Train Loss: 0.0072
	Test Loss: 0.0841
Epoch: 9/10
	Train Loss: 0.0065
	Test Loss: 0.0869
Epoch: 10/10
	Train Loss: 0.0073
	Test Loss: 0.0873


In [28]:
_, test_acc = evaluate(model, test_loader, criterion)
print(f"\tTest Accuracy: {test_acc*100:.2f}%")

	Test Accuracy: 97.85%
