# CS4248 Project Notebook

In [16]:
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
%env CUDA_LAUNCH_BLOCKING=1

env: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
env: CUDA_LAUNCH_BLOCKING=1


In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import torch
from torch import nn, tensor, zeros, argmax
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
import sentencepiece as spm
import random
import torch.optim as optim
from torch.nn import BCELoss
from torch import ones_like, zeros_like, tensor
from tqdm import tqdm
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
from IPython.display import display, clear_output

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # choose between cpu and cuda

# set constant seed to ensure reproducibility
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)
torch.cuda.manual_seed_all(42)

In [None]:
column_names = ['category', 'text']

train = pd.read_csv('fulltrain.csv', names=column_names)
X_train_original = train['text']
y_train_original = train['category']
y_train_original = y_train_original.astype('int16') # reduce memory usage

#! use the second quarter of the training data for pretrain
X_pretrain = X_train_original[len(X_train_original)//4:2*(len(X_train_original)//5)] # changed to // 5 due to memory constraints
y_pretrain = y_train_original[len(y_train_original)//4:2*(len(y_train_original)//5)]

del X_train_original, y_train_original, train

test = pd.read_csv('balancedtest.csv', names=column_names)

In [None]:
# CONSTANTS

## HYPERPARAMETERS
vocab_size = 13_000 # Note that a change in this value will require a retraining of the SentencePiece model
word_limit = 512  # Maximum number of words to generate
epochs = 10  # Number of epochs for adversarial training
batch_size = 64  # Batch size

## Generator
### Generator pretrain
gen_pretrain_epochs = 10  # Number of epochs for generator pretraining
gen_pretrain_lr = 0.05  # Learning rate for generator pretraining
gen_pretrain_batch_size = 32  # Batch size for generator pretraining
### Generator hyperparameters
embed_dim = 1024  # Dimensionality of word embeddings
hidden_dim = 512  # Number of features in the hidden state of the LSTM
enc_drop = 0.2  # Dropout rate for the encoder LSTM
dec_drop = 0.2  # Dropout rate for the decoder LSTM
temperature = 1 # Temperature for sampling
lr_gen_optim = 0.5  # Learning rate for generator optimizer
gen_lr_boost = 1.001  # Learning rate boost for generator optimizer (1.0 means no boost)
gen_lr_boost_freq = 100  # Frequency of learning rate boost for generator optimizer

## Discriminator1
### Discriminator1 pretrain
disc1_pretrain_epochs = 5  # Number of epochs for discriminator1 pretraining
disc1_pretrain_lr = 0.05  # Learning rate for discriminator1 pretraining
disc1_pretrain_batch_size = 64  # Batch size for discriminator1 pretraining
### Discriminator1 hyperparameters
disc1_embed_dim = 256  # Number of features in the hidden state of the LSTM
disc1_hidden_dim = 128  # Number of features in the dense layer
lr_disc1_optim = 0.0005  # Learning rate for discriminator1 optimizer
disc1_lr_boost = 1.001  # Learning rate boost for discriminator1 optimizer (1.0 means no boost)
disc1_lr_boost_freq = 100  # Frequency of learning rate boost for discriminator1 optimizer


## Preprocessing
#### Current
We make use of `SentencePieceTrainer` to train a SentencePiece model on all of the training data. We then use this model to tokenize the data. We set a vocabulary size of 13_000

The below cell assumes that we have the trained SentencePiece model

In [None]:
sp = spm.SentencePieceProcessor(model_file='spm_model.model') # we assume the model file is in the same directory as this notebook

eot_index = sp.eos_id()
pad_index = sp.pad_id()
bos_index = sp.bos_id()
padded_pretrain = nn.utils.rnn.pad_sequence([torch.tensor(sp.encode(text, out_type=int, add_bos=True, add_eos=True), dtype=int) for text in X_pretrain], batch_first=True, padding_value=pad_index)  # num_seq * max_seq

## Generator

In [None]:
class Generator(nn.Module):
    def __init__(self, vocabulary_size, embed_dim, hidden_dim, enc_drop, dec_drop, temperature, word_limit, eot_index, padding_index, bos_index):
        super(Generator, self).__init__()
        self.vocabulary_size = vocabulary_size
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.enc_drop = enc_drop
        self.dec_drop = dec_drop
        self.temperature = temperature
        self.word_limit = word_limit
        self.eot_index = eot_index
        self.padding_index = padding_index
        self.bos_index = bos_index
        
        # Encoder
        self.encode = nn.Sequential(
            nn.Embedding(vocabulary_size, embed_dim, padding_idx=padding_index),
            nn.LSTM(embed_dim, hidden_dim, batch_first=True, dropout=enc_drop)
        )
        
        # Decoder
        self.dec_embed = nn.Embedding(vocabulary_size, embed_dim)
        self.dec_lstm = nn.LSTM(embed_dim, hidden_dim, dropout=dec_drop, batch_first=True)
        self.dec_softmax = nn.Sequential(
            nn.Linear(hidden_dim, vocabulary_size),
            # removed softmax layer, we will use gumbel softmax instead
        )

    def pretrain(self, x): # batch_indexes: batch_size * seq_len
        '''
        Pretrain the generator
        We will use teacher forcing to train the generator, training it to predict the next word given the current word.
        Problems to consider(?): We are only training decoder not encoder
        '''
        x = self.dec_embed(x) # batch_size * seq_len * embed_dim
        x, _ = self.dec_lstm(x) # batch_size * seq_len * hidden_dim
        x = self.dec_softmax(x) # batch_size * seq_len * vocab_size
        return x

    def forward(self, batch_indexes, max_len=word_limit):
        '''
        Generate a batch of sequences with the maximum length of max_len
        '''
        batch_size = batch_indexes.size(0)
        
        # encode the input sequence
        _, (hn, cn) = self.encode(batch_indexes) # 1 * batch_size * hidden_dim

        # Samples hold the generated sequences, we fill with padding to initialize
        samples = torch.full((batch_size, max_len), self.padding_index).long().to(device) # batch_size * word_limit
        samples[:, 0] = self.bos_index # set the first token to the BOS token
        mask = torch.ones_like(samples, dtype=torch.bool).to(device)  # Start with a mask that allows writing to all positions

        # Distribution: We initialise all to 0 except the BOS token
        prev_word = torch.full((batch_size, 1), self.bos_index).long().to(device) # batch_size * 1

        for i in range(1, max_len): # Start from 1 because we have already set the first token
            # Generate next word
            word_tensor = self.dec_embed(prev_word).to(device) # batch_size * embed_dim
            _ , (hn, cn) = self.dec_lstm(word_tensor, (hn, cn)) # 1 * batch_size * hidden_dim
            del word_tensor
            dist = self.dec_softmax(hn) # 1 * batch_size * vocab_size
            dist = dist.squeeze(0) # batch_size * vocab_size
            dist = F.gumbel_softmax(dist, tau=self.temperature, hard=True) # batch_size * vocab_size
            index = torch.argmax(dist, dim=-1)
            prev_word = index.unsqueeze(1)
            
            # Update the mask: Set False for all positions after eot_index for each sequence
            samples[:, i].masked_scatter_(mask[:, i], index.view(-1))
            eot_generated = index.view(-1) == self.eot_index
            if i+1 < max_len:
                mask[:, i+1:] &= ~eot_generated.unsqueeze(1)
        
        return samples

## Discriminator 1

In [None]:
class Discriminator1(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim, hidden_dim):
        super(Discriminator1, self).__init__()
        self.embedding = nn.Embedding(vocabulary_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.embedding(x)  # Shape: [batch_size, seq_length, embedding_dim]
        
        _, (hn, _) = self.lstm(x)  # hn shape: [1, batch_size, hidden_dim] for 1 layer LSTM
        
        hn = hn[-1]  # Shape: [batch_size, hidden_dim]
        
        x = self.linear(hn)  # Shape: [batch_size, 1]
        x = self.sigmoid(x)
        return x


## Generator - Discriminator 1 Integration

In [None]:
discriminator1 = Discriminator1(vocabulary_size=vocab_size, embedding_dim=disc1_embed_dim, hidden_dim=disc1_hidden_dim).to(device)
generator = Generator(vocabulary_size=vocab_size, embed_dim=embed_dim, hidden_dim=hidden_dim, enc_drop=enc_drop, dec_drop=dec_drop, temperature=temperature, word_limit=word_limit, eot_index=eot_index, padding_index=pad_index, bos_index=bos_index).to(device)
bce = BCELoss()

### Pretrain Generator

In [None]:
optimizer = optim.Adam(generator.parameters(), lr=gen_pretrain_lr)

In [None]:
human_train = DataLoader(padded_pretrain[:len(padded_pretrain)], batch_size=gen_pretrain_batch_size, shuffle=True)

for epoch in range(gen_pretrain_epochs):
    pbar = tqdm(enumerate(human_train), total=len(human_train), desc=f"Epoch {epoch+1}/{gen_pretrain_epochs}")
    for i, sequences in pbar:
        seq_limit = min(max(50, i*(epoch+1)), word_limit) # Curriculum learning: Increase the sequence length as the training progresses
        # Forward pass
        inputs = sequences[:, :-1]  # Exclude the last token for input
        targets = sequences[:, 1:]  # Exclude the first token for targets
        inputs, targets = inputs[:, :seq_limit], targets[:, :seq_limit]  # Limit the sequence length
        inputs, targets = inputs.to(device), targets.to(device)
        logits = generator.pretrain(inputs)
        
        # Compute loss
        loss = F.cross_entropy(logits.transpose(1, 2), targets)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        pbar.set_postfix({"Loss": loss.item(), "Sequence Length": seq_limit})

In [None]:
# save the model
torch.save(generator.state_dict(), 'generator_pretrain.pth')

In [None]:
# Print some generated sequences from the pretrained generator
generated = generator(padded_pretrain[0].to(device).unsqueeze(0))
generated = generated.squeeze(0)
print(sp.decode([i.item() for i in generated]))

### Pretrain Discriminator 1

In [None]:
human_train = DataLoader(padded_pretrain[:len(padded_pretrain)], batch_size=disc1_pretrain_batch_size, shuffle=True)
discriminator1_optimizer_pretrain = optim.Adam(discriminator1.parameters(), lr=disc1_pretrain_lr)

for epoch in range(disc1_pretrain_epochs):
    pbar = tqdm(enumerate(human_train), total=len(human_train), desc=f"Epoch {epoch+1}/{disc1_pretrain_epochs}")
    for i, human_data in pbar:
        seq_limit = min(max(50, i*(epoch+1)), word_limit) # Curriculum learning: Increase the sequence length as the training progresses
        human_data = human_data.to(device)
        human_data = human_data[:, :seq_limit]
        generated = generator(human_data, seq_limit)
        human_pred = discriminator1(human_data)
        generated_pred = discriminator1(generated)
        pred = torch.cat((human_pred, generated_pred), dim=0)
        labels = torch.cat((torch.full_like(human_pred, 0.9), torch.full_like(generated_pred, 0.1)), dim=0).to(device) # use soft labels

        discriminator1_optimizer_pretrain.zero_grad()
        loss = BCELoss()(pred.squeeze(), labels.squeeze())
        loss.backward()
        discriminator1_optimizer_pretrain.step()
        
        pbar.set_postfix({"Human prediction": human_pred.mean().item(), "Generated prediction": generated_pred.mean().item(), "Loss": loss.item(), "Sequence length": seq_limit})
        del labels, human_data, generated, loss

In [None]:
# save the model
torch.save(discriminator1.state_dict(), 'discriminator1_pretrain.pth')