# CS4248 Project Notebook

In [None]:
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
%env CUDA_LAUNCH_BLOCKING=1

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import torch
from torch import nn, tensor, zeros, argmax
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
import sentencepiece as spm
import random
import torch.optim as optim
from torch.nn import BCELoss
from torch import ones_like, zeros_like, tensor
from tqdm import tqdm
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
from IPython.display import display, clear_output

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # choose between cpu and cuda

# set constant seed to ensure reproducibility
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)
torch.cuda.manual_seed_all(42)

In [None]:
column_names = ['category', 'text']

train = pd.read_csv('fulltrain.csv', names=column_names)
X_train_original = train['text']
y_train_original = train['category']
y_train_original = y_train_original.astype('int16') # reduce memory usage

#! use 1/4 of the training data
X_train = X_train_original[:len(X_train_original)//4]
y_train = y_train_original[:len(y_train_original)//4]


test = pd.read_csv('balancedtest.csv', names=column_names)

In [None]:
# CONSTANTS

## HYPERPARAMETERS
vocab_size = 13_000 # Note that a change in this value will require a retraining of the SentencePiece model
word_limit = 512  # Maximum number of words to generate
batch_size = 64  # Batch size for training
epochs = 10  # Number of epochs for adversarial training

## Generator
### Generator hyperparameters
embed_dim = 1024  # Dimensionality of word embeddings
hidden_dim = 512  # Number of features in the hidden state of the LSTM
enc_drop = 0.2  # Dropout rate for the encoder LSTM
dec_drop = 0.2  # Dropout rate for the decoder LSTM
temperature = 1 # Temperature for sampling
lr_gen_optim = 0.05  # Learning rate for generator optimizer
gen_lr_boost = 1  # Learning rate boost for generator optimizer (1.0 means no boost)
gen_lr_boost_freq = 50  # Frequency of learning rate boost for generator optimizer

## Discriminator1
### Discriminator1 hyperparameters
disc1_embed_dim = 256  # Number of features in the hidden state of the LSTM
disc1_hidden_dim = 128  # Number of features in the dense layer
lr_disc1_optim = 0.0005  # Learning rate for discriminator1 optimizer
disc1_lr_boost = 1  # Learning rate boost for discriminator1 optimizer (1.0 means no boost)
disc1_lr_boost_freq = 100  # Frequency of learning rate boost for discriminator1 optimizer


## Preprocessing
#### Current
We make use of `SentencePieceTrainer` to train a SentencePiece model on all of the training data. We then use this model to tokenize the data. We set a vocabulary size of 8000

uncomment the below 2 cells to 
1. Generate a csv with the text only of fulltrain.csv
2. Train a SentencePiece model on the text only csv

In [None]:
# Used to extract the text from the CSV file

# # Path to your CSV file
# csv_file_path = 'fulltrain.csv'
# # Path to the output text file
# text_file_path = 'fulltrain_textonly.txt'

# # Load the CSV file
# df = pd.read_csv(csv_file_path, names=column_names)

# # Assuming the text column is named 'text'. Adjust if your column name is different
# texts = df['text']

# # Save the text column to a plain text file
# with open(text_file_path, 'w', encoding='utf-8') as f:
#     for text in texts:
#         f.write(text + '\n')




In [None]:
# Used to create the SentencePiece model and save it to a file
# We only need to run this once to create the model file

# spm.SentencePieceTrainer.train(input="fulltrain_textonly.txt", 
#                                model_prefix='spm_model', 
#                                vocab_size=vocab_size, 
#                                max_sentence_length=100_000,
#                                unk_id=0, bos_id=1, eos_id=2, pad_id=3)



The below cell assumes that we have the trained SentencePiece model

In [None]:
sp = spm.SentencePieceProcessor(model_file='spm_model.model') # we assume the model file is in the same directory as this notebook

eot_index = sp.eos_id()
pad_index = sp.pad_id()
bos_index = sp.bos_id()
tokens = [sp.encode(text, out_type=int, add_bos=True, add_eos=True) for text in X_train]
tokens = [torch.tensor(token, dtype=int) for token in tokens]
padded = nn.utils.rnn.pad_sequence(tokens, batch_first=True, padding_value=pad_index)  # num_seq * max_seq

## Generator

In [None]:
class Generator(nn.Module):
    def __init__(self, vocabulary_size, embed_dim, hidden_dim, enc_drop, dec_drop, temperature, word_limit, eot_index, padding_index, bos_index):
        super(Generator, self).__init__()
        self.vocabulary_size = vocabulary_size
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.enc_drop = enc_drop
        self.dec_drop = dec_drop
        self.temperature = temperature
        self.word_limit = word_limit
        self.eot_index = eot_index
        self.padding_index = padding_index
        self.bos_index = bos_index
        
        # Encoder
        self.encode = nn.Sequential(
            nn.Embedding(vocabulary_size, embed_dim, padding_idx=padding_index),
            nn.LSTM(embed_dim, hidden_dim, batch_first=True, dropout=enc_drop)
        )
        
        # Decoder
        self.dec_embed = nn.Embedding(vocabulary_size, embed_dim)
        self.dec_lstm = nn.LSTM(embed_dim, hidden_dim, dropout=dec_drop, batch_first=True)
        self.dec_softmax = nn.Sequential(
            nn.Linear(hidden_dim, vocabulary_size),
            # removed softmax layer, we will use gumbel softmax instead
        )

    def pretrain(self, x): # batch_indexes: batch_size * seq_len
        '''
        Pretrain the generator
        We will use teacher forcing to train the generator, training it to predict the next word given the current word.
        Problems to consider(?): We are only training decoder not encoder
        '''
        x = self.dec_embed(x) # batch_size * seq_len * embed_dim
        x, _ = self.dec_lstm(x) # batch_size * seq_len * hidden_dim
        x = self.dec_softmax(x) # batch_size * seq_len * vocab_size
        return x

    def forward(self, batch_indexes, max_len=word_limit):
        '''
        Generate a batch of sequences with the maximum length of max_len
        '''
        batch_size = batch_indexes.size(0)
        
        # encode the input sequence
        _, (hn, cn) = self.encode(batch_indexes) # 1 * batch_size * hidden_dim

        # Samples hold the generated sequences, we fill with padding to initialize
        samples = torch.full((batch_size, max_len), self.padding_index).long().to(device) # batch_size * word_limit
        samples[:, 0] = self.bos_index # set the first token to the BOS token
        mask = torch.ones_like(samples, dtype=torch.bool).to(device)  # Start with a mask that allows writing to all positions

        # Distribution: We initialise all to 0 except the BOS token
        prev_word = torch.full((batch_size, 1), self.bos_index).long().to(device) # batch_size * 1

        for i in range(1, max_len): # Start from 1 because we have already set the first token
            # Generate next word
            word_tensor = self.dec_embed(prev_word).to(device) # batch_size * embed_dim
            _ , (hn, cn) = self.dec_lstm(word_tensor, (hn, cn)) # 1 * batch_size * hidden_dim
            del word_tensor
            dist = self.dec_softmax(hn) # 1 * batch_size * vocab_size
            dist = dist.squeeze(0) # batch_size * vocab_size
            dist = F.gumbel_softmax(dist, tau=self.temperature, hard=True) # batch_size * vocab_size
            index = torch.argmax(dist, dim=-1)
            prev_word = index.unsqueeze(1)
            
            # Update the mask: Set False for all positions after eot_index for each sequence
            samples[:, i].masked_scatter_(mask[:, i], index.view(-1))
            eot_generated = index.view(-1) == self.eot_index
            if i+1 < max_len:
                mask[:, i+1:] &= ~eot_generated.unsqueeze(1)
        
        return samples

## Discriminator 1

In [None]:
class Discriminator1(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim, hidden_dim):
        super(Discriminator1, self).__init__()
        self.embedding = nn.Embedding(vocabulary_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.embedding(x)  # Shape: [batch_size, seq_length, embedding_dim]
        
        _, (hn, _) = self.lstm(x)  # hn shape: [1, batch_size, hidden_dim] for 1 layer LSTM
        
        hn = hn[-1]  # Shape: [batch_size, hidden_dim]
        
        x = self.linear(hn)  # Shape: [batch_size, 1]
        x = self.sigmoid(x)
        return x


## Generator - Discriminator 1 Integration

In [None]:
discriminator1 = Discriminator1(vocabulary_size=vocab_size, embedding_dim=disc1_embed_dim, hidden_dim=disc1_hidden_dim).to(device)
generator = Generator(vocabulary_size=vocab_size, embed_dim=embed_dim, hidden_dim=hidden_dim, enc_drop=enc_drop, dec_drop=dec_drop, temperature=temperature, word_limit=word_limit, eot_index=eot_index, padding_index=pad_index, bos_index=bos_index).to(device)
generator_optimizer = optim.Adagrad(generator.parameters(), lr=lr_gen_optim)
generator_scheduler = lr_scheduler.StepLR(generator_optimizer, step_size=gen_lr_boost_freq, gamma=gen_lr_boost)
discriminator1_optimizer = optim.Adagrad(discriminator1.parameters(), lr=lr_disc1_optim)
discriminator1_scheduler = lr_scheduler.StepLR(discriminator1_optimizer, step_size=disc1_lr_boost_freq, gamma=disc1_lr_boost)
bce = BCELoss()

### Pretrain Generator

In [None]:
# load the model
generator.load_state_dict(torch.load('generator_pretrain.pth'))
generator.train()

In [None]:
# Print some generated sequences from the pretrained generator
generated = generator(padded[0].to(device).unsqueeze(0))
generated = generated.squeeze(0)
# indexes = argmax(F.gumbel_softmax(logits[0].squeeze(0), tau=1, hard=True), -1)
print(sp.decode([i.item() for i in generated]))

### Pretrain Discriminator 1

In [None]:
# load the model
discriminator1.load_state_dict(torch.load('discriminator1_pretrain.pth'))
discriminator1.train()

### Combined Train

In [None]:
# Save losses and predictions information for plotting
generator_losses = []
discriminator1_losses = []

discriminator_human_predictions = []
discriminator_generated_predictions = []

In [None]:
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    pbar = tqdm(range(0, len(padded), batch_size), desc='Training', leave=False)
    for i in pbar:
        iterations = i // batch_size # Number of iterations
        batch_size = min(batch_size, len(padded) - i)
        batch_indexes = padded[i:i+batch_size].to(device)  # Get a batch of sequences batch_size * max_seq
        seq_limit = min(max(50, i*(epoch+1)), word_limit)  # Limit the number of words to generate based on the iteration number and epoch number

        # Compute the generator loss
        gen_batch = generator(batch_indexes, seq_limit) # no need to pad generator's output as it is already padded
        predicted_fake_d1 = discriminator1.forward(gen_batch)
        generator_loss = bce(predicted_fake_d1, torch.full_like(predicted_fake_d1, 1)) # usage of soft labels

        # Compute the discriminator loss
        batch_indexes = batch_indexes[:, :min(seq_limit, batch_indexes.size(1))] # Truncate the input sequences to the word limit. TODO: Do we need this?
        predicted_human_d1 = discriminator1(batch_indexes)
        predictions = torch.cat((predicted_fake_d1.detach(), predicted_human_d1), dim=0)
        labels = torch.cat((torch.full_like(predicted_fake_d1, 0.1), torch.full_like(predicted_human_d1, 0.9)), dim=0) # usage of soft labels
        discriminator1_loss = bce(predictions, labels)


        ######## DEBUG REGION ########
        # Display the losses every 50 iterations
        if iterations % 10 == 0:
            # Plot the losses
            plt.figure(figsize=(10, 5))
            plt.plot(generator_losses, label='Generator Loss')
            plt.plot(discriminator1_losses, label='Discriminator Loss')
            plt.legend()
            plt.xlabel('Iterations')
            plt.ylabel('Loss')
            plt.title('Training Losses')
            
            clear_output(wait=True)
            display(plt.gcf())  # gcf - Get Current Figure
            plt.close()  # Close the figure to prevent it from being displayed again in the output
        ######## END DEBUG REGION ########
        
        # Train the generator
        generator_optimizer.zero_grad()
        generator_loss.backward()
        generator_optimizer.step()
        generator_scheduler.step()

        # Train the discriminator
        discriminator1_optimizer.zero_grad()
        discriminator1_loss.backward()
        discriminator1_optimizer.step()
        discriminator1_scheduler.step()


        # Store the losses and predictions for plotting
        generator_losses.append(generator_loss.item())
        discriminator1_losses.append(discriminator1_loss.item())
        discriminator_human_predictions.append(predicted_human_d1.mean().item())
        discriminator_generated_predictions.append(predicted_fake_d1.mean().item())

        # Update the progress bar
        pbar.set_postfix({"Human prediction": predicted_human_d1.mean().item(), "Generated prediction": predicted_fake_d1.mean().item(), "Generator Loss": generator_loss.item(), "Discriminator Loss": discriminator1_loss.item()})

        # Cleanup to free memory
        del batch_indexes, gen_batch, predicted_fake_d1, predicted_human_d1, predictions, labels


### Test code region

In [None]:
# generator_test = Generator(vocab_size, embed_dim, hidden_dim, enc_drop, dec_drop, temperature, word_limit, eot_index).to(device)
# discriminator1_test = Discriminator1(vocab_size, embed_dim, hidden_dim).to(device)

# generator_test.load_state_dict(torch.load("generator_test_1_small.pth"))
# generator_test.eval()

# generator_optimizer_test = optim.Adam(generator_test.parameters(), lr=lr_gen_optim)
# discriminator1_optimizer_test = optim.Adam(discriminator1_test.parameters(), lr=lr_disc1_optim)

In [None]:
# print generated text
gen_batch = generator(padded[4:7].to(device), 100)
gen_text = sp.decode([i.item() for i in gen_batch[0]])
print(gen_text)
gen_text = sp.decode([i.item() for i in gen_batch[1]])
print(gen_text)
gen_text = sp.decode([i.item() for i in gen_batch[2]])
print(gen_text)

In [None]:
predictions = discriminator1(padded[4:7].to(device))
print(predictions)
predictions = discriminator1(gen_batch)
print(predictions)

In [None]:
# Plot the losses

plt.figure(figsize=(10, 5))
plt.title("Generator and Discriminator Loss During Training")
plt.plot(generator_losses, label="Generator")
plt.plot(discriminator1_losses, label="Discriminator")
plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.legend()
plt.show()
plt.savefig("losses_1.png")
plt.close()

In [None]:
plt.figure(figsize=(10, 5))
plt.title("Discriminator & Generator Predictions During Training")
plt.plot(discriminator_human_predictions, label="Human Data")
plt.plot(discriminator_generated_predictions, label="Generated Data")
plt.xlabel("Iterations")
plt.ylabel("Prediction")
plt.legend()
plt.savefig("predictions_1.png")
plt.show()

In [None]:
# # generate text
# test = padded[83:99].to(device)
# generated_texts = generator.generate_text(test, sequence_lengths[83:99], len(test))


In [None]:
# for i, text in enumerate(generated_texts):
#     print(f"Generated text {i+1}:")
#     print(text[1:-1])
#     print(sp.decode([token for token in text[1:-1]]))
#     print()

In [None]:
# import torch

# def model_memory_usage_in_MB(model):
#     # Calculate the number of elements in the model parameters
#     num_params = sum(param.numel() for param in model.parameters())
    
#     # Assuming parameters are stored as 32-bit floats (4 bytes each), calculate memory usage in bytes
#     memory_usage_bytes = num_params * 4
    
#     # Convert bytes to megabytes
#     memory_usage_MB = memory_usage_bytes / (1024 ** 2)
    
#     return memory_usage_MB

# generator_memory = model_memory_usage_in_MB(generator)
# discriminator_memory = model_memory_usage_in_MB(discriminator1)

# print(f"Generator Memory Usage: {generator_memory:.2f} MB")
# print(f"Discriminator Memory Usage: {discriminator_memory:.2f} MB")
