# CS4248 Project Notebook

In [None]:
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
%env CUDA_LAUNCH_BLOCKING=1

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import torch
from torch import nn, tensor, zeros, argmax, arange
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
import sentencepiece as spm
import matplotlib.pyplot as plt

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
column_names = ['category', 'text']

train = pd.read_csv('fulltrain.csv', names=column_names)
X_train = train['text']
y_train = train['category']
y_train = y_train.astype('int16')

#! use 1/10 of the training data
X_train = X_train[:len(X_train)//4]
y_train = y_train[:len(y_train)//4]

test = pd.read_csv('balancedtest.csv', names=column_names)

In [None]:
# CONSTANTS

# HYPERPARAMETERS
vocab_size = 13_000
# Generator
global_batch_size = 64  # Batch size
embed_dim = 1024  # Dimensionality of word embeddings
hidden_dim = 512  # Number of features in the hidden state of the LSTM
enc_drop = 0.2  # Dropout rate for the encoder LSTM
dec_drop = 0.2  # Dropout rate for the decoder LSTM
temperature = 1 # Temperature for sampling
word_limit = 512  # Maximum number of words to generate TODO: for first batch only
lr_gen_optim = 0.5  # Learning rate for generator optimizer

# Discriminator1
lstm_hidden_dim = 64  # Number of features in the hidden state of the LSTM
dense_dim = 32  # Number of features in the dense layer
dropout_prob = 0.2  # Dropout rate
lr_disc1_optim = 0.000005  # Learning rate for discriminator1 optimizer

# Cooperator
coop_embed_dim = 1024
coop_hidden_dim = 512
coop_dropout = 0.2
coop_lr = 0.01

# Discriminator 2
disc2_embed_dim = 512
disc2_hidden_dim = 256
disc2_dropout = 0.4
disc2_lr = 0.01

epochs = 25  # Number of epochs

## Preprocessing
#### Current
We make use of `SentencePieceTrainer` to train a SentencePiece model on all of the training data. We then use this model to tokenize the data. We set a vocabulary size of 8000

In [None]:
# Used to extract the text from the CSV file

# # Path to your CSV file
# csv_file_path = 'fulltrain.csv'
# # Path to the output text file
# text_file_path = 'fulltrain_textonly.txt'

# # Load the CSV file
# df = pd.read_csv(csv_file_path, names=column_names)

# # Assuming the text column is named 'text'. Adjust if your column name is different
# texts = df['text']

# # Save the text column to a plain text file
# with open(text_file_path, 'w', encoding='utf-8') as f:
#     for text in texts:
#         f.write(text + '\n')




In [None]:
# Used to create the SentencePiece model and save it to a file
# We only need to run this once to create the model file

# spm.SentencePieceTrainer.train(input="fulltrain_textonly.txt", 
#                                model_prefix='spm_model', 
#                                vocab_size=vocab_size, 
#                                max_sentence_length=100_000,
#                                unk_id=0, bos_id=1, eos_id=2, pad_id=3)



## Generator

In [None]:
class Generator(nn.Module):
    def __init__(self, vocabulary_size, embed_dim, hidden_dim, enc_drop, dec_drop, temperature, word_limit, eot_index, padding_index, bos_index):
        super(Generator, self).__init__()
        self.vocabulary_size = vocabulary_size
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.enc_drop = enc_drop
        self.dec_drop = dec_drop
        self.temperature = temperature
        self.word_limit = word_limit
        self.eot_index = eot_index
        self.padding_index = padding_index
        self.bos_index = bos_index
        
        # Encoder
        self.encode = nn.Sequential(
            nn.Embedding(vocabulary_size, embed_dim, padding_idx=padding_index),
            nn.LSTM(embed_dim, hidden_dim, batch_first=True, dropout=enc_drop)
        )

        self.transform_hidden = nn.Linear(hidden_dim, hidden_dim * 4)
        self.transform_cell = nn.Linear(hidden_dim, hidden_dim * 4)
        
        # Decoder setup
        self.dec_embed = nn.Linear(vocabulary_size, embed_dim)
        self.dec_lstm = nn.LSTM(embed_dim, hidden_dim, dropout=dec_drop, batch_first=True)
        self.dec_softmax = nn.Sequential(
            nn.Linear(hidden_dim, vocabulary_size),
            nn.Softmax(dim=-1)
        )

    
    def temperature_sampling(self, probabilities, temperature=1.0):
        if temperature <= 0:
            return torch.argmax(probabilities, dim=-1).item()
        logits = torch.log(probabilities) / temperature
        scaled_probs = F.softmax(logits, dim=-1)
        selected_index = torch.multinomial(scaled_probs, 1)
        return selected_index
    
    
    def forward(self, batch_indexes, batch_target_class, max_len=word_limit):
        batch_size = batch_indexes.size(0)
        # encode the input sequence
        _, (hn, cn) = self.encode(batch_indexes) # 1 * batch_size * hidden_dim

        # transform the hidden and cell state to the respective target class
        hn = self.transform_hidden(hn).reshape((batch_size, 4, hidden_dim))[arange(batch_size), batch_target_class].unsqueeze(0)
        cn = self.transform_cell(cn).reshape((batch_size, 4, hidden_dim))[arange(batch_size), batch_target_class].unsqueeze(0)

        # Samples hold the generated sequences, we fill with padding to initialize
        samples = torch.full((batch_size, max_len), self.padding_index).long().to(device) # batch_size * word_limit
        samples[:, 0] = self.bos_index # set the first token to the BOS token
        mask = torch.ones_like(samples, dtype=torch.bool).to(device)  # Start with a mask that allows writing to all positions

        # Distribution: We initialise all to 0 except the BOS token
        dist = torch.zeros(batch_size, self.vocabulary_size).to(device) # batch_size * vocab_size
        dist[:, self.bos_index] = 1

        for i in range(1, max_len): # Start from 1 because we have already set the first token
            # Generate next word
            word_tensor = self.dec_embed(dist).to(device) # batch_size * embed_dim
            _ , (hn, cn) = self.dec_lstm(word_tensor.unsqueeze(1), (hn, cn)) # batch_size * hidden_dim
            del word_tensor
            dist = self.dec_softmax(hn) # batch_size * vocab_size
            dist = dist.squeeze(0) # batch_size * vocab_size
            index = self.temperature_sampling(dist, self.temperature) # batch_size
            
            # Update the mask: Set False for all positions after eot_index for each sequence
            samples[:, i].masked_scatter_(mask[:, i], index.view(-1))
            eot_generated = index.view(-1) == self.eot_index
            if i+1 < max_len:
                mask[:, i+1:] &= ~eot_generated.unsqueeze(1)
        
        return samples

## Discriminator 1

In [None]:
import torch
import torch.nn as nn

class Discriminator1(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim, hidden_dim, padding_index):
        super(Discriminator1, self).__init__()
        self.embedding = nn.Linear(vocabulary_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.embedding(x)  # Shape: [batch_size, seq_length, embedding_dim]
        
        _, (hn, _) = self.lstm(x)  # hn shape: [1, batch_size, hidden_dim] for 1 layer LSTM
        
        hn = hn[-1]  # Shape: [batch_size, hidden_dim]
        
        x = self.linear(hn)  # Shape: [batch_size, 1]
        x = self.sigmoid(x)
        return x


## Generator-Discriminator 1 Integration

In [None]:
sp = spm.SentencePieceProcessor(model_file='spm_model.model')

eot_index = sp.eos_id()
pad_index = sp.pad_id()
bos_index = sp.bos_id()
tokens = [sp.encode(text, out_type=int, add_bos=True, add_eos=True) for text in X_train]
tokens = [torch.tensor(token, dtype=int) for token in tokens]
padded = nn.utils.rnn.pad_sequence(tokens, batch_first=True, padding_value=pad_index)  # num_seq * max_seq

In [None]:
import torch.optim as optim
from torch.nn import BCELoss
from torch import ones_like, zeros_like, tensor
from tqdm import tqdm

bce = BCELoss()

generator = Generator(vocab_size, embed_dim, hidden_dim, enc_drop, dec_drop, temperature, word_limit, eot_index, pad_index, bos_index).to(device)
discriminator1 = Discriminator1(vocab_size, embed_dim, hidden_dim, pad_index).to(device)
generator_optimizer = optim.Adam(generator.parameters(), lr=lr_gen_optim)
discriminator1_optimizer = optim.Adam(discriminator1.parameters(), lr=lr_disc1_optim)

In [None]:
generator_losses = []
discriminator1_losses = []

In [None]:
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    for i in tqdm(range(0, len(padded), global_batch_size), desc="Training", leave=False):
        batch_size = min(global_batch_size, len(padded) - i)
        batch_indexes = padded[i:i+batch_size].to(device)  # Get a batch of sequences batch_size * max_seq

        # Train the generator
        generator_optimizer.zero_grad()
        gen_batch = generator.forward(batch_indexes) # no need to pad generator's output as it is already padded
        predicted_fake_d1 = discriminator1.forward(gen_batch)
        # print("fake: ", predicted_fake_d1[0].item())
        generator_loss = bce(predicted_fake_d1, ones_like(predicted_fake_d1))
        generator_loss.backward()
        generator_optimizer.step()
        
        generator_losses.append(generator_loss.item())  # Store generator loss

        # Train the discriminator1
        discriminator1_optimizer.zero_grad()
        predicted_human_d1 = discriminator1.forward(batch_indexes)
        predictions = torch.cat((predicted_fake_d1.detach(), predicted_human_d1), dim=0)
        # print("real: ", predicted_human_d1[0].item())
        labels = torch.cat((torch.full_like(predicted_fake_d1, 0.1), torch.full_like(predicted_human_d1, 0.9)), dim=0) # 0.1 for fake, 0.9 for real to prevent fluctuations
        discriminator1_loss = bce(predictions, labels)
        discriminator1_loss.backward()
        discriminator1_optimizer.step()

        discriminator1_losses.append(discriminator1_loss.item())  # Store discriminator loss

        # Cleanup to free memory
        del batch_indexes, gen_batch, predicted_fake_d1, predicted_human_d1, predictions, labels



torch.save(generator.state_dict(), "generator_test_1.pth")
torch.save(discriminator1.state_dict(), "discriminator1_test_1.pth")

## Cooperator

In [None]:
class Cooperator(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, dropout):
        super(Cooperator, self).__init__()
        self.embedding = nn.Linear(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, dropout=dropout)
        self.linear = nn.Linear(hidden_dim, 4)
        self.softmax = nn.Softmax(dim=1)
    def forward(self, x):
        x = self.embedding(x)
        _ , (hn, _) = self.lstm(x)
        x = self.linear(hn[0])
        x = self.softmax(x)

## Discriminator 2

In [None]:
class Discriminator2(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, dropout):
        super(Cooperator, self).__init__()
        self.embedding = nn.Linear(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, dropout=dropout)
        self.linear = nn.Linear(hidden_dim, 4)
        self.softmax = nn.Softmax(dim=1)
    def forward(self, x):
        x = self.embedding(x)
        _ , (hn, _) = self.lstm(x)
        x = self.linear(hn[0])
        x = self.softmax(x)

In [None]:
def index_to_binary(batch_indexes):
    batch_size, seq_size = batch_indexes.shape
    batch_binary = zeros((batch_size, seq_size, vocab_size))
    batch_binary[arange(batch_size), arange(seq_size)[None, :], batch_indexes] = 1
    return batch_binary

## Complete Integration

In [None]:
from torch import optim, nn, randint, cat, zeros, ones

cooperator = Cooperator(vocab_size, coop_embed_dim, coop_hidden_dim, coop_dropout).to(device)
discriminator2 = Discriminator2(vocab_size, disc2_embed_dim, disc2_hidden_dim, disc2_dropout).to(device)
cooperator_optim = optim.Adam(cooperator.parameters(), lr=coop_lr)
discriminator2_optim = optim.Adam(discriminator2.parameters(), lr=disc2_lr)

CELoss = nn.CrossEntropyLoss()

In [None]:
gen_losses = []
disc1_losses = []
coop_losses = []
disc2_losses = []

In [None]:
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    for i in tqdm(range(0, len(padded), global_batch_size), desc="Training", leave=False):
        batch_size = min(global_batch_size, len(padded) - i)
        batch_indexes = padded[i:i+batch_size].to(device)  # Get a batch of sequences batch_size * max_seq
        batch_human_class = y_train[i:i+batch_size]
        batch_target_class = randint(4, (batch_size, ))

        gen_batch = generator(batch_indexes, batch_target_class)
        disc1_AI = discriminator1(gen_batch)
        coop_AI = cooperator(gen_batch)
        disc2_AI = discriminator2(gen_batch)

        disc1_human = discriminator1(batch_indexes)
        coop_human = cooperator(batch_indexes)
        disc2_human = discriminator2(batch_indexes)

        zero_tensor = zeros(batch_size)
        one_tensor = ones(batch_size)
        disc1_loss = CELoss(cat((disc1_AI, disc1_human)), cat((zero_tensor, one_tensor)))
        disc1_losses.append(disc1_loss)
        gen_loss = CELoss(disc1_AI, one_tensor) + CELoss(coop_AI, batch_target_class) - CELoss(disc2_AI, batch_target_class)
        gen_losses.append(gen_loss)
        coop_loss = CELoss(disc1_AI, zero_tensor) * CELoss(coop_AI, batch_target_class) + CELoss(coop_human, batch_human_class)
        coop_losses.append(coop_loss)
        disc2_loss = CELoss(disc1_AI, zero_tensor) * CELoss(disc2_AI, batch_target_class) + CELoss(disc2_human, batch_human_class)
        disc2_losses.append(disc2_loss)
        
        generator_optimizer.zero_grad()
        gen_loss.backward()
        generator_optimizer.step()
        

        discriminator1_optimizer.zero_grad()
        disc1_loss.backward()
        discriminator1_optimizer.step()

        cooperator_optim.zero_grad()
        coop_loss.backward()
        cooperator_optim.step()

        discriminator2_optim.zero_grad()
        disc2_loss.backward()
        discriminator2_optim.step()

        # delete afterwards

torch.save(generator.state_dict(), "generator_test_1.pth")
torch.save(discriminator1.state_dict(), "discriminator1_test_1.pth")
torch.save(cooperator.state_dict(), "cooperator_test_1.pth")
torch.save(discriminator2.state_dict(), "discriminator2_test_1.pth")

### Test code region

In [None]:
# generator_test = Generator(vocab_size, embed_dim, hidden_dim, enc_drop, dec_drop, temperature, word_limit, eot_index).to(device)
# discriminator1_test = Discriminator1(vocab_size, embed_dim, hidden_dim).to(device)

# generator_test.load_state_dict(torch.load("generator_test_1_small.pth"))
# generator_test.eval()

# generator_optimizer_test = optim.Adam(generator_test.parameters(), lr=lr_gen_optim)
# discriminator1_optimizer_test = optim.Adam(discriminator1_test.parameters(), lr=lr_disc1_optim)

In [None]:
# print generated text
gen_batch = generator(padded[:3].to(device), 3)
gen_text = sp.decode([i.item() for i in gen_batch[0]])
print(gen_text)
gen_text = sp.decode([i.item() for i in gen_batch[2]])
print(gen_text)

In [None]:
# Plot the losses

plt.figure(figsize=(10, 5))
plt.title("Generator and Discriminator Loss During Training")
plt.plot(generator_losses, label="Generator")
plt.plot(discriminator1_losses, label="Discriminator")
plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.legend()
plt.show()
plt.savefig("losses.png")
plt.close()

In [None]:
# # generate text
# test = padded[83:99].to(device)
# generated_texts = generator.generate_text(test, sequence_lengths[83:99], len(test))


In [None]:
# for i, text in enumerate(generated_texts):
#     print(f"Generated text {i+1}:")
#     print(text[1:-1])
#     print(sp.decode([token for token in text[1:-1]]))
#     print()

In [None]:
# import torch

# def model_memory_usage_in_MB(model):
#     # Calculate the number of elements in the model parameters
#     num_params = sum(param.numel() for param in model.parameters())
    
#     # Assuming parameters are stored as 32-bit floats (4 bytes each), calculate memory usage in bytes
#     memory_usage_bytes = num_params * 4
    
#     # Convert bytes to megabytes
#     memory_usage_MB = memory_usage_bytes / (1024 ** 2)
    
#     return memory_usage_MB

# generator_memory = model_memory_usage_in_MB(generator)
# discriminator_memory = model_memory_usage_in_MB(discriminator1)

# print(f"Generator Memory Usage: {generator_memory:.2f} MB")
# print(f"Discriminator Memory Usage: {discriminator_memory:.2f} MB")
