# CS4248 Project Notebook

In [None]:
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
%env CUDA_LAUNCH_BLOCKING=1

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import torch
from torch import nn, tensor, zeros, argmax
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import sentencepiece as spm
import matplotlib.pyplot as plt

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
column_names = ['category', 'text']

train = pd.read_csv('fulltrain.csv', names=column_names)
X_train = train['text']
y_train = train['category']
y_train = y_train.astype('int16')

#! use 1/10 of the training data
X_train = X_train[:len(X_train)//10]
y_train = y_train[:len(y_train)//10]

test = pd.read_csv('balancedtest.csv', names=column_names)

In [None]:
# CONSTANTS

# HYPERPARAMETERS
vocab_size = 8_000
# Generator
batch_size = 64  # Batch size
embed_dim = 300  # Dimensionality of word embeddings
hidden_dim = 128  # Number of features in the hidden state of the LSTM
enc_drop = 0.2  # Dropout rate for the encoder LSTM
dec_drop = 0.2  # Dropout rate for the decoder LSTM
k = 5 # Number of top k words to sample from
word_limit = 512  # Maximum number of words to generate TODO: for first batch only
lr_gen_optim = 0.01  # Learning rate for generator optimizer

# Discriminator1
conv_channels = 128  # Number of output channels in the convolutional layer
lstm_hidden_dim = 128  # Number of features in the hidden state of the LSTM
dense_dim = 64  # Number of features in the dense layer
dropout_prob = 0.2  # Dropout rate
lr_disc1_optim = 0.001  # Learning rate for discriminator1 optimizer

epochs = 10  # Number of epochs

## Preprocessing
#### Current
We make use of `SentencePieceTrainer` to train a SentencePiece model on all of the training data. We then use this model to tokenize the data. We set a vocabulary size of 8000

In [None]:
# Used to extract the text from the CSV file

# # Path to your CSV file
# csv_file_path = 'fulltrain.csv'
# # Path to the output text file
# text_file_path = 'fulltrain_textonly.txt'

# # Load the CSV file
# df = pd.read_csv(csv_file_path, names=column_names)

# # Assuming the text column is named 'text'. Adjust if your column name is different
# texts = df['text']

# # Save the text column to a plain text file
# with open(text_file_path, 'w', encoding='utf-8') as f:
#     for text in texts:
#         f.write(text + '\n')




In [None]:
# Used to create the SentencePiece model and save it to a file
# We only need to run this once to create the model file

# spm.SentencePieceTrainer.train(input="fulltrain_textonly.txt", 
#                                model_prefix='spm_model', 
#                                vocab_size=vocab_size, 
#                                max_sentence_length=100_000,
#                                user_defined_symbols=['<sot>'],
#                                unk_id=0, bos_id=-1, eos_id=1, pad_id=2)



## Generator

In [None]:
class Generator(nn.Module):
    def __init__(self, vocabulary_size, embed_dim, hidden_dim, enc_drop, dec_drop, k, word_limit, eot_index, sot_index):
        super(Generator, self).__init__()
        self.vocabulary_size = vocabulary_size
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.enc_drop = enc_drop
        self.dec_drop = dec_drop
        self.k = k
        self.word_limit = word_limit
        self.eot_index = eot_index
        self.sot_index = sot_index
        
        # Encoder
        self.enc_embed = nn.Embedding(vocabulary_size, embed_dim)
        self.enc_lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, dropout=enc_drop)
        
        # Decoder setup
        self.dec_embed = nn.Linear(vocabulary_size, embed_dim)
        self.dec_lstm = nn.LSTM(embed_dim, hidden_dim, dropout=dec_drop, batch_first=True)
        self.dec_softmax = nn.Sequential(
            nn.Linear(hidden_dim, vocabulary_size),
            nn.Softmax(dim=-1)
        )
    
    def top_k_sampling(self, probabilities, k):
        top_k_probs, top_k_indices = torch.topk(probabilities, k, dim=-1)
        selected_indices = torch.multinomial(top_k_probs, 1)
        selected = top_k_indices.gather(1, selected_indices)
        return selected.item()
    
    
    def generate_text(self, batch_indexes, batch_sequence_lengths, batch_size):
        batch_sequence_lengths, perm_idx = torch.sort(torch.tensor(batch_sequence_lengths), descending=True)
        batch_indexes = batch_indexes[perm_idx]
        batch_embeddings = self.enc_embed(batch_indexes)
        batch_sequence_lengths = batch_sequence_lengths.to(device)
        packed_input = pack_padded_sequence(batch_embeddings, batch_sequence_lengths.cpu(), batch_first=True)

        # encode the input sequence
        _, (batch_hn, batch_cn) = self.enc_lstm(packed_input)

        gen_batch = []  # To store generated sequences
        init_dist = zeros(self.vocabulary_size, device=device).unsqueeze(0)
        init_dist[:, self.sot_index] = 1

        # Generate a sequence for each item in the batch
        for i in range(batch_size):
            prev_dist = init_dist
            hn, cn = batch_hn[:, i, :], batch_cn[:, i, :]  # Get initial states for this item in the batch
            gen = [tensor(self.sot_index)]  # To store generated indices for this item
            while True:
                # Get the next word
                word_tensor = self.dec_embed(prev_dist).to(device)
                _ , (hn, cn) = self.dec_lstm(word_tensor, (hn, cn))
                del word_tensor
                prev_dist = self.dec_softmax(hn)
                index = argmax(prev_dist)
                gen.append(index)
                if index == self.eot_index: break
                if len(gen) == self.word_limit: break
            gen_batch.append(gen)
        
        return gen_batch


## Discriminator 1

In [None]:
class Discriminator1(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim, conv_channels, lstm_hidden_dim, dense_dim, dropout_prob):
        super(Discriminator1, self).__init__()
        self.embedding = nn.Embedding(vocabulary_size, embedding_dim)
        
        # Convolutional layer expects input of shape (batch_size, channels, sequence_length),
        # so embedding_dim is used as in_channels. Output channels set to 128.
        self.conv1d = nn.Conv1d(in_channels=embedding_dim, out_channels=conv_channels, kernel_size=5)
        self.relu = nn.ReLU()
        
        # LSTM layer expects input of shape (batch_size, seq_len, features),
        # so we need to permute the output from conv1d.
        self.lstm = nn.LSTM(input_size=conv_channels, hidden_size=lstm_hidden_dim, batch_first=True)
        
        # Fully connected layer and dropout
        self.dense = nn.Linear(lstm_hidden_dim, dense_dim)  # Assuming the LSTM does not return sequences
        self.dropout = nn.Dropout(dropout_prob)
        
        # Output layer
        self.output_layer = nn.Linear(dense_dim, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        # Embedding layer
        embedded = self.embedding(x)
        
        # Conv1d expects (batch, channels, length), so permute the embedding output
        x = embedded.permute(0, 2, 1)
        x = self.conv1d(x)
        x = self.relu(x)
        
        # LSTM layer expects (batch, seq_len, features), permute back
        x = x.permute(0, 2, 1)

        # Pass the packed sequences through the LSTM.
        _, (x, _) = self.lstm(x)
        x = x.squeeze(0)
        
        # Pass through the fully connected layer and output layer
        x = self.dense(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.output_layer(x)
        x = self.sigmoid(x)
        
        return x


## Generator-Discriminator 1 Integration

In [None]:
sp = spm.SentencePieceProcessor(model_file='spm_model.model')

eot_index = sp.eos_id()
sot_index = sp.piece_to_id('<sot>')
pad_index = sp.pad_id()
tokens = [[sot_index] + sp.encode(text, out_type=int) + [eot_index] for text in X_train]
tokens = [torch.tensor(token, dtype=int) for token in tokens]
sequence_lengths = [len(token) for token in tokens]
padded = nn.utils.rnn.pad_sequence(tokens, batch_first=True, padding_value=pad_index)  # num_seq * max_seq

In [None]:
import torch.optim as optim
from torch.nn import BCELoss
from torch import ones_like, zeros_like, tensor
from tqdm import tqdm

bce = BCELoss()

generator = Generator(vocab_size, embed_dim, hidden_dim, enc_drop, dec_drop, k, word_limit, eot_index, sot_index).to(device)
discriminator1 = Discriminator1(vocab_size, embed_dim, conv_channels, lstm_hidden_dim, dense_dim, dropout_prob).to(device)
generator_optimizer = optim.Adam(generator.parameters(), lr=lr_gen_optim)
discriminator1_optimizer = optim.Adam(discriminator1.parameters(), lr=lr_disc1_optim)

In [None]:
generator_losses = []
discriminator1_losses = []

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    for i in tqdm(range(0, len(padded), batch_size), desc="Training", leave=False):
        batch_size = min(batch_size, len(padded) - i)
        batch_indexes = padded[i:i+batch_size].to(device)  # Get a batch of sequences batch_size * max_seq
        batch_sequence_lengths = sequence_lengths[i:i+batch_size]  # Get the sequence lengths for this batch

        # Train the generator
        generator_optimizer.zero_grad()
        gen_batch = generator.generate_text(batch_indexes, batch_sequence_lengths, batch_size)
        gen_batch_padded = nn.utils.rnn.pad_sequence([tensor(seq) for seq in gen_batch], batch_first=True, padding_value=pad_index).to(device)
        predicted_fake_d1 = discriminator1.forward(gen_batch_padded)
        generator_loss = bce(predicted_fake_d1, ones_like(predicted_fake_d1))
        generator_loss.backward()
        generator_optimizer.step()
        
        discriminator1_losses.append(generator_loss.item())  # Store generator loss

        # Train the discriminator1
        discriminator1_optimizer.zero_grad()
        predicted_human_d1 = discriminator1.forward(batch_indexes)
        predictions = torch.cat((predicted_fake_d1.detach(), predicted_human_d1), dim=0)
        labels = torch.cat((zeros_like(predicted_fake_d1), ones_like(predicted_human_d1)), dim=0)
        discriminator1_loss = bce(predictions, labels)
        discriminator1_loss.backward()
        discriminator1_optimizer.step()

        generator_losses.append(discriminator1_loss.item())  # Store discriminator loss

        # Cleanup to free memory
        del batch_indexes, gen_batch, predicted_fake_d1, predicted_human_d1, predictions, labels



torch.save(generator.state_dict(), "generator_test_1.pth")
torch.save(discriminator1.state_dict(), "discriminator1_test_1.pth")

### Test code region

In [None]:
# Plot the losses

plt.figure(figsize=(10, 5))
plt.title("Generator and Discriminator Loss During Training")
plt.plot(generator_losses, label="Generator")
plt.plot(discriminator1_losses, label="Discriminator")
plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.legend()
plt.show()
plt.savefig("losses.png")
plt.close()

In [None]:
# generate text
test = padded[83:99].to(device)
generated_texts = generator.generate_text(test, sequence_lengths[83:99], len(test))


In [None]:
for i, text in enumerate(generated_texts):
    print(f"Generated text {i+1}:")
    print(text[1:-1])
    print(sp.decode([token.item() for token in text[1:-1]]))
    print()

In [None]:
import torch

def model_memory_usage_in_MB(model):
    # Calculate the number of elements in the model parameters
    num_params = sum(param.numel() for param in model.parameters())
    
    # Assuming parameters are stored as 32-bit floats (4 bytes each), calculate memory usage in bytes
    memory_usage_bytes = num_params * 4
    
    # Convert bytes to megabytes
    memory_usage_MB = memory_usage_bytes / (1024 ** 2)
    
    return memory_usage_MB

generator_memory = model_memory_usage_in_MB(generator)
discriminator_memory = model_memory_usage_in_MB(discriminator1)

print(f"Generator Memory Usage: {generator_memory:.2f} MB")
print(f"Discriminator Memory Usage: {discriminator_memory:.2f} MB")
