# CS4248 Project Notebook

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import torch
from torch import nn, tensor, zeros, argmax
import torch.nn.functional as F

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
column_names = ['category', 'text']

train = pd.read_csv('fulltrain.csv', names=column_names)
X_train = train['text']
y_train = train['category']
test = pd.read_csv('balancedtest.csv', names=column_names)

## Preprocessing
#### Current
1. Tokenize the text
2. Lowercasing
3. Remove punctuation
4. Remove stop words
5. Lemmatization

In [None]:
def preprocess_text(text, lemmatize=False, remove_stopwords=False, remove_punctuations=False):
    tokens = word_tokenize(text.lower())

    if lemmatize:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    if remove_stopwords:
        tokens = [token for token in tokens if token not in stop_words]
    if remove_punctuations:
        tokens = [token for token in tokens if token.isalpha()]

    # add EOT token
    tokens.append('<EOT>')

    return tokens

## Generator

In [None]:
class Generator(nn.Module):
    def __init__(self, vocabulary_size, embed_dim, hidden_dim, enc_drop, dec_drop, k, word_limit, eot_index):
        super(Generator, self).__init__()
        self.vocabulary_size = vocabulary_size
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.enc_drop = enc_drop
        self.dec_drop = dec_drop
        self.k = k
        self.word_limit = word_limit
        self.eot_index = eot_index
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Embedding(vocabulary_size, embed_dim),
            nn.LSTM(embed_dim, hidden_dim, batch_first=True, dropout=enc_drop)
        )
        
        # Decoder setup
        self.dec_embed = nn.Linear(vocabulary_size, embed_dim)
        self.dec_lstm = nn.LSTM(embed_dim, hidden_dim, dropout=dec_drop)
        self.dec_softmax = nn.Sequential(
            nn.Linear(hidden_dim, vocabulary_size),
            nn.Softmax(dim=-1)
        )
    
    def top_k_sampling(self, probabilities, k):
        top_k_probs, top_k_indices = torch.topk(probabilities, k, dim=-1)
        selected_indices = torch.multinomial(top_k_probs, 1)
        selected = top_k_indices.gather(1, selected_indices)
        del top_k_probs, top_k_indices, selected_indices
        return selected.item()
    
    
    def generate_text(self, batch_indexes, batch_size):
        # encode the input sequence
        _, (batch_hn, batch_cn) = self.encoder(batch_indexes)

        gen_batch = []  # To store generated sequences
        init_dist = zeros(self.vocabulary_size, device=device).unsqueeze(0)
        init_dist[:, self.eot_index] = 1

        # Generate a sequence for each item in the batch
        for i in range(batch_size):
            prev_dist = init_dist
            hn, cn = batch_hn[:, i, :], batch_cn[:, i, :]  # Get initial states for this item in the batch
            gen = []  # To store generated indices for this item
            while True:
                torch.cuda.empty_cache()
                # Get the next word
                word_tensor = self.dec_embed(prev_dist).to(device)
                _ , (hn, cn) = self.dec_lstm(word_tensor, (hn, cn))
                del word_tensor
                prev_dist = self.dec_softmax(hn)
                index = self.top_k_sampling(prev_dist, self.k)
                if index == self.eot_index: break
                gen.append(index)
                if len(gen) == self.word_limit: break
            gen_batch.append(gen)
        
        return gen_batch


## Discriminator 1

In [None]:
class Discriminator1(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim, conv_channels, lstm_hidden_dim, dense_dim, dropout_prob):
        super(Discriminator1, self).__init__()
        self.embedding = nn.Embedding(vocabulary_size, embedding_dim)
        
        # Convolutional layer expects input of shape (batch_size, channels, sequence_length),
        # so embedding_dim is used as in_channels. Output channels set to 128.
        self.conv1d = nn.Conv1d(in_channels=embedding_dim, out_channels=conv_channels, kernel_size=5)
        self.relu = nn.ReLU()
        
        # LSTM layer expects input of shape (batch_size, seq_len, features),
        # so we need to permute the output from conv1d.
        self.lstm = nn.LSTM(input_size=conv_channels, hidden_size=lstm_hidden_dim, batch_first=True)
        
        # Fully connected layer and dropout
        self.dense = nn.Linear(lstm_hidden_dim, dense_dim)  # Assuming the LSTM does not return sequences
        self.dropout = nn.Dropout(dropout_prob)
        
        # Output layer
        self.output_layer = nn.Linear(dense_dim, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        # Embedding layer
        x = self.embedding(x)
        
        # Conv1d expects (batch, channels, length), so permute the embedding output
        x = x.permute(0, 2, 1)
        x = self.conv1d(x)
        x = self.relu(x)
        
        # LSTM layer expects (batch, seq_len, features), permute back
        x = x.permute(0, 2, 1)
        
        # Only take the output from the last LSTM cell
        _, (x, _) = self.lstm(x)
        x = x.squeeze(0)  # Remove the first dimension (num_layers*num_directions)
        
        # Fully connected layer
        x = self.dense(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        # Output layer
        x = self.output_layer(x)
        x = self.sigmoid(x)
        
        return x


## Generator-Discriminator 1 Integration

In [None]:
tokens = X_train.apply(preprocess_text) # Tokenizing the text
word_index = {word: i+1 for i, word in enumerate(set([token for tokens_list in tokens for token in tokens_list]))} # Mapping words to indices
index_word = {i: word for word, i in word_index.items()} # Mapping indices to words
labels = train['category']
# get index of END OF TEXT token
eot_index = word_index['<EOT>']

indexes = [tensor([word_index.get(word, -1) for word in seq]) for seq in tokens]
padded = nn.utils.rnn.pad_sequence(indexes, batch_first=True)  # num_seq * max_seq

In [None]:
import torch.optim as optim
from torch.nn import BCELoss
from torch import ones_like, zeros_like, tensor
from tqdm import tqdm

bce = BCELoss()

# HYPERPARAMETERS
# Generator
batch_size = 8  # Batch size
embed_dim = 512  # Dimensionality of word embeddings
hidden_dim = 128  # Number of features in the hidden state of the LSTM
enc_drop = 0.2  # Dropout rate for the encoder LSTM
dec_drop = 0.2  # Dropout rate for the decoder LSTM
k = 5 # Number of top k words to sample from
word_limit = 500  # Maximum number of words to generate TODO: for first batch only
lr_gen_optim = 0.001  # Learning rate for generator optimizer

# Discriminator1
conv_channels = 512  # Number of output channels in the convolutional layer
lstm_hidden_dim = 128  # Number of features in the hidden state of the LSTM
dense_dim = 64  # Number of features in the dense layer
dropout_prob = 0.2  # Dropout rate
lr_disc1_optim = 0.001  # Learning rate for discriminator1 optimizer

epochs = 10  # Number of epochs


generator = Generator(len(word_index), embed_dim, hidden_dim, enc_drop, dec_drop, k, word_limit, eot_index).to(device)
discriminator1 = Discriminator1(len(word_index), embed_dim, conv_channels, lstm_hidden_dim, dense_dim, dropout_prob).to(device)
generator_optimizer = optim.Adam(generator.parameters(), lr=lr_gen_optim)
discriminator1_optimizer = optim.Adam(discriminator1.parameters(), lr=lr_disc1_optim)


In [None]:
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    for i in tqdm(range(0, len(padded), batch_size), desc="Training", leave=False):
        batch_size = min(batch_size, len(padded) - i)
        batch_indexes = padded[i:i+batch_size].to(device)  # Get a batch of sequences batch_size * max_seq

        # Train the generator
        generator_optimizer.zero_grad()
        gen_batch = generator.generate_text(batch_indexes, batch_size)
        gen_batch_padded = nn.utils.rnn.pad_sequence([tensor(seq) for seq in gen_batch], batch_first=True).to(device)
        predicted_fake_d1 = discriminator1.forward(gen_batch_padded)
        generator_loss = bce(predicted_fake_d1, zeros_like(predicted_fake_d1))
        generator_loss.backward()
        generator_optimizer.step()
    

        # Train the discriminator1
        discriminator1_optimizer.zero_grad()
        predicted_human_d1 = discriminator1.forward(batch_indexes)
        predictions = torch.cat((predicted_fake_d1.detach(), predicted_human_d1), dim=0)
        labels = torch.cat((zeros_like(predicted_fake_d1), ones_like(predicted_human_d1)), dim=0)
        discriminator1_loss = bce(predictions, labels)
        discriminator1_loss.backward()
        discriminator1_optimizer.step()

        # Cleanup to free memory
        del batch_indexes, gen_batch, gen_batch_padded, predicted_fake_d1, predicted_human_d1, predictions, labels
