In [18]:
from __future__ import unicode_literals, print_function, division

import pandas as pd
import torch
from torch.nn.utils.rnn import pad_sequence

In [19]:
original_data = pd.read_csv('../data/raw_filtered.tsv', sep='\t')

# remove rows with empty reference or translation
original_data = original_data[original_data['reference'].notna()]
original_data = original_data[original_data['translation'].notna()]

In [20]:
original_data.head()

Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


In [21]:
# read tsv file cleaned_new
df = pd.read_csv('../data/final_cleaned.tsv', sep='\t')

# remove rows with empty reference or translation
df = df[df['reference'].notna()]
df = df[df['translation'].notna()]

In [22]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


### GENERATIVE MODEL

##### Will take original toxic sentence as input and generate a non-toxic sentence

In [23]:
X = df['reference']
y = df['translation']

In [24]:
# prepare pandas dataframes of sentences for training
df = pd.DataFrame({'original': X, 'translation': y})
import spacy

nlp = spacy.load('en_core_web_sm')


def tokenize(text):
    return [token.text.lower() for token in nlp.tokenizer(text) if not token.is_space]


df['original'] = df['original'].apply(tokenize)
df['translation'] = df['translation'].apply(tokenize)
# take only 100 rows
original_data = original_data[:100000]
original_data['reference'] = original_data['reference'].apply(tokenize)
original_data['translation'] = original_data['translation'].apply(tokenize)


source_vocab = set(token for tokens in original_data['reference'] for token in tokens).union(
    set(token for tokens in df['original'] for token in tokens))
target_vocab = set(token for tokens in original_data['translation'] for token in tokens).union(
    set(token for tokens in df['translation'] for token in tokens))

source_vocab.add("<PAD>")
target_vocab.add("<PAD>")
target_vocab.add("<SOS>")
target_vocab.add("<EOS>")

# add <UNK> token to vocabularies
source_vocab.add("<UNK>")
target_vocab.add("<UNK>")


# Create vocabularies with integer mappings
source_vocab_to_int = {token: i for i, token in enumerate(source_vocab)}
target_vocab_to_int = {token: i for i, token in enumerate(target_vocab)}
# Reverse mappings
int_to_source_vocab = {i: token for token, i in source_vocab_to_int.items()}
int_to_target_vocab = {i: token for token, i in target_vocab_to_int.items()}

# add <UNK> token to vocabularies
source_vocab_to_int['<UNK>'] = len(source_vocab_to_int)
target_vocab_to_int['<UNK>'] = len(target_vocab_to_int)

# add <UNK> token to vocabularies
int_to_source_vocab[len(source_vocab_to_int)] = '<UNK>'
int_to_target_vocab[len(target_vocab_to_int)] = '<UNK>'


# Step 4: Sequence Padding
max_source_length = max(len(tokens) for tokens in df['original'])
max_target_length = max(len(tokens) for tokens in df['translation'])



In [25]:
# save all 4 vocabularies
import pickle

with open('../models/source_vocab_to_int.pickle', 'wb') as handle:
    pickle.dump(source_vocab_to_int, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../models/target_vocab_to_int.pickle', 'wb') as handle:
    pickle.dump(target_vocab_to_int, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../models/int_to_source_vocab.pickle', 'wb') as handle:
    pickle.dump(int_to_source_vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../models/int_to_target_vocab.pickle', 'wb') as handle:
    pickle.dump(int_to_target_vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

In [26]:
from torch.utils.data import Dataset, DataLoader


# Define a custom dataset class for your data
class TranslationDataset(Dataset):
    def __init__(self, source_data, target_data, source_vocab_to_int, target_vocab_to_int):
        self.source_data = source_data
        self.target_data = target_data
        self.source_vocab_to_int = source_vocab_to_int
        self.target_vocab_to_int = target_vocab_to_int
        self.pad_int = self.target_vocab_to_int['<PAD>']

    def __len__(self):
        return len(self.source_data)

    def __getitem__(self, idx):
        source_sequence = self.source_data[idx]
        target_sequence = self.target_data[idx]

        source_sequence_int = [self.source_vocab_to_int[word] for word in source_sequence]
        target_sequence_int = [self.target_vocab_to_int[word] for word in target_sequence]

        # Pad sequences to the maximum length
        source_sequence_int = source_sequence_int + [self.pad_int] * (max_source_length - len(source_sequence_int))
        target_sequence_int = target_sequence_int + [self.pad_int] * (max_target_length - len(target_sequence_int))

        # Create a mask to ignore padding during training
        source_mask = [1 if token != self.pad_int else 0 for token in source_sequence_int]

        return {
            'source_sequence': torch.tensor(source_sequence_int),
            'target_sequence': torch.tensor(target_sequence_int),
            'source_mask': torch.tensor(source_mask)
        }


# Create datasets and data loaders
translation_dataset = TranslationDataset(df['original'], df['translation'], source_vocab_to_int, target_vocab_to_int)
batch_size = 32  # Adjust as needed

In [27]:
def custom_Collate(batch):
    batch = [item for item in batch if item['source_sequence'].shape[0] > 0 and item['target_sequence'].shape[0] > 0]
    return {
        'source_sequence': pad_sequence([item['source_sequence'] for item in batch], batch_first=False),
        'target_sequence': pad_sequence([item['target_sequence'] for item in batch], batch_first=False),
        'source_mask': pad_sequence([item['source_mask'] for item in batch], batch_first=False)
    }

data_loader = DataLoader(translation_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_Collate)

In [28]:
import random
# create seq2seq model with attention, encoder and decoder
import torch.nn as nn
import torch.nn.functional as F


class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_size, num_layers, dropout):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.num_layers = num_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout)

    def forward(self, source_sequence, source_mask):
        # source_sequence shape: (source_sequence_length, batch_size)
        # source_mask shape: (source_sequence_length, batch_size)

        embedded = self.dropout(self.embedding(source_sequence))
        # embedded shape: (source_sequence_length, batch_size, embedding_size)

        packed = nn.utils.rnn.pack_padded_sequence(embedded, source_mask.cpu().sum(0).long(), enforce_sorted=False)
        outputs, (hidden, cell) = self.lstm(packed)
        # outputs shape: (source_sequence_length, batch_size, hidden_size)
        # hidden shape: (num_layers, batch_size, hidden_size)
        # cell shape: (num_layers, batch_size, hidden_size)

        return outputs, hidden, cell
    
class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_size, num_layers, dropout):
        super(Decoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.num_layers = num_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout)
        self.fc = nn.Linear(hidden_size, input_size)

    def forward(self, target_sequence, hidden, cell, encoder_outputs):
        # target_sequence shape: (batch_size)
        # hidden shape: (num_layers, batch_size, hidden_size)
        # cell shape: (num_layers, batch_size, hidden_size)
        # encoder_outputs shape: (source_sequence_length, batch_size, hidden_size)

        target_sequence = target_sequence.unsqueeze(0)
        # target_sequence shape: (1, batch_size)

        hidden = hidden.view(self.num_layers, target_sequence.size(1), self.hidden_size)
        cell = cell.view(self.num_layers, target_sequence.size(1), self.hidden_size)
        
        embedded = self.dropout(self.embedding(target_sequence))
        # embedded shape: (target_sequence_length, batch_size, embedding_size)
        
        outputs, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        # output shape: (1, batch_size, hidden_size)
        # hidden shape: (num_layers, batch_size, hidden_size)
        # cell shape: (num_layers, batch_size, hidden_size)

        prediction = self.fc(outputs.squeeze(0))
        # prediction shape: (batch_size, input_size)

        return prediction, hidden, cell
    
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, source_sequence, source_mask, target_sequence):
        # source_sequence shape: (source_sequence_length, batch_size)
        # source_mask shape: (source_sequence_length, batch_size)
        # target_sequence shape: (target_sequence_length, batch_size)

        batch_size = source_sequence.shape[1]
        target_sequence_length = target_sequence.shape[0]
        target_vocab_size = self.decoder.input_size

        outputs = torch.zeros(target_sequence_length, batch_size, target_vocab_size).to(self.device)
        # outputs shape: (target_sequence_length, batch_size, target_vocab_size)

        encoder_outputs, hidden, cell = self.encoder(source_sequence, source_mask)
        # encoder_outputs shape: (source_sequence_length, batch_size, hidden_size)
        # hidden shape: (num_layers, batch_size, hidden_size)
        # cell shape: (num_layers, batch_size, hidden_size)

        # First input to the decoder is the <SOS> tokens
        input = target_sequence[0, :]

        for t in range(1, target_sequence_length):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            # output shape: (batch_size, target_vocab_size)
            outputs[t] = output
            # outputs shape: (target_sequence_length, batch_size, target_vocab_size)

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)

            # With probability 0.5 we take the actual next word in the sequence
            # otherwise we take the word that the Decoder predicted it to be.
            input = target_sequence[t] if random.random() < 0.5 else best_guess

        return outputs
    
    def predict(self, source_sequence, source_mask, target_vocab_to_int, int_to_target_vocab):
        # source_sequence shape: (source_sequence_length, batch_size)
        # source_mask shape: (source_sequence_length, batch_size)

        batch_size = source_sequence.shape[1]
        target_vocab_size = self.decoder.input_size
        max_target_length = 100

        outputs = torch.zeros(max_target_length, batch_size, target_vocab_size).to(self.device)
        # outputs shape: (target_sequence_length, batch_size, target_vocab_size)

        encoder_outputs, hidden, cell = self.encoder(source_sequence, source_mask)
        # encoder_outputs shape: (source_sequence_length, batch_size, hidden_size)
        # hidden shape: (num_layers, batch_size, hidden_size)
        # cell shape: (num_layers, batch_size, hidden_size)

        # First input to the decoder is the <SOS> tokens
        input = torch.tensor([target_vocab_to_int['<SOS>']] * batch_size).to(self.device)

        for t in range(1, max_target_length):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            # output shape: (batch_size, target_vocab_size)
            outputs[t] = output
            # outputs shape: (target_sequence_length, batch_size, target_vocab_size)

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)

            # With probability 0.5 we take the actual next word in the sequence
            # otherwise we take the word that the Decoder predicted it to be.
            input = best_guess

        # Remove <SOS> token
        outputs = outputs[1:]
        # outputs shape: (target_sequence_length, batch_size, target_vocab_size)

        # Get the best word indices (indexes in the vocabulary) per time step
        best_guess = outputs.argmax(2)
        # best_guess shape: (target_sequence_length, batch_size)

        # Convert the indices into actual words
        decoded_words = []
        for i in range(best_guess.shape[1]):
            
            predicted_sentence = [int_to_target_vocab[int_.item()] for int_ in best_guess[:, i]]
            decoded_words.append(' '.join(predicted_sentence))

        return decoded_words
    

In [34]:
# Hyperparameters
num_epochs = 1
learning_rate = 0.001
batch_size = 32

# Model hyperparameters
input_size_encoder = len(source_vocab_to_int)
input_size_decoder = len(target_vocab_to_int)
output_size = len(target_vocab_to_int)
encoder_embedding_size = 256
decoder_embedding_size = 256
hidden_size = 512
num_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5

In [35]:
print(input_size_encoder)
print(input_size_decoder)

84317
60639


In [36]:
# Initialize network
encoder_net = Encoder(input_size_encoder, hidden_size, encoder_embedding_size, num_layers, encoder_dropout)
decoder_net = Decoder(input_size_decoder, hidden_size, decoder_embedding_size, num_layers, decoder_dropout)
model = Seq2Seq(encoder_net, decoder_net, device).to(device)

In [37]:
# Loss and optimizer
pad_int = target_vocab_to_int['<PAD>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_int)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [40]:
# Train Network
for epoch in range(num_epochs):
    for batch_idx, batch in enumerate(data_loader):
        
        source_sequence = batch['source_sequence'].to(device)
        target_sequence = batch['target_sequence'].to(device)
        source_mask = batch['source_mask'].to(device)

        # Forward
        outputs = model(source_sequence, source_mask, target_sequence)
        # outputs shape: (target_sequence_length, batch_size, target_vocab_size)
        outputs = outputs[1:].reshape(-1, outputs.shape[2])
        target_sequence = target_sequence[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(outputs, target_sequence)

        # Backward
        loss.backward()

        # Clip to avoid exploding gradient issues
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # Print loss every 100 batches
        if batch_idx % 100 == 0:
            print(f'Epoch [{epoch}/{num_epochs}], Loss: {loss.item():.4f}')

OutOfMemoryError: CUDA out of memory. Tried to allocate 794.00 MiB. GPU 0 has a total capacty of 4.00 GiB of which 0 bytes is free. Of the allocated memory 2.86 GiB is allocated by PyTorch, and 617.95 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [39]:
import gc
gc.collect()

1350

In [None]:
test_sentence = "Now you are getting very nasty"

In [None]:
def prepare_sent(sentence: str):
    print(type(sentence))
    sentence = tokenize(sentence)
    sentence = [source_vocab_to_int.get(word, source_vocab_to_int['<UNK>']) for word in sentence]
    sentence = torch.tensor(sentence).unsqueeze(1).to(device)
    return sentence

In [None]:
sent = prepare_sent(test_sentence)

In [None]:
# predict
model.eval()
with torch.no_grad():
    translation = model.predict(sent,
                                torch.ones(sent.shape).to(device),
                                target_vocab_to_int,
                                int_to_target_vocab)
    
print(translation)

In [None]:
# save model
torch.save(model.state_dict(), '../models/seq2seq_NEW_model.pt')