In [60]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset


import numpy as np
import pandas as pd

import re
import nltk

# tokenizer
tokenizer = nltk.TweetTokenizer()

In [4]:
text_file = r"ara.txt\ara.txt"
columns = ["English", "Arabic", "Attribution"]
# header and names prevent the dataframe from starting with data
data = pd.read_csv(text_file, delimiter='\t', header=None, names=columns)

working_data = data.copy()
working_data.head()

Unnamed: 0,English,Arabic,Attribution
0,Hi.,مرحبًا.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
1,Run!,اركض!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
2,Duck!,اخفض رأسك!,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,Duck!,اخفضي رأسك!,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
4,Duck!,اخفضوا رؤوسكم!,CC-BY 2.0 (France) Attribution: tatoeba.org #2...


In [95]:
class TranslationDataset(Dataset):
    def __init__(self, data, tokenizer, stopwords):
        super(TranslationDataset, self).__init__()
        self.data = data

        #######################
        # English Preprocessing
        #######################

        self.data['cleaned_english'] = self.data['English'].apply(self.clean_data)
        self.data['tokenized_english'] = self.data['cleaned_english'].apply(lambda x: self.tokenize_text(x, tokenizer, stopwords))

        # get longest sequence
        self.longest_seq = self.get_longest_seq(list(self.data['tokenized_english'].values))

        # pad tokens
        self.data["padded_tokens"] = self.data['tokenized_english'].apply(lambda x: self.pad_tokens(x, self.longest_seq))

        # get word2idx
        self.word2idx = self.get_word2idx(list(self.data['padded_tokens'].values))

        # get idx2word
        self.idx2word = {idx : word for idx, word in enumerate(self.word2idx)}

        # get indices
        self.data['indices'] = self.data['padded_tokens'].apply(lambda x: self.get_indices(x, self.word2idx))

        ######################
        # Arabic Preprocessing
        ######################
        
        # clean arabic data
        self.data['cleaned_arabic'] = self.data['Arabic'].apply(self.clean_data)

        # tokenize arabic
        self.data['tokenized_arabic'] = self.data['cleaned_arabic'].apply(lambda x: self.tokenize_text(x, tokenizer, stopwords))

        # get longest sequence
        self.longest_seq_arab = self.get_longest_seq(list(self.data['tokenized_arabic'].values))

        # pad tokens
        self.data["padded_tokens_arabic"] = self.data['tokenized_arabic'].apply(lambda x: self.pad_tokens(x, self.longest_seq_arab))

        # get word2idx
        self.word2idx_arabic = self.get_word2idx(list(self.data['padded_tokens_arabic'].values))

        # get idx2word
        self.idx2word_arabic = {idx : word for idx, word in enumerate(self.word2idx_arabic)}

        # get indices
        self.data['indices_arabic'] = self.data['padded_tokens_arabic'].apply(lambda x: self.get_indices(x, self.word2idx_arabic))



    def clean_data(self, text):
        """
        cleans data and prepares it for tokenization
        Args:
            data: input_data (pd.DataFrame)
            tokenizer: nltk.tokenizer
        Returns:
            clean_data: List(List)
        """
        lower = text.lower()
        clean_text = re.sub(r'!"#$%&\'()*+-/:;<=>?@[\\]^_`{|}~', '', lower)
        return '<SOS>' + clean_text + '<EOS>' 

    def tokenize_text(self, text, tokenizer, stopwords):
        """
        break texts into tokens
        Args:
            text (str): text to tokenize
            tokenizer (nltk.tokenizer): to convert texts into tokens
            stopwords (nltk.corpus.stopwords.words): list of words to not include in vocab 

        Return:
            tokens (List): a list of tokens
        """
        tokens = tokenizer.tokenize(text)
        tokens = [token for token in tokens if token not in stopwords] 
        return tokens

    def get_longest_seq(self, tokens):
        """
        Args:
            tokens: list of tokens
        Returns:
        longest_seq (int): longest sequence in datset
        """

        longest_seq = 0

        for token in tokens:
            if len(token) > longest_seq:
                longest_seq = len(token)
        
        return longest_seq

    def pad_tokens(self, tokens, max_len):
        """
        To make sure all tokens havethe same length
        Args: 
            tokens: list of tokens
        Returns:
            padded_tokens (List): list of padded tokens
        """
        if len(tokens) < max_len:
            return tokens + ['<PAD>' for _ in range(max_len - len(tokens))]
        return tokens

    def get_word2idx(self, all_tokens):
        word2idx ={
            '<SOS>' : 0,
            '<EOS>' : 1,
            '<PAD>' : 2,
            '<UNK>': 3,
        }

        for tokens in all_tokens:
            for token in tokens:
                if token not in word2idx:
                    word2idx[token] = len(word2idx)
                
        return word2idx
    
    def get_indices(self, tokens, vocab):
        indices = []
        for token in tokens:
                indices.append(vocab.get(token, vocab['<UNK>']))
        return indices

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        if torch.is_tensor(idx):
            idx = idx.tolist()

        english_indices = self.data.iloc[idx]['indices']
        arabic_indices = self.data.iloc[idx]['indices_arabic']

        return torch.tensor(english_indices, dtype=torch.long), torch.tensor(arabic_indices, dtype=torch.long)


In [96]:
tokenizer = nltk.TweetTokenizer()
stopwords = nltk.corpus.stopwords.words('english')

dataset = TranslationDataset(data, tokenizer, stopwords)
indices = torch.randperm(len(dataset))

train_size = int(0.7*len(indices))
train_dataset = Subset(dataset, indices=indices[:train_size])
val_dataset = Subset(dataset, indices=indices[train_size:])

batch_size=64
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

In [97]:
next(iter(train_dataloader))

[tensor([[   0, 2331, 1731,  ...,    2,    2,    2],
         [   0,   48,   40,  ...,    2,    2,    2],
         [   0,  872,  214,  ...,    2,    2,    2],
         ...,
         [   0,   46,  263,  ...,    2,    2,    2],
         [   0,  803,    5,  ...,    2,    2,    2],
         [   0, 1329,  894,  ...,    2,    2,    2]]),
 tensor([[   0, 3031, 6361,  ...,    2,    2,    2],
         [   0, 2102,   98,  ...,    2,    2,    2],
         [   0, 1659, 1645,  ...,    2,    2,    2],
         ...,
         [   0,  512,    5,  ...,    2,    2,    2],
         [   0,   26,   67,  ...,    2,    2,    2],
         [   0,   79, 8217,  ...,    2,    2,    2]])]

In [98]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        # src shape: [batch_size, seq_len]
        embedded = self.dropout(self.embedding(src))
        
        # outputs, (hidden, cell)
        # We only need the final hidden/cell states to pass to the decoder
        _, (hidden, cell) = self.rnn(embedded)
        
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        # input shape: [batch_size] (current token for each sequence in batch)
        # Add a dimension for seq_len which is 1
        input = input.unsqueeze(1) 
        
        embedded = self.dropout(self.embedding(input))
        
        # output shape: [batch_size, 1, hid_dim]
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        # prediction shape: [batch_size, output_dim]
        prediction = self.fc_out(output.squeeze(1))
        
        return prediction, hidden, cell

import random

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src: [batch_size, src_len]
        # trg: [batch_size, trg_len]
        
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        
        # tensor to store decoder outputs
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        
        # 1. Encode the source
        hidden, cell = self.encoder(src)
        
        # 2. First input to the decoder is the <SOS> token
        input = trg[:, 0]
        
        # 3. Loop through the target sequence (starting from the 2nd token)
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            # store prediction
            outputs[:, t, :] = output
            
            # decide whether to use teacher forcing
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1) 
            
            # if teacher forcing, use actual next token as next input; else use predicted token
            input = trg[:, t] if teacher_force else top1
            
        return outputs


# HYPERPARAMETERS
INPUT_DIM = len(dataset.word2idx)         # English vocab size
OUTPUT_DIM = len(dataset.word2idx_arabic) # Arabic vocab size
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# INITIALIZE
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, device).to(device)

# OPTIMIZER & LOSS
optimizer = torch.optim.Adam(model.parameters())

# We ignore the padding token so it doesn't affect the loss
TRG_PAD_IDX = dataset.word2idx_arabic['<PAD>']
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

In [99]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    
    for i, (src, trg) in enumerate(iterator):
        src, trg = src.to(device), trg.to(device)
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        # output shape: [batch, trg_len, output_dim]
        # trg shape: [batch, trg_len]
        
        # Reshape for loss function
        # Flatten outputs to [batch * trg_len, output_dim]
        # Flatten trg to [batch * trg_len]
        # We start from index 1 to ignore the <SOS> token 
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        
        # Clip gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

# RUN TRAINING
N_EPOCHS = 10
CLIP = 1

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_dataloader, optimizer, criterion, CLIP)
    print(f'Epoch: {epoch+1} | Train Loss: {train_loss:.3f}')

Epoch: 1 | Train Loss: 6.400
Epoch: 2 | Train Loss: 5.797
Epoch: 3 | Train Loss: 5.587
Epoch: 4 | Train Loss: 5.395
Epoch: 5 | Train Loss: 5.235
Epoch: 6 | Train Loss: 5.112
Epoch: 7 | Train Loss: 4.991
Epoch: 8 | Train Loss: 4.860
Epoch: 9 | Train Loss: 4.741
Epoch: 10 | Train Loss: 4.625
