### 1. Data Loading

In [None]:
# NOTE: Install any necessary libraries like pandas, numpy, torch, 

In [2]:
import pandas as pd # used for reading the CSV file containing the English-Spanish pairs

# Load the dataset
data = pd.read_csv('data.csv')
print(data.head()) # English is the source and spanish is the target

  english  spanish
0     Go.      Ve.
1     Go.    Vete.
2     Go.    Vaya.
3     Go.  Váyase.
4     Hi.    Hola.


### 2. Preprocess the data

In [None]:
import torchtext # for handling text data and datasets
from torchtext.legacy.data import Field, BucketIterator
import spacy # for efficient tokenization and preprocessing of text


In [69]:
# checking versions
print(torch.__version__)
print(spacy.__version__)
print(torchtext.__version__)

1.8.0+cpu
2.3.7
0.9.0


In [58]:
# Load spacy models for English and spanish
spacy_en = spacy.load('en_core_web_sm')
spacy_es = spacy.load('es_core_news_sm')

# Define tokenizers: this is to convert sentences into tokens, aka words.
# It'll break down text into smaller pieces that the model can understand
def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

def tokenize_es(text):
    return [tok.text for tok in spacy_es.tokenizer(text)]

# Create fields for source and target languages
SRC = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True)
TRG = Field(tokenize=tokenize_es, init_token='<sos>', eos_token='<eos>', lower=True)

# Create the dataset from the Pandas DataFrame
fields = {'english': ('src', SRC), 'spanish': ('trg', TRG)}
examples = [torchtext.legacy.data.Example.fromlist([data['english'][i], data['spanish'][i]], [('src', SRC), ('trg', TRG)]) for i in range(len(data))]
dataset = torchtext.legacy.data.Dataset(examples, [('src', SRC), ('trg', TRG)])

# Split the dataset into training, validation, and test sets
train_data, valid_data, test_data = dataset.split(split_ratio=[0.2, 0.4, 0.4])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def sort_key(ex):
    return len(ex.src)

BATCH_SIZE = 12
train_iterator = BucketIterator(
    train_data,
    batch_size=BATCH_SIZE,
    sort_within_batch = True,
    device=device,
    sort_key=sort_key
)

### 3. Build the Transformer model

In [59]:
# Import dependencies for building the neural network
import torch
import torch.nn as nn
import torch.optim as optim

In [60]:
# Define Transformer Object that inherits form nn.module
class Transformer(nn.Module):
    def __init__(self, input_dim, output_dim, emb_dim, n_heads, n_layers, dropout):
        super().__init__()
        # embedding layers to convert input and output tokens into dense vectors, aka embeddings
        self.embedding_src = nn.Embedding(input_dim, emb_dim)
        self.embedding_trg = nn.Embedding(output_dim, emb_dim)
        
        # transformer layer: the main part of the model which has the transformer architecture;
        # responsible for processing the sequences and capturing relationships between tokens
        self.transformer = nn.Transformer(emb_dim, n_heads, n_layers, n_layers, emb_dim * 4, dropout)
       
        # output layer: a fully connected (fc) layer that maps the transformers output to the target vocabulary size
        self.fc_out = nn.Linear(emb_dim, output_dim)
        
        # dropout: a technique used to prevent overfitting by randomly setting a fraction of the input units to 0 during training
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, trg):
        src_emb = self.dropout(self.embedding_src(src))
        trg_emb = self.dropout(self.embedding_trg(trg))
        output = self.transformer(src_emb, trg_emb)
        return self.fc_out(output)

In [61]:
# build vocabulary
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

In [62]:
# Define Hyperparameters for the model
INPUT_DIM = len(SRC.vocab) # input dimension
OUTPUT_DIM = len(TRG.vocab) # output dimension
EMB_DIM = 256 # embedding dimension
N_HEADS = 8 # number of attention heads
N_LAYERS = 3 # number of layers
DROPOUT = 0.1

In [63]:
# Instantiate the Model
model = Transformer(INPUT_DIM, OUTPUT_DIM, EMB_DIM, N_HEADS, N_LAYERS, DROPOUT)

### 4. Train the model

In [64]:
def train(model, iterator, optimizer, criterion, clip):
    model.train() # put the model in training mode
    epoch_loss = 0

    for batch in iterator: # iterate through batches
        src = batch.src
        trg = batch.trg
    
        optimizer.zero_grad() # zero the gradients to avoid accumulation form previous iterations
        output = model(src, trg[:-1, :])
        output_dim = output.shape[-1]
    
        output = output.view(-1, output_dim)
        trg = trg[1:, :].view(-1)
    
        loss = criterion(output, trg) # loss calculation
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
    
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)
    
    

In [65]:
# Delcaring optimizer and loss function
optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss(ignore_index=TRG.vocab.stoi[TRG.pad_token])

In [66]:
# Training loop
N_EPOCHS = 10

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_iterator, optimizer, loss_fn, clip=1)
    print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.3f}')

Epoch 1, Train Loss: 4.497
Epoch 2, Train Loss: 3.955
Epoch 3, Train Loss: 3.712
Epoch 4, Train Loss: 3.552
Epoch 5, Train Loss: 3.466
Epoch 6, Train Loss: 3.396
Epoch 7, Train Loss: 3.300
Epoch 8, Train Loss: 3.243


KeyboardInterrupt: 