In [22]:
from datasets import load_dataset
from tqdm import tqdm


# Load the opus_books dataset
dataset = load_dataset("Helsinki-NLP/opus-100", "de-en")

# Split into training and validation datasets
train_data = dataset['train']
val_data = dataset['validation']

def preprocess_function(examples):
    inputs = [ex['de'] for ex in examples['translation']]
    targets = [ex['en'] for ex in examples['translation']]
    return {'inputs': inputs, 'targets': targets}

# Apply preprocessing to train and validation data
train_data = train_data.map(preprocess_function, batched=True)
val_data = val_data.map(preprocess_function, batched=True)

# Take a look at the first few preprocessed examples
for i in range(5):
    print(f"Input (German): {train_data[i]['inputs']}")
    print(f"Target (English): {train_data[i]['targets']}\n")


Input (German): Deine Habgier wird noch dein Tod sein.
Target (English): It's greed that it's gonna be the death of you, 'cause you...

Input (German): - Vega.
Target (English): Vega.

Input (German): Sagen Sie einfach stopp.
Target (English): Just say when.

Input (German): - Warte.
Target (English): - Wait.

Input (German): Ich will nicht hier sein.
Target (English): I don't wanna be here.



In [23]:
import re
def clean_text(text):
    # Remove special characters and extra spaces, normalize to lowercase
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9äöüß.,!?\s]", "", text)  # Removing unwanted characters
    text = re.sub(r"\s+", " ", text).strip()  # Removing extra spaces
    return text

# Preprocess the dataset
def preprocess_function(examples):
    inputs = [clean_text(ex['de']) for ex in examples['translation']]
    targets = [clean_text(ex['en']) for ex in examples['translation']]
    return {'inputs': inputs, 'targets': targets}

# Apply preprocessing to train and validation data
train_data = train_data.map(preprocess_function, batched=True)
val_data = val_data.map(preprocess_function, batched=True)

# Take a look at the first few preprocessed examples
for i in range(5):
    print(f"Input (German): {train_data[i]['inputs']}")
    print(f"Target (English): {train_data[i]['targets']}\n")


Input (German): deine habgier wird noch dein tod sein.
Target (English): its greed that its gonna be the death of you, cause you...

Input (German): vega.
Target (English): vega.

Input (German): sagen sie einfach stopp.
Target (English): just say when.

Input (German): warte.
Target (English): wait.

Input (German): ich will nicht hier sein.
Target (English): i dont wanna be here.



In [24]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

# Tokenize the dataset
def tokenize_function(examples):
    model_inputs = tokenizer(examples['inputs'], max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(examples['targets'], max_length=128, truncation=True, padding="max_length")

    # Replace 'labels' with tokenized target sentences
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply tokenization to train and validation data
train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)


In [25]:
for i in range(2):
    print(f"Tokenized Input IDs (German): {train_data[i]['input_ids']}")
    print(f"Tokenized Target IDs (English): {train_data[i]['labels']}\n")


Tokenized Input IDs (German): [5922, 9809, 122, 972, 551, 763, 20, 77, 12, 26, 1110, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Tokenized Target IDs (English): [165, 30337, 24, 165, 3, 13366, 36, 8, 1687, 13, 25, 6, 1137, 25, 233, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Tokenized Input IDs (German): [3, 162, 122, 9, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [26]:
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from transformers import AutoTokenizer


In [27]:
# Custom Dataset class for PyTorch
class TranslationDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        input_ids = torch.tensor(item['input_ids'], dtype=torch.long)
        labels = torch.tensor(item['labels'], dtype=torch.long)
        return input_ids, labels

In [28]:
train_dataset = TranslationDataset(train_data)
val_dataset = TranslationDataset(val_data)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)


In [29]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, max_seq_length=128):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = nn.Parameter(torch.randn(1, max_seq_length, d_model))
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt):
        src = self.embedding(src) + self.positional_encoding[:, :src.size(1), :]
        tgt = self.embedding(tgt) + self.positional_encoding[:, :tgt.size(1), :]
        output = self.transformer(src.transpose(0, 1), tgt.transpose(0, 1))
        output = self.fc_out(output.transpose(0, 1))
        return output

# Initialize model
vocab_size = tokenizer.vocab_size
model = TransformerModel(vocab_size)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

In [31]:
torch.mps.empty_cache()
