In [None]:
from google.colab import drive, auth
import pandas as pd
import io
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
import re
import numpy as np

drive.mount('/content/drive')
auth.authenticate_user()
FILE_ID = "1Ia0lMueQGicyHzacvLCmtVxiiALbaydQ"

def download_file_from_drive(file_id):
    """Downloads corpus from Google Drive using the file ID."""
    service = build('drive', 'v3')
    request = service.files().get_media(fileId=file_id)
    file_handle = io.BytesIO()

    downloader = MediaIoBaseDownload(file_handle, request)
    done = False
    while not done:
        status, done = downloader.next_chunk()
    file_handle.seek(0)
    return file_handle


file_handle = download_file_from_drive(FILE_ID)
df = pd.read_csv(file_handle)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Preproccessing

In [None]:
print(df.head())
print(len(df["English"]))
print(len(df["Greek"]))

                                             English  \
0                          Resumption of the session   
1  I declare resumed the session of the European ...   
2  Although, as you will have seen, the dreaded '...   
3  You have requested a debate on this subject in...   
4  In the meantime, I should like to observe a mi...   

                                               Greek  
0                              Επαvάληψη της συvσδoυ  
1  Κηρύσσω την επανάληψη της συνόδου του Ευρωπαϊκ...  
2  Όπως μπορέσατε να διαπιστώσετε, ο περίφημος "ι...  
3  Επιθυμείτε μία συζήτηση επί του θέματος τις επ...  
4  Επί του παρόντος θα ήθελα, όπως μου ζήτησαν ορ...  
1235977
1235977


In [None]:
missing_english = df['English'].isnull().sum()
print(f"Missing values in English column: {missing_english}")

missing_greek = df['Greek'].isnull().sum()
print(f"Missing values in Greek column: {missing_greek}")

Missing values in English column: 2354
Missing values in Greek column: 5767


In [None]:
import tensorflow as tf
import tensorflow_text as tf_text
import sentencepiece as spm

# Preprocessing Hyperparameters
MAX_VOCAB_SIZE = 32000 #Same as the paper
BATCH_SIZE = 128
MIN_TOKENS = 3
MAX_TOKENS = 20
MAX_SEQ_LENGTH = MAX_TOKENS + 2  # +2 for start and end tokens

df = df.dropna(subset=["English", "Greek"])

def clean_text(text):
    text = text.lower().strip()
    text = re.sub(r"\s+", " ", text)
    return text

df["English"] = df["English"].apply(clean_text)
df["Greek"] = df["Greek"].apply(clean_text)

# Filter by token length (without start/end tokens)
df = df[df["English"].str.split().str.len().between(MIN_TOKENS, MAX_TOKENS)]
df = df[df["Greek"].str.split().str.len().between(MIN_TOKENS, MAX_TOKENS)]

# Shuffle data
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into train/val/test (80/10/10)
total = len(df)
train_df = df.iloc[:int(0.8*total)]
val_df = df.iloc[int(0.8*total):int(0.9*total)]
test_df = df.iloc[int(0.9*total):]

# Print dataset sizes
print("Training set size:", len(train_df))
print("Validation set size:", len(val_df))
print("Test set size:", len(test_df))

# Combine training data for both languages to train a shared vocabulary
combined_train_text = pd.concat([train_df["English"], train_df["Greek"]], axis=0)
combined_train_text_path = "combined_train_text.txt"
combined_train_text.to_csv(combined_train_text_path, index=False, header=False)

spm.SentencePieceTrainer.Train(
    input=combined_train_text_path,
    model_prefix="spm_model",
    vocab_size=MAX_VOCAB_SIZE,
    character_coverage=1.0,
    model_type='bpe',
    max_sentence_length=10000
)

# Load the trained SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load("spm_model.model")

# Ensure start_token_id and end_token_id are outside the normal token range
vocab_size = sp.get_piece_size()
start_token_id = vocab_size + 1
end_token_id = vocab_size + 2

print("Trained SentencePiece vocab size:", sp.get_piece_size())

# Create tf_text SentencepieceTokenizer
tokenizer = tf_text.SentencepieceTokenizer(
    model=tf.io.gfile.GFile("spm_model.model", 'rb').read(),
    out_type=tf.int32,
    nbest_size=0,
    add_bos=False, add_eos=False
)

def vectorize_pairs(en, gr):
    # Tokenize English
    en_tokens = tokenizer.tokenize(en)
    en_tokens = en_tokens.to_tensor(shape=[tf.shape(en)[0], MAX_SEQ_LENGTH], default_value=0)

    # Tokenize Greek
    gr_tokens = tokenizer.tokenize(gr)
    # prepend start_token_id and append end_token_id to each sequence
    gr_tokens = tf.ragged.map_flat_values(
        lambda row: tf.concat([[start_token_id], row, [end_token_id]], axis=0),
        gr_tokens
    )

    # Convert to dense with padding/truncation
    gr_tokens = gr_tokens.to_tensor(shape=[tf.shape(gr)[0], MAX_SEQ_LENGTH], default_value=0)

    return (en_tokens, gr_tokens)

train_eng = tf.data.Dataset.from_tensor_slices(train_df["English"].values)
train_gr = tf.data.Dataset.from_tensor_slices(train_df["Greek"].values)
val_eng = tf.data.Dataset.from_tensor_slices(val_df["English"].values)
val_gr = tf.data.Dataset.from_tensor_slices(val_df["Greek"].values)

train_ds = tf.data.Dataset.zip((train_eng, train_gr)).batch(BATCH_SIZE).map(vectorize_pairs).prefetch(tf.data.AUTOTUNE)
val_ds = tf.data.Dataset.zip((val_eng, val_gr)).batch(BATCH_SIZE).map(vectorize_pairs).prefetch(tf.data.AUTOTUNE)

print("Vocabulary size:", sp.get_piece_size())
print("A few random tokens from the vocab:")
for i in [0, 10, 100, 500, 9999]:
    print(i, sp.id_to_piece(i))

for inp, tar in train_ds.take(1):
    print("Sample batch from train_ds:")
    print("Input shape:", inp.shape)
    print("Target shape:", tar.shape)
    print("First input sequence (token IDs):", inp[0, :20])
    print("First target sequence (token IDs):", tar[0, :20])

    # Convert a few tokens back to text for sanity check
    first_inp_tokens = inp[0].numpy()
    first_tar_tokens = tar[0].numpy()
    # Filter out padding (0)
    first_inp_tokens = [t for t in first_inp_tokens if t != 0]
    first_tar_tokens = [t for t in first_tar_tokens if t != 0]

    print("Reconstructed input text:", " ".join([sp.id_to_piece(int(t)) for t in first_inp_tokens if t < vocab_size]))
    print("Reconstructed target text:", " ".join([sp.id_to_piece(int(t)) for t in first_tar_tokens if t < vocab_size]))

for inp, tar in val_ds.take(1):
    print("Validation batch sample:")
    print("Input shape:", inp.shape)
    print("Target shape:", tar.shape)
    print("Input tokens (first sequence):", inp[0, :20])
    print("Target tokens (first sequence):", tar[0, :20])
    break

Training set size: 366815
Validation set size: 45852
Test set size: 45852
Trained SentencePiece vocab size: 32000
Vocabulary size: 32000
A few random tokens from the vocab:
0 <unk>
10 ▁α
100 ▁and
500 ▁fa
9999 ▁νομικό
Sample batch from train_ds:
Input shape: (128, 22)
Target shape: (128, 22)
First input sequence (token IDs): tf.Tensor(
[  137    71  2254  5660 31819     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0], shape=(20,), dtype=int32)
First target sequence (token IDs): tf.Tensor(
[32001   119  2148 29408     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0], shape=(20,), dtype=int32)
Reconstructed input text: ▁it ▁is ▁really ▁considerable .
Reconstructed target text: ▁είναι ▁πραγματικά ▁αξιοσημείωτο
Validation batch sample:
Input shape: (128, 22)
Target shape: (128, 22)
Input tokens (first sequence): tf.Tensor(
[  120    71   375   623 21718 22248 22227    43   200    48 26530 31819
     0     0

Model

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

def get_angles(pos, i, d_model):
      # Ensures float32 operations
      pos = tf.cast(pos, tf.float32)
      i = tf.cast(i, tf.float32)
      return pos / tf.pow(10000., (2 * (i//2)) / tf.cast(d_model, tf.float32))

def positional_encoding(position, d_model):
    angle_rads = get_angles(
        tf.range(position)[:, tf.newaxis],
        tf.range(d_model)[tf.newaxis, :],
        d_model
    )
    sines = tf.sin(angle_rads[:, 0::2])
    cosines = tf.cos(angle_rads[:, 1::2])
    angle_rads = tf.TensorArray(tf.float32, size=d_model)
    # Combine sines and cosines back:
    even_out = tf.reshape(sines, (-1, tf.shape(sines)[1])) # sines shape: position x d_model/2
    odd_out = tf.reshape(cosines, (-1, tf.shape(cosines)[1])) # same

    angle_rads = tf.reshape(tf.stack([even_out, odd_out], axis=2), (1, position, d_model))
    return tf.cast(angle_rads, tf.float32)


class PositionalEmbedding(layers.Layer):
    def __init__(self, vocab_size, d_model, max_length):
        super().__init__()
        self.d_model = d_model
        self.embedding = layers.Embedding(vocab_size, d_model, mask_zero=True)
        self.pos_encoding = positional_encoding(max_length, d_model)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x) * tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[:, :length, :]
        return x

def scaled_dot_product_attention(q, k, v, mask):
    # Compute attention weights and output
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_logits = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:
        scaled_logits += (mask * -1e9)

    weights = tf.nn.softmax(scaled_logits, axis=-1)
    output = tf.matmul(weights, v)
    return output, weights

class MultiHeadAttention(layers.Layer):
    # Multi-head attention mechanism
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.num_heads = num_heads
        self.depth = d_model // num_heads
        self.wq = layers.Dense(d_model)
        self.wk = layers.Dense(d_model)
        self.wv = layers.Dense(d_model)
        self.dense = layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        # Split the last dimension into (num_heads, depth)
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
        scaled_attention, _ = scaled_dot_product_attention(q, k, v, mask)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.num_heads*self.depth))
        return self.dense(concat_attention)

def point_wise_feed_forward_network(d_model, dff):
        # Fully connected feed-forward network
        return tf.keras.Sequential([
            layers.Dense(dff, activation='relu'),
            layers.Dense(d_model)
        ])

class EncoderLayer(layers.Layer):
    # A single layer of the Transformer encoder
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, x, mask, training):
        attn_output = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super().__init__()
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, look_ahead_mask=None, padding_mask=None, training=False):
        attn1 = self.mha1(q=x, k=x, v=x, mask=look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(x + attn1)

        attn2 = self.mha2(q=out1, k=enc_output, v=enc_output, mask=padding_mask)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(out1 + attn2)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(out2 + ffn_output)

        return out3

class Encoder(layers.Layer):
    # The full Transformer encoder: multiple encoder layers
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size,
                 maximum_position_encoding, rate=0.1):
        super().__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = PositionalEmbedding(vocab_size, d_model, maximum_position_encoding)
        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
                           for _ in range(num_layers)]
        self.dropout = layers.Dropout(rate)

    def call(self, x, mask, training):
        x = self.embedding(x)
        x = self.dropout(x, training=training)
        for enc_layer in self.enc_layers:
            x = enc_layer(x=x, mask=mask, training=training)
        return x

class Decoder(layers.Layer):
    # The full Transformer decoder: multiple decoder layers
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size,
                 maximum_position_encoding, rate=0.1):
        super().__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = PositionalEmbedding(vocab_size, d_model, maximum_position_encoding)
        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)
                           for _ in range(num_layers)]
        self.dropout = layers.Dropout(rate)

    def call(self, x, enc_output, look_ahead_mask, padding_mask, training):
        x = self.embedding(x)
        x = self.dropout(x, training=training)
        for dec_layer in self.dec_layers:
            x = dec_layer(x=x, enc_output=enc_output,
                          look_ahead_mask=look_ahead_mask,
                          padding_mask=padding_mask,
                          training=training)
        return x

class Transformer(tf.keras.Model):
    # The full Transformer model: encoder + decoder + final linear layer
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
                 target_vocab_size, pe_input, pe_target, rate=0.1):
        super().__init__()
        self.encoder = Encoder(num_layers, d_model, num_heads, dff,
                               input_vocab_size, pe_input, rate)
        self.decoder = Decoder(num_layers, d_model, num_heads, dff,
                               target_vocab_size, pe_target, rate)
        self.final_layer = layers.Dense(target_vocab_size)

    def call(self, enc_inputs, dec_inputs, enc_padding_mask,
             look_ahead_mask, dec_padding_mask, training):
        enc_output = self.encoder(x=enc_inputs, mask=enc_padding_mask, training=training)
        dec_output = self.decoder(x=dec_inputs, enc_output=enc_output, look_ahead_mask=look_ahead_mask, padding_mask=dec_padding_mask, training=training)
        final_output = self.final_layer(dec_output)
        return final_output

Training


In [None]:
import psutil
import GPUtil
import math
# Hyperparameters (Following "Attention is All You Need")
EPOCHS = 20
LEARNING_RATE = 1e-3
NUM_LAYERS = 6
D_MODEL = 256
NUM_HEADS = 8
DFF = 1024
DROPOUT_RATE = 0.1
warmup_steps = 4000

input_vocab_size = sp.get_piece_size()
target_vocab_size = input_vocab_size

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
train_loss = tf.keras.metrics.Mean(name='train_loss')
val_loss = tf.keras.metrics.Mean(name='val_loss')


#Follows learning rate scheduling from "Attention is All You Need"
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

transformer = Transformer(
    num_layers=NUM_LAYERS,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dff=DFF,
    input_vocab_size=input_vocab_size,
    target_vocab_size=target_vocab_size,
    pe_input=MAX_SEQ_LENGTH,
    pe_target=MAX_SEQ_LENGTH,
    rate=DROPOUT_RATE
)

# Loss and metrics
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
learning_rate_schedule = CustomSchedule(D_MODEL)
optimizer = tf.keras.optimizers.Adam(
    learning_rate=learning_rate_schedule,
    beta_1=0.9,
    beta_2=0.98,
    epsilon=1e-9
)

def create_padding_mask(seq):
    # Mask out padding tokens
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(size):
    # Prevent decoder from attending to future tokens
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

def create_masks(inp, tar):
    enc_padding_mask = create_padding_mask(inp)
    dec_padding_mask = create_padding_mask(inp)
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
    return enc_padding_mask, combined_mask, dec_padding_mask

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_sum(loss_) / tf.reduce_sum(mask)

@tf.function
def train_step(inp, tar):
    # tar_inp is everything except last token, tar_real is everything except first token
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    # Check how many tokens are non-padding in tar_real
    non_pad_tokens = tf.reduce_sum(
        tf.cast(tf.math.logical_not(tf.math.equal(tar_real, 0)), tf.float32)
    )
    tf.print("Number of non-padding tokens in tar_real:", non_pad_tokens)

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    with tf.GradientTape() as tape:
        predictions = transformer(
            enc_inputs=inp,
            dec_inputs=tar_inp,
            enc_padding_mask=enc_padding_mask,
            look_ahead_mask=combined_mask,
            dec_padding_mask=dec_padding_mask,
            training=True
        )
        loss = loss_function(tar_real, predictions)

    # Print out the loss to confirm it's non-zero
    tf.print("Loss:", loss)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    avg_grad = tf.reduce_mean([tf.reduce_mean(tf.abs(g)) for g in gradients if g is not None])
    tf.print("Average gradient:", avg_grad)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    train_loss(loss)

@tf.function
def val_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
    predictions = transformer(
        enc_inputs=inp,
        dec_inputs=tar_inp,
        enc_padding_mask=enc_padding_mask,
        look_ahead_mask=combined_mask,
        dec_padding_mask=dec_padding_mask,
        training=True
    )

    v_loss = loss_function(tar_real, predictions)
    val_loss(v_loss)

sample_input = tf.ones((1, MAX_SEQ_LENGTH), dtype=tf.int32)
sample_target = tf.ones((1, MAX_SEQ_LENGTH), dtype=tf.int32)
_ = transformer(
    enc_inputs=sample_input,
    dec_inputs=sample_target[:, :-1],
    enc_padding_mask=create_padding_mask(sample_input),
    look_ahead_mask=create_look_ahead_mask(MAX_SEQ_LENGTH-1),
    dec_padding_mask=create_padding_mask(sample_input),
    training=False
)

def get_resource_usage():
    # CPU usage
    cpu = psutil.cpu_percent()

    # RAM usage
    ram = psutil.virtual_memory().percent

    # GPU info
    try:
        gpus = GPUtil.getGPUs()
        if gpus:
            gpu = gpus[0]  # Get first GPU
            gpu_name = gpu.name
            gpu_util = gpu.load * 100
            gpu_mem = gpu.memoryUtil * 100
            gpu_info = f"GPU ({gpu_name}): {gpu_util:.1f}% (MEM: {gpu_mem:.1f}%)"
        else:
            gpu_info = "No GPU found"
    except:
        gpu_info = "GPU info unavailable"

    return f"CPU: {cpu:.1f}% | RAM: {ram:.1f}% | {gpu_info}"

# Training loop
for epoch in range(EPOCHS):
    train_loss.reset_state()
    val_loss.reset_state()

    # Training
    for (batch, (inp, tar)) in enumerate(train_ds):
        train_step(inp, tar)
        print(f"Resources: {get_resource_usage()}")
        print(f"Training Epoch {epoch+1}, Batch {batch+1}...")

    # Validation
    for (batch, (inp, tar)) in enumerate(val_ds):
        val_step(inp, tar)

    # Print epoch results
    print(f"Epoch {epoch+1}/{EPOCHS}, "
          f"Train Loss: {train_loss.result():.4f}, "
          f"Val Loss: {val_loss.result():.4f}")

    # Save the model weights after each epoch
    transformer.save_weights(f"transformer_weights_epoch_{epoch+1}.weights.h5")
    print("Model weights saved.")

# Save final weights
transformer.save_weights("/content/drive/MyDrive/transformer_weights_final.weights.h5")
print("Model training completed and weights saved.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Resources: CPU: 13.1% | RAM: 7.8% | GPU (NVIDIA A100-SXM4-40GB): 64.0% (MEM: 16.1%)
Training Epoch 20, Batch 1867...
Number of non-padding tokens in tar_real: 1758
Loss: 2.23072147
Average gradient: 0.000861086301
Resources: CPU: 15.2% | RAM: 7.8% | GPU (NVIDIA A100-SXM4-40GB): 69.0% (MEM: 16.1%)
Training Epoch 20, Batch 1868...
Number of non-padding tokens in tar_real: 1744
Loss: 2.31198359
Average gradient: 0.000789158454
Resources: CPU: 13.0% | RAM: 7.8% | GPU (NVIDIA A100-SXM4-40GB): 69.0% (MEM: 16.1%)
Training Epoch 20, Batch 1869...
Number of non-padding tokens in tar_real: 1803
Loss: 2.17277026
Average gradient: 0.000709340151
Resources: CPU: 12.9% | RAM: 7.8% | GPU (NVIDIA A100-SXM4-40GB): 66.0% (MEM: 16.1%)
Training Epoch 20, Batch 1870...
Number of non-padding tokens in tar_real: 1731
Loss: 2.34794021
Average gradient: 0.000825890806
Resources: CPU: 13.1% | RAM: 7.8% | GPU (NVIDIA A100-SXM4-40GB): 66.0% (MEM: 16

Evaluation

In [None]:
import sacrebleu

# Parameters
beam_size = 1
max_length = 20
start_token_id = 1

weights = download_file_from_drive('18dtss4j9kAxVzhS1vJp-jG4Xr1Ct94Zf')

transformer.load_weights(weights)
print("Weights loaded successfully!")

def beam_search_decode(model, input_seq, beam_size=5, max_length=50, debug=False):
    enc_padding_mask = create_padding_mask(input_seq)
    enc_output = model.encoder(x=input_seq, mask=enc_padding_mask, training=False)

    sequences = [(tf.constant([[start_token_id]], dtype=tf.int32), 0.0)]

    # Debug: Print shape and first few tokens of input_seq
    if debug:
        print("DEBUG: input_seq shape:", input_seq.shape)
        print("DEBUG: input_seq (first row):", input_seq[0, :10])

    for step in range(max_length):
        all_candidates = []
        for seq, score in sequences:
            look_ahead_mask = create_look_ahead_mask(tf.shape(seq)[1])
            dec_padding_mask = create_padding_mask(input_seq)
            combined_mask = tf.maximum(create_padding_mask(seq), look_ahead_mask)

            predictions = model.decoder(
                x=seq, enc_output=enc_output,
                look_ahead_mask=combined_mask,
                padding_mask=dec_padding_mask,
                training=False
            )
            predictions = model.final_layer(predictions)[:, -1, :]  # [1, vocab_size]

            log_probs = tf.nn.log_softmax(predictions, axis=-1)[0]  # [vocab_size]

            # Debug: Print details at the first time step for the first sequence
            if debug and step == 0:
                print("DEBUG: log_probs shape:", log_probs.shape)
                print("DEBUG: top 10 log_probs:", tf.math.top_k(log_probs, k=10))

            top_k_log_probs, top_k_ids = tf.math.top_k(log_probs, k=beam_size)

            # Debug: At first step, print chosen top tokens
            if debug and step == 0:
                print("DEBUG: top_k_ids:", top_k_ids.numpy())
                print("DEBUG: top_k_log_probs:", top_k_log_probs.numpy())

            for i in range(beam_size):
                new_seq = tf.concat([seq, tf.expand_dims([top_k_ids[i]], 0)], axis=-1)
                new_score = score + float(top_k_log_probs[i])
                all_candidates.append((new_seq, new_score))

        all_candidates = sorted(all_candidates, key=lambda x: x[1], reverse=True)
        sequences = all_candidates[:beam_size]

    best_seq, best_score = sequences[0]
    return best_seq[0, 1:].numpy()

# Choose 100 random samples from test_df
test_samples = test_df.sample(100, random_state=42)

references = []
predictions_text = []

# Debug on only the first sample to avoid too much clutter
debug_first_sample = True

for i, (eng, gr_ref) in enumerate(zip(test_samples["English"].values, test_samples["Greek"].values)):
    ragged = tokenizer.tokenize([eng])
    input_seq = ragged.to_tensor(shape=[1, MAX_SEQ_LENGTH])

    pred_tokens = beam_search_decode(transformer, input_seq, beam_size=beam_size, max_length=max_length, debug=debug_first_sample)
    debug_first_sample = False  # Only debug the first sample

    pred_text = " ".join([sp.id_to_piece(int(t)) for t in pred_tokens])
    references.append(gr_ref.strip())
    predictions_text.append(pred_text.strip())

# Debug: Print first 5 predictions and references
print("DEBUG: First 5 predictions vs references:")
for i in range(5):
    print("EN:", test_samples["English"].values[i])
    print("REF:", references[i])
    print("PRED:", predictions_text[i])
    print("---")

df_results = pd.DataFrame({
    "English": test_samples["English"].values,
    "Greek_Reference": test_samples["Greek"].values,
    "Predicted_Translation": predictions_text
})

df_results.head()

references_for_sacrebleu = [references]

bleu = sacrebleu.corpus_bleu(predictions_text, references_for_sacrebleu)
chrf = sacrebleu.corpus_chrf(predictions_text, references_for_sacrebleu)
ter = sacrebleu.corpus_ter(predictions_text, references_for_sacrebleu)

# Simple token-level accuracy
token_accuracies = []
for pred, ref in zip(predictions_text, references):
    pred_tokens = pred.split()
    ref_tokens = ref.split()
    matches = sum(p == r for p, r in zip(pred_tokens, ref_tokens))
    total = min(len(pred_tokens), len(ref_tokens))
    token_accuracies.append(matches / total if total > 0 else 0)
token_accuracy = np.mean(token_accuracies)

print("--------- Dataset Info ---------")
print("Training set size:", len(train_df))
print("Validation set size:", len(val_df))
print("Test set size:", len(test_df))
print(f"Tokenized Vocabulary Size: {sp.get_piece_size()}")
print("--------- Preprocessing Hyperparameters ---------")
print(f"MAX_VOCAB_SIZE:{MAX_VOCAB_SIZE}")
print(f"MAX_SEQ_LENGTH:{MAX_SEQ_LENGTH}")
print(f"BATCH_SIZE:{BATCH_SIZE}")
print(f"MIN_TOKENS (per sentence):{MIN_TOKENS}")
print(f"MAX_TOKENS (per sentence):{MAX_TOKENS}")
print("--------- Model Hyperparameters ---------")
print(f"EPOCHS: {EPOCHS}")
print(f"LEARNING_RATE: {LEARNING_RATE}")
print(f"NUM_LAYERS: {NUM_LAYERS}")
print(f"D_MODEL: {D_MODEL}")
print(f"NUM_HEADS: {NUM_HEADS}")
print(f"DFF: {DFF}")
print(f"DROPOUT_RATE: {DROPOUT_RATE}")
print("--------- Evaluation Metrics ---------")
print(f"BLEU: {bleu.score:.2f}")
print(f"CHRF: {chrf.score:.2f}")
print(f"TER: {ter.score:.2f}")

Weights loaded successfully!
DEBUG: input_seq shape: (1, 22)
DEBUG: input_seq (first row): tf.Tensor([9257   71  700   14 1726  263   14 1484 3818   58], shape=(10,), dtype=int32)
DEBUG: log_probs shape: (32000,)
DEBUG: top 10 log_probs: TopKV2(values=<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([-1.2894046, -1.6825116, -1.8038304, -2.9072692, -3.140919 ,
       -3.394728 , -3.5649388, -3.9261518, -4.640788 , -4.732126 ],
      dtype=float32)>, indices=<tf.Tensor: shape=(10,), dtype=int32, numpy=
array([10901,   102,    18,  1554,   943,   926,    61,  5221,  8471,
          66], dtype=int32)>)
DEBUG: top_k_ids: [10901]
DEBUG: top_k_log_probs: [-1.2894046]
DEBUG: First 5 predictions vs references:
EN: india is now the country with the most cases of aids.
REF: η ινδία είναι τώρα η χώρα με τα περισσότερα περιστατικά aids.
PRED: ▁ινδία ▁η ▁χώρα ▁είναι ▁πλέον ▁η ▁χώρα ▁που ▁έχει ▁τη ▁χώρα ▁αυτή ▁να ▁υποφέρει ▁από ▁το ▁aids . ▁" ▁αναλογία
---
EN: in most cases, costly collective wag