In [None]:
!python3 -m venv venv


In [None]:
!source venv/bin/activate

In [None]:
import torch
print("MPS available:", torch.backends.mps.is_available())
print("MPS built:", torch.backends.mps.is_built())


MPS available: True
MPS built: True


In [None]:
!pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu


Looking in indexes: https://download.pytorch.org/whl/nightly/cpu


In [None]:
!pip install --quiet sentencepiece sacrebleu

import os
import zipfile
import math
import random
import urllib.request
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import sentencepiece as spm
import sacrebleu

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print("Using device:", device)


Using device: mps


In [None]:
DATA_URL = "https://github.com/odashi/small_parallel_enja/archive/refs/heads/master.zip"
ZIP_PATH = Path("small_parallel_enja.zip")
EXTRACT_DIR = Path("small_parallel_enja-master")

if not ZIP_PATH.exists():
    print("Downloading dataset ZIP...")
    urllib.request.urlretrieve(DATA_URL, ZIP_PATH)
else:
    print("ZIP already present.")

if not EXTRACT_DIR.exists():
    print("Extracting...")
    with zipfile.ZipFile(ZIP_PATH, "r") as zf:
        zf.extractall(".")
else:
    print("Already extracted.")


def find_split_files(root: Path):
    candidates = list(root.rglob("*"))
    def pick(name):
        for p in candidates:
            if p.name == name:
                return p
        return None
    files = {
        "train.en": pick("train.en"),
        "train.ja": pick("train.ja"),
        "dev.en": pick("dev.en") or pick("tune.en"),
        "dev.ja": pick("dev.ja") or pick("tune.ja"),
        "test.en": pick("test.en"),
        "test.ja": pick("test.ja"),
    }
    return files

files = find_split_files(EXTRACT_DIR)
files


ZIP already present.
Already extracted.


{'train.en': PosixPath('small_parallel_enja-master/train.en'),
 'train.ja': PosixPath('small_parallel_enja-master/train.ja'),
 'dev.en': PosixPath('small_parallel_enja-master/dev.en'),
 'dev.ja': PosixPath('small_parallel_enja-master/dev.ja'),
 'test.en': PosixPath('small_parallel_enja-master/test.en'),
 'test.ja': PosixPath('small_parallel_enja-master/test.ja')}

In [None]:
PAD_ID, UNK_ID, BOS_ID, EOS_ID = 0, 1, 2, 3
VOCAB_EN = 8000
VOCAB_JA = 8000

# Read text files
def read_lines(path: Path):
    with open(path, encoding="utf-8") as f:
        return [ln.strip() for ln in f]

assert files["train.en"] and files["train.ja"], "train.{en,ja} not found"
train_en = read_lines(files["train.en"])
train_ja = read_lines(files["train.ja"])

dev_en = read_lines(files["dev.en"]) if files["dev.en"] else []
dev_ja = read_lines(files["dev.ja"]) if files["dev.ja"] else []
test_en = read_lines(files["test.en"]) if files["test.en"] else []
test_ja = read_lines(files["test.ja"]) if files["test.ja"] else []

def aligned_pairs(src, tgt):
    return [(s, t) for s, t in zip(src, tgt) if s and t]

train_pairs = aligned_pairs(train_en, train_ja)
dev_pairs = aligned_pairs(dev_en, dev_ja) if dev_en and dev_ja else []
test_pairs = aligned_pairs(test_en, test_ja) if test_en and test_ja else []

print(f"Train pairs: {len(train_pairs)}")
print(f"Dev pairs:   {len(dev_pairs)}")
print(f"Test pairs:  {len(test_pairs)}")

def train_sp(sentences, prefix, vocab_size):
    txt = Path(f"{prefix}_spm_input.txt")
    txt.write_text("\n".join(sentences), encoding="utf-8")
    if not Path(f"{prefix}.model").exists():
        spm.SentencePieceTrainer.train(
            (
                f"--input={txt} --model_prefix={prefix} --vocab_size={vocab_size} "
                f"--pad_id={PAD_ID} --unk_id={UNK_ID} --bos_id={BOS_ID} --eos_id={EOS_ID} "
                "--character_coverage=1.0 --input_sentence_size=2000000 "
                "--shuffle_input_sentence=true --minloglevel=2"
            )
        )
    return spm.SentencePieceProcessor(model_file=f"{prefix}.model")


en_sp = train_sp([p[0] for p in train_pairs], "en", VOCAB_EN)
ja_sp = train_sp([p[1] for p in train_pairs], "ja", VOCAB_JA)

print("EN vocab:", en_sp.get_piece_size(), "| JA vocab:", ja_sp.get_piece_size())


Train pairs: 50000
Dev pairs:   500
Test pairs:  500
EN vocab: 4000 | JA vocab: 4000


In [None]:
def add_bos_eos(ids):
    return [BOS_ID] + ids + [EOS_ID]

class EnJaDataset(Dataset):
    def __init__(self, pairs, en_sp, ja_sp, max_src_len=128, max_tgt_len=128):
        self.pairs = pairs
        self.en_sp = en_sp
        self.ja_sp = ja_sp
        self.max_src_len = max_src_len
        self.max_tgt_len = max_tgt_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        en, ja = self.pairs[idx]
        src_ids = add_bos_eos(self.en_sp.encode(en, out_type=int))[: self.max_src_len]
        tgt_ids = add_bos_eos(self.ja_sp.encode(ja, out_type=int))[: self.max_tgt_len]
        return torch.tensor(src_ids, dtype=torch.long), torch.tensor(tgt_ids, dtype=torch.long)

def pad_sequences(seqs, pad=PAD_ID):
    max_len = max(s.size(0) for s in seqs)
    out = torch.full((len(seqs), max_len), pad, dtype=torch.long)
    for i, s in enumerate(seqs):
        out[i, : s.size(0)] = s
    return out

def collate_fn(batch):
    src_list, tgt_list = zip(*batch)
    src = pad_sequences(src_list, PAD_ID)
    tgt = pad_sequences(tgt_list, PAD_ID)
    tgt_inp = tgt[:, :-1]
    tgt_out = tgt[:, 1:]

    src_key_padding_mask = (src == PAD_ID)
    tgt_key_padding_mask = (tgt_inp == PAD_ID)
    T = tgt_inp.size(1)
    subsequent = torch.triu(torch.ones((T, T), dtype=torch.bool), diagonal=1)
    attn_mask = subsequent.unsqueeze(0)  # broadcast over batch & heads later
    return src, tgt_inp, tgt_out, src_key_padding_mask, tgt_key_padding_mask, attn_mask


MAX_TRAIN = None
train_data = EnJaDataset(train_pairs[:MAX_TRAIN] if MAX_TRAIN else train_pairs, en_sp, ja_sp)
val_data = EnJaDataset(dev_pairs if dev_pairs else train_pairs[:2000], en_sp, ja_sp)  # fallback if no dev
test_data = EnJaDataset(test_pairs if test_pairs else train_pairs[-2000:], en_sp, ja_sp)  # fallback if no test

BATCH_SIZE = 64
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, drop_last=False)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, drop_last=False)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, drop_last=False)

len(train_loader), len(val_loader), len(test_loader)


(782, 8, 8)

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_len: int = 10000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))  # (1, L, D)

    def forward(self, x):
        # x: (B, L, D)
        L = x.size(1)
        return x + self.pe[:, :L, :]

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        assert d_model % n_heads == 0
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads

        self.q_proj = nn.Linear(d_model, d_model)
        self.k_proj = nn.Linear(d_model, d_model)
        self.v_proj = nn.Linear(d_model, d_model)
        self.o_proj = nn.Linear(d_model, d_model)
        self.drop = nn.Dropout(dropout)

    def forward(self, q, k, v, attn_mask=None, key_padding_mask=None):
        # q,k,v: (B, L, D)
        B, Lq, _ = q.size()
        B, Lk, _ = k.size()

        def shape(x, L):
            x = x.view(B, L, self.n_heads, self.d_k).transpose(1, 2)  # (B, H, L, d_k)
            return x

        Q = shape(self.q_proj(q), Lq)
        K = shape(self.k_proj(k), Lk)
        V = shape(self.v_proj(v), Lk)

        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)  # (B, H, Lq, Lk)

        # Add masks
        if attn_mask is not None:
            scores = scores.masked_fill(attn_mask.unsqueeze(1), float("-inf"))

        if key_padding_mask is not None:
            scores = scores.masked_fill(key_padding_mask.unsqueeze(1).unsqueeze(2), float("-inf"))

        attn = torch.softmax(scores, dim=-1)
        attn = self.drop(attn)
        context = torch.matmul(attn, V)  # (B, H, Lq, d_k)
        context = context.transpose(1, 2).contiguous().view(B, Lq, self.d_model)  # (B, Lq, D)
        out = self.o_proj(context)
        return out

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.lin1 = nn.Linear(d_model, d_ff)
        self.lin2 = nn.Linear(d_ff, d_model)
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        return self.lin2(self.drop(F.gelu(self.lin1(x))))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, n_heads, dropout)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.drop = nn.Dropout(dropout)

    def forward(self, src, src_key_padding_mask=None):
        # Pre-norm
        x = src
        x2 = self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x),
                            attn_mask=None, key_padding_mask=src_key_padding_mask)
        x = x + self.drop(x2)
        x2 = self.ff(self.norm2(x))
        x = x + self.drop(x2)
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, n_heads, dropout)
        self.cross_attn = MultiHeadAttention(d_model, n_heads, dropout)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.drop = nn.Dropout(dropout)

    def forward(self, tgt, memory, tgt_attn_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
        x = tgt
        x2 = self.self_attn(self.norm1(x), self.norm1(x), self.norm1(x),
                            attn_mask=tgt_attn_mask, key_padding_mask=tgt_key_padding_mask)
        x = x + self.drop(x2)
        x2 = self.cross_attn(self.norm2(x), memory, memory,
                             attn_mask=None, key_padding_mask=memory_key_padding_mask)
        x = x + self.drop(x2)
        x2 = self.ff(self.norm3(x))
        x = x + self.drop(x2)
        return x

class TransformerFromScratch(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, d_model=512, n_heads=8, num_enc=6, num_dec=6, d_ff=2048, dropout=0.1, max_len=4096):
        super().__init__()
        self.src_emb = nn.Embedding(src_vocab, d_model, padding_idx=PAD_ID)
        self.tgt_emb = nn.Embedding(tgt_vocab, d_model, padding_idx=PAD_ID)
        self.pos = PositionalEncoding(d_model, max_len)
        self.enc_layers = nn.ModuleList([EncoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(num_enc)])
        self.dec_layers = nn.ModuleList([DecoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(num_dec)])
        self.out = nn.Linear(d_model, tgt_vocab)
        self.drop = nn.Dropout(dropout)

        # Init
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def encode(self, src, src_key_padding_mask):
        x = self.drop(self.pos(self.src_emb(src)))
        for layer in self.enc_layers:
            x = layer(x, src_key_padding_mask=src_key_padding_mask)
        return x

    def decode(self, tgt, memory, tgt_attn_mask, tgt_key_padding_mask, memory_key_padding_mask):
        x = self.drop(self.pos(self.tgt_emb(tgt)))
        for layer in self.dec_layers:
            x = layer(x, memory, tgt_attn_mask=tgt_attn_mask,
                      tgt_key_padding_mask=tgt_key_padding_mask,
                      memory_key_padding_mask=memory_key_padding_mask)
        return x

    def forward(self, src, tgt_inp, src_key_padding_mask, tgt_key_padding_mask, tgt_sub_mask):
        memory = self.encode(src, src_key_padding_mask)  # (B, S, D)
        dec = self.decode(tgt_inp, memory, tgt_sub_mask, tgt_key_padding_mask, src_key_padding_mask)  # (B, T, D)
        logits = self.out(dec)  # (B, T, Vtgt)
        return logits


In [None]:
def noam_schedule(step, d_model, warmup):
    step = max(step, 1)
    return (d_model ** -0.5) * min(step ** -0.5, step * (warmup ** -1.5))

class NoamOpt:
    def __init__(self, d_model, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.d_model = d_model

    def step(self):
        self._step += 1
        lr = noam_schedule(self._step, self.d_model, self.warmup)
        for pg in self.optimizer.param_groups:
            pg["lr"] = lr
        self.optimizer.step()

    def zero_grad(self):
        self.optimizer.zero_grad()

def compute_loss(logits, targets):
    # logits: (B,T,V), targets: (B,T)
    return F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=PAD_ID)

@torch.no_grad()
def greedy_decode(model, src, src_key_padding_mask, max_len=100):
    model.eval()
    B = src.size(0)
    memory = model.encode(src, src_key_padding_mask)  # (B,S,D)
    ys = torch.full((B, 1), BOS_ID, dtype=torch.long, device=src.device)
    for _ in range(max_len - 1):
        T = ys.size(1)
        subsequent = torch.triu(torch.ones((T, T), dtype=torch.bool, device=src.device), diagonal=1)
        logits = model.decode(ys, memory, subsequent.unsqueeze(0), ys.eq(PAD_ID), src_key_padding_mask)
        next_token = model.out(logits[:, -1, :]).argmax(-1, keepdim=True)  # greedy
        ys = torch.cat([ys, next_token], dim=1)
        if (next_token == EOS_ID).all():  # all sequences ended
            break
    return ys

def ids_to_text(ids_batch, sp):
    outs = []
    for ids in ids_batch:
        ids = ids.tolist()
        # strip BOS/EOS
        if ids and ids[0] == BOS_ID: ids = ids[1:]
        if ids and ids[-1] == EOS_ID: ids = ids[:-1]
        outs.append(sp.decode(ids))
    return outs

@torch.no_grad()
def evaluate_bleu(model, loader, en_sp, ja_sp, max_len=100, limit_batches=None):
    refs = []
    hyps = []
    for b, (src, tgt_inp, tgt_out, src_kpm, tgt_kpm, tgt_sub) in enumerate(loader):
        src = src.to(device)
        src_kpm = src_kpm.to(device)
        pred_ids = greedy_decode(model, src, src_kpm, max_len=max_len)
        hyps.extend(ids_to_text(pred_ids, ja_sp))
        refs.extend([[r] for r in ids_to_text(tgt_out.to(device), ja_sp)])
        if limit_batches and (b + 1) >= limit_batches:
            break
    return sacrebleu.corpus_bleu(hyps, refs).score


In [None]:
SRC_VOCAB = en_sp.get_piece_size()
TGT_VOCAB = ja_sp.get_piece_size()

d_model = 256
n_heads = 4
num_enc = 4
num_dec = 4
d_ff = 1024
dropout = 0.1
warmup_steps = 2000
MAX_EPOCHS = 10

model = TransformerFromScratch(
    SRC_VOCAB, TGT_VOCAB,
    d_model=d_model,
    n_heads=n_heads,
    num_enc=num_enc,
    num_dec=num_dec,
    d_ff=d_ff,
    dropout=dropout,
    max_len=1024
).to(device)

opt = torch.optim.AdamW(
    model.parameters(),
    betas=(0.9, 0.98),
    eps=1e-9,
    weight_decay=0.0
)
sched = NoamOpt(d_model, warmup_steps, opt)

best_bleu = 0.0
for epoch in range(1, MAX_EPOCHS + 1):
    model.train()
    total_loss = 0.0

    for i, (src, tgt_inp, tgt_out, src_kpm, tgt_kpm, tgt_sub) in enumerate(train_loader, 1):
        src, tgt_inp, tgt_out = src.to(device), tgt_inp.to(device), tgt_out.to(device)
        src_kpm, tgt_kpm = src_kpm.to(device), tgt_kpm.to(device)
        tgt_sub = tgt_sub.to(device)

        logits = model(src, tgt_inp, src_kpm, tgt_kpm, tgt_sub)
        loss = compute_loss(logits, tgt_out)

        sched.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        sched.step()

        total_loss += loss.item()

        if i % 200 == 0:
            print(f"Epoch {epoch} | Step {i}/{len(train_loader)} | "
                  f"Loss {total_loss / i:.4f}")

    bleu = evaluate_bleu(model, val_loader, en_sp, ja_sp, max_len=100, limit_batches=50)
    print(f"Epoch {epoch} done. TrainLoss {total_loss/len(train_loader):.4f} "
          f"| Val BLEU {bleu:.2f}")

    if bleu > best_bleu:
        best_bleu = bleu
        torch.save(
            {
                "model": model.state_dict(),
                "cfg": {
                    "d_model": d_model,
                    "n_heads": n_heads,
                    "num_enc": num_enc,
                    "num_dec": num_dec,
                    "d_ff": d_ff,
                    "dropout": dropout
                }
            },
            "enja_transformer.pt"
        )
        print("✓ Saved checkpoint with BLEU =", best_bleu)


Epoch 1 | Step 200/782 | Loss 5.9563
Epoch 1 | Step 400/782 | Loss 5.3883
Epoch 1 | Step 600/782 | Loss 5.0256
Epoch 1 done. TrainLoss 4.7296 | Val BLEU 47.90
✓ Saved checkpoint with BLEU = 47.89736254435747
Epoch 2 | Step 200/782 | Loss 3.3514
Epoch 2 | Step 400/782 | Loss 3.2190
Epoch 2 | Step 600/782 | Loss 3.0986
Epoch 2 done. TrainLoss 2.9953 | Val BLEU 9.31
Epoch 3 | Step 200/782 | Loss 2.4745
Epoch 3 | Step 400/782 | Loss 2.4222
Epoch 3 | Step 600/782 | Loss 2.3705
Epoch 3 done. TrainLoss 2.3211 | Val BLEU 10.50
Epoch 4 | Step 200/782 | Loss 1.9854
Epoch 4 | Step 400/782 | Loss 1.9584
Epoch 4 | Step 600/782 | Loss 1.9274
Epoch 4 done. TrainLoss 1.9033 | Val BLEU 16.28
Epoch 5 | Step 200/782 | Loss 1.6587
Epoch 5 | Step 400/782 | Loss 1.6595
Epoch 5 | Step 600/782 | Loss 1.6484
Epoch 5 done. TrainLoss 1.6393 | Val BLEU 8.62
Epoch 6 | Step 200/782 | Loss 1.4548
Epoch 6 | Step 400/782 | Loss 1.4626
Epoch 6 | Step 600/782 | Loss 1.4626
Epoch 6 done. TrainLoss 1.4608 | Val BLEU 8.19


In [None]:
def load_model(ckpt_path, SRC_VOCAB, TGT_VOCAB, device):
    if not Path(ckpt_path).exists():
        raise FileNotFoundError(f"{ckpt_path} not found")

    ckpt = torch.load(ckpt_path, map_location=device)
    cfg = ckpt["cfg"]

    model = TransformerFromScratch(
        SRC_VOCAB, TGT_VOCAB,
        d_model=cfg["d_model"], n_heads=cfg["n_heads"],
        num_enc=cfg["num_enc"], num_dec=cfg["num_dec"],
        d_ff=cfg["d_ff"], dropout=cfg["dropout"], max_len=2048
    ).to(device)

    state_dict = ckpt["model"]
    if state_dict["pos.pe"].shape[1] != model.pos.pe.shape[1]:
        old_pe = state_dict["pos.pe"]
        new_pe = torch.zeros_like(model.pos.pe)
        length = min(old_pe.shape[1], new_pe.shape[1])
        new_pe[:, :length, :] = old_pe[:, :length, :]
        state_dict["pos.pe"] = new_pe

    model.load_state_dict(state_dict)
    model.eval()
    return model


In [None]:
# Checkpoint path
ckpt_path = "enja_transformer.pt"

if Path(ckpt_path).exists():
    ckpt = torch.load(ckpt_path, map_location=device)
    cfg = ckpt["cfg"]

    model = TransformerFromScratch(
        SRC_VOCAB, TGT_VOCAB,
        d_model=cfg["d_model"], n_heads=cfg["n_heads"],
        num_enc=cfg["num_enc"], num_dec=cfg["num_dec"],
        d_ff=cfg["d_ff"], dropout=cfg["dropout"], max_len=2048
    ).to(device)

    state_dict = ckpt["model"]
    if state_dict["pos.pe"].shape[1] != model.pos.pe.shape[1]:
        old_pe = state_dict["pos.pe"]
        new_pe = torch.zeros_like(model.pos.pe)
        length = min(old_pe.shape[1], new_pe.shape[1])
        new_pe[:, :length, :] = old_pe[:, :length, :]
        state_dict["pos.pe"] = new_pe

    model.load_state_dict(state_dict)
    model.eval()

# Evaluate BLEU
test_bleu = evaluate_bleu(model, test_loader, en_sp, ja_sp, max_len=100, limit_batches=50)

# Translation function
def translate_sentences(model, sentences, src_tokenizer, tgt_tokenizer, max_len=100):
    model.eval()
    with torch.no_grad():
        src_ids = [
            torch.tensor(add_bos_eos(src_tokenizer.encode(s, out_type=int)), dtype=torch.long)
            for s in sentences
        ]
        src = pad_sequences(src_ids, PAD_ID).to(device)
        src_kpm = (src == PAD_ID)
        ys = greedy_decode(model, src, src_kpm.to(device), max_len=max_len)
        return ids_to_text(ys, tgt_tokenizer)

# Sample sentences
samples = [
    "Hello, hey",
    "welcome",
    "We are doing our best",
    "this is research assignment"
]

print("\nSample translations:")
for en, ja in zip(samples, translate_sentences(model, samples, en_sp, ja_sp, max_len=60)):
    print(f"EN: {en}\nJA: {ja}\n")



Sample translations:
EN: Hello, hey
JA: 彼 は 私 の を 見 て い る 。 が 。 が い 。 い 。 う 。 。 。

EN: welcome
JA: 私 は 部屋 に 行 い 。 で す 。 す 。 す 。 。 。 。 。 。

EN: We are doing our best
JA: この 本 は 何 の を 読 る 。 が な い 。 す 。 。 。 。 。 。

EN: this is research assignment
JA: この この 本 は この この この 本き で す 。 で す 。 す 。 。 。 。 。

