In [8]:
!pip install kaggle

import os
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import re
from collections import Counter
from tqdm import tqdm
import math



In [9]:
from google.colab import files

uploaded = files.upload()

Saving small_vocab_en.csv to small_vocab_en (1).csv


In [10]:
from google.colab import files

uploaded = files.upload()  # Pilih kedua file: small_vocab_en.csv & small_vocab_fr.csv

Saving small_vocab_fr.csv to small_vocab_fr (2).csv


In [11]:
with open('small_vocab_en.csv', 'r', encoding='utf-8') as f:
    en_lines = [line.strip() for line in f if line.strip()]

with open('small_vocab_fr.csv', 'r', encoding='utf-8') as f:
    fr_lines = [line.strip() for line in f if line.strip()]

if len(en_lines) != len(fr_lines):
    min_len = min(len(en_lines), len(fr_lines))
    en_lines = en_lines[:min_len]
    fr_lines = fr_lines[:min_len]
    print(f"⚠️ Jumlah baris tidak sama! Dipotong ke {min_len} baris.")

import pandas as pd
df = pd.DataFrame({
    'src': en_lines,
    'tgt': fr_lines
})

print("✅ Dataset berhasil dibuat!")
print(df.head())

✅ Dataset berhasil dibuat!
                                                 src  \
0  new jersey is sometimes quiet during autumn , ...   
1  the united states is usually chilly during jul...   
2  california is usually quiet during march , and...   
3  the united states is sometimes mild during jun...   
4  your least liked fruit is the grape , but my l...   

                                                 tgt  
0  new jersey est parfois calme pendant l' automn...  
1  les états-unis est généralement froid en juill...  
2  california est généralement calme en mars , et...  
3  les états-unis est parfois légère en juin , et...  
4  votre moins aimé fruit est le raisin , mais mo...  


In [12]:
import re

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\sàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['src'] = df['src'].apply(preprocess)
df['tgt'] = df['tgt'].apply(preprocess)

In [13]:
class Vocab:
    def __init__(self, sentences, max_vocab=10000):
        word_freq = Counter()
        for sent in sentences:
            word_freq.update(sent.split())

        top_words = word_freq.most_common(max_vocab - 2)
        self.word2idx = {'<pad>': 0, '<unk>': 1}
        self.idx2word = {0: '<pad>', 1: '<unk>'}
        for word, _ in top_words:
            idx = len(self.word2idx)
            self.word2idx[word] = idx
            self.idx2word[idx] = word

    def encode(self, sentence, max_len=50):
        tokens = sentence.split()[:max_len]
        ids = [self.word2idx.get(token, self.word2idx['<unk>']) for token in tokens]
        ids += [self.word2idx['<pad>']] * (max_len - len(ids))
        return ids

src_vocab = Vocab(df['src'], max_vocab=5000)
tgt_vocab = Vocab(df['tgt'], max_vocab=5000)

print(f"Vocab size - English: {len(src_vocab.word2idx)}, French: {len(tgt_vocab.word2idx)}")

Vocab size - English: 202, French: 332


In [14]:
MAX_LEN = 20

src_encoded = [src_vocab.encode(sent, MAX_LEN) for sent in df['src']]
tgt_encoded = [tgt_vocab.encode(sent, MAX_LEN) for sent in df['tgt']]


src_tensor = torch.tensor(src_encoded, dtype=torch.long)
tgt_tensor = torch.tensor(tgt_encoded, dtype=torch.long)

In [15]:
class TranslationDataset(Dataset):
    def __init__(self, src, tgt):
        self.src = src
        self.tgt = tgt

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        return self.src[idx], self.tgt[idx]

dataset = TranslationDataset(src_tensor, tgt_tensor)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

BATCH_SIZE = 100
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [16]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return x

class TransformerModel(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=128, nhead=4, num_layers=2, dim_feedforward=256, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)

    def create_pad_mask(self, seq, pad_idx=0):
        return (seq == pad_idx)

    def create_tgt_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz), diagonal=1).bool()
        return mask

    def forward(self, src, tgt):
        src_pad_mask = self.create_pad_mask(src)
        tgt_pad_mask = self.create_pad_mask(tgt)
        tgt_mask = self.create_tgt_mask(tgt.size(1)).to(tgt.device)

        src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))
        tgt_emb = self.pos_encoder(self.tgt_embedding(tgt) * math.sqrt(self.d_model))

        output = self.transformer(
            src=src_emb,
            tgt=tgt_emb,
            src_key_padding_mask=src_pad_mask,
            tgt_key_padding_mask=tgt_pad_mask,
            tgt_mask=tgt_mask
        )
        return self.fc_out(output)

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)

model = TransformerModel(
    src_vocab_size=len(src_vocab.word2idx),
    tgt_vocab_size=len(tgt_vocab.word2idx),
    d_model=128,
    nhead=4,
    num_layers=2
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_losses = []
val_losses = []
val_accuracies = []

model.train()
for batch_idx, (src_batch, tgt_batch) in enumerate(tqdm(train_loader, desc="Training")):
    src_batch, tgt_batch = src_batch.to(device), tgt_batch.to(device)

    tgt_input = tgt_batch[:, :-1]
    tgt_output = tgt_batch[:, 1:]

    optimizer.zero_grad()
    output = model(src_batch, tgt_input)
    loss = criterion(output.reshape(-1, output.size(-1)), tgt_output.reshape(-1))
    loss.backward()
    optimizer.step()

    train_losses.append(loss.item())

    model.eval()
    with torch.no_grad():
        val_loss = 0
        correct = 0
        total = 0
        for val_src, val_tgt in val_loader:
            val_src, val_tgt = val_src.to(device), val_tgt.to(device)
            val_input = val_tgt[:, :-1]
            val_label = val_tgt[:, 1:]

            val_out = model(val_src, val_input)
            vloss = criterion(val_out.reshape(-1, val_out.size(-1)), val_label.reshape(-1))
            val_loss += vloss.item()

            pred = val_out.argmax(dim=-1)
            mask = (val_label != 0)
            correct += (pred[mask] == val_label[mask]).sum().item()
            total += mask.sum().item()

        avg_val_loss = val_loss / len(val_loader)
        val_acc = correct / total if total > 0 else 0

        val_losses.append(avg_val_loss)
        val_accuracies.append(val_acc)

        print(f"Batch {batch_idx+1} | Train Loss: {loss.item():.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")

    model.train()


    break

Device: cpu


  output = torch._nested_tensor_from_mask(
Training:   0%|          | 0/1103 [00:29<?, ?it/s]

Batch 1 | Train Loss: 6.0399 | Val Loss: 5.4484 | Val Acc: 0.0524





In [18]:
def translate_sentence(model, sentence, src_vocab, tgt_vocab, device, max_len=20):
    model.eval()
    with torch.no_grad():

        src = src_vocab.encode(sentence.lower(), max_len)
        src = torch.tensor(src).unsqueeze(0).to(device)

        tgt = torch.tensor([[1]])
        tgt = tgt.to(device)

        for _ in range(max_len - 1):
            output = model(src, tgt)
            pred_token = output[:, -1, :].argmax(dim=-1).item()
            tgt = torch.cat([tgt, torch.tensor([[pred_token]]).to(device)], dim=1)
            if pred_token == 0:  # <pad> atau akhir
                break

        tokens = tgt.squeeze().cpu().tolist()
        words = [tgt_vocab.idx2word.get(t, '<unk>') for t in tokens if t != 0]
        return ' '.join(words)

test_sentence = "hello how are you"
translated = translate_sentence(model, test_sentence, src_vocab, tgt_vocab, device)
print(f"Input (EN): {test_sentence}")
print(f"Output (FR): {translated}")

Input (EN): hello how are you
Output (FR): <unk> est étaient gelés pense moteur <unk> est novembre trop froid blanche mais il gelés il gelés il gelés préférés
