In [1]:
import re
import torch
import torch.nn as nn
import spacy
import pandas as pd
from torch.utils.data import DataLoader , random_split
from torch.utils.tensorboard import SummaryWriter
from datasets import load_dataset
from collections import Counter 
from torchtext.vocab import build_vocab_from_iterator
from model import buildTransformers


2025-02-17 00:45:58.033493: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset("Helsinki-NLP/opus_wikipedia", "ar-en")


In [3]:
dataSet = ds["train"]["translation"]


In [4]:
len(dataSet)


151136

In [5]:
def cleanText(text, lang="ar"):
    text = text.strip()  
    if lang == "ar":
        text = re.sub(r'[^\u0621-\u064A\s]', '', text)
        text = re.sub(r'\s+', ' ', text)
    elif lang == "en":
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = re.sub(r'\s+', ' ', text)
    return text

dataSet = [data for data in dataSet if data["ar"].replace(" ", "") != data["en"].replace(" ", "")]
for data in dataSet:
    data["ar"] = cleanText(data["ar"], lang="ar")
    data["en"] = cleanText(data["en"], lang="en")

dataSet = dataSet[:len(dataSet) // 32]
df = pd.DataFrame(dataSet)
ar = df["ar"].tolist()
en = df["en"].tolist()
dataSet[:15],len(dataSet)


([{'ar': 'إستونيا بالإستونية رسميا جمهورية إستونيا بالإستونية هي دولة تقع في منطقة بحر البلطيق بشمال أوروبا',
   'en': 'Estonia officially the Republic of Estonia is a country in the Baltic region of Northern Europe'},
  {'ar': 'يحدها من الشمال خليج فنلندا ومن الغرب بحر البلطيق ومن الجنوب لاتفيا كم وإلى الشرق من بحيرة بيبوس والاتحاد الروسي كم',
   'en': 'It is bordered to the north by the Gulf of Finland to the west by the Baltic Sea to the south by Latvia km and to the east by Lake Peipus and Russia km'},
  {'ar': 'وعبر بحر البلطيق تقع السويد في الغرب وفنلندا في الشمال',
   'en': 'Across the Baltic Sea lies Sweden in the west and Finland in the north'},
  {'ar': ' ما قبل التاريخ أصبح استقرار الإنسان في إستونيا ممكنا قبل حوالي إلى سنة عندما ذاب الجليد من أخر عصر جليدي',
   'en': 'HistoryPrehistoryHuman settlement in Estonia became possible to years ago when the ice from the last glacial era melted'},
  {'ar': 'وفقا للالتأريخ الكاربوني تم اللاستعمار حوالي سنة مضت في بداية الألفية التاسع

In [6]:
spacyAr = spacy.blank("ar")
spacyEn = spacy.blank("en")


tokenizerEng = lambda text: [token.text for token in spacyEn(text)]

tokenizerAr = lambda text: [token.text for token in spacyAr(text)]


def yield_tokens(data, tokenizer):
    for text in data:
        yield tokenizer(text)

def buildVocab(data, tokenizer):
    vocab = build_vocab_from_iterator(
        yield_tokens(data, tokenizer), 
        specials=['<unk>', '<pad>', '<sos>', '<eos>']
    )
    vocab.set_default_index(vocab['<unk>'])
    return vocab


In [7]:

vocabAr = buildVocab(ar, tokenizerAr)  
vocabEn = buildVocab(en, tokenizerEng)  


In [8]:
def dataProcess(ar, en, seq_length=None):
    data = []
    for rawAr, rawEn in zip(ar, en):
        tokensAr = [vocabAr['<sos>']] + [vocabAr[token] for token in tokenizerAr(rawAr)] + [vocabAr['<eos>']]
        tokensEn = [vocabEn['<sos>']] + [vocabEn[token] for token in tokenizerEng(rawEn)] + [vocabEn['<eos>']]
        
        if seq_length is not None:
            if len(tokensAr) > seq_length:
                tokensAr = tokensAr[:seq_length]
            else:
                tokensAr += [vocabAr['<pad>']] * (seq_length - len(tokensAr))
            
            if len(tokensEn) > seq_length:
                tokensEn = tokensEn[:seq_length]
            else:
                tokensEn += [vocabEn['<pad>']] * (seq_length - len(tokensEn))
        
        data.append((torch.tensor(tokensAr, dtype=torch.long),torch.tensor(tokensEn, dtype=torch.long)))
    return data


In [9]:
dataSet = dataProcess(ar,en,350)

trainSize = int(0.6 * len(dataSet))
testSize = len(dataSet) - trainSize
trainDataset, testDataset = random_split(dataSet, [trainSize, testSize])


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else  'cpu')
batchSize = 32
trainLoader = DataLoader(
    trainDataset,
    batch_size=batchSize,
    shuffle=True
    )
testLoader = DataLoader(
    testDataset,
    batch_size=batchSize,
    shuffle=True
    )


In [11]:
srcVocabSize = len(vocabAr)
tgtVocabSize = len(vocabEn)
model = buildTransformers(
    srcVocabSize=srcVocabSize,
    tgtVocabSize=tgtVocabSize,
    srcSeqLen=350,
    tgtSeqLen=350,
    dModel=512,
    n=6,
    h=4,
    dropout=0.1,
    dFF=2048
)
model.to(device)


Transformer(
  (encoder): Encoder(
    (layers): ModuleList(
      (0-5): 6 x EncoderBlock(
        (Attention): MultiHeadAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (Wq): Linear(in_features=512, out_features=512, bias=True)
          (Wk): Linear(in_features=512, out_features=512, bias=True)
          (Wv): Linear(in_features=512, out_features=512, bias=True)
          (Wo): Linear(in_features=512, out_features=512, bias=True)
        )
        (feedForward): FeedForward(
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (residualConnection): ModuleList(
          (0-1): 2 x ResidualConnection(
            (dropout): Dropout(p=0.1, inplace=False)
            (norm): LayerNormalization()
          )
        )
      )
    )
    (norm): LayerNormalization()
  )
  (decoder): Decoder(
    (layers): Modul

In [12]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss(ignore_index=vocabEn['<pad>'])


In [13]:
def generate_mask(src, tgt, src_pad_idx, tgt_pad_idx, device):
    # Create source mask
    src_mask = (src != src_pad_idx).unsqueeze(1).unsqueeze(2)
    
    # Create target mask
    tgt_mask = (tgt != tgt_pad_idx).unsqueeze(1).unsqueeze(2)
    seq_length = tgt.size(1)
    causal_mask = torch.tril(torch.ones(seq_length, seq_length)).bool().to(device)
    tgt_mask = tgt_mask & causal_mask
    
    return src_mask, tgt_mask

# Modified training loop with proper masking
num_epochs = 20
best_loss = float('inf')
trainLoss = []
valLoss = []
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch_idx, (src, tgt) in enumerate(trainLoader):
        src, tgt = src.to(device), tgt.to(device)
        
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:].reshape(-1)
        
        # Generate masks
        src_mask, tgt_mask = generate_mask(src, tgt_input, vocabAr['<pad>'], vocabEn['<pad>'], device)
        
        optimizer.zero_grad()
        output = model(src, tgt_input, src_mask, tgt_mask)
        output = output.contiguous().view(-1, tgtVocabSize)
        
        loss = criterion(output, tgt_output)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        total_loss += loss.item()
        
        # Print every 50 batches
        if (batch_idx + 1) % 50 == 0:
            avg_batch_loss = total_loss / (batch_idx + 1)
            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(trainLoader)}], Loss: {avg_batch_loss:.4f}")
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for src, tgt in testLoader:
            src, tgt = src.to(device), tgt.to(device)
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:].reshape(-1)
            
            src_mask, tgt_mask = generate_mask(src, tgt_input,vocabAr['<pad>'], vocabEn['<pad>'], device)
            
            output = model(src, tgt_input, src_mask, tgt_mask)
            output = output.view(-1, tgtVocabSize)
            
            loss = criterion(output, tgt_output)
            val_loss += loss.item()
    
    avg_train_loss = total_loss / len(trainLoader)
    avg_val_loss = val_loss / len(testLoader)

    # Save best model
    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        torch.save(model.state_dict(), "best_transformer.pth")
    trainLoss.append(avg_train_loss)
    valLoss.append(avg_val_loss)
    print(f"Epoch [{epoch+1}/{num_epochs}] | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

print("Training completed.")


Epoch [1/20], Batch [50/69], Loss: 8.0625
Epoch [1/20] | Train Loss: 7.8183 | Val Loss: 7.0433
Epoch [2/20], Batch [50/69], Loss: 6.7695
Epoch [2/20] | Train Loss: 6.7471 | Val Loss: 6.8144
Epoch [3/20], Batch [50/69], Loss: 6.4126
Epoch [3/20] | Train Loss: 6.3822 | Val Loss: 6.6371
Epoch [4/20], Batch [50/69], Loss: 6.0634
Epoch [4/20] | Train Loss: 6.0417 | Val Loss: 6.5431
Epoch [5/20], Batch [50/69], Loss: 5.7578
Epoch [5/20] | Train Loss: 5.7392 | Val Loss: 6.4750
Epoch [6/20], Batch [50/69], Loss: 5.4847
Epoch [6/20] | Train Loss: 5.4637 | Val Loss: 6.4747
Epoch [7/20], Batch [50/69], Loss: 5.2310
Epoch [7/20] | Train Loss: 5.2245 | Val Loss: 6.4869
Epoch [8/20], Batch [50/69], Loss: 5.0014
Epoch [8/20] | Train Loss: 4.9977 | Val Loss: 6.4493
Epoch [9/20], Batch [50/69], Loss: 4.7895
Epoch [9/20] | Train Loss: 4.7776 | Val Loss: 6.4501
Epoch [10/20], Batch [50/69], Loss: 4.5833
Epoch [10/20] | Train Loss: 4.5727 | Val Loss: 6.4323
Epoch [11/20], Batch [50/69], Loss: 4.3753
Epoch

In [31]:
# Load the trained model
model.load_state_dict(torch.load("best_transformer.pth"))

def pad_sequence(seq, max_length, pad_token):
    if len(seq) < max_length:
        return seq + [pad_token] * (max_length - len(seq))
    else:
        return seq[:max_length]


# Improved translation function with beam search
def translate_sentence(sentence, beam_size=3, max_length=350):
    model.eval()
    # Tokenize and pad source
    tokens = [vocabAr['<sos>']] + [vocabAr[token] for token in tokenizerAr(sentence)] + [vocabAr['<eos>']]
    tokens = pad_sequence(tokens, max_length, vocabAr['<pad>'])
    src_tensor = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)
    
    # Create source mask
    src_mask = (src_tensor != vocabAr['<pad>']).unsqueeze(1).unsqueeze(2)

    # Beam search initialization
    beam = [([vocabEn['<sos>']], 0)]

    for _ in range(max_length):
        new_beam = []
        for seq, score in beam:
            if seq[-1] == vocabEn['<eos>']:
                new_beam.append((seq, score))
                continue
                
            tgt_tensor = torch.tensor(seq, dtype=torch.long).unsqueeze(0).to(device)
            tgt_len = len(seq)
            
            # Create causal mask
            tgt_mask = torch.tril(torch.ones(tgt_len, tgt_len)).bool().to(device)
            tgt_mask = tgt_mask.unsqueeze(0).unsqueeze(0)
            
            with torch.no_grad():
                output = model(src_tensor, tgt_tensor, src_mask, tgt_mask)
            
            log_probs = torch.log_softmax(output[:, -1, :], dim=-1)
            top_probs, top_indices = log_probs.topk(beam_size)
            
            for i in range(beam_size):
                new_seq = seq + [top_indices[0, i].item()]
                new_score = score + top_probs[0, i].item()
                new_beam.append((new_seq, new_score))
        
        # Keep top beam_size candidates
        new_beam.sort(key=lambda x: x[1]/len(x[0]), reverse=True)
        beam = new_beam[:beam_size]
    # Select best sequence
    best_seq = max(beam, key=lambda x: x[1]/len(x[0]))[0]
    translated = [vocabEn.lookup_token(tok) for tok in best_seq[1:-1]]
    return ' '.join(translated)

print((translate_sentence("")))


 


In [32]:
vocabAr[""]


0