In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import spacy
from torch.nn.utils.rnn import pad_sequence
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class TranslationDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, src_tokenizer, tgt_tokenizer, src_vocab, tgt_vocab):
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src_text = self.src_texts[idx]
        tgt_text = self.tgt_texts[idx]
        src_tokens = [self.src_vocab.get(token.text, self.src_vocab["<unk>"]) for token in self.src_tokenizer(src_text)]
        tgt_tokens = [self.tgt_vocab.get(token.text, self.tgt_vocab["<unk>"]) for token in self.tgt_tokenizer(tgt_text)]
        return torch.tensor(src_tokens), torch.tensor(tgt_tokens)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(torch.log(torch.tensor(10000.0)) / d_model))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x):
        return x + self.encoding[:, :x.size(1), :].to(x.device)

class AttentionLayer(nn.Module):
    def __init__(self, d_model, n_heads):
        super(AttentionLayer, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=d_model, num_heads=n_heads, batch_first=True)

    def forward(self, query, key, value, mask=None):
        return self.attention(query, key, value, attn_mask=mask)[0]

class TransformerEncoder(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, num_layers):
        super(TransformerEncoder, self).__init__()
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads, dim_feedforward=ff_dim, batch_first=True)
            for _ in range(num_layers)
        ])

    def forward(self, x, mask=None):
        for layer in self.layers:
            x = layer(x, src_key_padding_mask=mask)
        return x

class TransformerDecoder(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, num_layers):
        super(TransformerDecoder, self).__init__()
        self.layers = nn.ModuleList([
            nn.TransformerDecoderLayer(d_model=d_model, nhead=n_heads, dim_feedforward=ff_dim, batch_first=True)
            for _ in range(num_layers)
        ])

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None):
        for layer in self.layers:
            tgt = layer(tgt, memory, tgt_mask=tgt_mask, memory_key_padding_mask=memory_mask)
        return tgt

class TransformerTranslator(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, n_heads, ff_dim, num_layers):
        super(TransformerTranslator, self).__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model)
        self.encoder = TransformerEncoder(d_model, n_heads, ff_dim, num_layers)
        self.decoder = TransformerDecoder(d_model, n_heads, ff_dim, num_layers)
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        src = self.positional_encoding(self.src_embedding(src))
        tgt = self.positional_encoding(self.tgt_embedding(tgt))
        memory = self.encoder(src, mask=src_mask)
        output = self.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=src_mask)
        return self.fc_out(output)

# Custom collate function
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_padded = pad_sequence([torch.cat((torch.tensor([1]), seq, torch.tensor([2]))) for seq in src_batch],
                              batch_first=True, padding_value=0)
    tgt_padded = pad_sequence([torch.cat((torch.tensor([1]), seq, torch.tensor([2]))) for seq in tgt_batch],
                              batch_first=True, padding_value=0)
    max_len_src = src_padded.size(1)
    max_len_tgt = tgt_padded.size(1)

    # Adjust masks for padding
    src_padding_mask = (src_padded == 0)
    tgt_padding_mask = (tgt_padded == 0)

    return src_padded, tgt_padded, src_padding_mask, tgt_padding_mask

def generate_square_subsequent_mask(sz):
    mask = torch.triu(torch.ones((sz, sz), device='cpu'), diagonal=1)
    mask = mask.masked_fill(mask == 1, float('-inf'))
    return mask

# Saving model weights
def save_model(model, file_path):
    torch.save(model.state_dict(), file_path)
    print(f"Model weights saved to {file_path}")

# Loading model weights
def load_model(model, file_path):
    model.load_state_dict(torch.load(file_path, map_location=device))
    model.eval()
    print(f"Model weights loaded from {file_path}")




In [58]:
def translate_sentence(model, sentence, src_vocab, tgt_vocab, nlp_src, max_len=50):
    model.eval()
    
    src_tokens = [src_vocab.get(token.text, src_vocab["<unk>"]) for token in nlp_src(sentence)]
    src_tensor = torch.tensor([src_tokens]).to(device)
    
    src_mask = None
    memory = model.encoder(model.positional_encoding(model.src_embedding(src_tensor)), src_mask)
    
    tgt_tokens = [tgt_vocab["<sos>"]]
    for _ in range(max_len):
        tgt_tensor = torch.tensor([tgt_tokens]).to(device)
        tgt_mask = generate_square_subsequent_mask(tgt_tensor.size(1)).to(device)
        tgt_positional = model.positional_encoding(model.tgt_embedding(tgt_tensor))
        output = model.decoder(tgt_positional, memory, tgt_mask=tgt_mask, memory_mask=src_mask)
        output = model.fc_out(output)
        next_token = output.argmax(-1)[:, -1].item()
        tgt_tokens.append(next_token)
        
        if next_token == tgt_vocab["<eos>"]:
            break
    
    inv_tgt_vocab = {v: k for k, v in tgt_vocab.items()}
    translated_sentence = " ".join([inv_tgt_vocab[token] for token in tgt_tokens if token not in (0, tgt_vocab["<sos>"], tgt_vocab["<eos>"])])
    return translated_sentence


# Data preparation
nlp_src = spacy.load("en_core_web_sm")
nlp_tgt = spacy.load("fr_core_news_sm")

src_vocab = {word: idx for idx, word in enumerate(["<pad>", "<sos>", "<eos>", "<unk>"] + list(nlp_src.vocab.strings))}
tgt_vocab = {word: idx for idx, word in enumerate(["<pad>", "<sos>", "<eos>", "<unk>"] + list(nlp_tgt.vocab.strings))}

# Adjust the dataset to use streaming if disk space is limited
from datasets import load_dataset

data = load_dataset("wmt14", "fr-en", streaming=True)
train_data = data["train"]

src_texts = []
tgt_texts = []
for example in train_data:
    src_sentence = example["translation"]["en"]
    word_count = len(src_sentence.split())
    if word_count<8:
        src_texts.append(src_sentence)
        tgt_texts.append(example["translation"]["fr"])
    if len(src_texts) >= 100000:  # Limit to 100,000 examples
        break
print(len(src_texts))

train_dataset = TranslationDataset(src_texts, tgt_texts, nlp_src, nlp_tgt, src_vocab, tgt_vocab)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)  # Adjust batch size

# Example usage
src_vocab_size = len(src_vocab)
tgt_vocab_size = len(tgt_vocab)
d_model = 512
n_heads = 8
ff_dim = 2048
num_layers = 6

model = TransformerTranslator(src_vocab_size, tgt_vocab_size, d_model, n_heads, ff_dim, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)








100000


In [59]:
file_path="transformer_weights.pth"

load_model(model, file_path)

test_sentence = "Hello world"
translated = translate_sentence(model, test_sentence, src_vocab, tgt_vocab, nlp_src)
print("Translated:", translated)


  model.load_state_dict(torch.load(file_path, map_location=device))


Model weights loaded from transformer_weights.pth
Translated: Les États-Unis évoquées mondial évoquées mondial évoquées mondial évoquées dramatique sensible leur -je Lamy mondial gérer leur -je Lamy mondial -je Lamy mondial évoquées dramatique leur -je Lamy mondial -je Lamy -je Lamy mondial -je Lamy mondial -je Lamy -je Lamy mondial -je Lamy mondial -je Lamy mondial -je Lamy


In [37]:
print("Translated:", translated)

Translated: <unk> monde <unk> <unk> <unk> <unk> les <unk> <unk> à <unk> monde plus <unk> <unk> <unk> les deux actualité actualité actualité actualité <unk> plus <unk> <unk> <unk> <unk> d' actualité monde <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>


In [64]:
# Training loop
import time


for epoch in range(3):
    model.train()
    count=0
    mtime=0
    for src, tgt, src_padding_mask, tgt_padding_mask in train_loader:
        
        count+=1
        start_time = time.time()
        src, tgt = src.to(device), tgt.to(device)
        src_padding_mask, tgt_padding_mask = src_padding_mask.to(device), tgt_padding_mask.to(device)
        optimizer.zero_grad()
        
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:].reshape(-1)
        
        src_mask = None
        tgt_mask = generate_square_subsequent_mask(tgt_input.size(1)).to(device)
        
        
        output = model(src, tgt_input, src_mask=src_mask, tgt_mask=tgt_mask)
        output = output.reshape(-1, tgt_vocab_size)
        
        loss = criterion(output, tgt_output)
        loss.backward()
        optimizer.step()
        #optimizer.zero_grad()

        
        
        # Очистка GPU памяти
        torch.cuda.empty_cache()
        end_time= time.time()
        mtime+=(end_time-start_time)
        if count==1000:
            count=0
            b_time=mtime
            mtime=0
            print(f"batch time {b_time} mSec , Loss: {loss.item()}")
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")
    
    if epoch%3==0:
        # Сохранение весов модели в файл
        model_path = "transformer_weights.pth"
        torch.save(model.state_dict(), model_path)
        print(f"Weights saved to {model_path}")
        # Test translation
        test_sentence = "Hello world"
        translated = translate_sentence(model, test_sentence, src_vocab, tgt_vocab, nlp_src)
        print("Translated:", translated)

batch time 106.11256885528564 mSec , Loss: 0.3247288465499878
batch time 103.96519589424133 mSec , Loss: 0.481305867433548
batch time 103.47373080253601 mSec , Loss: 0.5381783843040466
Epoch 1, Loss: 0.4056829810142517
Weights saved to transformer_weights.pth
Translated: Les États-Unis : Autre programme Autre programme : les réseaux : leur propre leur propre touche à leur propre touche à leur propre touche à la famine en Lamy : Lamy : leur rends par leur propre leur rends Lamy : Lamy à leur rends Lamy : Lamy : Lamy
batch time 106.54590964317322 mSec , Loss: 0.3038889467716217
batch time 102.06413555145264 mSec , Loss: 0.4543510377407074
batch time 103.74764585494995 mSec , Loss: 0.3406851887702942
Epoch 2, Loss: 0.3881976306438446
batch time 103.97550845146179 mSec , Loss: 0.2372269183397293
batch time 104.35857677459717 mSec , Loss: 0.2705625295639038
batch time 103.89511132240295 mSec , Loss: 0.2870049476623535
Epoch 3, Loss: 0.23062372207641602


In [54]:
model_path = "transformer_weights.pth"
torch.save(model.state_dict(), model_path)

In [57]:
test_sentence = "What is your name"
translated = translate_sentence(model, test_sentence, src_vocab, tgt_vocab, nlp_src)
print("Translated:", translated)

Translated: Il n ' y faut que votre n ' était que votre n ' est que votre nom n n ' est que votre n ' appelle que votre nom n n n n n n ' appelle que votre n n n ' appelle que votre n n n


In [63]:
for i in range(25):
    print(f"{src_texts[i]} - {tgt_texts[i]}")

Resumption of the session - Reprise de la session
Madam President, on a point of order. - Madame la Présidente, c'est une motion de procédure.
Madam President, on a point of order. - Madame la Présidente, c'est une motion de procédure.
It is the case of Alexander Nikitin. - Il s'agit du cas d'Alexandre Nikitin.
Why are there no fire instructions? - Comment se fait-il qu'il n'y ait pas de consignes en cas d'incendie ?
Why are no-smoking areas not enforced? - Comment se fait-il que l'on ne respecte pas les zones non fumeurs ?
We do not know what is happening. - Nous ne savons pas ce qui se passe.
Agenda - Ordre des travaux
Relating to Wednesday: - En ce qui concerne le mercredi :
(Applause from the PSE Group) - (Applaudissements du groupe PSE)
We therefore respect whatever Parliament may decide. - Par conséquent, nous respectons les décisions que pourrait prendre le Parlement dans ce sens.
(Parliament rejected the request) President. - (Le Parlement rejette la demande) La Présidente.
Tha

In [9]:
print(src_texts[2])

Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.


In [10]:
print(tgt_texts[2])

Comme vous avez pu le constater, le grand "bogue de l'an 2000" ne s'est pas produit. En revanche, les citoyens d'un certain nombre de nos pays ont été victimes de catastrophes naturelles qui ont vraiment été terribles.


In [62]:
print(len(train_loader))
count=0
for src, tgt,_,o in train_loader:
    count+=1
    print(len(src[0]))
    if count==10:
        break


3125
11
12
12
11
12
11
11
11
11
12
