In [None]:
import torch
import torch.nn as nn
import math

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [None]:
!nvidia-smi

Tue Jul 29 08:53:14 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   41C    P8             10W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# TransformerModel

In [None]:
from transformers import BertTokenizer, BertModel
from torch.utils.checkpoint import checkpoint
en_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
sr_tokenizer = BertTokenizer.from_pretrained("classla/bcms-bertic")
if sr_tokenizer.pad_token is None:
  sr_tokenizer.add_special_tokens({'pad_token': '<pad>'})
  print("added sr pad")
if en_tokenizer.pad_token is None:
  print("added en pad")
  en_tokenizer.add_special_tokens({'pad_token': '<pad>'})
print(en_tokenizer.pad_token_id)


class TransformerTranslator(nn.Module):
    def __init__(
        self,
        src_vocab_size=32000,  # English vocabulary size
        tgt_vocab_size=32000,  # Serbian vocabulary size
        d_model=1024,           # Embedding dimension
        nhead=16,              # Attention heads
        num_encoder_layers=10,
        num_decoder_layers=8,
        dim_feedforward=3072,  # Hidden layer size
        norm_first=True,
        activation="gelu",
        dropout=0.2,
        use_checkpointing=False  # New parameter to control checkpointing
    ):
        super().__init__()
        self.use_checkpointing = use_checkpointing  # Store the flag
        self.d_model = d_model


        # Embedding layers
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)

        # Transformer
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True,
            activation=activation,
            norm_first=norm_first
        )

        # Output layer
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt, src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
        src = self.pos_encoder(self.src_embedding(src).to(device)) * math.sqrt(self.d_model)
        tgt = self.pos_encoder(self.tgt_embedding(tgt).to(device)) * math.sqrt(self.d_model)

        tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(1)).to(device)
        memory_key_padding_mask = src_key_padding_mask
        src_key_padding_mask.to(device)
        if memory_key_padding_mask is not None:
          memory_key_padding_mask.to(device)


        if self.use_checkpointing:
            # Manually run checkpointed transformer
            output = self._checkpointed_transformer_forward(
                src=src,
                tgt=tgt,
                tgt_mask=tgt_mask,
                src_key_padding_mask=src_key_padding_mask,
                tgt_key_padding_mask=tgt_key_padding_mask,
                memory_key_padding_mask=memory_key_padding_mask
            )
        else:
            # print(f"tgt shape: {tgt.shape}")
            # print(f"src shape:  + {src.shape}")
            # print(f"tgt mask shape:  + {tgt_mask.shape}")
            # print(f"src key padding mask shape:  + {src_key_padding_mask.shape}")
            # print(f"tgt key padding mask shape:  + {tgt_key_padding_mask.shape}")
            output = self.transformer(
                src=src,
                tgt=tgt,
                tgt_mask=tgt_mask,
                src_key_padding_mask=src_key_padding_mask,
                tgt_key_padding_mask=tgt_key_padding_mask,
                memory_key_padding_mask=memory_key_padding_mask
            )

        return self.fc_out(output)

    def _checkpointed_transformer_forward(self, src, tgt, tgt_mask, src_key_padding_mask, tgt_key_padding_mask, memory_key_padding_mask):
      # Requires PyTorch >= 2.0
      for mod in self.transformer.encoder.layers:
          src = checkpoint(mod, src, use_reentrant=False)
      for mod in self.transformer.decoder.layers:
          tgt = checkpoint(mod, tgt, memory=src, use_reentrant=False)
      return tgt


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(1)].to(device)
        return self.dropout(x)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/83.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'ElectraTokenizer'. 
The class this function is called from is 'BertTokenizer'.


0


#Dataset/DataloaderFNS

In [None]:
from torch.utils.data import Dataset, DataLoader

class TranslationDataset(Dataset):
    def __init__(self, en_texts, sr_texts, en_tokenizer, sr_tokenizer):
        self.en_texts = en_texts
        self.sr_texts = sr_texts
        self.en_tokenizer = en_tokenizer
        self.sr_tokenizer = sr_tokenizer

    def __len__(self):
        return len(self.en_texts)

    def __getitem__(self, idx):
        en_encoded = self.en_tokenizer(self.en_texts[idx], return_tensors="pt", padding=False, truncation=True, max_length=512)
        sr_encoded = self.sr_tokenizer(self.sr_texts[idx], return_tensors="pt", padding=False, truncation=True, max_length=512)
        return {
            "input_ids": en_encoded["input_ids"].squeeze(),
            "src_attention_mask": en_encoded["attention_mask"].squeeze(),
            "labels": sr_encoded["input_ids"].squeeze(),
            "tgt_attention_mask" : sr_encoded["attention_mask"].squeeze(),
        }

def create_dataset(dataset):
  if sr_tokenizer.pad_token is None:
    sr_tokenizer.add_special_tokens({'pad_token': '<pad>'})
  if en_tokenizer.pad_token is None:
    en_tokenizer.add_special_tokens({'pad_token': '<pad>'})



  def collate_fn(batch):
      en_lengths = [len(x["input_ids"]) for x in batch]
      #print(en_lengths, "en")
      sr_lengths = [len(x["labels"]) for x in batch]
      #print(sr_lengths, "sr")
      max_len = min(512, max(max(en_lengths), max(sr_lengths)))  # Shared max length

      # Pad both sides equally
      en_padded = en_tokenizer.pad(
          {"input_ids": [x["input_ids"] for x in batch],
          "attention_mask": [x["src_attention_mask"] for x in batch]},
          padding="max_length",
          max_length=max_len,
          return_tensors="pt"
      )

      sr_padded = sr_tokenizer.pad(
          {"input_ids": [x["labels"] for x in batch],
           "attention_mask": [x["tgt_attention_mask"] for x in batch]},
          padding="max_length",
          max_length=max_len,
          return_tensors="pt"
      )

      return {
          "input_ids": en_padded["input_ids"].to(device),
          "src_attention_mask": en_padded["attention_mask"].to(device),
          "labels": sr_padded["input_ids"].to(device),
          "tgt_attention_mask": sr_padded["attention_mask"].to(device),
      }

  return DataLoader(
      dataset,
      batch_size=8,
      collate_fn=collate_fn,
      shuffle=False
  )
  batch = next(iter(dataloader))
  print("English shape:", batch["input_ids"].shape)
  print("Serbian shape:", batch["labels"].shape)
  print("English attention:", batch["attention_mask"][0])
  print("Serbian decoded:", sr_tokenizer.decode(batch["labels"][0]))
  print("English decoded", en_tokenizer.decode(batch["input_ids"][0]))

  for batch in dataloader:
    print(batch["input_ids"])
    print(batch["attention_mask"])
    print(batch["labels"])

# Val_Data i TrainFN

In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/LegalCSV/Val_data/mock_legal_validation.csv", encoding="utf-8")
x_val = df['english'].to_list()
y_val = df['serbian'].to_list()
x_val = [[x] for x in x_val]
y_val = [[x] for x in y_val]

dataset_val = TranslationDataset(x_val, y_val, sr_tokenizer, en_tokenizer)
test_val_loader = create_dataset(dataset_val)

In [None]:
import matplotlib.pyplot as plt
import torch.optim as optim
from torch.amp import autocast, GradScaler
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW



model = TransformerTranslator()
model.to(device)
optimizer = AdamW(
    model.parameters(),
    lr=2e-4,
    weight_decay=0.001
)
criterion = nn.CrossEntropyLoss(
    ignore_index=sr_tokenizer.pad_token_id,
    reduction='mean')

def train_model(w_dataloader, all_dataloader, p_dataloader, val_loader=None, epoch_len=150):
    model.to(device)
    patience_counter = 0
    model.train()
    scaler = GradScaler()  # For mixed precision
    best_loss = float('inf')
    patience, patience_counter = 3, 0  # Early stopping

    total_steps = (len(all_dataloader) // 2) * epoch_len
    warmup_steps = int(0.1 * total_steps)  # 10% warmup
    best_val_loss = float('inf')

    # Initialize scheduler ONCE before training loop
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )

    for epoch in range(epoch_len):
        epoch_loss = 0.0
        torch.cuda.empty_cache()  # Force cleanup before epoch
        print(f"\nGPU Memory Start: {torch.cuda.memory_allocated()/1e9:.2f}GB")

        # Dynamic dataloader switching (keep your logic)
        if epoch < epoch_len * 0.05:
            dataloader = w_dataloader
        elif epoch < epoch_len * 0.15:
            dataloader = p_dataloader
        else:
            dataloader = all_dataloader

        for i, batch in enumerate(dataloader):
            optimizer.zero_grad()
            src_key_padding_mask = (batch["src_attention_mask"] == 0).to(device)
            tgt_key_padding_mask = (batch["tgt_attention_mask"][:, :-1] == 0).to(device)

            # Mixed Precision (FP16/BF16)
            with autocast(dtype=torch.float16, device_type=device, enabled=True):  # Use torch.bfloat16 if A100
                outputs = model(
                    src=batch["input_ids"],
                    tgt=batch["labels"][:, :-1],
                    src_key_padding_mask=src_key_padding_mask,
                    tgt_key_padding_mask=tgt_key_padding_mask
                )
                loss = criterion(
                    outputs.view(-1, outputs.size(-1)),
                    batch["labels"][:, 1:].reshape(-1)
                )

            # Gradient Scaling + Accumulation (2 steps)
            scaler.scale(loss).backward()
            if (i + 1) % 2 == 0 or i == len(dataloader) - 1:  # Adjust accumulation steps as needed
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

                epoch_loss += loss.item()
                avg_train_loss = epoch_loss / (len(dataloader) // 2)

                if i % 10 == 0:
                  current_lr = scheduler.get_last_lr()[0]
                  print(f"Epoch {epoch:03d} | Batch {i:03d} | "
                  f"LR {current_lr:.2e} | Train Loss {loss.item():.4f}")

                total_norm = 0
                for p in model.parameters():
                  if p.grad is not None:
                    param_norm = p.grad.data.norm(2)
                    total_norm += param_norm.item() ** 2
                total_norm = total_norm ** 0.5
                if total_norm > 5:
                  print(f"Grad norm: {total_norm}")
                print("-------------------------------------")
                print(en_tokenizer.decode(batch['input_ids'][0]))
                print(sr_tokenizer.decode(batch['labels'][0]))
                print("-------------------------------------")


        # --- Validation Phase ---
        if val_loader is not None:
          model.eval()
          val_loss = 0.0
          with torch.inference_mode():
            with autocast(dtype=torch.float16, device_type=device):
              for batch in val_loader:
                  src_key_padding_mask = (batch["src_attention_mask"] == 0).to(device)
                  tgt_key_padding_mask = (batch["tgt_attention_mask"][:, :-1] == 0).to(device)
                  outputs = model(batch["input_ids"], batch["labels"][:, :-1], src_key_padding_mask, tgt_key_padding_mask)
                  loss = criterion(outputs.view(-1, outputs.size(-1)),
                                batch["labels"][:, 1:].reshape(-1))
                  val_loss += loss.item()

          #avg_val_loss = val_loss / len(val_loader)
          model.train()
        avg_val_loss = 0

        if epoch == int(epoch_len * 0.05) or epoch == int(epoch_len * 0.15):
          print(f"Switched to dataloader with batch size: {dataloader.batch_size}")

        # --- Epoch Logging ---
        current_lr = scheduler.get_last_lr()[0]
        print(f"\n[Epoch {epoch + 1:03d}/{epoch_len:03d}] "
              f"LR = {current_lr:.2e} | "
              f"Train Loss = {avg_train_loss:.4f} | "
              f"Val Loss = {avg_val_loss:.4f} | "
              f"Δ Loss = {avg_val_loss - avg_train_loss:+.4f}\n")

        # Save best model
        # Not doing anything atm :)
        # if avg_train_loss < avg_train_loss:
        #     avg_train_loss = avg_train_loss
        #     patience_counter = 0  # Reset counter
        #     torch.save(model.state_dict(), "best_model.pth")
        #     print(f"Saved new best model (Val Loss: {best_val_loss:.4f})")
        # else:
        #   patience_counter += 1
        #   if patience_counter >= patience:
        #     print(f"Early stopping at epoch {epoch}")
        #     #break





#TranslateFN


In [None]:
def translate_single(
    model,
    english_text,
    en_tokenizer,
    sr_tokenizer,
    max_length=64,
    beam_width=5,
    length_penalty=0.6,
    device="cuda" if torch.cuda.is_available() else "cpu"
):
    # Tokenize input with [CLS]/[SEP]
    inputs = en_tokenizer(
        english_text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_length,
        add_special_tokens=True  # Ensure [CLS] and [SEP] are added
    ).to(device)
    model.to(device)

    # Initialize beams with [CLS] token
    cls_id = sr_tokenizer.cls_token_id
    sep_id = sr_tokenizer.sep_token_id
    assert cls_id is not None, "Tokenizer missing [CLS] token"
    assert sep_id is not None, "Tokenizer missing [SEP] token"
    beams = [([cls_id], 0.0)]  # Start with [CLS]

    model.eval()
    with torch.inference_mode():
        # Forward pass through encoder
        src_emb = (model.pos_encoder(model.src_embedding(inputs["input_ids"])) * math.sqrt(model.d_model)).to(device)
        memory = model.transformer.encoder(
            src_emb,
            src_key_padding_mask=(inputs["attention_mask"] == 0)
        ).to(device)

        for _ in range(max_length):
            candidates = []
            for seq, score in beams:
                # Stop if [SEP] is generated
                if seq[-1] == sep_id:
                    candidates.append((seq, score))
                    continue

                # Prepare decoder input
                tgt = torch.tensor([seq], device=device)
                tgt_emb = model.pos_encoder(model.tgt_embedding(tgt)) * math.sqrt(model.d_model)

                # Forward pass through decoder
                output = model.transformer.decoder(
                    tgt_emb,
                    memory,
                    tgt_mask=model.transformer.generate_square_subsequent_mask(tgt.size(1)).to(device),
                    memory_key_padding_mask=(inputs["attention_mask"] == 0)
                )
                logits = model.fc_out(output[:, -1, :])

                # Get top-k tokens
                topk_scores, topk_tokens = torch.topk(logits, beam_width)
                topk_scores = torch.log_softmax(topk_scores, dim=-1)

                for i in range(beam_width):
                    new_seq = seq + [topk_tokens[0, i].item()]
                    new_score = score + topk_scores[0, i].item()
                    candidates.append((new_seq, new_score))

            # Select top beams
            print(f"Step {_}: Top beam -> {sr_tokenizer.decode(beams[-1][0])}")
            beams = sorted(candidates, key=lambda x: x[1])[-beam_width:]
            print(f"Step {_}: Top beam -> {sr_tokenizer.decode(beams[-1][0])}")

            # Early stopping if all beams end with [SEP]
            if all(beam[0][-1] == sep_id for beam in beams):
                break

    # Select best beam (with length penalty)
    if not beams:  # Handle empty beam case
        return ""
    #print(beams)
    print([sr_tokenizer.decode(x) for x in [word[0] for word in beams]])
    best_beam = max(beams, key=lambda x: x[1] / (len(x[0])**length_penalty))
    return sr_tokenizer.decode(best_beam[0], skip_special_tokens=True)

In [None]:
def greedy_translate_single(
    model,
    english_text,
    en_tokenizer,
    sr_tokenizer,
    max_length=64,
    length_penalty=0.6,
    device="cuda" if torch.cuda.is_available() else "cpu"
):
  inputs = en_tokenizer(
    english_text,
    return_tensors="pt",
    truncation=True,
    padding=True,
    max_length=max_length,
    add_special_tokens=True  # Ensure [CLS] and [SEP] are added
  ).to(device)

  model.to(device)
  cls_id = sr_tokenizer.cls_token_id
  sep_id = sr_tokenizer.sep_token_id
  assert cls_id is not None, "Tokenizer missing [CLS] token"
  assert sep_id is not None, "Tokenizer missing [SEP] token"
  final_seq = [cls_id]

  model.eval()
  with torch.inference_mode():
    src_emb = (model.pos_encoder(model.src_embedding(inputs["input_ids"])) * math.sqrt(model.d_model)).to(device)
    memory = model.transformer.encoder(
        src_emb,
        src_key_padding_mask=(inputs["attention_mask"] == 0)
    ).to(device)

    for _ in range(max_length):

      if final_seq[-1] == sep_id:
        break

      tgt = torch.tensor(final_seq, device=device).unsqueeze(0)
      tgt_emb = model.pos_encoder(model.tgt_embedding(tgt)) * math.sqrt(model.d_model)

      output = model.transformer.decoder(
      tgt_emb,
      memory,
      tgt_mask=model.transformer.generate_square_subsequent_mask(tgt.size(1)).to(device),
      memory_key_padding_mask=(inputs["attention_mask"] == 0)
      )

      logits = model.fc_out(output[:, -1, :])
      topk_scores, topk_tokens = torch.topk(logits, 1)
      next_token = topk_tokens.squeeze(-1)
      final_seq.append(next_token.item())

  return sr_tokenizer.decode(final_seq, skip_special_tokens=True)


#Data Importing and Cleaning

In [None]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.2.0


In [None]:
from docx import Document
import re
import os

docs_file = '/content/drive/MyDrive/Colab Notebooks/TranslatedDocs'
x_train = []
y_train = []


for filename in os.listdir(docs_file):
    if filename.endswith('.docx'):
        full_path = os.path.join(docs_file, filename)
        doc = Document(full_path)
        x_train.append(doc.tables[0].rows[0].cells[0].text)
        print(len(doc.tables[0].rows[0].cells[0].text))
        print(len(doc.tables[0].rows[0].cells[0].text.replace('\n\n', '\n')))
        print("===================")
        y_train.append(doc.tables[0].rows[0].cells[1].text)
        print(len(doc.tables[0].rows[0].cells[1].text))
        print(len(doc.tables[0].rows[0].cells[1].text.replace('\n\n', '\n')))
        print("######################")

test_x = []
test_y = []

def replace_consecutive_newlines(data):
    if isinstance(data, str):
        return data.replace('\n\n', '\n')
    elif isinstance(data, list):
        return [replace_consecutive_newlines(item) for item in data]
    else:
        return data  # Handle other data types as needed

def normalize_spaces(text):
    return re.sub(r'\s+', ' ', text).strip()

def easy_split(text):
  pattern = r'(?<=\n)'
  return [line for line in re.split(pattern, text) if line.strip() != '']

x_train_para = []
y_train_para = []

for dummy_x, dummy_y in zip(x_train, y_train):
  for i in range(50):
    dummy_y = replace_consecutive_newlines(dummy_y)
    dummy_x = replace_consecutive_newlines(dummy_x)

  t_x = easy_split(dummy_x)
  t_y = easy_split(dummy_y)

  t_x = [normalize_spaces(x) for x in t_x]
  t_y = [normalize_spaces(y) for y in t_y]

  t_x = [x for x in t_x]
  t_y = [y for y in t_y]

  test_x.append(t_x)
  test_y.append(t_y)

  new_x = []
  new_y = []
  for i , x in enumerate(t_x):
    if len(x) > 10:
      new_x.append(t_x[i])
      new_y.append(t_y[i])
  print(new_x[5])
  print("----------")
  print(new_y[5])
  print("%%%%%%%%%%%%%%%%%%%")


  l_dummy_x = [[st] for st in new_x]
  l_dummy_y = [[st] for st in new_y]

  for st_x, st_y in zip(l_dummy_x, l_dummy_y):
    x_train_para.append(st_x)
    y_train_para.append(st_y)

x_flat = [item[0] for item in new_x]
y_flat = [item[0] for item in new_y]

df = pd.DataFrame({'english': x_flat, 'serbian': y_flat})
df.to_csv('/content/drive/MyDrive/Colab Notebooks/LegalCSV/Docs/doc_paragraphs.csv', index=False)



12943
12891
14038
13975
######################
78163
77944
81689
81467
######################
2584
2572
2745
2736
######################
15050
14974
16630
16566
######################
6946
6895
6981
6929
######################
58333
58130
65556
65353
######################
122745
122667
129578
129506
######################
57872
57673
64755
64553
######################
Услуга Penetration test подразумева проверу функционалности и безбедности система и могућности угрожавања безбедности података које системи користе, препознавањем и експлоатацијом рањивости, и то система из прилога 1.
----------
Service Penetration test involves checking the functionality and security of systems and data capabilities of security threats that the systems use, by recognizing and exploiting vulnerabilities of the systems listed in Annex 1.
%%%%%%%%%%%%%%%%%%%
7) Plaćanje
----------
7) Payments
%%%%%%%%%%%%%%%%%%%
Vlastodavac se obavezuje da će isplatiti naknadu iz prethodnog stava na osnovu fakture punomoćn

In [None]:
lines = x_train_para + y_train_para
with open("/content/drive/MyDrive/Colab Notebooks/TranslatedDocs/raw_legal_text.txt", "w", encoding="utf-8") as f:
  for line in lines:
    fline = line[0] + '\n'
    f.write(fline)

#CSV Importing and Cleaning

In [None]:
import random

def combine_dataset(x_terms, y_terms):
  all_terms = list(zip(x_terms, y_terms))
  random.shuffle(all_terms)
  x, y = zip(*all_terms)
  return list(x), list(y)

#Osnovne reci
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/LegalCSV/Reci/osnovne_reci.csv", encoding="utf-8")
x_train_common_words = df['english'].to_list()
y_train_common_words = df['serbian'].to_list()
x_train_common_words = [[x] for x in x_train_common_words]
y_train_common_words = [[x] for x in y_train_common_words]

#Reci vezane za pravo
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/LegalCSV/Reci/legal_terms_en_sr.csv", encoding="utf-8")
x_train_legal_words = df['english'].to_list()
y_train_legal_words = df['serbian'].to_list()
x_train_legal_words = [[x] for x in x_train_legal_words]
y_train_legal_words = [[x] for x in y_train_legal_words]

#Fraze
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/LegalCSV/Fraze/common_law_phrases.csv", encoding="utf-8")
x_train_legal_phrases = df['english '].to_list()
y_train_legal_phrases = df['serbian'].to_list()
x_train_legal_phrases = [[x] for x in x_train_legal_phrases]
y_train_legal_phrases = [[x] for x in y_train_legal_phrases]

#Sad je bez paragrafa
raw_all_x_train = x_train_legal_phrases + x_train_legal_words + x_train_common_words
raw_all_y_train = y_train_legal_phrases + y_train_legal_words + y_train_common_words
all_x_train, all_y_train = combine_dataset(raw_all_x_train, raw_all_y_train)



In [None]:
dataset_word = TranslationDataset(x_train_common_words + x_train_legal_words, y_train_common_words + y_train_legal_words, en_tokenizer, sr_tokenizer)
dataloader_word = create_dataset(dataset_word)
for batch in dataloader_word:
  print("===========================")
  print(en_tokenizer.decode(batch["input_ids"][0]))
  print(batch["src_attention_mask"])
  print(sr_tokenizer.decode(batch['labels'][0]))
  print(batch['tgt_attention_mask'])

[CLS] of [SEP]
tensor([[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]], device='cuda:0')
[CLS] o [SEP]
tensor([[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]], device='cuda:0')
[CLS] upon [SEP] [PAD] [PAD]
tensor([[1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0]], device='cuda:0')
[CLS] po [SEP] [PAD] [PAD]
tensor([[1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 0, 0]], device='cuda:0')
[CLS] shall [SEP] [PAD] [PAD]
tensor([[1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [

In [None]:
dataset_all = TranslationDataset(all_x_train, all_y_train, en_tokenizer, sr_tokenizer)
dataset_phrase = TranslationDataset(x_train_legal_phrases, y_train_legal_phrases, en_tokenizer, sr_tokenizer)
dataset_word = TranslationDataset(x_train_legal_words + x_train_common_words, y_train_legal_words + y_train_common_words, en_tokenizer, sr_tokenizer)

dataloader_all = create_dataset(dataset_all)
dataloader_phrase = create_dataset(dataset_phrase)
dataloader_word = create_dataset(dataset_word)

for batch in dataloader_word:
  print(en_tokenizer.decode(batch["input_ids"][6]))
  print(batch["src_attention_mask"])
  print(batch["tgt_attention_mask"])
  print(sr_tokenizer.decode(batch["labels"][6]))
  print("--------------------")
  break



[CLS] legal [SEP] [PAD]
tensor([[1, 1, 1, 0],
        [1, 1, 1, 0],
        [1, 1, 1, 0],
        [1, 1, 1, 0],
        [1, 1, 1, 0],
        [1, 1, 1, 0],
        [1, 1, 1, 0],
        [1, 1, 1, 0]], device='cuda:0')
tensor([[1, 1, 1, 0],
        [1, 1, 1, 0],
        [1, 1, 1, 0],
        [1, 1, 1, 1],
        [1, 1, 1, 0],
        [1, 1, 1, 0],
        [1, 1, 1, 0],
        [1, 1, 1, 0]], device='cuda:0')
[CLS] pravni [SEP] [PAD]
--------------------


#Saving and inputs


In [None]:
train_model(w_dataloader=dataloader_word, p_dataloader=dataloader_phrase, all_dataloader=dataloader_all, epoch_len=40)


GPU Memory Start: 1.19GB




-------------------------------------
[CLS] liability [SEP] [PAD]
[CLS] odgovornost [SEP] [PAD]
-------------------------------------
-------------------------------------
[CLS] injunction [SEP] [PAD] [PAD] [PAD]
[CLS] sudska zabrana [SEP] [PAD]
-------------------------------------
-------------------------------------
[CLS] penalty [SEP] [PAD]
[CLS] kazna [SEP] [PAD]
-------------------------------------
-------------------------------------
[CLS] justice [SEP] [PAD] [PAD] [PAD]
[CLS] pravda [SEP] [PAD] [PAD] [PAD]
-------------------------------------
-------------------------------------
[CLS] tax [SEP] [PAD] [PAD] [PAD]
[CLS] porez [SEP] [PAD] [PAD] [PAD]
-------------------------------------
-------------------------------------
[CLS] intellectual property [SEP] [PAD] [PAD]
[CLS] intelektualna svojina [SEP]
-------------------------------------
-------------------------------------
[CLS] of [SEP]
[CLS] od [SEP]
-------------------------------------
-------------------------------

In [None]:
torch.save(model, '/content/drive/MyDrive/Colab Notebooks/SavedModels/model.pth')

In [None]:
model = torch.load('/content/drive/MyDrive/Colab Notebooks/SavedModels/model.pth', weights_only=False)

In [None]:
print(all_x_train[2])
print(all_y_train[2])

['lease']
['zakup']


In [None]:
translation = translate_single(
    model=model,
    english_text="lease",
    en_tokenizer=en_tokenizer,
    sr_tokenizer=sr_tokenizer
)
print(translation)

Step 0: Top beam -> [CLS]
Step 0: Top beam -> [CLS] odredba
Step 1: Top beam -> [CLS] odredba
Step 1: Top beam -> [CLS] odredba [SEP]
Step 2: Top beam -> [CLS] odredba [SEP]
Step 2: Top beam -> [CLS] odredba [SEP]
['[CLS] zabrana otkrivanja [SEP]', '[CLS] odbrana [SEP]', '[CLS] nalog [SEP]', '[CLS] od [SEP]', '[CLS] odredba [SEP]']
odredba


In [None]:
translation = greedy_translate_single(
    model=model,
    english_text="lease",
    en_tokenizer=en_tokenizer,
    sr_tokenizer=sr_tokenizer
)
print(translation)

odredba
