In [35]:
#!pip install transformers
#!pip install Levenshtein

In [36]:
# Importing required packages
import pandas as pd
import numpy as np
import random
import math
import re
import torch
from tqdm import tqdm

from transformers import MT5Tokenizer, MT5ForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration
from Levenshtein import distance as levenshtein_distance

In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [38]:
mask_per = 15
model_size = 'large'
tokenizer = MT5Tokenizer.from_pretrained(f"google/mt5-{model_size}")
model = MT5ForConditionalGeneration.from_pretrained(f"google/mt5-{model_size}")



In [39]:
# model_size = '3b'
# tokenizer = T5Tokenizer.from_pretrained(f"google-t5/t5-{model_size}")
# model = T5ForConditionalGeneration.from_pretrained(f"google-t5/t5-{model_size}")

# model_size='xl'

In [40]:
TITLE = 'nrc'
TYPE = 'baseline' #'unique' #'duplicates'

MASKED_INPUT_PATH = f'..\\data\\{TYPE}\\masked_input\\masked_input_{mask_per}'

masked_data = pd.read_csv(f'{MASKED_INPUT_PATH}\\masked_sequences_{TITLE}.csv')

In [41]:
masked_data.head()

Unnamed: 0,Input sequence (ground truth),Input sequence (masked),Masked tokens,Number of masked tokens,Masking percentage,Duplicate count
0,Met een spectaculaire reddingsoperatie wist he...,Met een spectaculaire reddingsoperatie wist he...,"['actie voor de', '120 gijzelaars die']",6,15.0,1
1,Met een spectaculaire reddingsoperatie wist he...,<extra_id_0> reddingsoperatie wist het Isralis...,"['Met een spectaculaire', 'prijs aan Palestijn...",6,15.0,1
2,Op 1 juni begint de Nederlander Hermen Hulst a...,Op 1 juni begint de Nederlander Hermen Hulst a...,"['van Sony Interactive', 'gamebedrijf door een...",7,15.0,1
3,De liberaal-conservatieve Nieuw-Vlaamse Allian...,De liberaal-conservatieve <extra_id_0> blijft ...,"['Nieuw-Vlaamse Alliantie (N-VA)', 'de regiona...",6,15.0,1
4,De elfde editie van de Willem IV Rally was een...,De <extra_id_0> de Willem IV Rally was een gro...,"['elfde editie van', 'van de organisatie. Dit']",7,15.0,1


In [42]:
# Handle multiple gpus if available
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = torch.nn.DataParallel(model)

model.to(device)

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 1024)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 1024)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=Fals

# Functions

In [43]:
# Combine tokens into final output sequence
def flatten(lst):
  flat_list = []
  for item in lst:
    if isinstance(item, list):
      flat_list.extend(flatten(item))
    else:
      flat_list.append(item)
  return flat_list

In [44]:
def similarity(masked_tokens, filled_tokens):

  distances = []
  # Iterate through each pair of tokens
  for masked_token, filled_token in zip(masked_tokens, filled_tokens):
    try:
      # Calculate Levenshtein Distance between the two tokens
      distance = levenshtein_distance(masked_token, filled_token)

      # Normalize distance by dividing by the length of the longer token
      max_length = max(len(masked_token), len(filled_token))
      normalized_distance = distance / max_length

      # Append the normalized distance to the list of distances
      distances.append(normalized_distance)
    except:
      distances.append(None)

  # Return the list of normalized distances
  return distances

In [45]:
def fill_mask(input_data, model_size=model_size, sequence_id=None, model=model, tokenizer=tokenizer, device=device):

  # Retrieve input sequence data
  input_sequence = input_data['Input sequence (ground truth)']
  input_sequence_masked = input_data['Input sequence (masked)']
  masked_tokens = [item.strip("'") for item in input_data['Masked tokens'].strip("[]").split(", ")]
  n_masks = input_data['Number of masked tokens']
  duplicate_count = input_data['Duplicate count']

  # Tokenize the input sequence
  input_ids = tokenizer(input_sequence_masked, return_tensors="pt").input_ids
  input_ids = input_ids.to(device)

  # Generate the output sequence
  #with torch.no_grad():
  #  sequence_ids = model.generate(input_ids, max_length=len(input_sequence_masked)+(len(masked_tokens))) # max_length = length masked input sequence + length masked input tokens

  # Generate the output sequence, handle multiple gpu's if available
  with torch.no_grad():
      if isinstance(model, torch.nn.DataParallel):
          sequence_ids = model.module.generate(input_ids, max_length=len(input_sequence_masked) + len(masked_tokens))
      else:
          sequence_ids = model.generate(input_ids, max_length=len(input_sequence_masked) + len(masked_tokens))

  sequence_ids = sequence_ids.cpu()

  # Decode the generated sequence
  generated_sequence = tokenizer.batch_decode(sequence_ids, skip_special_tokens=False)[0]

  # Combine the input sequence and generated sequence, replacing masked tokens with model output
  filled_tokens = []
  output_sequence = []
  input_tokens = input_sequence_masked.split()
  generated_tokens = generated_sequence.split()
  print(input_tokens)
  print(generated_tokens)

  try:
    input_token_index = 0
    mask_tag_id = 0
    for token in input_tokens:
      # If mask tag is found in input tokens, append every token between the mask tag and the next mask tag
      if "<extra_id_" in token:
        # If mask tag_id is higher than number of masked tokens, skip.
        if mask_tag_id > n_masks:
          continue
        else:
          # Define mask tags
          mask_tag = f"<extra_id_{mask_tag_id}>"
          mask_tag_next = f"<extra_id_{mask_tag_id+1}>"

          # Retrieve indices for tokens between mask tags
          start_index = [i for i, item in enumerate(generated_tokens) if mask_tag in item][0]

          # If next mask_id is not in the generated tokens, end_index is last generated token
          try:
            end_index = [i for i, item in enumerate(generated_tokens) if mask_tag_next in item][0]
          except:
            end_index = len(generated_tokens)

          # Append in between tokens
          tokens_to_append = generated_tokens[start_index+1:end_index]
          filled_tokens.append(tokens_to_append)
          output_sequence.append(tokens_to_append)

          # Update indices
          input_token_index +=1
          mask_tag_id = mask_tag_id+1

      # Append input sequence token
      else:
        output_sequence.append(input_tokens[input_token_index])
        input_token_index+=1

    # Flatten sequence
    output_sequence_flat = flatten(output_sequence)

    # Remove nested list
    filled_tokens = [' '.join(tokens) for tokens in filled_tokens]

    # Clean possible </s> tag form last token
    last_token = output_sequence_flat[-1]
    if last_token.endswith("</s>"):
      output_sequence_flat[-1] = last_token[:-4]
      filled_tokens[-1] = last_token[:-4]

    final_output_sequence = " ".join(output_sequence_flat)

    # Calculate similarity
    levenshtein_distances = similarity(masked_tokens, filled_tokens)
    try:
      levenshtein_avg = round((sum(levenshtein_distances))/(len(levenshtein_distances)), 2)
    except:
      levenshtein_avg = None

  # If error, e.g. if not all tags are in generated output, make output empty and therefore invalid
  except:
    filled_tokens = None
    final_output_sequence = None
    levenshtein_distances = None
    levenshtein_avg = None

  print(f"Sequence id:                      {sequence_id}")
  print(f"Input sequence (masked):          {input_sequence_masked}")
  print(f"Final model output sequence:      {final_output_sequence}")
  print(f"Input sequence (ground truth):    {input_sequence}")
  print(f"Masked tokens:                    {masked_tokens}")
  print(f"Filled tokens:                    {filled_tokens}")
  print(f"Levenshtein distance (avg):       {levenshtein_avg}")
  print("")

  results_df_temp = None
  # Check if the DataFrame exists
  if results_df_temp is None:
      results_df_temp = pd.DataFrame(columns=["Sequence id", "Model size",
                                              "Input sequence (ground truth)", "Input sequence (masked)",
                                              "Model output sequence", "Masked tokens", "Filled tokens",
                                              "Number of masked tokens", "Masking percentage", "Duplicate count",
                                              "Levenshtein distances", "Levenshtein distance (avg)"])

  new_data = {
        "Sequence id": [sequence_id],
        "Model size": [model_size],
        "Input sequence (ground truth)": [input_sequence],
        "Input sequence (masked)": [input_sequence_masked],
        "Model output sequence": [final_output_sequence],
        "Masked tokens": [masked_tokens],
        "Filled tokens": [filled_tokens],
        "Number of masked tokens": [n_masks],
        "Masking percentage": [mask_per],
        "Duplicate count": [duplicate_count],
        "Levenshtein distances": [levenshtein_distances],
        "Levenshtein distance (avg)": [levenshtein_avg]
  }

  results_df_temp = pd.DataFrame(new_data)


  return results_df_temp

# Generating

In [46]:
# Loop over sequences and fill_mask
results_df = None

for i in range(len(masked_data)):
  print(f"seq {i}")
  results_df_temp = fill_mask(masked_data.iloc[i], sequence_id=f"seq{i}-0") 
  # Add results to df
  results_df = pd.concat([results_df, results_df_temp], ignore_index=True)


seq 0


['Met', 'een', 'spectaculaire', 'reddingsoperatie', 'wist', 'het', 'Isralische', 'leger', 'zaterdag', 'vier', 'gijzelaars', 'te', 'bevrijden.', 'De', 'prijs', 'aan', 'Palestijnse', 'zijde', 'is', 'echter', 'ongekend', 'hoog.', 'Wat', 'betekent', 'deze', '<extra_id_0>', 'ongeveer', '<extra_id_1>', 'nog', 'vastzitten?', 'En', 'waarom', 'zwijgen', 'de', 'meeste', 'wereldleiders?', 'Vijf', 'vragen', 'beantwoord.']
['<pad>', '<extra_id_0>', 'prijs', 'voor', '<extra_id_1>', '200.000', 'Palestijnen', 'die', '<extra_id_2>', '200.000', 'mensen', 'die', 'er', '<extra_id_3>', 'prijs', 'voor', 'de', '200.000', 'mensen', 'die', 'er', 'nu', 'al', '<extra_id_15>', 'een', 'jaar', '<extra_id_2>', '200.000', 'mensen', 'die', 'er', '<extra_id_2>', '200.000', 'mensen', 'die', 'er', 'nu', '<extra_id_2>', '200.000', 'mensen', 'die', 'er', '<extra_id_16>', 'prijs', 'voor', 'de', '<extra_id_17>', 'prijs', 'voor', 'de', '<extra_id_18>', '200.000', 'mensen', 'die', 'er', '<extra_id_19>t', '<extra_id_20>', '200.

In [47]:
results_df

Unnamed: 0,Sequence id,Model size,Input sequence (ground truth),Input sequence (masked),Model output sequence,Masked tokens,Filled tokens,Number of masked tokens,Masking percentage,Duplicate count,Levenshtein distances,Levenshtein distance (avg)
0,seq0-0,large,Met een spectaculaire reddingsoperatie wist he...,Met een spectaculaire reddingsoperatie wist he...,Met een spectaculaire reddingsoperatie wist he...,"[actie voor de, 120 gijzelaars die]","[prijs voor, 200.000 Palestijnen die]",6,15,1,"[0.6153846153846154, 0.6956521739130435]",0.66
1,seq1-0,large,Met een spectaculaire reddingsoperatie wist he...,<extra_id_0> reddingsoperatie wist het Isralis...,In een reddingsoperatie wist het Isralische le...,"[Met een spectaculaire, prijs aan Palestijnse]","[In een, spanning aan Israëlische]",6,15,1,"[0.8095238095238095, 0.625]",0.72
2,seq2-0,large,Op 1 juni begint de Nederlander Hermen Hulst a...,Op 1 juni begint de Nederlander Hermen Hulst a...,Op 1 juni begint de Nederlander Hermen Hulst a...,"[van Sony Interactive, gamebedrijf door een tu...","[van Sony Computer, speler van zijn]",7,15,1,"[0.5, 0.8125]",0.66
3,seq3-0,large,De liberaal-conservatieve Nieuw-Vlaamse Allian...,De liberaal-conservatieve <extra_id_0> blijft ...,De liberaal-conservatieve Partij voor de Vrijh...,"[Nieuw-Vlaamse Alliantie (N-VA), de regionale ...","[Partij voor de Vrijheid, de regionale verkiez...",6,15,1,"[0.9, 0.038461538461538464]",0.47
4,seq4-0,large,De elfde editie van de Willem IV Rally was een...,De <extra_id_0> de Willem IV Rally was een gro...,De eerste editie van de Willem IV Rally was ee...,"[elfde editie van, van de organisatie. Dit]","[eerste editie van, Dit]",7,15,1,"[0.23529411764705882, 0.8695652173913043]",0.55
...,...,...,...,...,...,...,...,...,...,...,...,...
95,seq95-0,large,Zelden won een renner een van de drie grote ro...,Zelden won een renner een <extra_id_0> grote r...,Zelden won een renner een etappe in grote rond...,"[van de drie, Tadej Pogacar de Giro]","[etappe in, de Giro]",7,15,1,"[0.7272727272727273, 0.6666666666666666]",0.70
96,seq96-0,large,De van televisie bekende Henritte Momma is ein...,<extra_id_0> televisie bekende Henritte Momma ...,De op televisie bekende Henritte Momma is eind...,"[De van, haar de afgelopen]","[De op, haar de afgelopen]",5,15,1,"[0.5, 0.0]",0.25
97,seq97-0,large,De laatste verkeerstuin van Nederland in zijn ...,De laatste verkeerstuin van Nederland in zijn ...,De laatste verkeerstuin van Nederland in zijn ...,[subsidie stopzet. Eeuwig],[plannen wil aanpassen. Een],3,15,1,[0.8076923076923077],0.81
98,seq98-0,large,Nog nooit kregen zo weinig babys borstvoeding....,Nog nooit kregen <extra_id_0> borstvoeding. Er...,Nog nooit kregen ze borstvoeding. Er wordt te ...,[zo weinig babys],[ze],3,15,1,[0.8666666666666667],0.87


In [48]:
#results_df.to_csv(f"..\\results\\20240520\{TITLE}\\{mask_per}\\results_{TITLE}_{model_size}.csv", index=False)
results_df.to_csv(f"..\\results\\working_results\\mT5\\{TYPE}\\results_{mask_per}\\{TITLE}\\results_{TITLE}_{model_size}.csv", index=False)

**Sliding window**

In [49]:
# # Loop over sentences and fill_mask
# results_df = None
# for i in range(len(duplicates_groene)):

#   input_token_len = len(duplicates_groene.text[i].split())

#   # Skip is sequence is to short
#   if input_token_len < 10:
#     continue

#   # Sliding window start_index, n_masks
#   n_masks=3
#   for idx in range(input_token_len - n_masks):
#     print(f"Start index: {idx}")
#     # Generate results
#     results_df_temp = fill_mask(duplicates_groene.text[i], mask_start_index=idx, n_masks=n_masks, sequence_id=f"seq{i}")
#     # Add results to df
#     results_df = pd.concat([results_df, results_df_temp], ignore_index=True)

In [50]:
results_df

Unnamed: 0,Sequence id,Model size,Input sequence (ground truth),Input sequence (masked),Model output sequence,Masked tokens,Filled tokens,Number of masked tokens,Masking percentage,Duplicate count,Levenshtein distances,Levenshtein distance (avg)
0,seq0-0,large,Met een spectaculaire reddingsoperatie wist he...,Met een spectaculaire reddingsoperatie wist he...,Met een spectaculaire reddingsoperatie wist he...,"[actie voor de, 120 gijzelaars die]","[prijs voor, 200.000 Palestijnen die]",6,15,1,"[0.6153846153846154, 0.6956521739130435]",0.66
1,seq1-0,large,Met een spectaculaire reddingsoperatie wist he...,<extra_id_0> reddingsoperatie wist het Isralis...,In een reddingsoperatie wist het Isralische le...,"[Met een spectaculaire, prijs aan Palestijnse]","[In een, spanning aan Israëlische]",6,15,1,"[0.8095238095238095, 0.625]",0.72
2,seq2-0,large,Op 1 juni begint de Nederlander Hermen Hulst a...,Op 1 juni begint de Nederlander Hermen Hulst a...,Op 1 juni begint de Nederlander Hermen Hulst a...,"[van Sony Interactive, gamebedrijf door een tu...","[van Sony Computer, speler van zijn]",7,15,1,"[0.5, 0.8125]",0.66
3,seq3-0,large,De liberaal-conservatieve Nieuw-Vlaamse Allian...,De liberaal-conservatieve <extra_id_0> blijft ...,De liberaal-conservatieve Partij voor de Vrijh...,"[Nieuw-Vlaamse Alliantie (N-VA), de regionale ...","[Partij voor de Vrijheid, de regionale verkiez...",6,15,1,"[0.9, 0.038461538461538464]",0.47
4,seq4-0,large,De elfde editie van de Willem IV Rally was een...,De <extra_id_0> de Willem IV Rally was een gro...,De eerste editie van de Willem IV Rally was ee...,"[elfde editie van, van de organisatie. Dit]","[eerste editie van, Dit]",7,15,1,"[0.23529411764705882, 0.8695652173913043]",0.55
...,...,...,...,...,...,...,...,...,...,...,...,...
95,seq95-0,large,Zelden won een renner een van de drie grote ro...,Zelden won een renner een <extra_id_0> grote r...,Zelden won een renner een etappe in grote rond...,"[van de drie, Tadej Pogacar de Giro]","[etappe in, de Giro]",7,15,1,"[0.7272727272727273, 0.6666666666666666]",0.70
96,seq96-0,large,De van televisie bekende Henritte Momma is ein...,<extra_id_0> televisie bekende Henritte Momma ...,De op televisie bekende Henritte Momma is eind...,"[De van, haar de afgelopen]","[De op, haar de afgelopen]",5,15,1,"[0.5, 0.0]",0.25
97,seq97-0,large,De laatste verkeerstuin van Nederland in zijn ...,De laatste verkeerstuin van Nederland in zijn ...,De laatste verkeerstuin van Nederland in zijn ...,[subsidie stopzet. Eeuwig],[plannen wil aanpassen. Een],3,15,1,[0.8076923076923077],0.81
98,seq98-0,large,Nog nooit kregen zo weinig babys borstvoeding....,Nog nooit kregen <extra_id_0> borstvoeding. Er...,Nog nooit kregen ze borstvoeding. Er wordt te ...,[zo weinig babys],[ze],3,15,1,[0.8666666666666667],0.87


In [51]:
# from google.colab import files

# results_df.to_csv('results_df.csv')
# files.download('results_df.csv')