In [269]:
import pandas as pd
import random
import math

In [270]:
TITLE = 'nrc'
TYPE = 'baseline' # 'duplicates' 'unique'
MASK_PERC = 0.30
MASK_LEVEL_FOLDER = 'masked_input_30'

In [271]:
df = pd.read_csv(f"..\\data\\{TYPE}\\text_{TYPE}_{TITLE}.csv")

# Filter out rows with less than 20 characters in the 'text' column
data = df[df['text'].str.len() >= 20]

# Reindex the DataFrame
data = data.reset_index(drop=True)

# Display the first few rows of the filtered DataFrame
print(data.head())

   Unnamed: 0                                               text  text_count  \
0           2  Met een spectaculaire reddingsoperatie wist he...           1   
1           6  Met een spectaculaire reddingsoperatie wist he...           1   
2           7  Op 1 juni begint de Nederlander Hermen Hulst a...           1   
3           9  De liberaal-conservatieve Nieuw-Vlaamse Allian...           1   
4          14  De elfde editie van de Willem IV Rally was een...           1   

                                                link paywall  
0  Hoe vang je een planetoïde? China heeft groots...     ja   
1  https://www.nrc.nl/nieuws/2024/05/24/follow-th...     ja   
2       Borstvoeding geven is niet meer gewoon - NRC     ja   
3  https://www.nrc.nl/nieuws/2024/05/31/trump-zaa...     ja   
4  https://www.nrc.nl/nieuws/2024/05/29/zelfstand...     ja   


In [272]:
import re
def clean_input(input_sequence):
    # Remove unknown characters using regex
    cleaned_sequence = re.sub(r'[^\x00-\x7F]+', '', input_sequence)
    return cleaned_sequence

In [273]:
# Combine tokens into final output sentence
def flatten(lst):
  flat_list = []
  for item in lst:
    if isinstance(item, list):
      flat_list.extend(flatten(item))
    else:
      flat_list.append(item)
  return flat_list

In [274]:
def find_valid_spans(input_tokens, span_lengths, gap=1, max_attempts=100):
    """
    Finds valid spans for masking in the input tokens while ensuring gaps between spans.

    Args:
    - input_tokens (list): List of input tokens.
    - span_lengths (list): List of lengths for each span.
    - gap (int): Minimum gap required between spans.
    - max_attempts (int): Maximum number of attempts to find valid spans.

    Returns:
    - list: Sorted list of indices for the valid spans.

    Raises:
    - ValueError: If valid spans cannot be found after the specified number of attempts.
    """
    
    for attempt in range(max_attempts):
        selected_indices = set()  # Set to store indices of selected spans
        used_indices = set()      # Set to store indices that are already used
        random.shuffle(span_lengths)  # Shuffle span lengths for variability
        potential_starts = list(range(len(input_tokens)))  # Potential start indices for spans
        random.shuffle(potential_starts)  # Shuffle start indices for randomness

        for span_length in span_lengths:
            found = False
            for start_index in potential_starts:
                # Ensure the span does not exceed the input length
                if start_index > len(input_tokens) - span_length:
                    continue

                # Define the range of the current span
                span_range = range(start_index, start_index + span_length)

                # Check if the span range and the adjacent indices are not used
                if all(idx not in used_indices for idx in span_range) and \
                   all(idx - 1 not in used_indices for idx in span_range) and \
                   all(idx + 1 not in used_indices for idx in span_range):
                    selected_indices.update(span_range)  # Add span indices to the set
                    used_indices.update(span_range)      # Mark these indices as used
                    potential_starts = [idx for idx in potential_starts if idx >= start_index + span_length + gap]
                    found = True
                    break

            if not found:
                break

        # Check if the total selected indices match the required span lengths
        if len(selected_indices) == sum(span_lengths):
            return sorted(list(selected_indices))

        # Relax the gap constraint after half the attempts
        if attempt == max_attempts // 2:
            gap = max(1, gap - 1)  # Reduce gap but not below 1

    raise ValueError("Failed to find non-overlapping spans with required gap after multiple attempts.")

In [275]:
def mask_input(input_data, mask_perc=0.15, noise_span_length=3):
    """
    Mask input text data by replacing a percentage of tokens with mask tags.

    Args:
    - input_data (object): Object containing input text data and additional information.
    - mask_perc (float): Percentage of tokens to mask. Default is 0.15 (15%).
    - noise_span_length (int): Desired average length of noise spans. Default is 3.

    Returns:
    - DataFrame: DataFrame containing masked sequences, masked tokens, and other information.
    """

    # Clean input text to ensure it doesn't contain unknown characters
    input_sequence = clean_input(input_data['text'])
    input_tokens = input_sequence.split()

    # Skip if sequence is too small
    if len(input_tokens) < 5:
        input_tokens = ""

    # Get the count of duplicate text (if any)
    duplicate_count = input_data['text_count']

    # Round number of masked tokens, at least 1 token masked
    n_masks = max(1, round(len(input_tokens) * mask_perc))

    try:
        if n_masks <= noise_span_length:
            span_lengths = [n_masks]
            n_noise_spans = 1
        else:
            n_noise_spans = int(round(n_masks / noise_span_length, 0))
            base_span_length = n_masks // n_noise_spans
            remaining_masks = n_masks % n_noise_spans

            # Initialize span lengths with the base span length
            span_lengths = [base_span_length] * n_noise_spans

            # Distribute remaining masks to achieve closer to the desired average span length
            for i in range(remaining_masks):
                span_lengths[i] += 1

        mask_tokens_indices = find_valid_spans(input_tokens, span_lengths)

        # Replace masked tokens with mask tags
        masked_input_tokens = []  # Full input tokens list containing masking tags
        masked_tokens = []        # List of masked tokens
        mask_id = 0
        i = 0
        while i < len(input_tokens):
            if i in mask_tokens_indices:
                # Find consecutive indices
                consecutive_indices = [i]
                while i + 1 < len(input_tokens) and i + 1 in mask_tokens_indices:
                    consecutive_indices.append(i + 1)
                    i += 1

                # Replace consecutive indices with a single mask token
                masked_input_tokens.append(f"<extra_id_{mask_id}>")
                masked_tokens.append([input_tokens[j] for j in consecutive_indices])
                mask_id += 1
                i += 1
            else:
                masked_input_tokens.append(input_tokens[i])
                i += 1

        n_masks_final = len(flatten(masked_tokens))

        # Verify if the final number of masked tokens matches the expected number
        if n_masks_final != n_masks:
            raise ValueError(f"Expected to mask {n_masks} tokens, but masked {n_masks_final} tokens.")

        input_sequence_masked = " ".join(masked_input_tokens)
        masked_tokens = [' '.join(tokens) for tokens in masked_tokens]

        print(input_sequence)
        print(masked_input_tokens)
        print(span_lengths)
        print(mask_tokens_indices)

    except Exception as e:
        print(f"Error during masking: {e}")
        # If an error occurs during masking, set values to None
        input_sequence = None
        input_sequence_masked = None
        masked_tokens = None
        n_masks = None
        duplicate_count = None

    # Initialize DataFrame to store masked sequences and other information
    masked_sequences_df_temp = None

    # Check if the DataFrame exists, if not, create it
    if masked_sequences_df_temp is None:
        masked_sequences_df_temp = pd.DataFrame(columns=["Input sequence (ground truth)", "Input sequence (masked)", "Masked tokens"])

    # Create a new row of data for the DataFrame
    new_data = {
        "Input sequence (ground truth)": [input_sequence],
        "Input sequence (masked)": [input_sequence_masked],
        "Masked tokens": [masked_tokens],
        "Number of masked tokens": [n_masks],
        "Masking percentage": [mask_perc*100],
        "Duplicate count": [duplicate_count]
    }

    # Add the new data to the DataFrame
    masked_sequences_df_temp = pd.DataFrame(new_data)

    return masked_sequences_df_temp

In [276]:
masked_sequences_df = None
data = data

for i in range(len(data)):
  print(i)
  masked_sequences_df_temp = mask_input(data.iloc[i], mask_perc=MASK_PERC, noise_span_length=3)
  # Add results to df
  masked_sequences_df = pd.concat([masked_sequences_df, masked_sequences_df_temp], ignore_index=True)

0
Met een spectaculaire reddingsoperatie wist het Isralische leger zaterdag vier gijzelaars te bevrijden. De prijs aan Palestijnse zijde is echter ongekend hoog. Wat betekent deze actie voor de ongeveer 120 gijzelaars die nog vastzitten? En waarom zwijgen de meeste wereldleiders? Vijf vragen beantwoord.
['Met', 'een', 'spectaculaire', 'reddingsoperatie', 'wist', 'het', 'Isralische', '<extra_id_0>', 'te', 'bevrijden.', 'De', 'prijs', 'aan', 'Palestijnse', 'zijde', 'is', 'echter', 'ongekend', 'hoog.', 'Wat', 'betekent', 'deze', 'actie', 'voor', 'de', 'ongeveer', '120', '<extra_id_1>', 'vastzitten?', 'En', 'waarom', '<extra_id_2>', 'wereldleiders?', '<extra_id_3>']
[4, 3, 3, 3]
[7, 8, 9, 10, 30, 31, 32, 36, 37, 38, 40, 41, 42]
1
Met een spectaculaire reddingsoperatie wist het Isralische leger zaterdag vier gijzelaars te bevrijden. De prijs aan Palestijnse zijde is echter ongekend hoog. Wat betekent deze actie voor de ongeveer 120 gijzelaars die nog vastzitten? En waarom zwijgen de meeste 

In [277]:
masked_sequences_df = masked_sequences_df.dropna(axis='rows')
masked_sequences_df

Unnamed: 0,Input sequence (ground truth),Input sequence (masked),Masked tokens,Number of masked tokens,Masking percentage,Duplicate count
0,Met een spectaculaire reddingsoperatie wist he...,Met een spectaculaire reddingsoperatie wist he...,"[leger zaterdag vier gijzelaars, gijzelaars di...",13,30.0,1
1,Met een spectaculaire reddingsoperatie wist he...,Met een spectaculaire <extra_id_0> Isralische ...,"[reddingsoperatie wist het, te bevrijden. De p...",13,30.0,1
2,Op 1 juni begint de Nederlander Hermen Hulst a...,Op 1 juni begint de Nederlander Hermen Hulst <...,"[als ceo van, Entertainment, de maker, Hij kri...",14,30.0,1
3,De liberaal-conservatieve Nieuw-Vlaamse Allian...,De liberaal-conservatieve Nieuw-Vlaamse <extra...,"[Alliantie (N-VA) blijft de, Belgische media o...",13,30.0,1
4,De elfde editie van de Willem IV Rally was een...,<extra_id_0> van de Willem IV Rally was een gr...,"[De elfde editie, het Ronald McDonald, Rotterd...",15,30.0,1
...,...,...,...,...,...,...
95,Zelden won een renner een van de drie grote ro...,Zelden won een renner een van de drie grote ro...,"[veel overmacht als, grotendeels zijn, die al ...",14,30.0,1
96,De van televisie bekende Henritte Momma is ein...,De van televisie bekende Henritte Momma is ein...,"[haar de afgelopen, zorgzame moeder van de, ku...",10,30.0,1
97,De laatste verkeerstuin van Nederland in zijn ...,De laatste verkeerstuin van Nederland in zijn ...,"[de gemeente Utrecht de, zonde, vinden de]",7,30.0,1
98,Nog nooit kregen zo weinig babys borstvoeding....,Nog nooit kregen zo <extra_id_0> Er wordt te <...,"[weinig babys borstvoeding., weinig ruimte voor]",6,30.0,1


In [278]:
masked_sequences_df.iloc[2]['Input sequence (masked)']

'Op 1 juni begint de Nederlander Hermen Hulst <extra_id_0> Sony Interactive <extra_id_1> van PlayStation. Voorheen produceerde Hulst de succesvolle gameseries Killzone en Horizon. <extra_id_2> taak om s werelds grootste gamebedrijf door een tumultueuze <extra_id_3> game-industrie <extra_id_4>'

In [279]:
masked_sequences_df.to_csv(f"..\\data\\{TYPE}\\masked_input\\{MASK_LEVEL_FOLDER}\\masked_sequences_{TITLE}.csv", index=False)