In [1]:
# paraphrase english 10k
# paraphrase spanish 10k
# back-translation 10k
# adversarial 10k - random text
# perturbation 10k - swamp delete update
# english synonym 10k - nltk
# spanish synonym 10k - nltk
# concatenate 10k
# total 80k synthetic data

In [2]:
import pandas as pd
import os
from parrot import Parrot
import torch
import warnings
import random
import numpy as np
import tqdm
from transformers import pipeline
import string

# seed
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
   torch.cuda.manual_seed_all(seed)

warnings.filterwarnings("ignore")

device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
df = pd.read_parquet("exp-data/en-es-train-val.parquet")

In [4]:
df

Unnamed: 0,dataset,split,EN,ES,length
0,english-spanish-translator,train,Please answer me.,"Por favor, contestame.",7
1,english-spanish-translator,train,I didn't ask.,Yo no pregunt√©.,8
2,mustc-en-es-text-only,train,"Because after all, the people we are coupled t...","Porque despu√©s de todo, las personas con las q...",28
3,OPUS-books-EN-ES,train,"""Faith, that's true, Mr. Spilett,"" replied the...","‚ÄìSe√±or Spilett, tiene usted raz√≥n ‚Äìrespondi√≥ e...",23
4,wikipedia_en_es_m2m,train,"Greece has many islands, with estimates rangin...","Grecia tiene muchas islas, con estimaciones qu...",338
...,...,...,...,...,...
109995,corpus-en-es,val,We should not only be paying attention to road...,No s√≥lo debemos prestar atenci√≥n a la segurida...,42
109996,english-spanish-translator,val,I can't wait to be a father.,No puedo esperar a ser padre.,12
109997,OPUS-books-EN-ES,val,"""I am, then, your prisoner?""","¬øSoy, pues, vuestra prisionera?",15
109998,wikipedia_en_es_m2m,val,"The cryosphere (from the Greek kryos, ""cold"", ...","La criosfera (del griego kryos, ""fr√≠o"", ""gelad...",348


In [5]:
df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
dfs = [df.iloc[i*10000:(i+1)*10000].reset_index(drop=True) for i in range(8)]
dfs[-1] = df.iloc[70_000:100_000].reset_index(drop=True)  # last chunk may be smaller due to length filtering

In [6]:
#Init models (make sure you init ONLY once if you integrate this to your code)
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5", use_gpu=True)
paraph_df = []
max_paraph_per_sentence = 2
with torch.no_grad():
    for i, row in tqdm.tqdm(dfs[0].iterrows(), total=len(dfs[0])):
        en_phrase = row['EN']
        para_phrases = parrot.augment(input_phrase=en_phrase, 
                                      max_return_phrases=5, 
                                      max_length=row['length'] + 5, 
                                      use_gpu=True)
        if para_phrases is None:
            para_phrases = []
        for k, para_phrase in enumerate(para_phrases):
            new_row = row.to_dict()
            para_phrase = para_phrase[0].strip()
            if para_phrase == en_phrase.strip() or para_phrase == '':
                continue
            new_row['synthetic'] = para_phrase
            new_row['method'] = 'paraphrase'
            new_row['target'] = 'EN'
            paraph_df.append(new_row)
            if k+1 >= max_paraph_per_sentence:
                break


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
  0%|          | 0/10000 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [55:57<00:00,  2.98it/s] 


In [7]:
paraph_df = pd.DataFrame(paraph_df)
# if EN and EN-Paraphrase are the same, drop the row
paraph_df = paraph_df[paraph_df['EN'] != paraph_df['synthetic']].reset_index(drop=True)
# replace EN with synthetic
paraph_df['EN'] = paraph_df['synthetic']
paraph_df = paraph_df.drop(columns=['synthetic'])
paraph_df

Unnamed: 0,dataset,split,EN,ES,length,method,target
0,english-spanish-translator,train,it's awful weather today,Hoy hace un clima p√©simo.,12,paraphrase,EN
1,english-spanish-translator,train,the weather is horrible,Hoy hace un clima p√©simo.,12,paraphrase,EN
2,wikipedia_en_es_m2m,train,horsepower hp is a unit of measurement of powe...,La potencia de caballo (hp) es una unidad de m...,349,paraphrase,EN
3,wikipedia_en_es_m2m,train,horsepower hp is a unit of measurement of powe...,La potencia de caballo (hp) es una unidad de m...,349,paraphrase,EN
4,mustc-en-es-text-only,train,coconut shells crushed in a machine mixed with...,"Por ejemplo, las c√°scaras de coco trituradas e...",76,paraphrase,EN
...,...,...,...,...,...,...,...
9002,mustc-en-es-text-only,train,we can never go back to a place and find out e...,No podemos volver a un lugar y encontrarlo exa...,26,paraphrase,EN
9003,mustc-en-es-text-only,train,there may be more than three million muslims i...,En EE.UU. hay m√°s de 3 millones de musulmanes.,20,paraphrase,EN
9004,OPUS-books-EN-ES,train,i don't want to hide anything from you',No puedo ni quiero ocultarle nada...,15,paraphrase,EN
9005,mustc-en-es-text-only,train,ten days later i'm on call at the san francisc...,"Diez d√≠as m√°s tarde, estoy de guardia en el Ho...",47,paraphrase,EN


In [8]:
from transformers import pipeline
import torch
paraphraser_es = pipeline(
    task="text2text-generation",
    model="p-serna/mt5-small-spanish-paraphraser",
    tokenizer="p-serna/mt5-small-spanish-paraphraser",
    device=0 if torch.cuda.is_available() else -1,
)

def generate_spanish_paraphrases(texts, num_return_sequences=3, max_length=128):
    results = {}
    for text in texts:
        outputs = paraphraser_es(
            f"parafrasea: {text}",
            num_return_sequences=num_return_sequences,
            max_new_tokens=max_length,
            clean_up_tokenization_spaces=True,
            truncation=True,
            num_beams=num_return_sequences
        )
        results[text] = [item["generated_text"] for item in outputs]
    return results

spanish_paraphrases = generate_spanish_paraphrases(['¬øPuedes recomendar algunos restaurantes exclusivos en Nueva York?',])
spanish_paraphrases

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'¬øPuedes recomendar algunos restaurantes exclusivos en Nueva York?': ['¬øPuedes recomendar algunos restaurantes exclusivos en Nueva York?',
  '¬øC√≥mo puedo recomendar algunos restaurantes exclusivos en Nueva York?',
  '¬øPuedes recomendar algunos restaurantes exclusivos de Nueva York?']}

In [9]:
spanish_paraph_df = []
max_spanish_paraph_per_sentence = 2

for i, row in tqdm.tqdm(dfs[1].iterrows(), total=len(dfs[1])):
    es_phrase = row["ES"]
    spans = generate_spanish_paraphrases(
        [es_phrase], num_return_sequences=2, max_length=row['length'] + 5
    ).get(es_phrase, [])
    kept = 0
    for span in spans:
        candidate = span.strip()
        if not candidate or candidate == es_phrase.strip():
            continue
        new_row = row.to_dict()
        new_row["synthetic"] = candidate
        new_row["method"] = "spanish_paraphrase"
        new_row['target'] = 'ES'
        spanish_paraph_df.append(new_row)
        kept += 1
        if kept >= max_spanish_paraph_per_sentence:
            break

  0%|          | 9/10000 [00:01<32:14,  5.16it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 7401/10000 [55:19<37:51,  1.14it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [10]:
spanish_paraph_df = pd.DataFrame(spanish_paraph_df)
spanish_paraph_df = spanish_paraph_df[spanish_paraph_df["ES"] != spanish_paraph_df["synthetic"]].reset_index(drop=True)
# replace ES with synthetic
spanish_paraph_df["ES"] = spanish_paraph_df["synthetic"]
spanish_paraph_df = spanish_paraph_df.drop(columns=["synthetic"])
spanish_paraph_df

Unnamed: 0,dataset,split,EN,ES,length,method,target
0,mustc-en-es-text-only,train,"But, in 2007, Bhumika and Nepal's LGBT rights ...","Parafrasea: Pero, en 2007, la organizaci√≥n de ...",52,spanish_paraphrase,ES
1,mustc-en-es-text-only,train,"But, in 2007, Bhumika and Nepal's LGBT rights ...","Pero, en 2007, la organizaci√≥n de derechos LGB...",52,spanish_paraphrase,ES
2,OPUS-books-EN-ES,val,"It was the dog of the engineer, Cyrus Harding.","Parafrasea, era el perro del ingeniero Ciro Sm...",13,spanish_paraphrase,ES
3,OPUS-books-EN-ES,val,"It was the dog of the engineer, Cyrus Harding.",Parafrasea era el perro del ingeniero Ciro Smith.,13,spanish_paraphrase,ES
4,OPUS-books-EN-ES,train,Gideon Spilett was tall.,Parafrasea: Ged√≥n Spillett era alto y ten√≠a un...,21,spanish_paraphrase,ES
...,...,...,...,...,...,...,...
19113,wikipedia_en_es_m2m,train,Year 235 (CCXXXV) was a common year starting o...,"En ese momento, el a√±o 235 (CCXXXV) fue un a√±o...",304,spanish_paraphrase,ES
19114,mustc-en-es-text-only,train,(Laughter) I looked in the rear-view mirror an...,Parafrasea: S√© que suena como cualquier cosa p...,46,spanish_paraphrase,ES
19115,mustc-en-es-text-only,train,(Laughter) I looked in the rear-view mirror an...,Parafrasea: S√© que suena como cualquier cosa p...,46,spanish_paraphrase,ES
19116,mustc-en-es-text-only,train,That was the debate.,"Parafrasea, ese era el debate.",6,spanish_paraphrase,ES


In [11]:
pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-es-en")

Device set to use cuda:0


In [12]:

back_translation_rows = []
batch_size = 32

k = 0
for start in tqdm.tqdm(range(0, len(dfs[2]), batch_size), desc="Back-translation ES‚ÜíEN"):
    batch = dfs[2].iloc[start:start + batch_size]
    translations = pipe(batch["ES"].tolist())
    for row, translated in zip(batch.to_dict("records"), translations):
        augmented = row.copy()
        augmented["synthetic"] = translated["translation_text"].strip()
        augmented["method"] = "back_translation"
        augmented["target"] = "EN"
        back_translation_rows.append(augmented)

Back-translation ES‚ÜíEN: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 313/313 [28:21<00:00,  5.44s/it]


In [13]:
back_translation_df = pd.DataFrame(back_translation_rows)
back_translation_df = back_translation_df[
    back_translation_df["synthetic"].str.strip().ne(back_translation_df["EN"].str.strip())
].reset_index(drop=True)

# replace EN with synthetic
back_translation_df["EN"] = back_translation_df["synthetic"]
back_translation_df = back_translation_df.drop(columns=["synthetic"])

back_translation_df.head()

Unnamed: 0,dataset,split,EN,ES,length,method,target
0,OPUS-books-EN-ES,train,"""Sure!"" replied Trifaldin, ""that in Candaya, n...","-¬°Claro est√°! -respondi√≥ Trifald√≠n-, que en Ca...",39,back_translation,EN
1,OPUS-books-EN-ES,val,"""Did he ever give any sign of having taken opi...",‚Äì‚Äì¬øAlguna vez dio se√±ales de haber tomado opio?,18,back_translation,EN
2,english-spanish-translator,train,You want to leave your bag in the trunk?,Quieres dejar tu bolso en el maletero?,14,back_translation,EN
3,OPUS-books-EN-ES,val,It seemed that the series of supernatural inci...,Parec√≠a que la serie de incidentes sobrenatura...,89,back_translation,EN
4,mustc-en-es-text-only,train,"Helping my children to be careful, healthy, an...","Ayudar a mis hijos a ser adultos cuidadosos, s...",34,back_translation,EN


In [15]:


NOISE_CHAR_POOL = (
    list(string.digits + string.punctuation)
    + ["‚ú®", "üî•", "üöÄ", "üí•", "üí°", "üåÄ", "üîí", "üß¨", "üå™Ô∏è", "üßä", "üï∂Ô∏è", "üéØ", "‚ö°", "üõ∞Ô∏è", "ü§ñ", "üß®", "üí¨", "üï≥Ô∏è",
       "¬ø", "¬°", "¬ß", "¬∂", "¬©", "‚Ñ¢", "Œ©", "Œª", "Œî", "Œ≤", "Êº¢", "Â≠ó", "Èõ∂", "Èæç", "\u200b"]
)


def random_fragment(min_len: int = 4, max_len: int = 16) -> str:
    length = random.randint(min_len, max_len)
    fragment = "".join(random.choice(NOISE_CHAR_POOL) for _ in range(length))
    if random.random() < 0.35:
        fragment = fragment * random.randint(2, 3)
    if random.random() < 0.25:
        fragment = f"{fragment}\n{random_fragment(2, 4)}"
    return fragment


def inject_random_segments(text: str, max_len: int = 4) -> str:
    words = text.split()
    max_len = min(max_len, len(text))
    max_len = min(max_len, len(text)// 3)
    if not words:
        return text
    rand_text = "".join([NOISE_CHAR_POOL[random.randint(0, len(NOISE_CHAR_POOL)-1)] for _ in range(random.randint(1, max_len))])
    # randomly add spaces in the rand_text
    rand_text_ = ""
    for ch in rand_text:
        rand_text_ += ch
        if random.random() < 0.3:
            rand_text_ += " "
    rand_words = rand_text_.strip().split()
    # randomly insert rand_words into words
    insert_positions = sorted(random.sample(range(len(words) + 1), len(rand_words)))
    for offset, (pos, rand_word) in enumerate(zip(insert_positions, rand_words)):
        words.insert(pos + offset, rand_word)
    return " ".join(words)
    


def wrap_words_with_repeated_chars(text: str, wrap_prob: float = 0.35) -> str:
    words = text.split()
    wrapped = []
    for word in words:
        if random.random() < wrap_prob:
            repeated = random.choice(NOISE_CHAR_POOL) * random.randint(1, 4)
            wrapped.append(f"{repeated} {word} {repeated}")
        else:
            wrapped.append(word)
    return " ".join(wrapped)

def randomize_whitespace(text: str, newline_prob: float = 0.3, tab_prob: float = 0.15) -> str:
    buffer = []
    for ch in text:
        if ch == " ":
            roll = random.random()
            if roll < newline_prob:
                buffer.append("\n")
            elif roll < newline_prob + tab_prob:
                buffer.append("\t")
            else:
                buffer.append(" " * random.randint(1, 3))
        else:
            buffer.append(ch)
    return "".join(buffer)


def interleave_noise_lines(text: str, max_lines: int = 3) -> str:
    lines = text.split("\n")
    max_insert = min(max_lines, len(lines) + 1)
    if max_insert == 0:
        return text
    inserts = random.randint(1, max_insert)
    positions = sorted(random.sample(range(len(lines) + 1), inserts))
    for offset, pos in enumerate(positions):
        lines.insert(pos + offset, random_fragment(1, 4))
    return "\n".join(lines)


def apply_adversarial_perturbations(text: str) -> str:
    if not isinstance(text, str) or not text.strip():
        return text
    operations = [
        lambda t: inject_random_segments(t, max_len=20),
        lambda t: wrap_words_with_repeated_chars(t, wrap_prob=0.1),
        lambda t: randomize_whitespace(t, newline_prob=0.1, tab_prob=0.1),
        lambda t: interleave_noise_lines(t, max_lines=2),
    ]
    random.shuffle(operations)
    perturbed = text
    try:
        perturbed = operations[0](perturbed)
    except:
        perturbed = text
    return perturbed


# def adversarialize_english_column(frame, source_col: str = "EN", target_col: str = "EN_adv", apply_prob: float = 0.6):
#     frame[target_col] = frame[source_col].apply(
#         lambda txt: apply_adversarial_perturbations(txt) if isinstance(txt, str) and random.random() < apply_prob else txt
#     )
#     return frame

# apply_adversarial_perturbations("This is a sample case of adversarial perturbation.")

for i, row in tqdm.tqdm(dfs[3].iterrows(), total=len(dfs[3])):
    en_phrase = row['EN']
    augmented_phrase = apply_adversarial_perturbations(en_phrase)
    new_row = row.to_dict()
    new_row['synthetic'] = augmented_phrase
    new_row['method'] = 'adversarial_perturbation'
    new_row['target'] = 'EN'
    dfs[3].at[i, 'synthetic'] = augmented_phrase
    dfs[3].at[i, 'method'] = 'adversarial_perturbation'
    dfs[3].at[i, 'target'] = 'EN'
    
adversarial_df = dfs[3]
# remove if EN and synthetic are the same
adversarial_df = adversarial_df[
    adversarial_df["EN"].str.strip().ne(adversarial_df["synthetic"].str.strip())
].reset_index(drop=True)

# replace EN with synthetic
adversarial_df["EN"] = adversarial_df["synthetic"]
adversarial_df = adversarial_df.drop(columns=["synthetic"])

adversarial_df

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:00<00:00, 16846.92it/s]


Unnamed: 0,dataset,split,EN,ES,length,method,target
0,english-spanish-translator,train,She drank [ two glasses of wine at the party. 209,Ella se bebi√≥ dos vasos de vino en la fiesta.,18,adversarial_perturbation,EN
1,OPUS-books-EN-ES,train,"{-+üî•üå™Ô∏èüï≥ ""Queer spot to choose, is it not?"" sai...","-Extra√±a elecci√≥n para vivir, ¬øno es eso? -dij...",41,adversarial_perturbation,EN
2,english-spanish-translator,train,] Do Èõ∂üß¨ you have any /Œ≤ brothers? ?0ü§ñ,Ten√©s hermanos?,6,adversarial_perturbation,EN
3,mustc-en-es-text-only,train,‚ô´ üß®¬©üï∂Ô∏è^‚ú®¬∂ when black men ski.,‚ô´ cuando los hombres negros esqu√≠an.,15,adversarial_perturbation,EN
4,english-spanish-translator,train,Tom should 006üß¨Œª still be at work.,Tom todav√≠a deber√≠a estar en el trabajo.,16,adversarial_perturbation,EN
...,...,...,...,...,...,...,...
9309,english-spanish-translator,train,¬ø-\n¬ø3\nDo you have a twin sister?\n7üï≥Ô∏èŒ©\n$üí•6,Tienes una hermana gemela?,10,adversarial_perturbation,EN
9310,wikipedia_en_es_m2m,train,"John Adam Belushi (January 24, 1949 ‚Äì March 5,...",John Adam Belushi (24 de enero de 1949 ‚Äì 5 de ...,308,adversarial_perturbation,EN
9311,mustc-en-es-text-only,train,"The same adulation of Hitler, praise of the Th...","La misma admiraci√≥n por Hitler, alabanzas al T...",32,adversarial_perturbation,EN
9312,mustc-en-es-text-only,train,That is üéØ the change needed to Œî~+Èæç3üõ∞ restore ...,Ese es el cambio necesario para restaurar el b...,30,adversarial_perturbation,EN


In [16]:
dfs[3]

Unnamed: 0,dataset,split,EN,ES,length,synthetic,method,target
0,english-spanish-translator,train,She drank two glasses of wine at the party.,Ella se bebi√≥ dos vasos de vino en la fiesta.,18,She drank [ two glasses of wine at the party. 209,adversarial_perturbation,EN
1,OPUS-books-EN-ES,train,"""Queer spot to choose, is it not?"" said he as ...","-Extra√±a elecci√≥n para vivir, ¬øno es eso? -dij...",41,"{-+üî•üå™Ô∏èüï≥ ""Queer spot to choose, is it not?"" sai...",adversarial_perturbation,EN
2,english-spanish-translator,train,Do you have any brothers?,Ten√©s hermanos?,6,] Do Èõ∂üß¨ you have any /Œ≤ brothers? ?0ü§ñ,adversarial_perturbation,EN
3,mustc-en-es-text-only,train,‚ô´ when black men ski.,‚ô´ cuando los hombres negros esqu√≠an.,15,‚ô´ üß®¬©üï∂Ô∏è^‚ú®¬∂ when black men ski.,adversarial_perturbation,EN
4,english-spanish-translator,train,Tom should still be at work.,Tom todav√≠a deber√≠a estar en el trabajo.,16,Tom should 006üß¨Œª still be at work.,adversarial_perturbation,EN
...,...,...,...,...,...,...,...,...
9995,english-spanish-translator,train,Do you have a twin sister?,Tienes una hermana gemela?,10,¬ø-\n¬ø3\nDo you have a twin sister?\n7üï≥Ô∏èŒ©\n$üí•6,adversarial_perturbation,EN
9996,wikipedia_en_es_m2m,train,"John Adam Belushi (January 24, 1949 ‚Äì March 5,...",John Adam Belushi (24 de enero de 1949 ‚Äì 5 de ...,308,"John Adam Belushi (January 24, 1949 ‚Äì March 5,...",adversarial_perturbation,EN
9997,mustc-en-es-text-only,train,"The same adulation of Hitler, praise of the Th...","La misma admiraci√≥n por Hitler, alabanzas al T...",32,"The same adulation of Hitler, praise of the Th...",adversarial_perturbation,EN
9998,mustc-en-es-text-only,train,That is the change needed to restore energy ba...,Ese es el cambio necesario para restaurar el b...,30,That is üéØ the change needed to Œî~+Èæç3üõ∞ restore ...,adversarial_perturbation,EN


In [17]:
def swap_adjacent_chars(text: str) -> str:
    if not isinstance(text, str) or len(text) < 2:
        return text
    idx = random.randrange(len(text) - 1)
    chars = list(text)
    chars[idx], chars[idx + 1] = chars[idx + 1], chars[idx]
    return "".join(chars)


def delete_random_char(text: str) -> str:
    if not isinstance(text, str) or not text:
        return text
    idx = random.randrange(len(text))
    return text[:idx] + text[idx + 1:]


def replace_random_char(text: str) -> str:
    if not isinstance(text, str) or not text:
        return text
    idx = random.randrange(len(text))
    replacement_pool = string.ascii_letters + string.digits + string.punctuation
    replacement = random.choice(replacement_pool)
    return text[:idx] + replacement + text[idx + 1:]


def swap_adjacent_words(text: str) -> str:
    words = text.split()
    if len(words) < 2:
        return text
    idx = random.randrange(len(words) - 1)
    words[idx], words[idx + 1] = words[idx + 1], words[idx]
    return " ".join(words)


def delete_random_word(text: str) -> str:
    words = text.split()
    if not words:
        return text
    idx = random.randrange(len(words))
    del words[idx]
    return " ".join(words)


def replace_random_word(text: str) -> str:
    words = text.split()
    if len(words) < 2:
        return text
    idx = random.randrange(len(words))
    replacement = random.choice(words[:idx] + words[idx + 1:])
    words[idx] = replacement
    return " ".join(words)


CHAR_OPS = [swap_adjacent_chars, delete_random_char, replace_random_char]
WORD_OPS = [swap_adjacent_words, delete_random_word, replace_random_word]


def perturb_text(text: str, char_ratio: float = 0.5) -> str:
    if not isinstance(text, str):
        return text
    ops_pool = CHAR_OPS if random.random() < char_ratio else WORD_OPS
    op = random.choice(ops_pool)
    result = op(text)
    return result if result else text


# def augment_with_char_word_ops(frame, source_col="EN", n_samples=5, random_state=None):
#     sampled = frame.sample(n=min(n_samples, len(frame)), random_state=random_state).copy()
#     sampled["synthetic"] = sampled[source_col].apply(lambda txt: perturb_text(txt, char_ratio=0.6))
#     sampled = sampled[sampled["synthetic"].ne(sampled[source_col])]
#     sampled["method"] = "char_word_ops"
#     sampled["target"] = source_col
#     return sampled.reset_index(drop=True)


# char_word_augmented_df = augment_with_char_word_ops(df, source_col="EN", n_samples=20, random_state=seed)
# char_word_augmented_df.head()

# examples
# perturb_text("This is a sample text for perturbation.")
for i, row in tqdm.tqdm(dfs[4].iterrows(), total=len(dfs[4])):
    en_phrase = row['EN']
    augmented_phrase = perturb_text(en_phrase, char_ratio=0.6)
    new_row = row.to_dict()
    new_row['synthetic'] = augmented_phrase
    new_row['method'] = 'char_word_perturbation'
    new_row['target'] = 'EN'
    dfs[4].at[i, 'synthetic'] = augmented_phrase
    dfs[4].at[i, 'method'] = 'char_word_perturbation'
    dfs[4].at[i, 'target'] = 'EN'

dfs[4]
perturbation_df = dfs[4]
# remove if EN and synthetic are the same
perturbation_df = perturbation_df[
    perturbation_df["EN"].str.strip().ne(perturbation_df["synthetic"].str.strip())
].reset_index(drop=True)    
# replace EN with synthetic
perturbation_df["EN"] = perturbation_df["synthetic"]
perturbation_df = perturbation_df.drop(columns=["synthetic"])
perturbation_df

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:00<00:00, 20887.88it/s]


Unnamed: 0,dataset,split,EN,ES,length,method,target
0,Document-Translation-en-es,train,"Liverpool has rejected an offer from Chelsea, ...",El Liverpool ha rechazado una oferta del Chels...,458,char_word_perturbation,EN
1,english-spanish-translator,train,I'm a thirty-year-ld unmarried woman.,Soy una mujer soltera de treinta a√±os.,16,char_word_perturbation,EN
2,OPUS-books-EN-ES,train,But what's of purpose of this refuge?,Pero ¬øpara qu√© este refugio?,12,char_word_perturbation,EN
3,corpus-en-es,train,This is why the placing of signs with informat...,"Por eso, la colocaci√≥n de carteles con informa...",70,char_word_perturbation,EN
4,mustc-en-es-text-only,train,The whole business of politics has been effect...,Todo este negocio de la pol√≠tica se ha abandon...,43,char_word_perturbation,EN
...,...,...,...,...,...,...,...
9927,english-spanish-translator,train,Experience is the name everyone gives to|their...,Experiencia es el nombre que todo el mundo le ...,21,char_word_perturbation,EN
9928,OPUS-books-EN-ES,train,"La brute qu‚Äôelle avait pour mari, et qui l‚Äôava...","Su est√∫pido marido, que la hab√≠a desposado sob...",96,char_word_perturbation,EN
9929,OPUS-books-EN-ES,train,is That very interesting.,Eso es muy interesante.,10,char_word_perturbation,EN
9930,corpus-en-es,train,"You have heard words of appreciation, from Mr ...","Ha escuchado usted palabras de agradecimiento,...",90,char_word_perturbation,EN


In [18]:
import nltk
from nltk.corpus import wordnet as wn

nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)

def synonym_replacement(sentence, max_replacements=2):
    words = sentence.split()
    replaceable = [i for i, w in enumerate(words) if wn.synsets(w)]
    random.shuffle(replaceable)
    replaced = 0
    for idx in replaceable:
        syns = wn.synsets(words[idx])
        lemmas = {lemma.replace("_", " ") for syn in syns for lemma in syn.lemma_names()}
        lemmas.discard(words[idx])
        if not lemmas:
            continue
        words[idx] = random.choice(list(lemmas))
        replaced += 1
        if replaced >= max_replacements:
            break
    return " ".join(words)

english_synonym_rows = []
for i, row in tqdm.tqdm(dfs[5].iterrows(), total=len(dfs[5]), desc="EN synonym replacement"):
    original = row["EN"]
    augmented = synonym_replacement(original)
    if augmented.strip() == original.strip():
        continue
    new_row = row.to_dict()
    new_row["synthetic"] = augmented
    new_row["method"] = "english_synonym"
    new_row["target"] = "EN"
    english_synonym_rows.append(new_row)

english_synonym_df = pd.DataFrame(english_synonym_rows)
# remove if EN and synthetic are the same
english_synonym_df = english_synonym_df[
    english_synonym_df["EN"].str.strip().ne(english_synonym_df["synthetic"].str.strip())
].reset_index(drop=True)
# replace EN with synthetic
english_synonym_df["EN"] = english_synonym_df["synthetic"]
english_synonym_df = english_synonym_df.drop(columns=["synthetic"])
english_synonym_df.head()

EN synonym replacement: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:07<00:00, 1273.52it/s]


Unnamed: 0,dataset,split,EN,ES,length,method,target
0,OPUS-books-EN-ES,train,"This post was one of those, now far more numer...","Entonces decidi√≥ escuchar, observar, orientars...",69,english_synonym,EN
1,OPUS-books-EN-ES,train,"Have they non obey me?""",¬øNo se me ha obedecido?,10,english_synonym,EN
2,wikipedia_en_es_m2m,train,"In mathematics, a complete measure (or, more p...","En matem√°ticas, una medida completa (o, m√°s pr...",331,english_synonym,EN
3,OPUS-books-EN-ES,train,"""Silence!"" exclaim Felton; ""we constitute here.""",¬°Silencio! dijo Felton . Hemos llegado.,17,english_synonym,EN
4,mustc-en-es-text-only,train,It's the same principle: it's got O on one sid...,Sigue el mismo principio: toma el ox√≠geno por ...,58,english_synonym,EN


In [19]:
def spanish_synonym_replacement(sentence, max_replacements=2):
    words = sentence.split()
    replaceable = [i for i, w in enumerate(words) if wn.synsets(w, lang="spa")]
    random.shuffle(replaceable)
    replaced = 0
    for idx in replaceable:
        syns = wn.synsets(words[idx], lang="spa")
        lemmas = {
            lemma.replace("_", " ")
            for syn in syns
            for lemma in syn.lemma_names(lang="spa")
        }
        lemmas.discard(words[idx])
        if not lemmas:
            continue
        words[idx] = random.choice(list(lemmas))
        replaced += 1
        if replaced >= max_replacements:
            break
    return " ".join(words)

spanish_synonym_rows = []
for i, row in tqdm.tqdm(dfs[6].iterrows(), total=len(dfs[6]), desc="ES synonym replacement"):
    original = row["ES"]
    augmented = spanish_synonym_replacement(original)
    if augmented.strip() == original.strip():
        continue
    new_row = row.to_dict()
    new_row["synthetic"] = augmented
    new_row["method"] = "spanish_synonym"
    new_row["target"] = "ES"
    spanish_synonym_rows.append(new_row)

spanish_synonym_df = pd.DataFrame(spanish_synonym_rows)
spanish_synonym_df = spanish_synonym_df[
    spanish_synonym_df["ES"].str.strip().ne(spanish_synonym_df["synthetic"].str.strip())
].reset_index(drop=True)
# replace ES with synthetic
spanish_synonym_df["ES"] = spanish_synonym_df["synthetic"]
spanish_synonym_df = spanish_synonym_df.drop(columns=["synthetic"])

spanish_synonym_df.head()


ES synonym replacement: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:01<00:00, 8835.17it/s]


Unnamed: 0,dataset,split,EN,ES,length,method,target
0,OPUS-books-EN-ES,train,"No one, at least, can think I have not done en...","Al menos, nadie puede cogitaci√≥n que no he hec...",49,spanish_synonym,ES
1,mustc-en-es-text-only,train,What has also happened is in the course of the...,Lo que adem√°s ha sucedido durante estos viajes...,57,spanish_synonym,ES
2,mustc-en-es-text-only,train,Here's what I've learned from studying shocks ...,Esto he aprendido estudiando crisis y desastre...,48,spanish_synonym,ES
3,OPUS-books-EN-ES,train,"'How pleased I am to see you!' said Vronsky, a...","‚Äì¬°Cu√°nto celebro verte! ‚Äìdijo Vronsky, mostran...",45,spanish_synonym,ES
4,wikipedia_en_es_m2m,train,The Grand Union Canal in England is part of th...,El Canal de la Gran Uni√≥n en Inglaterra es par...,332,spanish_synonym,ES


In [20]:
# concatenate two random english phrases and output their spanish translation
def proc(row1, row2):
    if random.random() < 0.5:
        en = f"{row1['EN']} {row2['EN']}"
        es = f"{row1['ES']} {row2['ES']}"
    else:
        en = f"{row2['EN']} {row1['EN']}"
        es = f"{row2['ES']} {row1['ES']}"
    return pd.Series({"EN": en, "ES": es, "split": row1["split"], 'dataset': 'mixed'})

concat_rows = []
for i in tqdm.tqdm(range(len(dfs[7])), desc="Concatenate EN-ES pairs"):
    row1 = dfs[7].iloc[i]
    j = random.randint(0, len(dfs[7]) - 1)
    while j == i or dfs[7].iloc[j]["split"] != row1["split"]:
        j = random.randint(0, len(dfs[7]) - 1)
    row2 = dfs[7].iloc[j]
    new_row = proc(row1, row2).to_dict()
    new_row.update({
        'method': 'concatenate',
        'target': 'BOTH'
    })
    concat_rows.append(new_row)

concat_df = pd.DataFrame(concat_rows)
concat_df.head()

Concatenate EN-ES pairs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30000/30000 [00:03<00:00, 8920.30it/s]


Unnamed: 0,EN,ES,split,dataset,method,target
0,What time is your plane scheduled to take off?...,A qu√© hora sale tu avi√≥n? Entonces qu√© hacemos...,train,mixed,concatenate,BOTH
1,Hand me the wrench. It was lit only by the dim...,P√°same la llave. Iluminado solo por el tenue r...,train,mixed,concatenate,BOTH
2,Burnham had designed a railroad station that w...,Burnham hab√≠a dise√±ado una estaci√≥n de trenes ...,train,mixed,concatenate,BOTH
3,"Now, I'm not here to tell you what kind of leg...",No estoy aqu√≠ para decirles qu√© leyes deber√≠an...,val,mixed,concatenate,BOTH
4,Cet assassin inconnu proc√©dait d‚Äôune fa√ßon si ...,Hab√≠a un no s√© qu√© de met√≥dico e incomprensibl...,train,mixed,concatenate,BOTH


In [21]:
# collect all synthetic dataframes
synthetic_dfs = [
    paraph_df,
    spanish_paraph_df,
    back_translation_df,
    adversarial_df,
    perturbation_df,
    english_synonym_df,
    spanish_synonym_df,
    concat_df,
]
synthetic_df = pd.concat(synthetic_dfs).reset_index(drop=True)

In [22]:
synthetic_df

Unnamed: 0,dataset,split,EN,ES,length,method,target
0,english-spanish-translator,train,it's awful weather today,Hoy hace un clima p√©simo.,12.0,paraphrase,EN
1,english-spanish-translator,train,the weather is horrible,Hoy hace un clima p√©simo.,12.0,paraphrase,EN
2,wikipedia_en_es_m2m,train,horsepower hp is a unit of measurement of powe...,La potencia de caballo (hp) es una unidad de m...,349.0,paraphrase,EN
3,wikipedia_en_es_m2m,train,horsepower hp is a unit of measurement of powe...,La potencia de caballo (hp) es una unidad de m...,349.0,paraphrase,EN
4,mustc-en-es-text-only,train,coconut shells crushed in a machine mixed with...,"Por ejemplo, las c√°scaras de coco trituradas e...",76.0,paraphrase,EN
...,...,...,...,...,...,...,...
104975,mixed,train,"My hedge was begun and carried on, I believe, ...","Interrump√≠ las labores de inmediato y, para em...",,concatenate,BOTH
104976,mixed,train,I hate lawyers. Do you like pork loin?,Odio a los abogados. Le gusta el lomo de cerdo?,,concatenate,BOTH
104977,mixed,train,Gluten is a structural protein naturally found...,El gluten es una prote√≠na estructural que se e...,,concatenate,BOTH
104978,mixed,train,I should like to emphasise that there is no ro...,Me gustar√≠a resaltar que en este tema no hay l...,,concatenate,BOTH


In [23]:
synthetic_df['method'].unique()

array(['paraphrase', 'spanish_paraphrase', 'back_translation',
       'adversarial_perturbation', 'char_word_perturbation',
       'english_synonym', 'spanish_synonym', 'concatenate'], dtype=object)

In [24]:
synthetic_df['method'].value_counts()

method
concatenate                 30000
spanish_paraphrase          19118
char_word_perturbation       9932
english_synonym              9818
adversarial_perturbation     9314
paraphrase                   9007
spanish_synonym              8903
back_translation             8888
Name: count, dtype: int64

In [25]:
# pip install transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
checkpoint = "HuggingFaceTB/SmolLM-135M"
device = "cuda" # for GPU usage or "cpu" for CPU usage
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [26]:
target_column_map = {"EN": "EN", "ES": "ES", "BOTH": "EN"}

dedup_groups = []
for (dataset_name, target_name), group in synthetic_df.groupby(["dataset", "target"], dropna=False):
    subset_col = target_column_map.get(target_name, "EN")
    dedup_groups.append(group.drop_duplicates(subset=subset_col).reset_index(drop=True))

xsynthetic_df = pd.concat(dedup_groups, ignore_index=True)

In [27]:
xsynthetic_df['method'].value_counts()

method
concatenate                 30000
spanish_paraphrase          19065
char_word_perturbation       9932
english_synonym              9818
adversarial_perturbation     9314
paraphrase                   8991
spanish_synonym              8902
back_translation             8883
Name: count, dtype: int64

In [28]:
xsynthetic_df

Unnamed: 0,dataset,split,EN,ES,length,method,target
0,Document-Translation-en-es,train,german midfielder daniel bierofka suffered a f...,El centrocampista alem√°n Daniel Bierofka sufri...,294.0,paraphrase,EN
1,Document-Translation-en-es,train,the german midfielder daniel bierofka suffered...,El centrocampista alem√°n Daniel Bierofka sufri...,294.0,paraphrase,EN
2,Document-Translation-en-es,train,at a time of maximum tension between the gener...,En un momento de m√°xima tensi√≥n entre la Gener...,477.0,paraphrase,EN
3,Document-Translation-en-es,train,listen to it on your phone if you have an ipho...,Esc√∫chalo en tu m√≥vi . Si tienes un iPhone o...,289.0,paraphrase,EN
4,Document-Translation-en-es,train,the recent events have again shown that realit...,Los recientes acontecimientos han demostrado (...,271.0,paraphrase,EN
...,...,...,...,...,...,...,...
104900,wikipedia_en_es_m2m,train,Smallfilms is a British television production ...,Smallfilms es una compa√±√≠a de producci√≥n de te...,370.0,spanish_synonym,ES
104901,wikipedia_en_es_m2m,train,A raster image processor (RIP) is a component ...,Un procesador de imagen de raster (RIP) es un ...,359.0,spanish_synonym,ES
104902,wikipedia_en_es_m2m,val,Events Pre-1600 1568 ‚Äì The Spanish Duke of Alb...,Eventos Pre-1600 1568 ‚Äì El duque espa√±ol de Al...,346.0,spanish_synonym,ES
104903,wikipedia_en_es_m2m,train,z/OS is a 64-bit operating system for IBM z/Ar...,z/OS es un sistema operativo de 64 bits para I...,303.0,spanish_synonym,ES


In [29]:
# apply length
template = "English: {english} Spanish: {spanish} <|END|>"
xsynthetic_df['length'] = xsynthetic_df.apply(
    lambda row: len(tokenizer.encode(
        template.format(english=row['EN'], spanish=row['ES']),
        return_tensors="pt",
    )[0]),
    axis=1
)

In [30]:
xsynthetic_df = xsynthetic_df[xsynthetic_df['length'] <= 512].reset_index(drop=True)


In [31]:
# number of samples per method
xsynthetic_df['method'].value_counts()

method
concatenate                 21887
spanish_paraphrase          17583
paraphrase                   8980
char_word_perturbation       8606
english_synonym              8473
back_translation             7867
adversarial_perturbation     7838
spanish_synonym              7560
Name: count, dtype: int64

In [32]:
# max sample per method
max_samples_per_method = 10_000
balanced_dfs = []
for method, group in xsynthetic_df.groupby('method'):
    if len(group) > max_samples_per_method:
        balanced_dfs.append(
            group.sample(n=max_samples_per_method, random_state=seed).reset_index(drop=True)
        )
    else:
        balanced_dfs.append(group.reset_index(drop=True))
balanced_synthetic_df = pd.concat(balanced_dfs).reset_index(drop=True)
balanced_synthetic_df['method'].value_counts()

method
concatenate                 10000
spanish_paraphrase          10000
paraphrase                   8980
char_word_perturbation       8606
english_synonym              8473
back_translation             7867
adversarial_perturbation     7838
spanish_synonym              7560
Name: count, dtype: int64

In [33]:
balanced_synthetic_df

Unnamed: 0,dataset,split,EN,ES,length,method,target
0,Document-Translation-en-es,train,"4¬∂\n*Êº¢\n5|ÈæçÂ≠ó\n27 people, aged between 16 and 7...","27 personas, de entre 16 y 79 a√±os, resultaron...",377,adversarial_perturbation,EN
1,Document-Translation-en-es,train,Tidying up your house\nhad never\tbeen so...,Ordenar tu casa nunca hab√≠a tenido tanta impor...,505,adversarial_perturbation,EN
2,Document-Translation-en-es,train,=‚ö°Èæç\nFifty-three people with arrest warrants w...,Cincuenta y tres personas con √≥rdenes de captu...,448,adversarial_perturbation,EN
3,Document-Translation-en-es,val,Two deer\tspecimens from the Citadel pack...,Dos ejemplares de ciervos de la manada de la C...,511,adversarial_perturbation,EN
4,Document-Translation-en-es,train,The\nWho will give a memorable concert fo...,The Who dar√° un conmemorable concierto para to...,407,adversarial_perturbation,EN
...,...,...,...,...,...,...,...
69319,wikipedia_en_es_m2m,train,"In chemistry and atomic physics, the main grou...","En qu√≠mica y f√≠sica at√≥mica, el grupo principa...",505,spanish_synonym,ES
69320,wikipedia_en_es_m2m,train,A Dungeon Master is the organizer of a Dungeon...,Dungeon Master adem√°s puede referirse a: Gamin...,484,spanish_synonym,ES
69321,wikipedia_en_es_m2m,train,Total war is warfare that includes any and all...,La guerra total es la guerra que incluye todos...,511,spanish_synonym,ES
69322,wikipedia_en_es_m2m,train,"In telecommunications, asynchronous operation ...","En telecomunicaciones, operaci√≥n as√≠ncrona o o...",271,spanish_synonym,ES


In [34]:
null_summary = df.isnull().any()
print(null_summary)
print(f"Any column contains nulls: {null_summary.any()}")

dataset    False
split      False
EN         False
ES         False
length     False
dtype: bool
Any column contains nulls: False


In [35]:
balanced_synthetic_df.to_parquet("exp-data/synthetic-en-es-data.parquet", index=False)

In [1]:
import pandas as pd
aug_df = pd.read_parquet("exp-data/synthetic-en-es-data.parquet")

In [2]:
aug_df

Unnamed: 0,dataset,split,EN,ES,length,method,target
0,Document-Translation-en-es,train,"4¬∂\n*Êº¢\n5|ÈæçÂ≠ó\n27 people, aged between 16 and 7...","27 personas, de entre 16 y 79 a√±os, resultaron...",377,adversarial_perturbation,EN
1,Document-Translation-en-es,train,Tidying up your house\nhad never\tbeen so...,Ordenar tu casa nunca hab√≠a tenido tanta impor...,505,adversarial_perturbation,EN
2,Document-Translation-en-es,train,=‚ö°Èæç\nFifty-three people with arrest warrants w...,Cincuenta y tres personas con √≥rdenes de captu...,448,adversarial_perturbation,EN
3,Document-Translation-en-es,val,Two deer\tspecimens from the Citadel pack...,Dos ejemplares de ciervos de la manada de la C...,511,adversarial_perturbation,EN
4,Document-Translation-en-es,train,The\nWho will give a memorable concert fo...,The Who dar√° un conmemorable concierto para to...,407,adversarial_perturbation,EN
...,...,...,...,...,...,...,...
69319,wikipedia_en_es_m2m,train,"In chemistry and atomic physics, the main grou...","En qu√≠mica y f√≠sica at√≥mica, el grupo principa...",505,spanish_synonym,ES
69320,wikipedia_en_es_m2m,train,A Dungeon Master is the organizer of a Dungeon...,Dungeon Master adem√°s puede referirse a: Gamin...,484,spanish_synonym,ES
69321,wikipedia_en_es_m2m,train,Total war is warfare that includes any and all...,La guerra total es la guerra que incluye todos...,511,spanish_synonym,ES
69322,wikipedia_en_es_m2m,train,"In telecommunications, asynchronous operation ...","En telecomunicaciones, operaci√≥n as√≠ncrona o o...",271,spanish_synonym,ES


In [4]:
aug_df['split'].value_counts(normalize=True)

split
train    0.908733
val      0.091267
Name: proportion, dtype: float64

In [5]:
train_proc_df = pd.read_parquet("exp-data/en-es-train-val.parquet")

In [6]:
train_proc_df

Unnamed: 0,dataset,split,EN,ES,length
0,english-spanish-translator,train,I heard it thunder in the distance.,Escuch√© truenos en la lejan√≠a.,28
1,english-spanish-translator,train,The divers were trapped in the cave.,Los buzos quedaron atrapados en la caverna.,32
2,english-spanish-translator,train,He told me that he was very tired then.,√âl me dijo que estaba muy cansado en ese momento.,37
3,mustc-en-es-text-only,train,"From a governmental perspective, from a social...",Aspiramos a que todos tengan acceso a la energ...,68
4,mustc-en-es-text-only,train,"But there's a problem with this, and that is, ...","Pero hay un problema con esto, y es que los se...",100
...,...,...,...,...,...
109995,english-spanish-translator,val,Stand up for yourself.,Lev√°ntate por ti mismo.,23
109996,mustc-en-es-text-only,val,"I pause and I think, how could all of this go ...","Me detengo y pienso, ¬øc√≥mo podr√≠a todo esto te...",80
109997,english-spanish-translator,val,I guarantee I'll get you a job.,Te aseguro que te conseguir√© un empleo.,34
109998,mustc-en-es-text-only,val,And I think computers today are doing quite th...,Y creo que las computadoras de hoy est√°n hacie...,42
