## Compilation of data augmentation methods


In [7]:
import random
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np
import pandas as pd
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
!git clone https://github.com/jasonwei20/eda_nlp.git

Cloning into 'eda_nlp'...
remote: Enumerating objects: 396, done.[K
remote: Counting objects: 100% (60/60), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 396 (delta 45), reused 44 (delta 44), pack-reused 336[K
Receiving objects: 100% (396/396), 20.40 MiB | 12.52 MiB/s, done.
Resolving deltas: 100% (194/194), done.


In [1]:
!git clone https://github.com/linv24/da-stance-detection.git

Cloning into 'da-stance-detection'...
remote: Enumerating objects: 177, done.[K
remote: Counting objects: 100% (177/177), done.[K
remote: Compressing objects: 100% (159/159), done.[K
remote: Total 177 (delta 13), reused 175 (delta 11), pack-reused 0[K
Receiving objects: 100% (177/177), 770.34 KiB | 2.81 MiB/s, done.
Resolving deltas: 100% (13/13), done.
Filtering content: 100% (116/116), 3.53 GiB | 64.89 MiB/s, done.


In [8]:
test_data_path = '/content/drive/MyDrive/ASDA_Nov_2023[52]/datasets/hillary/train.tsv'
test_data = pd.read_csv(test_data_path, sep='\t')
print(test_data)

                                                 Tweet         Target 1  \
0    I just gave an unhealthy amount of my hard-ear...  Hillary Clinton   
1    @HillaryClinton Hillary pandering with her log...  Hillary Clinton   
2    @HomeOfUncleSam @ScotsFyre @RWNutjob1 @SA_Hart...  Hillary Clinton   
3    Because Communist Breadlines are not my thing!...  Hillary Clinton   
4    I want America to great again #WhyImNotVotingF...  Hillary Clinton   
..                                                 ...              ...   
580  2 people that are surprising? Trump &Fiorina. ...  Hillary Clinton   
581  So we're to believe that "Santa" is a hairstyl...  Hillary Clinton   
582  @oreillyfactor Who gives a rate ass what a Old...  Hillary Clinton   
583  @Ered604 welcome 2 Twitter~PROUD to be #UNITEB...  Hillary Clinton   
584  @kryptickaos_ I would have to agree, but I wou...  Hillary Clinton   

     Stance 1  
0           0  
1           0  
2           0  
3           0  
4           0  
.. 

##Synonym Replacement

Replace certain words with their synonyms

Synonyms come from wordnet

In [30]:
def get_pos_tag(word):
    tag = nltk.pos_tag([word])[0][1]
    return {
        'J': wordnet.ADJ,
        'N': wordnet.NOUN,
        'R': wordnet.ADV,
        'V': wordnet.VERB
    }.get(tag[0], wordnet.NOUN)  # Default to NOUN for unrecognized tags

def get_synonyms(word, pos_tag):
    synonyms = []
    for syn in wordnet.synsets(word, pos=pos_tag):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name().replace('_', ' '))
    return synonyms

def replace_with_synonym(word, synonyms, replace_prob):
    if synonyms and random.random() < replace_prob:
        return random.choice(synonyms)
    return word

def augment_with_synonyms(text, replace_prob=0.):
    sentences = sent_tokenize(text)
    augmented_sentences = []
    for sentence in sentences:
        tokenized = word_tokenize(sentence)
        augmented_tokens = []
        for token in tokenized:
            pos_tag = get_pos_tag(token)
            synonyms = get_synonyms(token, pos_tag)
            augmented_tokens.append(replace_with_synonym(token, synonyms, replace_prob))
        augmented_sentence = ' '.join(augmented_tokens)
        augmented_sentences.append(augmented_sentence)
    return ' '.join(augmented_sentences)

In [31]:
# Example text for augmentation
original_text = "Create easily interpretable topics with Large Language Models — With the advent of Llama 2, running strong LLMs locally has become more and more a reality."

# Augment the text with synonyms

print("Original Text:", original_text)
count_of_generated_examples = 2
for gen in range(count_of_generated_examples):
    augmented_text = augment_with_synonyms(original_text)
    print(f"Augmented Text {gen+1}:", augmented_text,'\n')

Original Text: Create easily interpretable topics with Large Language Models — With the advent of Llama 2, running strong LLMs locally has become more and more a reality.
Augmented Text 1: Create well interpretable topics with Large linguistic process example — With the advent of llama 2 , running strong LLMs locally has become more and more A reality . 

Augmented Text 2: Create easy explainable topics with tumid Language model — With the advent of Llama 2 , running strong LLM locally make become more and more a reality . 



In [35]:
new_data = {'Tweet': [], 'Target 1': [], "Stance 1": []}

for row in test_data.itertuples():
  augmented = {row.Tweet}
  for i in range(2):
    #Choose replace prob here
    augmented_text = augment_with_synonyms(row.Tweet, 0.5)
    if augmented_text not in augmented:
      new_data["Target 1"].append(row._2)
      new_data["Stance 1"].append(row._3)
      new_data["Tweet"].append(augmented_text)
      augmented.add(augmented_text)
  # if random.random() < 0.1:
  #     print("Original Text: ", row.Tweet)
  #     print("Augmented Text: ", augmented)
augmented_data = pd.DataFrame(new_data)

final_data = pd.concat([test_data, augmented_data], ignore_index=True)

final_data.to_csv('fianl_sr_data.tsv', sep='\t', index=False)
print(test_data.shape)
print(final_data.shape)

(585, 3)
(1737, 3)


##EDA
Synonym Replacement + Random Deletion + Random Insertion + Random Swap

In [24]:
# takes in sem eval data (Tweet Target Stance) and converts into (Label Sentence) for eda
new_data = {'label': [], 'sentence': []}
label_to_ix = {}
ix_to_label = {}
ix = 0

for row in test_data.itertuples():
  targ_stance = (row._2, row._3) # tuple of stance and target
  if targ_stance not in label_to_ix:
    ix_to_label[ix] = targ_stance
    label_to_ix[targ_stance] = ix
    ix+=1
  new_data["sentence"].append(row.Tweet)
  new_data["label"].append(label_to_ix[targ_stance])
processed_data = pd.DataFrame(new_data)
processed_data.to_csv('processed_data.tsv', sep='\t', index=False)
print(processed_data)

     label                                           sentence
0        0  I just gave an unhealthy amount of my hard-ear...
1        0  @HillaryClinton Hillary pandering with her log...
2        0  @HomeOfUncleSam @ScotsFyre @RWNutjob1 @SA_Hart...
3        0  Because Communist Breadlines are not my thing!...
4        0  I want America to great again #WhyImNotVotingF...
..     ...                                                ...
580      2  2 people that are surprising? Trump &Fiorina. ...
581      2  So we're to believe that "Santa" is a hairstyl...
582      2  @oreillyfactor Who gives a rate ass what a Old...
583      2  @Ered604 welcome 2 Twitter~PROUD to be #UNITEB...
584      2  @kryptickaos_ I would have to agree, but I wou...

[585 rows x 2 columns]


In [25]:
!python eda_nlp/code/augment.py --input=./processed_data.tsv --output=augmented_data.tsv --num_aug=3 --alpha_sr=0.05 --alpha_rd=0.1 --alpha_ri=0.1 --alpha_rs=0.1

generated augmented sentences with eda for ./processed_data.tsv to augmented_data.tsv with num_aug=3


In [26]:
# augment = pd.read_csv('./augmented_data.tsv', sep='\t')
# print(augment.shape)
# print(test_data.shape)

(2343, 2)
(585, 3)


In [29]:
# reverts to original data format
augmented_data = pd.read_csv('./augmented_data.tsv', sep='\t')
final_eda_data = {'Tweet': [], 'Target 1': [], "Stance 1": []}

for row in augmented_data.itertuples():
  if (row.label == 'label'):
    continue
  target, stance = ix_to_label[int(row.label)]
  final_eda_data['Tweet'].append(row.sentence)
  final_eda_data['Target 1'].append(target)
  final_eda_data["Stance 1"].append(stance)
final_eda_data = pd.DataFrame(final_eda_data)
final_eda_data.to_csv('final_eda_data.tsv', sep='\t', index=False)
print(final_eda_data.shape)

(2340, 3)


##CBERT (CMLM) WIP

Use masking to predict words in tweets

In [40]:
!git clone https://github.com/1024er/cbert_aug.git

Cloning into 'cbert_aug'...
remote: Enumerating objects: 558, done.[K
remote: Total 558 (delta 0), reused 0 (delta 0), pack-reused 558[K
Receiving objects: 100% (558/558), 5.09 MiB | 15.02 MiB/s, done.
Resolving deltas: 100% (290/290), done.


In [41]:
test_data = pd.read_csv('/content/drive/MyDrive/ASDA_Nov_2023[52]/datasets/hillary/train.tsv', sep='\t')
print(test_data)

                                                 Tweet         Target 1  \
0    I just gave an unhealthy amount of my hard-ear...  Hillary Clinton   
1    @HillaryClinton Hillary pandering with her log...  Hillary Clinton   
2    @HomeOfUncleSam @ScotsFyre @RWNutjob1 @SA_Hart...  Hillary Clinton   
3    Because Communist Breadlines are not my thing!...  Hillary Clinton   
4    I want America to great again #WhyImNotVotingF...  Hillary Clinton   
..                                                 ...              ...   
580  2 people that are surprising? Trump &Fiorina. ...  Hillary Clinton   
581  So we're to believe that "Santa" is a hairstyl...  Hillary Clinton   
582  @oreillyfactor Who gives a rate ass what a Old...  Hillary Clinton   
583  @Ered604 welcome 2 Twitter~PROUD to be #UNITEB...  Hillary Clinton   
584  @kryptickaos_ I would have to agree, but I wou...  Hillary Clinton   

     Stance 1  
0           0  
1           0  
2           0  
3           0  
4           0  
.. 

In [45]:
new_data = {'sentence': [], 'label': []}
seen = {}
ix = 0
for row in test_data.itertuples():
  targ_stance = (row._2, row._3)
  if targ_stance not in seen:
    seen[targ_stance] = ix
    ix+=1
  new_data["sentence"].append(row.Tweet)
  new_data["label"].append(seen[targ_stance])
processed_data = pd.DataFrame(new_data)
processed_data.to_csv('processed_data.tsv', sep='\t', index=False)

dev 10% train 70% test 20%


##Back Translation WIP

Translate to a romance language then back to english

In [None]:
!pip install transformers
!pip install mosestokenizer
!pip install sentencepiece




In [None]:
from transformers import MarianMTModel, MarianTokenizer

# Helper function to download data for a language
def download(model_name):
  tokenizer = MarianTokenizer.from_pretrained(model_name)
  model = MarianMTModel.from_pretrained(model_name)
  return tokenizer, model

# download model for English -> Romance
tmp_lang_tokenizer, tmp_lang_model = download('Helsinki-NLP/opus-mt-en-ROMANCE')
# download model for Romance -> English
src_lang_tokenizer, src_lang_model = download('Helsinki-NLP/opus-mt-ROMANCE-en')

(…)MANCE/resolve/main/tokenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

(…)us-mt-en-ROMANCE/resolve/main/source.spm:   0%|          | 0.00/779k [00:00<?, ?B/s]

(…)us-mt-en-ROMANCE/resolve/main/target.spm:   0%|          | 0.00/799k [00:00<?, ?B/s]

(…)us-mt-en-ROMANCE/resolve/main/vocab.json:   0%|          | 0.00/1.46M [00:00<?, ?B/s]

(…)s-mt-en-ROMANCE/resolve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

(…)ANCE/resolve/main/generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

(…)CE-en/resolve/main/tokenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

(…)us-mt-ROMANCE-en/resolve/main/source.spm:   0%|          | 0.00/800k [00:00<?, ?B/s]

(…)us-mt-ROMANCE-en/resolve/main/target.spm:   0%|          | 0.00/779k [00:00<?, ?B/s]

(…)us-mt-ROMANCE-en/resolve/main/vocab.json:   0%|          | 0.00/1.46M [00:00<?, ?B/s]

(…)s-mt-ROMANCE-en/resolve/main/config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

(…)E-en/resolve/main/generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
def translate(texts, model, tokenizer, language):
  """Translate texts into a target language"""
  # Format the text as expected by the model
  formatter_fn = lambda txt: f"{txt}" if language == "en" else f">>{language}<< {txt}"
  original_texts = [formatter_fn(txt) for txt in texts]

  # Tokenize (text to tokens)
  tokens = tokenizer.prepare_seq2seq_batch(original_texts)
  print("TOKENS:",  type(tokens))
  # Translate
  translated = model.generate(**tokens)

  # Decode (tokens to text)
  translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)

  return translated_texts

def back_translate(texts, language_src, language_dst):
  """Implements back translation"""
  # Translate from source to target language
  translated = translate(texts, tmp_lang_model, tmp_lang_tokenizer, language_dst)

  # Translate from target language back to source language
  back_translated = translate(translated, src_lang_model, src_lang_tokenizer, language_src)

  return back_translated

In [None]:
src_texts = np.array(['I might be late tonight', 'What a movie, so bad', 'That was very kind'])
back_texts = back_translate(src_texts, "en", "fr")

print(back_texts)
# ['I might be late tonight.', 'What a movie, so bad', 'That was very kind of you.']

TOKENS: <class 'transformers.tokenization_utils_base.BatchEncoding'>


AttributeError: ignored

##ASDA WIP

Method from [Target-Aware Data Augmentation for Stance Detection](https://aclanthology.org/2021.naacl-main.148.pdf)



```
Input :Training dataset Dtrain
Total training steps S
Auxiliary sentence A
Batch size B
Language model M
Proportion of sentence to mask p
1 for each i = 1, 2, ..., S do
  2 Batchi = ∅
    3 for each j = 1, 2, ..., B do
      4 Randomly sample a sentence s from
      Dtrain
      5 Randomly mask words of s with
      probability p to obtain sm
      6 Prepend the auxiliary sentence A
      that contains corresponding target
      and label information to the sm to
      obtain sˆ
      7 Batchi = Batchi ∪ {sˆ}
    8 end
  9 Fine-tune the language model M with
  Batchi
10 end
11 return M
```

