<a href="https://colab.research.google.com/github/ipietri/w266_Final_Project/blob/master/notebooks/RtGender-Notebooks/grey_scale_augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# For Greyscaling Data Augmentation 

References:
[Code Source](https://github.com/ainagari/scalar_adjs)

* [BERT Knows Punta Cana is not just beautiful, it’s gorgeous:
Ranking Scalar Adjectives with Contextualised Representations](https://aclanthology.org/2020.emnlp-main.598.pdf)\
*[Scalar Adjective Identification and Multilingual Ranking
](https://arxiv.org/abs/2105.01180)\
*[Identifying and Ordering Scalar Adjectives Using Lexical Substitution](https://www.proquest.com/openview/aade435a5bbdcf41e2b8c24e648826cc/1.pdf?pq-origsite=gscholar&cbl=18750)\
*[A Gold Standard for Scalar Adjectives](https://aclanthology.org/L16-1424/)

In [None]:
import copy
import transformers
from transformers import BertTokenizer, BertConfig, BertModel, AutoTokenizer, AutoModel, FlaubertTokenizer, FlaubertModel, AutoConfig, FlaubertConfig
import pickle


In [None]:
language_str = "en"
#whether we exclude the last bpe of words when words are split into multiple wordpieces
exclude_last_bpe ="True"
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)

In [None]:
# from extract_representations.py
def special_tokenization(sentence, tokenizer, model_name):
    map_ori_to_bert = []
    if "flaubert" in model_name:
        tok_sent = ['<s>']
    else:
        tok_sent = ['[CLS]']

    for orig_token in sentence.split():
        current_tokens_bert_idx = [len(tok_sent)]
        bert_token = tokenizer.tokenize(orig_token) # tokenize
        tok_sent.extend(bert_token) # add to my new tokens
        if len(bert_token) > 1: # if the new token has been 'wordpieced'
            extra = len(bert_token) - 1
            for i in range(extra):
                current_tokens_bert_idx.append(current_tokens_bert_idx[-1]+1) # list of new positions of the target word in the new tokenization
        map_ori_to_bert.append(tuple(current_tokens_bert_idx))

    if "flaubert" in model_name:
        tok_sent.append('</s>')
    else:
        tok_sent.append('[SEP]')

    return tok_sent, map_ori_to_bert

## Augment

In [None]:
infos = []
final_dict = []
labels_list = []
augmented_text_list = []

def augment_greyscaling(dict_from_df, datanames, text_col, scale_col):
  '''return augmented text with milder words from selected scales'''
  for id, values in dict_from_df.items():
    num_positions = 0
    for data_name in datanames:
      # sum the number of word positions for replacement
      num_positions += len(dict_from_df[id]['new_col'][data_name])

      if num_positions == 0:
        # if none begin again
        continue

      else:
        for word, values in dict_from_df[id]['new_col'][data_name].items():
          num_mild_words = len(dict_from_df[id]['new_col'][data_name][word]['milder_words'])
          
          if num_mild_words == 0:
            # if there are no milder words in the scale move to next word
            continue

          else:
            # assume the word is only in one location in the example text
            position_scaleword = dict_from_df[id]['new_col'][data_name][word]['position']
            cinstance = dict()
            cinstance[text_col] = copy.deepcopy(dict_from_df[id][text_col])

            # convert text to a list
            sentence_words = dict_from_df[id][text_col].replace("'", "") 
            sentence_words = sentence_words.split(" ")
            for scale_word in dict_from_df[id]['new_col'][data_name][word]['milder_words']:

              # change a to an and vice versa depending on first letter of the scaleword 
              if sentence_words[position_scaleword-1] == "a" and scale_word[0] in "aeiou":
                sentence_words[position_scaleword-1] = "an"
              elif sentence_words[position_scaleword-1] == "an" and scale_word[0] not in "aeiou":
                sentence_words[position_scaleword-1] = "a"
              
              # and replace the scaleword
              sentence_words[position_scaleword] = scale_word
              cinstance["position"] = [position_scaleword]
              
              # extract and tokenize the original sentence
              example = ' '.join(sentence_words)

              # add augmented text to final dictionary
              test_df.loc[len(test_df)] = [dict_from_df[id][scale_col], 
                                          ' '.join(sentence_words)]
              labels_list.append(dict_from_df[id][scale_col])
              augmented_text_list.append(' '.join(sentence_words))

              bert_tokenized_sentence, mapp = special_tokenization(example, tokenizer, model_name)
              
              current_positions = cinstance['position']
              if len(current_positions) == 1:
                  bert_position = mapp[cinstance['position'][0]] 
              elif len(current_positions) > 1:
                bert_position = []
                for p in current_positions:
                    bert_position.extend(mapp[p])
              
              cinstance[id] = id
              cinstance["bert_tokenized_sentence"] = bert_tokenized_sentence
              cinstance["bert_position"] = bert_position
              cinstance["scale"] = scale
              cinstance["lemma"] = scale_word
              infos.append(cinstance)

  return labels_list, augmented_text_list, infos, final_dict
