# **Data augmentation: POS-driven method (replacing adj+adv using BERT)**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pickle
import numpy as np

from nltk.tag.stanford import StanfordPOSTagger
from nltk.tokenize import word_tokenize

from nltk.corpus import wordnet

from lxml import html
import requests

# Install word tokenizer:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Install POS-tagger:
!wget --no-check-certificate 'https://nlp.stanford.edu/software/stanford-tagger-4.2.0.zip'
!unzip stanford-tagger-4.2.0.zip

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


--2024-06-17 09:30:48--  https://nlp.stanford.edu/software/stanford-tagger-4.2.0.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://downloads.cs.stanford.edu/nlp/software/stanford-tagger-4.2.0.zip [following]
--2024-06-17 09:30:48--  https://downloads.cs.stanford.edu/nlp/software/stanford-tagger-4.2.0.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 78034596 (74M) [application/zip]
Saving to: ‘stanford-tagger-4.2.0.zip’


2024-06-17 09:31:00 (6.89 MB/s) - ‘stanford-tagger-4.2.0.zip’ saved [78034596/78034596]

Archive:  stanford-tagger-4.2.0.zip
   creating: stanford-postagger-full-2020-11-17/
  inflating: stanford-postagger-full-2020-1

In [None]:
!pip install -U tensorflow==2.15

Collecting tensorflow==2.15
  Downloading tensorflow-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (475.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.16,>=2.15 (from tensorflow==2.15)
  Downloading tensorboard-2.15.2-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorflow-estimator<2.16,>=2.15.0 (from tensorflow==2.15)
  Downloading tensorflow_estimator-2.15.0-py2.py3-none-any.whl (441 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m442.0/442.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras<2.16,>=2.15.0 (from tensorflow==2.15)
  Downloading keras-2.15.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting

In [None]:
from transformers import pipeline

## **Load the data**

Load train set:

In [None]:
f_in = open("drive/MyDrive/data/7-classes/train_set_paragraphs.pkl","rb")
#f_in = open("drive/MyDrive/data/7-classes/train_set_sentences.pkl","rb")

data_train = pickle.load(f_in)

f_in.close()

Classes to augment:

In [None]:
#data_train_positive = [(data_train[i][1],data_train[i][2]) for i in range(len(data_train)) if data_train[i][2] == 0]
data_train_positive = [(data_train[i][1],data_train[i][2]) for i in range(len(data_train)) if (data_train[i][2] == 0) or (data_train[i][2] == 3) or (data_train[i][2] == 6)]
#data_train_positive = [(data_train[i][1],data_train[i][2]) for i in range(len(data_train)) if (data_train[i][2] == 0) or (data_train[i][2] == 2) or (data_train[i][2] == 3) or (data_train[i][2] == 4) or (data_train[i][2] == 6)]

In [None]:
data_train_positive[0:5]

[('Questionnaire for arable farmers/ horticulture businesses/ plant production / food processors (= suppliers of residual streams)\n\nBlock “Introduction and ice breaker”\n\nWhich type of residue stream (apple, tomato, grape, potatoes, or brewer’s grains) do you deal with in your company?\n\nOnly with residues of the potato processing stream.',
  6),
 ('Questionnaire for arable farmers/ horticulture businesses/ plant production / food processors (= suppliers of residual streams)\n\nBlock “strengths and challenges”\n\nPlease describe your challenges regarding the valorization of your agricultural by-products from apple-/tomato-/grape-/potato processing /brewer’s grains  as of today, e.g.:\n\nSmells: Same',
  0),
 ('Questionnaire for arable farmers/ horticulture businesses/ plant production / food processors (= suppliers of residual streams)\n\nBlock “current structures”\n\nHow satisfied are you with your current valorization of apple-/tomato-/grape-/potato processing/brewer’s grains? Wh

In [None]:
len(data_train_positive)

247

## **Perform augmentation**

### **POS-driven method (adj+adv): replace all adjectives and adverbs in each segment using masked word prediction**

Experiment (augmentation) name:

In [None]:
experiment = '19'

How many times repeat augmentation:

In [None]:
k = 1

Define POS-tagger:

In [None]:
st = StanfordPOSTagger('/content/stanford-postagger-full-2020-11-17/models/english-left3words-distsim.tagger',
                       '/content/stanford-postagger-full-2020-11-17/stanford-postagger-4.2.0.jar',
                       encoding='utf-8')

Perform POS-tagging, then mask all adjectives and adverbs in each phrase:

In [None]:
classified_segments = []
masked_segments = []
for i in range(len(data_train_positive)):
  tokenized_text = word_tokenize(data_train_positive[i][0], language='english')
  classified_text = st.tag(tokenized_text)
  masked_text = ""
  for word,tag in classified_text:
    if tag != 'JJ' and tag != 'RB':
      if word == "’" or word == ".":
        masked_text = masked_text[:-1] + word
      else:
        masked_text += word + " "
    else:
      masked_text += '[MASK] '
  classified_segments.append((classified_text,data_train_positive[i][1]))
  masked_segments.append((masked_text,data_train_positive[i][1]))
  if i % 10 == 0:
    print("Process",i,"segment")

Process 0 segment
Process 10 segment
Process 20 segment
Process 30 segment
Process 40 segment
Process 50 segment
Process 60 segment
Process 70 segment
Process 80 segment
Process 90 segment
Process 100 segment
Process 110 segment
Process 120 segment
Process 130 segment
Process 140 segment
Process 150 segment
Process 160 segment
Process 170 segment
Process 180 segment
Process 190 segment
Process 200 segment
Process 210 segment
Process 220 segment
Process 230 segment
Process 240 segment


In [None]:
masked_segments[0:5]

[('Questionnaire for [MASK] farmers/ horticulture businesses/ plant production / food processors ( = suppliers of [MASK] streams ) Block “ Introduction and ice breaker ” Which type of residue stream ( apple , tomato , grape , potatoes , or brewer’s grains ) do you deal with in your company ? [MASK] with residues of the potato processing stream.',
  6),
 ('Questionnaire for [MASK] farmers/ horticulture businesses/ plant production / food processors ( = suppliers of [MASK] streams ) Block “ strengths and challenges ” Please describe your challenges regarding the valorization of your [MASK] by-products from [MASK] processing /brewer’s grains as of today , e.g.: Smells : [MASK] ',
  0),
 ('Questionnaire for [MASK] farmers/ horticulture businesses/ plant production / food processors ( = suppliers of [MASK] streams ) Block “ [MASK] structures ” How [MASK] are you with your [MASK] valorization of [MASK] processing/brewer’s grains ? Why ? We have got our [MASK] biogas plant since 1980 and we a

Predict masked words in the resulting sentences:

In [None]:
model_name = "bert-base-uncased"

bert_unmasker = pipeline("fill-mask", model=model_name, tokenizer=model_name) # define the model

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def generateCombinations(masked_text):
  # predict masked words:
  predicted_words = bert_unmasker(masked_text)

  # extract predicted words:
  generated_words = []
  if masked_text.count("[MASK]") == 1:
    generated_candidates = []
    for word in predicted_words[:k]:
      generated_candidates.append(word['token_str'])
    generated_words.append(generated_candidates)
  else:
    for word_candidates in predicted_words:
      generated_candidates = []
      for word in word_candidates[:k]:
        generated_candidates.append(word['token_str'])
      generated_words.append(generated_candidates)

  return generated_words

def maskedPhrase2NewText(generated_combinations,combination_id,classified_phrase):
  predicted_text = ""
  word_id = 0
  for word,tag in classified_phrase:
    if tag != 'JJ' and tag != 'RB':
      if word == "’" or word == ".":
        predicted_text = predicted_text[:-1] + word
      else:
        predicted_text += word + " "
    else:
      predicted_text += generated_combinations[word_id][combination_id] + " "
      word_id += 1

  return predicted_text

In [None]:
new_training_set = []
count_without_mask = 0
count_already_exist = 0

for i in range(len(masked_segments)):
  nb_combinations = masked_segments[i][0].count("[MASK]")
  if (nb_combinations > 0) and (nb_combinations < 50):
    generated_combinations = generateCombinations(masked_segments[i][0])
    for j in range(k):
      new_phrase = maskedPhrase2NewText(generated_combinations,j,classified_segments[i][0])
      if new_phrase.strip() != data_train_positive[i][0].replace("\n","").strip():
        new_training_set.append((new_phrase,masked_segments[i][1]))
      else:
        count_already_exist += 1
  else:
    count_without_mask += 1
  if i % 10 == 0:
    print("Process",i,"segment")

Process 0 segment
Process 10 segment
Process 20 segment
Process 30 segment
Process 40 segment
Process 50 segment
Process 60 segment
Process 70 segment
Process 80 segment
Process 90 segment
Process 100 segment
Process 110 segment
Process 120 segment
Process 130 segment
Process 140 segment
Process 150 segment
Process 160 segment
Process 170 segment
Process 180 segment
Process 190 segment
Process 200 segment
Process 210 segment
Process 220 segment
Process 230 segment
Process 240 segment


Generated phrases:

In [None]:
new_training_set[0:5]

[('Questionnaire for potato farmers/ horticulture businesses/ plant production / food processors ( = suppliers of residue streams ) Block “ Introduction and ice breaker ” Which type of residue stream ( apple , tomato , grape , potatoes , or brewer’s grains ) do you deal with in your company ? deal with residues of the potato processing stream.',
  6),
 ('Questionnaire for food farmers/ horticulture businesses/ plant production / food processors ( = suppliers of food streams ) Block “ strengths and challenges ” Please describe your challenges regarding the valorization of your food by-products from food processing /brewer’s grains as of today , e.g.: Smells : ; ',
  0),
 ('Questionnaire for small farmers/ horticulture businesses/ plant production / food processors ( = suppliers of food streams ) Block “ of structures ” How successful are you with your waste valorization of the processing/brewer’s grains ? Why ? We have got our own biogas plant since 1980 and we are a a farmers and the v

Create new segments:

In [None]:
data_new = [(-1,new_training_set[i][0],new_training_set[i][1]) for i in range(len(new_training_set))]

data_augmented = data_train + data_new

Some stats:

In [None]:
print('Class 0:', len([i for i in range(len(data_new)) if data_new[i][2]==0]))
print('Class 1:', len([i for i in range(len(data_new)) if data_new[i][2]==1]))
print('Class 2:', len([i for i in range(len(data_new)) if data_new[i][2]==2]))
print('Class 3:', len([i for i in range(len(data_new)) if data_new[i][2]==3]))
print('Class 4:', len([i for i in range(len(data_new)) if data_new[i][2]==4]))
print('Class 5:', len([i for i in range(len(data_new)) if data_new[i][2]==5]))
print('Class 6:', len([i for i in range(len(data_new)) if data_new[i][2]==6]))
print('Total:', len(data_new))

Class 0: 126
Class 1: 0
Class 2: 0
Class 3: 96
Class 4: 0
Class 5: 0
Class 6: 24
Total: 246


In [None]:
len(data_augmented)

777

Save results:

In [None]:
f_out = open("drive/MyDrive/data/7-classes/train_set_paragraphs_augm-"+experiment+".pkl","wb")
#f_out = open("drive/MyDrive/data/7-classes/train_set_sentences_augm-"+experiment+".pkl","wb")

pickle.dump(data_augmented,f_out)

f_out.close()