## Data Augmentation: Paraphrasing

In [None]:
import os
import sys

# Needed to import modules from parent directory
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [1]:
from utils.database import *
from utils.files import *
from tqdm import tqdm
from bson import ObjectId
import pandas as pd 
import numpy as np

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM
from datasets import load_from_disk, Dataset, ClassLabel, Value, Features
from huggingface_hub import InferenceClient
from transformers import BertTokenizer
import matplotlib.pyplot as plt
from utils.preprocessing import *
from utils.accelerators import *
from utils.multithreading import *
from utils.database import *
from utils.model import *
from utils.files import *
from datasets import Dataset
from tqdm import tqdm
import statistics
import hashlib
import random
import time
import math
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
topic = "cannabis" #"energie" #"kinder" "cannabis"

## Get Predictions

### Load Model

In [3]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "CohereForAI/aya-101"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)

Loading checkpoint shards: 100%|██████████| 11/11 [00:10<00:00,  1.00it/s]


In [4]:
# if torch.cuda.device_count() > 1:
#     print(f"Using {torch.cuda.device_count()} GPUs!")
#     model = torch.nn.DataParallel(model)

# # Move model to GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = model.to(device)

### Prompt Template

In [5]:
PARAPHRASE_TEMPLATE = """Paraphrase the following text, retaining the original language, meaning, and topic. Please return only the paraphrased text without any additional content or formatting.

Text: "{webpage_text}"

Paraphrased Text:"""

# Test the template with a dummy text
paraphrase_test = PARAPHRASE_TEMPLATE.format(webpage_text='Lorem ipsum dolor sit amet, consectetur adipiscing elit.')
print(paraphrase_test)


Paraphrase the following text, retaining the original language, meaning, and topic. Please return only the paraphrased text without any additional content or formatting.

Text: "Lorem ipsum dolor sit amet, consectetur adipiscing elit."

Paraphrased Text:


### Generation Parameters

In [6]:
params = {'do_sample': True,
        'early_stopping': True,        
        #'num_beam_groups': 2,
        'num_beams': 2,
        'num_return_sequences': 1,
        'max_new_tokens': 1024,
        'min_new_tokens': 1,
        'output_scores': True,
        #'repetition_penalty': 1.0,
        'temperature': 1.2,
        'top_k': 50,
        'top_p': 1.0 
        }

### Load Dataset

In [7]:
#dataset = load_from_disk(f"../../data/tmp/processed_dataset_buff_{topic}_split_chunkified")
dataset = load_from_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_chunkified_random")

dataset

DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'is_topic', 'label', 'chunk_id'],
        num_rows: 2651
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'is_topic', 'label', 'chunk_id'],
        num_rows: 295
    })
})

## Helper Functions

In [8]:
def compile_prompt(article, template, topic, lang = 'German'):
    """ Compiles the prompt for the given article and model."""

    # Extract the article headline and text
    article_text = article.get("text")
    prompt = template.format(topic = topic, lang = lang, webpage_text=article_text)
    #prompt = template.format(topic = "Cannabis", lang = 'German', webpage_text=article_text, positive_example=positive_example, negative_example=negative_example)

    return prompt

In [9]:
def calculate_input_length(prompt):
    """ Calculates the length of the input sequence for the model. """

    # Tokenize the prompt
    tokenized_prompt = tokenizer(prompt, return_tensors="pt", add_special_tokens=False, truncation=False, padding=False)

    # Calculate the length of the input sequence
    input_length = tokenized_prompt.input_ids.size(1)

    return input_length

In [10]:
def generate_answers(model, tokenizer, prompt, params, remove_input=True):
    """Generates answers from a language model for a given prompt."""

    # Encode the prompt and generate the answers
    encoded_input = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
    if encoded_input.size()[1] > tokenizer.model_max_length:
        print("Input too long, truncating.")
        #encoded_input = encoded_input[:, :tokenizer.model_max_length]
    
    generated_outputs = model.generate(encoded_input, **params)

    # Decode and clean outputs
    outputs = []
    input_text_wo_st = tokenizer.decode(encoded_input[0], skip_special_tokens=True)
    for output in generated_outputs:
        decoded_text = tokenizer.decode(output, skip_special_tokens=True)
        cleaned_text = decoded_text.replace(input_text_wo_st, "").strip()
        outputs.append(cleaned_text if remove_input else decoded_text)
        
    return outputs

In [11]:
# def parse_response(output_text):
#     """Determines if the model's output signifies "Yes" (1) or "No" (0)."""
#     text = output_text.lower()
#     return 1 if "yes" in text else 0 if "no" in text else ValueError("Ambiguous response.")


## Generate new Training Examples

### Test on an Example

In [12]:
def paraphrase(text, tokenizer, model, template, params):
    """Paraphrases the given text using the model and template."""
    prompt = compile_prompt({"text": text}, template, topic)
    paraphrased_text = generate_answers(model, tokenizer, prompt, params)[0]
    return paraphrased_text

In [13]:
# Example usage
#text = "Das hier ist ein Test."
text = "Der Konsum von Cannabis ist in Deutschland verboten."
paraphrased_text = paraphrase(text, tokenizer, model, PARAPHRASE_TEMPLATE, params)
print("Original text:", text)
print("Paraphrased text:", paraphrased_text)

Original text: Der Konsum von Cannabis ist in Deutschland verboten.
Paraphrased text: Cannabis ist in Deutschland verboten.


## Iterate over Training Dataset

In [14]:
# Filter positive examples and sample 20 percent of the positive exampl
positive_examples = dataset['train'].filter(lambda example: example['label'] == 1)

# Select the first 20% of the shuffled positive examples as your random sample
positive_examples_shuffled = positive_examples.shuffle(seed=42)
num_samples = int(len(positive_examples_shuffled) * 0.004) 
sampled_examples = positive_examples_shuffled.select(range(num_samples))

# Generate new data points for the sampled positive examples
dataset[f'positive_sampled'] = sampled_examples
dataset

DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'is_topic', 'label', 'chunk_id'],
        num_rows: 2651
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'is_topic', 'label', 'chunk_id'],
        num_rows: 295
    })
    positive_sampled: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'is_topic', 'label', 'chunk_id'],
        num_rows: 5
    })
})

In [15]:
def generate_new_data_points(text, n_examples=1):
    """Generates n new data points from the original text."""
    new_texts = [paraphrase(text, tokenizer, model, PARAPHRASE_TEMPLATE, params) for _ in range(n_examples)]
    return new_texts

In [16]:
# Generate new datapoints for n = 1, 2, 3, 4, 5
for n in [1, 2, 3, 4, 5]:
    print(f"Generating {n} new examples for each original example...")
    
    # Expand the dataset with new examples
    expanded_examples = []
    for example in tqdm(sampled_examples):
        new_texts = generate_new_data_points(example['text'], n)
        for new_text in new_texts:
            new_example = example.copy()
            new_example['text'] = new_text
            expanded_examples.append(new_example)
    
    # Convert the list of new examples to a Dataset
    expanded_dataset = Dataset.from_pandas(pd.DataFrame(expanded_examples))
    dataset[f'expanded_{n}'] = expanded_dataset

    print(f"Completed generating {n} new examples for each original example.")

Generating 1 new examples for each original example...


100%|██████████| 5/5 [03:42<00:00, 44.43s/it]


Completed generating 1 new examples for each original example.
Generating 2 new examples for each original example...


100%|██████████| 5/5 [05:25<00:00, 65.19s/it] 


Completed generating 2 new examples for each original example.
Generating 3 new examples for each original example...


100%|██████████| 5/5 [12:06<00:00, 145.26s/it]


Completed generating 3 new examples for each original example.
Generating 4 new examples for each original example...


100%|██████████| 5/5 [11:03<00:00, 132.78s/it]


Completed generating 4 new examples for each original example.
Generating 5 new examples for each original example...


100%|██████████| 5/5 [15:24<00:00, 184.91s/it]

Completed generating 5 new examples for each original example.





In [None]:
dataset['expanded_1'][0]

In [17]:
dataset['expanded_1'][0]

{'_id': '64a0946b749484eec84ef27b',
 'batch_id': 16,
 'domain': 'mdr.de',
 'view_url': 'www.mdr.de/brisant/cannabis-legalisierung-278.html',
 'lang': 'de',
 'text': 'Menü Startseite Sendungen TV - Programm Live Mediathek Teletext Service Über uns Zur optimalen Darstellung unserer Webseite benötigen Sie Javascript. Bitte aktivieren sie dies in Ihrem Browser. Brisant Zur Brisant - Startseite Startseite Prominent Ratgeber Podcast Redaktion Service Neuer Bereich Rauschmittel Legalisierung von Cannabis - Das soll sich beim Besitz, Konsum und Kauf jetzt ändern Hauptinhalt Stand : 29. Juni 2023, 11 : 45 Uhr Die geplante Freigabe von Cannabis ist in Deutschland umstritten. Nun hat die Bundesregierung die teilweise Legalisierung der Droge auf den Weg gebracht.',
 'text_length': 10054,
 'word_count': 1340,
 'is_topic': True,
 'label': 1,
 'chunk_id': 0}

## Save Generated Trainig Examples

In [18]:
# Save the expanded dataset
dataset.save_to_disk(f"../../data/tmp/augmented_dataset_{topic}_paraphrasing")

Saving the dataset (1/1 shards): 100%|██████████| 2651/2651 [00:00<00:00, 234352.74 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 295/295 [00:00<00:00, 51188.14 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 5/5 [00:00<00:00, 855.32 examples/s] 


AttributeError: 'list' object has no attribute 'save_to_disk'