# Prepare Dataset for Processing

In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from datasets import Dataset, load_from_disk
from transformers import AutoTokenizer
from multiprocessing import Pool
from utils.preprocessing import *
from utils.database import *
from utils.files import *
import transformers

  from .autonotebook import tqdm as notebook_tqdm


The code `os.environ["TOKENIZERS_PARALLELISM"] = "false"` disables parallel tokenization in HuggingFace's libraries. It's a way to suppress warnings and prevent potential issues tied to multi-core tokenization.
See: https://stackoverflow.com/questions/62691279/how-to-disable-tokenizers-parallelism-true-false-warning

In [2]:
# import os
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
# transformers.utils.logging.set_verbosity_error()

## Import Raw Dataset

In [3]:
dataset = load_from_disk('../data/input/articles')

In [4]:
describeDataset(dataset)

Number of rows: 50
Column names: ['id', 'title', 'url', 'text']
Features (schema): {'id': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'url': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None)}


## Process Dataset

### Define Prompt Template

In [5]:
# PROMPT_TEMPLATE = "Output a response given the Output rules and Article.\nOutput Rules: Identify if" \
#     " there is one, multiple, or zero {elt}s in the article.\nIf the number of {elt}s == 0, then output " \
#     "'None'.\nIf the number of {elt}s > 0, then output the names of the {elt}s as a python list.\n" \
#     "Article: {article_text}"

PROMPT_TEMPLATE = "Who is the {elt} in the following text?\nText: {article_text}"

# Test the template with a dummy text
print(PROMPT_TEMPLATE.format(elt='hero',
      article_text='Lorem ipsum dolor sit amet, consectetur adipiscing elit.'))


Who is the hero in the following text?
Text: Lorem ipsum dolor sit amet, consectetur adipiscing elit.


### Expand Dataset

In [6]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
print("Input window length:", tokenizer.model_max_length)

Input window length: 512


Functions to segment articles into chunks fitting within the input window:

In [7]:
def calcInputLength(prompt):
    """Calculate the length of the input after"""
    return tokenizer(prompt, return_tensors="pt").input_ids.real.shape[1]


template_length = calcInputLength(
    PROMPT_TEMPLATE.format(elt='villain', article_text=' '))
print(template_length)

chunk_size = tokenizer.model_max_length - template_length
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=30,
    # separators=['.', '?', '!', "\n\n", "\n", " ", ""],
    length_function=calcInputLength)


def split_text(text, n_tokens, tokenizer, overlap=10):
    """Splits the input text into chunks with n_tokens tokens using HuggingFace tokenizer, with an overlap of overlap tokens."""
    tokens = tokenizer.tokenize(text)
    chunks = []
    i = 0
    while i < len(tokens):
        chunk = tokens[i:i+n_tokens]
        chunks.append(tokenizer.convert_tokens_to_string(chunk))
        i += n_tokens - overlap

    return chunks

13


For each article, distinct prompts identify 'hero', 'villain', and 'victim'. If an article exceeds the model's input size, it's divided into chunks, generating additional prompts. It seems that one article results in about 10 to 12 prompts.

In [8]:
def expandRow(row):
    """
    Generate prompts based on various roles and text chunks from the input row.
    """
    roles = ['hero', 'villain', 'victim']
    prompts = []

    # Split the text into chunks
    # text_splitter.split_text(row.get('text'))
    text_chunks = split_text(row.get('text'), 450, tokenizer, overlap=10)

    # Generate prompts for each role and text chunk
    for role in roles:
        for chunk_id, text_chunk in enumerate(text_chunks):
            prompt = PROMPT_TEMPLATE.format(elt=role, article_text=text_chunk)
            new_row = {
                **row,
                'prompt': prompt,
                'role': role,
                'chunk': chunk_id,
                'chunk_length': calcInputLength(text_chunk)
            }
            prompts.append(new_row)

    return prompts

Process datataset using multiple proesses:

In [9]:
num_processes = 12

with Pool(processes=num_processes) as pool:
    # The pool.map function applies the expandRow function to each row in dataset
    # and returns a list of results. Each result is a list, so we flatten the list using itertools.chain.
    dataset_hvv = list(pool.map(expandRow, dataset))

# Flatten the resulting list of lists
dataset_hvv = [item for sublist in dataset_hvv for item in sublist]

# Convert the list of dictionaries into a Dataset
dataset_hvv = Dataset.from_dict(
    {key: [dic[key] for dic in dataset_hvv] for key in dataset_hvv[0]})

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Token indices sequence length is longer than the specified maximum sequence length for this model (652 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (839 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1109 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1058 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (673 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t

In [10]:
dataset_hvv.save_to_disk('data/input/articles_chunkified')

Saving the dataset (1/1 shards): 100%|██████████| 651/651 [00:00<00:00, 17371.97 examples/s]


### Tokenize Dataset

In [11]:
def tokenizeInputs(example):
    """Tokenize the inputs"""

    tokenized_inputs = tokenizer(example["prompt"], max_length=tokenizer.model_max_length,
                                    truncation=True, is_split_into_words=False, add_special_tokens=True, padding="max_length")

    # Combine original data with the tokenized inputs
    example.update(tokenized_inputs)
    return example

tokenized_dataset = dataset_hvv.map(tokenizeInputs)

Map: 100%|██████████| 651/651 [00:00<00:00, 740.17 examples/s]


## Calculate some basic Statistics

In [12]:
def calculate_prompt_length(row):
    row['prompt_length'] = calcInputLength(row['prompt'])
    return row

# Assuming the dataset object supports a map operation
tokenized_dataset = tokenized_dataset.map(calculate_prompt_length)

# Assuming the dataset object can be iterated like a list
min_length = min(row['prompt_length'] for row in tokenized_dataset)
max_length = max(row['prompt_length'] for row in tokenized_dataset)
total_length = sum(row['prompt_length'] for row in tokenized_dataset)
avg_length = total_length / len(tokenized_dataset)

print("Minimum prompt length:", min_length)
print("Maximum prompt length:", max_length)
print("Average prompt length:", avg_length)

Map: 100%|██████████| 651/651 [00:00<00:00, 740.70 examples/s]


Minimum prompt length: 37
Maximum prompt length: 464
Average prompt length: 408.27342549923196


In [13]:
print(tokenized_dataset[0]["prompt"])

Who is the hero in the following text?
Text: मेरा शहर Link Copied रहें हर खबर से अपडेट, डाउनलोड करें Android Hindi News App, iOS Hindi News App और Amarujala Hindi News APP अपने मोबाइल पे|Get all India News in Hindi related to live update of politics, sports, entertainment, technology and education etc. Stay updated with us for all breaking news from India News and more news in Hindi. Next Article Please wait... Please wait... Delete All Cookies Followed


## Save Dataset to Disk

In [14]:
tokenized_dataset.save_to_disk('../data/input/articles_tokenized')

Saving the dataset (1/1 shards): 100%|██████████| 651/651 [00:00<00:00, 9867.88 examples/s] 
