# Processing Dataset


### Functions



In [2]:
from datasets import * #suprisingly the only import needed.

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# A dictionary of arabic letters from dotted to dotless. (credit: @kaddu341)
letter_dict = {
    "ا": "ا",
    "أ": "ا",
    "إ": "ا",
    "آ": "ا",
    "ب": "ب",
    "ت": "ب",
    "ث": "ب",
    "ج": "ح",
    "ح": "ح",
    "خ": "ح",
    "د": "د",
    "ذ": "د",
    "ر": "ر",
    "ز": "ر",
    "س": "س",
    "ش": "س",
    "ص": "ص",
    "ض": "ص",
    "ط": "ط",
    "ظ": "ط",
    "ع": "ع",
    "غ": "ع",
    "ف": "ف",
    "ق": "ف",
    "ك": "ك",
    "ل": "ل",
    "م": "م",
    "ن": "ن",
    "و": "و",
    "ؤ": "و",
    "ه": "ه",
    "ة": "ه",
    "ي": "ى",
    "ى": "ى",
    "ئ": "ى",
    " ": " ",
}
letter_set = set(letter_dict.keys())


In [4]:
# Replaces multiple spaces, or whatever char designated by the perameter into one char. (helper function)
def replace(string, char):
    while char+char in string:
        string = string.replace(char+char, char)

    return string


In [5]:
# Essential function, parses arabic dotted text into dotless text. 
# Returns a dict with keys "clean" and "dotless". 
# This format is required for the datasets.map() function.

#unused, see below functions
def parse_text(text):
    
  text = text["text"].strip(" ")
  
  clean_chars = [char for char in text if char in letter_set]
  clean_string = ''.join(clean_chars)

  dotless_chars = [letter_dict[char] for char in clean_chars]
  dotless_string = ''.join(dotless_chars)
      
  clean_string = replace(clean_string, ' ')
  dotless_string = replace(dotless_string, '_')
  
  return {"clean": clean_string, "dotless": dotless_string}


In [6]:
def parse_clean_text(text):
    
  text = text["text"].strip(" ")
  
  clean_chars = [char for char in text if char in letter_set]
  clean_string = ''.join(clean_chars)
      
  clean_string = replace(clean_string, ' ')

  
  return {"clean": clean_string}

In [7]:
def parse_dotless_text(text):
    
  text = text["clean"].strip(" ")

  dotless_chars = [letter_dict[char] for char in text]
  dotless_string = ''.join(text)
      
  dotless_string = replace(dotless_string, '_')
  
  return {"dotless": dotless_string}

In [25]:
# Used for the datasets.filter() function to filter out examples/rows that have examples of less than 20 chars.
#def filterEmpty(x):
#    return not len(x['clean']) < 20

#Not used, lambda function used instead

### Loading, saving, and trimming Oscar (unshuffled_deduplicated_ar) dataset

In [9]:
#load the oscar unshuffled_deduplicated_ar dataset from huggingface 
#datasetLoad = load_dataset("oscar", "unshuffled_deduplicated_ar") #Oscar Arabic dataset

In [10]:
#save the dataset to "ar-arrow-datasets/" Apache Arrow datafolder
#datasetLoad.save_to_disk('ar-arrow-datasets')

In [11]:
#load from file
full_dataset = load_from_disk('ar-arrow-datasets')

In [12]:
print(full_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'text'],
        num_rows: 9006977
    })
})


### Applying maps, filters, and splits
This process of applying maps and filters is very slow especially for big datasets. 

It is possible to implement a batched approach similar ot batched tokenization that would speed it up greatly. 

In the .map() add perameter batched=True. However the function called needs to be compatible with this approach.

I will implement this soon.

Generally this is not the way to handle big data. However this is fine for the AR-dotless-small dataset

In [61]:
# Gets the index of the #th space.
def num_word(text, num):
    space_count = 0
    index = -1
    for i in range(num):
        index = text.find(' ', index + 1)
    return index

In [76]:
# Chunks examples into smaller examples, at @words_per_chuck words each
def chunk_examples(examples, words_per_chunk=8):
    chunks = []
    for sentence in examples["clean"]:
        words = sentence.split()
        num_words = len(words)
        start = 0
        while start < num_words:
            end = min(start + words_per_chunk, num_words)
            chunk = ' '.join(words[start:end])
            chunks.append(chunk)
            start = end
    
    return {"clean": chunks}

In [74]:
dataset = full_dataset['train'].select(range(100)) #Trim to number of examples
dataset = dataset.remove_columns(["id"])
print(dataset, '\n')

dataset = dataset.filter(lambda x: not len(x['text']) < 20) #Apply the filter to remove useless empty examples.
print(dataset, '\n')

dataset = dataset.map(parse_clean_text, remove_columns="text") #Apply the function (convert dotted to dotless)
set_state_1 = dataset["clean"][0]
print(dataset, '\n')

# test to see how many examples are under 20 characters. Goal of filterEmpty() to eliminate
a = 0
for i in dataset['clean']:
    if len(i) < 40:
        a += 1
print(a, '\n')

dataset = dataset.map(chunk_examples, batched=True, remove_columns=dataset.column_names) # applies teh chuncking
set_state_2 = [dataset["clean"][0], dataset["clean"][1]]
print(dataset, '\n')

dataset = dataset.map(parse_dotless_text) #Apply the function (convert dotted to dotless)
set_state_3 = [dataset["dotless"][0], dataset["dotless"][1]]
print(dataset, '\n')

dataset = dataset.filter(lambda x: not len(x['clean']) < 20) #Apply the filter to remove useless empty examples.
print(dataset, '\n')


Dataset({
    features: ['text'],
    num_rows: 100
}) 

Dataset({
    features: ['text'],
    num_rows: 100
}) 

Dataset({
    features: ['clean'],
    num_rows: 100
}) 

0 

Dataset({
    features: ['clean'],
    num_rows: 4602
}) 

Dataset({
    features: ['clean', 'dotless'],
    num_rows: 4602
}) 

Dataset({
    features: ['clean', 'dotless'],
    num_rows: 4567
}) 



In [73]:
print(set_state_1,"\n\n")
print(set_state_2,"\n\n")
print(set_state_3)

مرحبا بك عزيز الزائر نتمنى لك أوقاتا سعيدة معنا وأن نزداد شرفا بخدمتك ولا تنسى التسجيل معنا لتستفيد ب 


['مرحبا بك عزيز الزائر نتمنى لك أوقاتا سعيدة', 'معنا وأن نزداد شرفا بخدمتك ولا تنسى التسجيل'] 


['مرحبا بك عزيز الزائر نتمنى لك أوقاتا سعيدة', 'معنا وأن نزداد شرفا بخدمتك ولا تنسى التسجيل']


In [30]:
# test to see how many examples are under 20 characters. Goal of filterEmpty() to eliminate
a = 0
for i in dataset['clean']:
    if len(i) < 20:
        a += 1
a

0

In [13]:
dataset_clean = dataset.train_test_split(train_size= 0.8) #create 80/20 split into train and test datasets
dataset_clean

DatasetDict({
    train: Dataset({
        features: ['clean', 'dotless'],
        num_rows: 79658
    })
    test: Dataset({
        features: ['clean', 'dotless'],
        num_rows: 19915
    })
})

### Save and upload to hugging face

In [14]:
#save the cleaned dataset to "clean-arrow-datasets/" Apache Arrow datafolder
dataset_clean.save_to_disk("clean-arrow-datasets")

Saving the dataset (2/2 shards): 100%|██████████| 79658/79658 [00:00<00:00, 251503.53 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 19915/19915 [00:00<00:00, 242508.32 examples/s]


In [143]:
#required token id. Run 'huggingface-cli login'
dataset_clean.push_to_hub("dot-ammar/AR-dotless-small", private=True)

Creating parquet from Arrow format: 100%|██████████| 40/40 [00:01<00:00, 37.10ba/s]
Creating parquet from Arrow format: 100%|██████████| 40/40 [00:01<00:00, 37.47ba/s]s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 2/2 [00:29<00:00, 14.70s/it]
Deleting unused files from dataset repository: 100%|██████████| 2/2 [00:00<00:00,  8.21it/s]
Creating parquet from Arrow format: 100%|██████████| 20/20 [00:00<00:00, 36.05ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:04<00:00,  4.77s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  8.47it/s]
Downloading metadata: 100%|██████████| 622/622 [00:00<00:00, 6.41MB/s]
