# Processing Dataset


### Functions



In [None]:
from datasets import * #suprisingly the only import needed.

In [None]:
# A dictionary of arabic letters from dotted to dotless. (credit: @kaddu341)
letter_dict = {
    "ا": "ا",
    "أ": "ا",
    "إ": "ا",
    "آ": "ا",
    "ب": "ب",
    "ت": "ب",
    "ث": "ب",
    "ج": "ح",
    "ح": "ح",
    "خ": "ح",
    "د": "د",
    "ذ": "د",
    "ر": "ر",
    "ز": "ر",
    "س": "س",
    "ش": "س",
    "ص": "ص",
    "ض": "ص",
    "ط": "ط",
    "ظ": "ط",
    "ع": "ع",
    "غ": "ع",
    "ف": "ف",
    "ق": "ف",
    "ك": "ك",
    "ل": "ل",
    "م": "م",
    "ن": "ن",
    "و": "و",
    "ؤ": "و",
    "ه": "ه",
    "ة": "ه",
    "ي": "ى",
    "ى": "ى",
    "ئ": "ى",
    " ": " ",
}
letter_set = set(letter_dict.keys())


In [None]:
# Replaces multiple spaces, or whatever char designated by the perameter into one char. (helper function)
def replace(string, char):
    while char+char in string:
        string = string.replace(char+char, char)

    return string


In [None]:
# Essential function, parses arabic dotted text into dotless text. 
# Returns a dict with keys "clean" and "dotless". 
# This format is required for the datasets.map() function.

#unused, see below functions, also this is not batched
def parse_text(text):
    
  text = text["text"].strip(" ")
  
  clean_chars = [char for char in text if char in letter_set]
  clean_string = ''.join(clean_chars)

  dotless_chars = [letter_dict[char] for char in clean_chars]
  dotless_string = ''.join(dotless_chars)
      
  clean_string = replace(clean_string, ' ')
  dotless_string = replace(dotless_string, '_')
  
  return {"clean": clean_string, "dotless": dotless_string}


In [None]:
def parse_clean_text(example):
  
  clean = []
  
  for text in example["text"]:
    text = text.strip(" ")
    
    clean_chars = [char for char in text if char in letter_set]
    clean_string = ''.join(clean_chars)
    
    clean_string = replace(clean_string, ' ')
    clean.append(clean_string)
  
  return {"clean": clean}

In [None]:
def parse_dotless_text(example):
  dotless = []
  for text in example["clean"]:
    dotless_chars = [letter_dict[char] for char in text]
    dotless_string = ''.join(dotless_chars)
        
    dotless_string = replace(dotless_string, '_')
    dotless.append(dotless_string)
  
  return {"dotless": dotless}

In [None]:
#test
text = {"text": "ولا تنسى التسجيل معنا لتستفيد بكل جديد"}
print(text, "\n")

text = parse_clean_text(text)
print(text, "\n")

text = parse_dotless_text(text)
print(text, "\n")


In [None]:
# Used for the datasets.filter() function to filter out examples/rows that have examples of less than 20 chars.
#def filterEmpty(x):
#    return not len(x['clean']) < 20

#Not used, lambda function used instead

### Loading, saving, and trimming Oscar (unshuffled_deduplicated_ar) dataset

In [None]:
#load the oscar unshuffled_deduplicated_ar dataset from huggingface 
#datasetLoad = load_dataset("oscar", "unshuffled_deduplicated_ar") #Oscar Arabic dataset

In [None]:
#save the dataset to "ar-arrow-datasets/" Apache Arrow datafolder
#datasetLoad.save_to_disk('ar-arrow-datasets')

In [None]:
#load from file
full_dataset = load_from_disk('ar-arrow-datasets')

In [None]:
print(full_dataset)

### Applying maps, filters, and splits
This process of applying maps and filters is very slow especially for big datasets. 

It is possible to implement a batched approach similar ot batched tokenization that would speed it up greatly. 

In the .map() add perameter batched=True. However the function called needs to be compatible with this approach.

I will implement this soon.

Generally this is not the way to handle big data. However this is fine for the AR-dotless-small dataset

In [None]:
# Gets the index of the #th space.
def num_word(text, num):
    space_count = 0
    index = -1
    for i in range(num):
        index = text.find(' ', index + 1)
    return index

In [None]:
import math
exampleToWordsLen = [] # just to keep track of how many chunks an old  made

In [None]:
print(exampleToWordsLen[:5])

In [None]:

# Chunks examples into smaller examples, at @words_per_chuck words each
def chunk_examples(examples, words_per_chunk=8):
    chunks = []
    for sentence in examples["clean"]:
        words = sentence.split()
        num_words = len(words)
        exampleToWordsLen.append(math.ceil(num_words/words_per_chunk)) # just to keep track of how many chunks an old example made
        start = 0
        while start < num_words:
            end = min(start + words_per_chunk, num_words)
            chunk = ' '.join(words[start:end])
            chunks.append(chunk)
            start = end
    
    return {"clean": chunks}

In [None]:
dataset = full_dataset['train'].select(range(100000)) #Trim to number of examples
dataset = dataset.remove_columns(["id"])
print(dataset, '\n')

dataset = dataset.filter(lambda x: not len(x['text']) < 20) #Apply the filter to remove useless empty examples.
print(dataset, '\n')
        
dataset = dataset.map(parse_clean_text, batched=True, remove_columns="text") #Apply the function (convert dotted to dotless)
set_state_1 = dataset["clean"][0]
print(dataset, '\n')

# test to see how many examples are under 20 characters. Goal of filterEmpty() to eliminate
a = 0
for i in dataset['clean']:
    if len(i) < 40:
        a += 1
print(a, '\n')

dataset = dataset.map(chunk_examples, batched=True, remove_columns=dataset.column_names) # applies teh chuncking
set_state_2 = dataset["clean"][0:exampleToWordsLen[0]]
print(dataset, '\n')

dataset = dataset.map(parse_dotless_text, batched=True) #Apply the function (convert dotted to dotless)
set_state_3 = dataset["dotless"][0:exampleToWordsLen[0]]
print(dataset, '\n')

dataset = dataset.filter(lambda x: not len(x['clean']) < 20) #Apply the filter to remove useless empty examples.
print(dataset, '\n')

In [None]:
print(set_state_1,"\n\n")
print(set_state_2,"\n\n")
print(set_state_3)

In [None]:
# test to see how many examples are under 20 characters. Goal of filterEmpty() to eliminate
a = 0
for i in dataset['clean']:
    if len(i) < 20:
        a += 1
a

In [None]:
dataset_clean = dataset.train_test_split(train_size= 0.999999999) #create 80/20 split into train and test datasets
del dataset_clean['test'] # no longer using a test set
dataset_clean


### Save and upload to hugging face

In [142]:
#save the cleaned dataset to "clean-arrow-datasets/" Apache Arrow datafolder
dataset_clean.save_to_disk("AR-dotless-mediumPlus-arrow")

Saving the dataset (2/2 shards): 100%|██████████| 4446330/4446330 [00:14<00:00, 313548.54 examples/s]


In [143]:
#required token id. Run 'huggingface-cli login'
dataset_clean.push_to_hub("dot-ammar/AR-dotless-mediumPlus")

Creating parquet from Arrow format: 100%|██████████| 2224/2224 [00:27<00:00, 81.49ba/s]
Creating parquet from Arrow format: 100%|██████████| 2224/2224 [00:26<00:00, 84.88ba/s]]
Pushing dataset shards to the dataset hub: 100%|██████████| 2/2 [01:36<00:00, 48.38s/it]
